Mercurial > repos > cpt > cpt_helical_wheel
comparison plotWheels/descriptors.py @ 1:9b276485c94a draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:44:43 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:9caa9aa44fd8 | 1:9b276485c94a |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 .. currentmodule:: modlamp.descriptors | |
4 | |
5 .. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch> | |
6 | |
7 This module incorporates different classes to calculate peptide descriptor values. The following classes are available: | |
8 | |
9 ============================= ============================================================================ | |
10 Class Characteristics | |
11 ============================= ============================================================================ | |
12 :py:class:`GlobalDescriptor` Global one-dimensional peptide descriptors calculated from the AA sequence. | |
13 :py:class:`PeptideDescriptor` AA scale based global or convoluted descriptors (auto-/cross-correlated). | |
14 ============================= ============================================================================ | |
15 | |
16 .. seealso:: :class:`modlamp.core.BaseDescriptor` from which the classes in :mod:`modlamp.descriptors` inherit. | |
17 """ | |
18 | |
19 import sys | |
20 | |
21 import numpy as np | |
22 from scipy import stats | |
23 from sklearn.externals.joblib import Parallel, delayed | |
24 | |
25 from plotWheels.core import ( | |
26 BaseDescriptor, | |
27 load_scale, | |
28 count_aas, | |
29 aa_weights, | |
30 aa_energies, | |
31 aa_formulas, | |
32 ) | |
33 | |
34 __author__ = "Alex Müller, Gisela Gabernet" | |
35 __docformat__ = "restructuredtext en" | |
36 | |
37 | |
38 def _one_autocorr(seq, window, scale): | |
39 """Private function used for calculating auto-correlated descriptors for 1 given sequence, window and an AA scale. | |
40 This function is used by the :py:func:`calculate_autocorr` method of :py:class:`PeptideDescriptor`. | |
41 | |
42 :param seq: {str} amino acid sequence to calculate descriptor for | |
43 :param window: {int} correlation-window size | |
44 :param scale: {str} amino acid scale to be used to calculate descriptor | |
45 :return: {numpy.array} calculated descriptor data | |
46 """ | |
47 try: | |
48 m = list() # list of lists to store translated sequence values | |
49 for l in range(len(seq)): # translate AA sequence into values | |
50 m.append(scale[str(seq[l])]) | |
51 # auto-correlation in defined sequence window | |
52 seqdesc = list() | |
53 for dist in range(window): # for all correlation distances | |
54 for val in range( | |
55 len(scale["A"]) | |
56 ): # for all features of the descriptor scale | |
57 valsum = list() | |
58 cntr = 0.0 | |
59 for pos in range(len(seq)): # for every position in the sequence | |
60 if (pos + dist) < len( | |
61 seq | |
62 ): # check if corr distance is possible at that sequence position | |
63 cntr += 1 # counter to scale sum | |
64 valsum.append(m[pos][val] * m[pos + dist][val]) | |
65 seqdesc.append( | |
66 sum(valsum) / cntr | |
67 ) # append scaled correlation distance values | |
68 return seqdesc | |
69 except ZeroDivisionError: | |
70 print( | |
71 "ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" | |
72 % (window, seq) | |
73 ) | |
74 | |
75 | |
76 def _one_crosscorr(seq, window, scale): | |
77 """Private function used for calculating cross-correlated descriptors for 1 given sequence, window and an AA scale. | |
78 This function is used by the :py:func:`calculate_crosscorr` method of :py:class:`PeptideDescriptor`. | |
79 | |
80 :param seq: {str} amino acid sequence to calculate descriptor for | |
81 :param window: {int} correlation-window size | |
82 :param scale: {str} amino acid scale to be used to calculate descriptor | |
83 :return: {numpy.array} calculated descriptor data | |
84 """ | |
85 try: | |
86 m = list() # list of lists to store translated sequence values | |
87 for l in range(len(seq)): # translate AA sequence into values | |
88 m.append(scale[str(seq[l])]) | |
89 # auto-correlation in defined sequence window | |
90 seqdesc = list() | |
91 for val in range(len(scale["A"])): # for all features of the descriptor scale | |
92 for cc in range(len(scale["A"])): # for every feature cross correlation | |
93 if (val + cc) < len( | |
94 scale["A"] | |
95 ): # check if corr distance is in range of the num of features | |
96 for dist in range(window): # for all correlation distances | |
97 cntr = float() | |
98 valsum = list() | |
99 for pos in range( | |
100 len(seq) | |
101 ): # for every position in the sequence | |
102 if (pos + dist) < len( | |
103 seq | |
104 ): # check if corr distance is possible at that sequence pos | |
105 cntr += 1 # counter to scale sum | |
106 valsum.append(m[pos][val] * m[pos + dist][val + cc]) | |
107 seqdesc.append( | |
108 sum(valsum) / cntr | |
109 ) # append scaled correlation distance values | |
110 return seqdesc | |
111 except ZeroDivisionError: | |
112 print( | |
113 "ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" | |
114 % (window, seq) | |
115 ) | |
116 | |
117 | |
118 def _one_arc(seq, modality, scale): | |
119 """Privat function used for calculating arc descriptors for one sequence and AA scale. This function is used by | |
120 :py:func:`calculate_arc` method method of :py:class:`PeptideDescriptor`. | |
121 | |
122 :param seq: {str} amino acid sequence to calculate descriptor for | |
123 :param scale: {str} amino acid scale to be used to calculate descriptor | |
124 :return: {numpy.array} calculated descriptor data | |
125 """ | |
126 desc_mat = [] | |
127 for aa in seq: | |
128 desc_mat.append(scale[aa]) | |
129 desc_mat = np.asarray(desc_mat) | |
130 | |
131 # Check descriptor dimension | |
132 desc_dim = desc_mat.shape[1] | |
133 | |
134 # list to store descriptor values for all windows | |
135 allwindows_arc = [] | |
136 | |
137 if len(seq) > 18: | |
138 window = 18 | |
139 # calculates number of windows in sequence | |
140 num_windows = len(seq) - window | |
141 else: | |
142 window = len(seq) | |
143 num_windows = 1 | |
144 | |
145 # loop through all windows | |
146 for j in range(num_windows): | |
147 # slices descriptor matrix into current window | |
148 window_mat = desc_mat[j : j + window, :] | |
149 | |
150 # defines order of amino acids in helical projection | |
151 order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7] | |
152 | |
153 # orders window descriptor matrix into helical projection order | |
154 ordered = [] | |
155 for pos in order: | |
156 try: | |
157 ordered.append(window_mat[pos, :]) | |
158 except: | |
159 # for sequences of len < 18 adding dummy vector with 2s, length of descriptor dimensions | |
160 ordered.append([2] * desc_dim) | |
161 ordered = np.asarray(ordered) | |
162 | |
163 window_arc = [] | |
164 | |
165 # loop through pharmacophoric features | |
166 for m in range(desc_dim): | |
167 all_arcs = ( | |
168 [] | |
169 ) # stores all arcs that can be found of a pharmacophoric feature | |
170 arc = 0 | |
171 | |
172 for n in range( | |
173 18 | |
174 ): # for all positions in helix, regardless of sequence length | |
175 if ( | |
176 ordered[n, m] == 0 | |
177 ): # if position does not contain pharmacophoric feature | |
178 all_arcs.append(arc) # append previous arc to all arcs list | |
179 arc = 0 # arc is initialized | |
180 elif ( | |
181 ordered[n, m] == 1 | |
182 ): # if position contains pharmacophoric feature(PF), elongate arc by 20° | |
183 arc += 20 | |
184 elif ordered[n, m] == 2: # if position doesn't contain amino acid: | |
185 if ( | |
186 ordered[n - 1, m] == 1 | |
187 ): # if previous position contained PF add 10° | |
188 arc += 10 | |
189 elif ( | |
190 ordered[n - 1, m] == 0 | |
191 ): # if previous position didn't contain PF don't add anything | |
192 arc += 0 | |
193 elif ( | |
194 ordered[n - 2, m] == 1 | |
195 ): # if previous position is empty then check second previous for PF | |
196 arc += 10 | |
197 if ( | |
198 n == 17 | |
199 ): # if we are at the last position check for position n=0 instead of next position. | |
200 if ordered[0, m] == 1: # if it contains PF add 10° extra | |
201 arc += 10 | |
202 else: # if next position contains PF add 10° extra | |
203 if ordered[n + 1, m] == 1: | |
204 arc += 10 | |
205 elif ordered[n + 1, m] == 0: | |
206 arc += 0 | |
207 else: # if next position is empty check for 2nd next position | |
208 if n == 16: | |
209 if ordered[0, m] == 1: | |
210 arc += 10 | |
211 else: | |
212 if ordered[n + 2, m] == 1: | |
213 arc += 10 | |
214 | |
215 all_arcs.append(arc) | |
216 if not arc == 360: | |
217 arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together | |
218 all_arcs = [arc0] + all_arcs[1:] | |
219 | |
220 window_arc.append( | |
221 np.max(all_arcs) | |
222 ) # append to window arcs the maximum arc of this PF | |
223 allwindows_arc.append(window_arc) # append all PF arcs of this window | |
224 | |
225 allwindows_arc = np.asarray(allwindows_arc) | |
226 | |
227 if modality == "max": | |
228 final_arc = np.max( | |
229 allwindows_arc, axis=0 | |
230 ) # calculate maximum / mean arc along all windows | |
231 elif modality == "mean": | |
232 final_arc = np.mean(allwindows_arc, axis=0) | |
233 else: | |
234 print('modality is unknown, please choose between "max" and "mean"\n.') | |
235 sys.exit() | |
236 return final_arc | |
237 | |
238 | |
239 def _charge(seq, ph=7.0, amide=False): | |
240 """Calculates charge of a single sequence. The method used is first described by Bjellqvist. In the case of | |
241 amidation, the value for the 'Cterm' pKa is 15 (and Cterm is added to the pos_pks dictionary. | |
242 The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed). | |
243 | |
244 **pos_pks** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} | |
245 | |
246 **neg_pks** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} | |
247 | |
248 :param ph: {float} pH at which to calculate peptide charge. | |
249 :param amide: {boolean} whether the sequences have an amidated C-terminus. | |
250 :return: {array} descriptor values in the attribute :py:attr:`descriptor | |
251 """ | |
252 | |
253 if amide: | |
254 pos_pks = {"Nterm": 9.38, "K": 10.67, "R": 12.10, "H": 6.04} | |
255 neg_pks = {"Cterm": 15.0, "D": 3.71, "E": 4.15, "C": 8.14, "Y": 10.10} | |
256 else: | |
257 pos_pks = {"Nterm": 9.38, "K": 10.67, "R": 12.10, "H": 6.04} | |
258 neg_pks = {"Cterm": 2.15, "D": 3.71, "E": 4.15, "C": 8.14, "Y": 10.10} | |
259 | |
260 aa_content = count_aas(seq, scale="absolute") | |
261 aa_content["Nterm"] = 1.0 | |
262 aa_content["Cterm"] = 1.0 | |
263 pos_charge = 0.0 | |
264 for aa, pK in pos_pks.items(): | |
265 c_r = 10 ** (pK - ph) | |
266 partial_charge = c_r / (c_r + 1.0) | |
267 pos_charge += aa_content[aa] * partial_charge | |
268 neg_charge = 0.0 | |
269 for aa, pK in neg_pks.items(): | |
270 c_r = 10 ** (ph - pK) | |
271 partial_charge = c_r / (c_r + 1.0) | |
272 neg_charge += aa_content[aa] * partial_charge | |
273 return round(pos_charge - neg_charge, 3) | |
274 | |
275 | |
276 class GlobalDescriptor(BaseDescriptor): | |
277 """ | |
278 Base class for global, non-amino acid scale dependant descriptors. The following descriptors can be calculated by | |
279 the **methods** linked below: | |
280 | |
281 - `Sequence Length <modlamp.html#modlamp.descriptors.GlobalDescriptor.length>`_ | |
282 - `Molecular Formula <modlamp.html#modlamp.descriptors.GlobalDescriptor.formula>`_ | |
283 - `Molecular Weight <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_MW>`_ | |
284 - `Sequence Charge <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_charge>`_ | |
285 - `Charge Density <modlamp.html#modlamp.descriptors.GlobalDescriptor.charge_density>`_ | |
286 - `Isoelectric Point <modlamp.html#modlamp.descriptors.GlobalDescriptor.isoelectric_point>`_ | |
287 - `Instability Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.instability_index>`_ | |
288 - `Aromaticity <modlamp.html#modlamp.descriptors.GlobalDescriptor.aromaticity>`_ | |
289 - `Aliphatic Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.aliphatic_index>`_ | |
290 - `Boman Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.boman_index>`_ | |
291 - `Hydrophobic Ratio <modlamp.html#modlamp.descriptors.GlobalDescriptor.hydrophobic_ratio>`_ | |
292 - `all of the above <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_all>`_ | |
293 """ | |
294 | |
295 def length(self, append=False): | |
296 """ | |
297 Method to calculate the length (total AA count) of every sequence in the attribute :py:attr:`sequences`. | |
298 | |
299 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
300 attribute :py:attr:`descriptor`. | |
301 :return: array of sequence lengths in the attribute :py:attr:`descriptor` | |
302 :Example: | |
303 | |
304 >>> desc = GlobalDescriptor(['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR']) | |
305 >>> desc.length() | |
306 >>> desc.descriptor | |
307 array([[ 8.], [12.], [12.]]) | |
308 """ | |
309 desc = [] | |
310 for seq in self.sequences: | |
311 desc.append(float(len(seq.strip()))) | |
312 desc = np.asarray(desc).reshape(len(desc), 1) | |
313 if append: | |
314 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
315 self.featurenames.append("Length") | |
316 else: | |
317 self.descriptor = np.array(desc) | |
318 self.featurenames = ["Length"] | |
319 | |
320 def formula(self, amide=False, append=False): | |
321 """Method to calculate the molecular formula of every sequence in the attribute :py:attr:`sequences`. | |
322 | |
323 :param amide: {boolean} whether the sequences are C-terminally amidated. | |
324 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
325 attribute :py:attr:`descriptor`. | |
326 :return: array of molecular formulas {str} in the attribute :py:attr:`descriptor` | |
327 :Example: | |
328 | |
329 >>> desc = GlobalDescriptor(['KADSFLSADGHSADFSLDKKLKERL', 'ERTILSDFPQWWFASLDFLNC', 'ACDEFGHIKLMNPQRSTVWY']) | |
330 >>> desc.formula(amide=True) | |
331 >>> for v in desc.descriptor: | |
332 ... print(v[0]) | |
333 C122 H197 N35 O39 | |
334 C121 H168 N28 O33 S | |
335 C106 H157 N29 O30 S2 | |
336 | |
337 .. seealso:: :py:func:`modlamp.core.aa_formulas()` | |
338 | |
339 .. versionadded:: v2.7.6 | |
340 """ | |
341 desc = [] | |
342 formulas = aa_formulas() | |
343 for seq in self.sequences: | |
344 f = {"C": 0, "H": 0, "N": 0, "O": 0, "S": 0} | |
345 for aa in seq: # loop over all AAs | |
346 for k in f.keys(): | |
347 f[k] += formulas[aa][k] | |
348 | |
349 # substract H2O for every peptide bond | |
350 f["H"] -= 2 * (len(seq) - 1) | |
351 f["O"] -= len(seq) - 1 | |
352 | |
353 if amide: # add C-terminal amide --> replace OH with NH2 | |
354 f["O"] -= 1 | |
355 f["H"] += 1 | |
356 f["N"] += 1 | |
357 | |
358 if f["S"] != 0: | |
359 val = "C%s H%s N%s O%s %s%s" % ( | |
360 f["C"], | |
361 f["H"], | |
362 f["N"], | |
363 f["O"], | |
364 "S", | |
365 f["S"], | |
366 ) | |
367 else: | |
368 val = "C%s H%s N%s O%s" % (f["C"], f["H"], f["N"], f["O"]) | |
369 | |
370 desc.append([val]) | |
371 | |
372 if append: | |
373 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
374 self.featurenames.append("Formula") | |
375 else: | |
376 self.descriptor = np.array(desc) | |
377 self.featurenames = ["Formula"] | |
378 | |
379 def calculate_MW(self, amide=False, append=False): | |
380 """Method to calculate the molecular weight [g/mol] of every sequence in the attribute :py:attr:`sequences`. | |
381 | |
382 :param amide: {boolean} whether the sequences are C-terminally amidated (subtracts 0.95 from the MW). | |
383 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
384 attribute :py:attr:`descriptor`. | |
385 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
386 :Example: | |
387 | |
388 >>> desc = GlobalDescriptor('IAESFKGHIPL') | |
389 >>> desc.calculate_MW(amide=True) | |
390 >>> desc.descriptor | |
391 array([[ 1210.43]]) | |
392 | |
393 .. seealso:: :py:func:`modlamp.core.aa_weights()` | |
394 | |
395 .. versionchanged:: v2.1.5 amide option added | |
396 """ | |
397 desc = [] | |
398 weights = aa_weights() | |
399 for seq in self.sequences: | |
400 mw = [] | |
401 for aa in seq: # sum over aa weights | |
402 mw.append(weights[aa]) | |
403 desc.append( | |
404 round(sum(mw) - 18.015 * (len(seq) - 1), 2) | |
405 ) # sum over AA MW and subtract H20 MW for every | |
406 # peptide bond | |
407 desc = np.asarray(desc).reshape(len(desc), 1) | |
408 if ( | |
409 amide | |
410 ): # if sequences are amidated, subtract 0.98 from calculated MW (OH - NH2) | |
411 desc = [d - 0.98 for d in desc] | |
412 if append: | |
413 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
414 self.featurenames.append("MW") | |
415 else: | |
416 self.descriptor = np.array(desc) | |
417 self.featurenames = ["MW"] | |
418 | |
419 def calculate_charge(self, ph=7.0, amide=False, append=False): | |
420 """Method to overall charge of every sequence in the attribute :py:attr:`sequences`. | |
421 | |
422 The method used is first described by Bjellqvist. In the case of amidation, the value for the 'Cterm' pKa is 15 | |
423 (and Cterm is added to the pos_pKs dictionary. | |
424 The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed). | |
425 | |
426 **pos_pKs** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} | |
427 | |
428 **neg_pKs** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} | |
429 | |
430 :param ph: {float} ph at which to calculate peptide charge. | |
431 :param amide: {boolean} whether the sequences have an amidated C-terminus. | |
432 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
433 attribute :py:attr:`descriptor`. | |
434 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
435 :Example: | |
436 | |
437 >>> desc = GlobalDescriptor('KLAKFGKRSELVALSG') | |
438 >>> desc.calculate_charge(ph=7.4, amide=True) | |
439 >>> desc.descriptor | |
440 array([[ 3.989]]) | |
441 """ | |
442 | |
443 desc = [] | |
444 for seq in self.sequences: | |
445 desc.append( | |
446 _charge(seq, ph, amide) | |
447 ) # calculate charge with helper function | |
448 desc = np.asarray(desc).reshape(len(desc), 1) | |
449 if append: | |
450 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
451 self.featurenames.append("Charge") | |
452 else: | |
453 self.descriptor = np.array(desc) | |
454 self.featurenames = ["Charge"] | |
455 | |
456 def charge_density(self, ph=7.0, amide=False, append=False): | |
457 """Method to calculate the charge density (charge / MW) of every sequences in the attributes :py:attr:`sequences` | |
458 | |
459 :param ph: {float} pH at which to calculate peptide charge. | |
460 :param amide: {boolean} whether the sequences have an amidated C-terminus. | |
461 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
462 attribute :py:attr:`descriptor`. | |
463 :return: array of descriptor values in the attribute :py:attr:`descriptor`. | |
464 :Example: | |
465 | |
466 >>> desc = GlobalDescriptor('GNSDLLIEQRTLLASDEF') | |
467 >>> desc.charge_density(ph=6, amide=True) | |
468 >>> desc.descriptor | |
469 array([[-0.00097119]]) | |
470 """ | |
471 self.calculate_charge(ph, amide) | |
472 charges = self.descriptor | |
473 self.calculate_MW(amide) | |
474 masses = self.descriptor | |
475 desc = charges / masses | |
476 desc = np.asarray(desc).reshape(len(desc), 1) | |
477 if append: | |
478 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
479 self.featurenames.append("ChargeDensity") | |
480 else: | |
481 self.descriptor = np.array(desc) | |
482 self.featurenames = ["ChargeDensity"] | |
483 | |
484 def isoelectric_point(self, amide=False, append=False): | |
485 """ | |
486 Method to calculate the isoelectric point of every sequence in the attribute :py:attr:`sequences`. | |
487 The pK scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed). | |
488 | |
489 **pos_pKs** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} | |
490 | |
491 **neg_pKs** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} | |
492 | |
493 :param amide: {boolean} whether the sequences have an amidated C-terminus. | |
494 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
495 attribute :py:attr:`descriptor`. | |
496 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
497 :Example: | |
498 | |
499 >>> desc = GlobalDescriptor('KLFDIKFGHIPQRST') | |
500 >>> desc.isoelectric_point() | |
501 >>> desc.descriptor | |
502 array([[ 10.6796875]]) | |
503 """ | |
504 ph, ph1, ph2 = float(), float(), float() | |
505 desc = [] | |
506 for seq in self.sequences: | |
507 | |
508 # Bracket between ph1 and ph2 | |
509 ph = 7.0 | |
510 charge = _charge(seq, ph, amide) | |
511 if charge > 0.0: | |
512 ph1 = ph | |
513 charge1 = charge | |
514 while charge1 > 0.0: | |
515 ph = ph1 + 1.0 | |
516 charge = _charge(seq, ph, amide) | |
517 if charge > 0.0: | |
518 ph1 = ph | |
519 charge1 = charge | |
520 else: | |
521 ph2 = ph | |
522 break | |
523 else: | |
524 ph2 = ph | |
525 charge2 = charge | |
526 while charge2 < 0.0: | |
527 ph = ph2 - 1.0 | |
528 charge = _charge(seq, ph, amide) | |
529 if charge < 0.0: | |
530 ph2 = ph | |
531 charge2 = charge | |
532 else: | |
533 ph1 = ph | |
534 break | |
535 # Bisection | |
536 while ph2 - ph1 > 0.0001 and charge != 0.0: | |
537 ph = (ph1 + ph2) / 2.0 | |
538 charge = _charge(seq, ph, amide) | |
539 if charge > 0.0: | |
540 ph1 = ph | |
541 else: | |
542 ph2 = ph | |
543 desc.append(ph) | |
544 desc = np.asarray(desc).reshape(len(desc), 1) | |
545 if append: | |
546 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
547 self.featurenames.append("pI") | |
548 else: | |
549 self.descriptor = np.array(desc) | |
550 self.featurenames = ["pI"] | |
551 | |
552 def instability_index(self, append=False): | |
553 """ | |
554 Method to calculate the instability of every sequence in the attribute :py:attr:`sequences`. | |
555 The instability index is a prediction of protein stability based on the amino acid composition. | |
556 ([1] K. Guruprasad, B. V Reddy, M. W. Pandit, Protein Eng. 1990, 4, 155–161.) | |
557 | |
558 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
559 attribute :py:attr:`descriptor`. | |
560 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
561 :Example: | |
562 | |
563 >>> desc = GlobalDescriptor('LLASMNDLLAKRST') | |
564 >>> desc.instability_index() | |
565 >>> desc.descriptor | |
566 array([[ 63.95714286]]) | |
567 """ | |
568 | |
569 desc = [] | |
570 dimv = load_scale("instability")[1] | |
571 for seq in self.sequences: | |
572 stabindex = float() | |
573 for i in range(len(seq) - 1): | |
574 stabindex += dimv[seq[i]][seq[i + 1]] | |
575 desc.append((10.0 / len(seq)) * stabindex) | |
576 desc = np.asarray(desc).reshape(len(desc), 1) | |
577 if append: | |
578 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
579 self.featurenames.append("InstabilityInd") | |
580 else: | |
581 self.descriptor = np.array(desc) | |
582 self.featurenames = ["InstabilityInd"] | |
583 | |
584 def aromaticity(self, append=False): | |
585 """ | |
586 Method to calculate the aromaticity of every sequence in the attribute :py:attr:`sequences`. | |
587 According to Lobry, 1994, it is simply the relative frequency of Phe+Trp+Tyr. | |
588 | |
589 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
590 attribute :py:attr:`descriptor`. | |
591 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
592 :Example: | |
593 | |
594 >>> desc = GlobalDescriptor('GLFYWRFFLQRRFLYWW') | |
595 >>> desc.aromaticity() | |
596 >>> desc.descriptor | |
597 array([[ 0.52941176]]) | |
598 """ | |
599 desc = [] | |
600 for seq in self.sequences: | |
601 f = seq.count("F") | |
602 w = seq.count("W") | |
603 y = seq.count("Y") | |
604 desc.append(float(f + w + y) / len(seq)) | |
605 desc = np.asarray(desc).reshape(len(desc), 1) | |
606 if append: | |
607 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
608 self.featurenames.append("Aromaticity") | |
609 else: | |
610 self.descriptor = np.array(desc) | |
611 self.featurenames = ["Aromaticity"] | |
612 | |
613 def aliphatic_index(self, append=False): | |
614 """ | |
615 Method to calculate the aliphatic index of every sequence in the attribute :py:attr:`sequences`. | |
616 According to Ikai, 1980, the aliphatic index is a measure of thermal stability of proteins and is dependant | |
617 on the relative volume occupied by aliphatic amino acids (A,I,L & V). | |
618 ([1] A. Ikai, J. Biochem. 1980, 88, 1895–1898.) | |
619 | |
620 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
621 attribute :py:attr:`descriptor`. | |
622 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
623 :Example: | |
624 | |
625 >>> desc = GlobalDescriptor('KWLKYLKKLAKLVK') | |
626 >>> desc.aliphatic_index() | |
627 >>> desc.descriptor | |
628 array([[ 139.28571429]]) | |
629 """ | |
630 desc = [] | |
631 aa_dict = aa_weights() | |
632 for seq in self.sequences: | |
633 d = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa | |
634 d = { | |
635 k: (float(d[k]) / len(seq)) * 100 for k in d.keys() | |
636 } # get mole percent of all AA | |
637 desc.append( | |
638 d["A"] + 2.9 * d["V"] + 3.9 * (d["I"] + d["L"]) | |
639 ) # formula for calculating the AI (Ikai, 1980) | |
640 desc = np.asarray(desc).reshape(len(desc), 1) | |
641 if append: | |
642 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
643 self.featurenames.append("AliphaticInd") | |
644 else: | |
645 self.descriptor = np.array(desc) | |
646 self.featurenames = ["AliphaticInd"] | |
647 | |
648 def boman_index(self, append=False): | |
649 """Method to calculate the boman index of every sequence in the attribute :py:attr:`sequences`. | |
650 According to Boman, 2003, the boman index is a measure for protein-protein interactions and is calculated by | |
651 summing over all amino acid free energy of transfer [kcal/mol] between water and cyclohexane,[2] followed by | |
652 dividing by sequence length. | |
653 ([1] H. G. Boman, D. Wade, I. a Boman, B. Wåhlin, R. B. Merrifield, *FEBS Lett*. **1989**, *259*, 103–106. | |
654 [2] A. Radzick, R. Wolfenden, *Biochemistry* **1988**, *27*, 1664–1670.) | |
655 | |
656 .. seealso:: :py:func:`modlamp.core.aa_energies()` | |
657 | |
658 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
659 attribute :py:attr:`descriptor`. | |
660 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
661 :Example: | |
662 | |
663 >>> desc = GlobalDescriptor('GLFDIVKKVVGALGSL') | |
664 >>> desc.boman_index() | |
665 >>> desc.descriptor | |
666 array([[-1.011875]]) | |
667 """ | |
668 d = aa_energies() | |
669 desc = [] | |
670 for seq in self.sequences: | |
671 val = [] | |
672 for a in seq: | |
673 val.append(d[a]) | |
674 desc.append(sum(val) / len(val)) | |
675 desc = np.asarray(desc).reshape(len(desc), 1) | |
676 if append: | |
677 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
678 self.featurenames.append("BomanInd") | |
679 else: | |
680 self.descriptor = np.array(desc) | |
681 self.featurenames = ["BomanInd"] | |
682 | |
683 def hydrophobic_ratio(self, append=False): | |
684 """ | |
685 Method to calculate the hydrophobic ratio of every sequence in the attribute :py:attr:`sequences`, which is the | |
686 relative frequency of the amino acids **A,C,F,I,L,M & V**. | |
687 | |
688 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
689 attribute :py:attr:`descriptor`. | |
690 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
691 :Example: | |
692 | |
693 >>> desc = GlobalDescriptor('VALLYWRTVLLAIII') | |
694 >>> desc.hydrophobic_ratio() | |
695 >>> desc.descriptor | |
696 array([[ 0.73333333]]) | |
697 """ | |
698 desc = [] | |
699 aa_dict = aa_weights() | |
700 for seq in self.sequences: | |
701 pa = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa | |
702 # formula for calculating the AI (Ikai, 1980): | |
703 desc.append( | |
704 (pa["A"] + pa["C"] + pa["F"] + pa["I"] + pa["L"] + pa["M"] + pa["V"]) | |
705 / float(len(seq)) | |
706 ) | |
707 desc = np.asarray(desc).reshape(len(desc), 1) | |
708 if append: | |
709 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
710 self.featurenames.append("HydrophRatio") | |
711 else: | |
712 self.descriptor = np.array(desc) | |
713 self.featurenames = ["HydrophRatio"] | |
714 | |
715 def calculate_all(self, ph=7.4, amide=True): | |
716 """Method combining all global descriptors and appending them into the feature matrix in the attribute | |
717 :py:attr:`descriptor`. | |
718 | |
719 :param ph: {float} pH at which to calculate peptide charge | |
720 :param amide: {boolean} whether the sequences have an amidated C-terminus. | |
721 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
722 :Example: | |
723 | |
724 >>> desc = GlobalDescriptor('AFGHFKLKKLFIFGHERT') | |
725 >>> desc.calculate_all(amide=True) | |
726 >>> desc.featurenames | |
727 ['Length', 'MW', 'ChargeDensity', 'pI', 'InstabilityInd', 'Aromaticity', 'AliphaticInd', 'BomanInd', 'HydRatio'] | |
728 >>> desc.descriptor | |
729 array([[ 18., 2.17559000e+03, 1.87167619e-03, 1.16757812e+01, ... 1.10555556e+00, 4.44444444e-01]]) | |
730 >>> desc.save_descriptor('/path/to/outputfile.csv') # save the descriptor data (with feature names header) | |
731 """ | |
732 | |
733 # This is a strange way of doing it. However, the append=True option excludes length and charge, no idea why! | |
734 fn = [] | |
735 self.length() # sequence length | |
736 l = self.descriptor | |
737 fn.extend(self.featurenames) | |
738 self.calculate_MW(amide=amide) # molecular weight | |
739 mw = self.descriptor | |
740 fn.extend(self.featurenames) | |
741 self.calculate_charge(ph=ph, amide=amide) # net charge | |
742 c = self.descriptor | |
743 fn.extend(self.featurenames) | |
744 self.charge_density(ph=ph, amide=amide) # charge density | |
745 cd = self.descriptor | |
746 fn.extend(self.featurenames) | |
747 self.isoelectric_point(amide=amide) # pI | |
748 pi = self.descriptor | |
749 fn.extend(self.featurenames) | |
750 self.instability_index() # instability index | |
751 si = self.descriptor | |
752 fn.extend(self.featurenames) | |
753 self.aromaticity() # global aromaticity | |
754 ar = self.descriptor | |
755 fn.extend(self.featurenames) | |
756 self.aliphatic_index() # aliphatic index | |
757 ai = self.descriptor | |
758 fn.extend(self.featurenames) | |
759 self.boman_index() # Boman index | |
760 bi = self.descriptor | |
761 fn.extend(self.featurenames) | |
762 self.hydrophobic_ratio() # Hydrophobic ratio | |
763 hr = self.descriptor | |
764 fn.extend(self.featurenames) | |
765 | |
766 self.descriptor = np.concatenate((l, mw, c, cd, pi, si, ar, ai, bi, hr), axis=1) | |
767 self.featurenames = fn | |
768 | |
769 | |
770 class PeptideDescriptor(BaseDescriptor): | |
771 """Base class for peptide descriptors. The following **amino acid descriptor scales** are available for descriptor | |
772 calculation: | |
773 | |
774 - **AASI** (An amino acid selectivity index scale for helical antimicrobial peptides, *[1] D. Juretić, D. Vukicević, N. Ilić, N. Antcheva, A. Tossi, J. Chem. Inf. Model. 2009, 49, 2873–2882.*) | |
775 - **ABHPRK** (modlabs inhouse physicochemical feature scale (Acidic, Basic, Hydrophobic, Polar, aRomatic, Kink-inducer) | |
776 - **argos** (Argos hydrophobicity amino acid scale, *[2] Argos, P., Rao, J. K. M. & Hargrave, P. A., Eur. J. Biochem. 2005, 128, 565–575.*) | |
777 - **bulkiness** (Amino acid side chain bulkiness scale, *[3] J. M. Zimmerman, N. Eliezer, R. Simha, J. Theor. Biol. 1968, 21, 170–201.*) | |
778 - **charge_phys** (Amino acid charge at pH 7.0 - Hystidine charge +0.1.) | |
779 - **charge_acid** (Amino acid charge at acidic pH - Hystidine charge +1.0.) | |
780 - **cougar** (modlabs inhouse selection of global peptide descriptors) | |
781 - **eisenberg** (the Eisenberg hydrophobicity consensus amino acid scale, *[4] D. Eisenberg, R. M. Weiss, T. C. Terwilliger, W. Wilcox, Faraday Symp. Chem. Soc. 1982, 17, 109.*) | |
782 - **Ez** (potential that assesses energies of insertion of amino acid side chains into lipid bilayers, *[5] A. Senes, D. C. Chadi, P. B. Law, R. F. S. Walters, V. Nanda, W. F. DeGrado, J. Mol. Biol. 2007, 366, 436–448.*) | |
783 - **flexibility** (amino acid side chain flexibilitiy scale, *[6] R. Bhaskaran, P. K. Ponnuswamy, Int. J. Pept. Protein Res. 1988, 32, 241–255.*) | |
784 - **grantham** (amino acid side chain composition, polarity and molecular volume, *[8] Grantham, R. Science. 185, 862–864 (1974).*) | |
785 - **gravy** (GRAVY hydrophobicity amino acid scale, *[9] J. Kyte, R. F. Doolittle, J. Mol. Biol. 1982, 157, 105–132.*) | |
786 - **hopp-woods** (Hopp-Woods amino acid hydrophobicity scale,*[10] T. P. Hopp, K. R. Woods, Proc. Natl. Acad. Sci. 1981, 78, 3824–3828.*) | |
787 - **ISAECI** (Isotropic Surface Area (ISA) and Electronic Charge Index (ECI) of amino acid side chains, *[11] E. R. Collantes, W. J. Dunn, J. Med. Chem. 1995, 38, 2705–2713.*) | |
788 - **janin** (Janin hydrophobicity amino acid scale, *[12] J. L. Cornette, K. B. Cease, H. Margalit, J. L. Spouge, J. A. Berzofsky, C. DeLisi, J. Mol. Biol. 1987, 195, 659–685.*) | |
789 - **kytedoolittle** (Kyte & Doolittle hydrophobicity amino acid scale, *[13] J. Kyte, R. F. Doolittle, J. Mol. Biol. 1982, 157, 105–132.*) | |
790 - **levitt_alpha** (Levitt amino acid alpha-helix propensity scale, extracted from http://web.expasy.org/protscale. *[14] M. Levitt, Biochemistry 1978, 17, 4277-4285.*) | |
791 - **MSS** (A graph-theoretical index that reflects topological shape and size of amino acid side chains, *[15] C. Raychaudhury, A. Banerjee, P. Bag, S. Roy, J. Chem. Inf. Comput. Sci. 1999, 39, 248–254.*) | |
792 - **MSW** (Amino acid scale based on a PCA of the molecular surface based WHIM descriptor (MS-WHIM), extended to natural amino acids, *[16] A. Zaliani, E. Gancia, J. Chem. Inf. Comput. Sci 1999, 39, 525–533.*) | |
793 - **pepArc** (modlabs pharmacophoric feature scale, dimensions are: hydrophobicity, polarity, positive charge, negative charge, proline.) | |
794 - **pepcats** (modlabs pharmacophoric feature based PEPCATS scale, *[17] C. P. Koch, A. M. Perna, M. Pillong, N. K. Todoroff, P. Wrede, G. Folkers, J. A. Hiss, G. Schneider, PLoS Comput. Biol. 2013, 9, e1003088.*) | |
795 - **polarity** (Amino acid polarity scale, *[18] J. M. Zimmerman, N. Eliezer, R. Simha, J. Theor. Biol. 1968, 21, 170–201.*) | |
796 - **PPCALI** (modlabs inhouse scale derived from a PCA of 143 amino acid property scales, *[19] C. P. Koch, A. M. Perna, M. Pillong, N. K. Todoroff, P. Wrede, G. Folkers, J. A. Hiss, G. Schneider, PLoS Comput. Biol. 2013, 9, e1003088.*) | |
797 - **refractivity** (Relative amino acid refractivity values, *[20] T. L. McMeekin, M. Wilensky, M. L. Groves, Biochem. Biophys. Res. Commun. 1962, 7, 151–156.*) | |
798 - **t_scale** (A PCA derived scale based on amino acid side chain properties calculated with 6 different probes of the GRID program, *[21] M. Cocchi, E. Johansson, Quant. Struct. Act. Relationships 1993, 12, 1–8.*) | |
799 - **TM_tend** (Amino acid transmembrane propensity scale, extracted from http://web.expasy.org/protscale, *[22] Zhao, G., London E. Protein Sci. 2006, 15, 1987-2001.*) | |
800 - **z3** (The original three dimensional Z-scale, *[23] S. Hellberg, M. Sjöström, B. Skagerberg, S. Wold, J. Med. Chem. 1987, 30, 1126–1135.*) | |
801 - **z5** (The extended five dimensional Z-scale, *[24] M. Sandberg, L. Eriksson, J. Jonsson, M. Sjöström, S. Wold, J. Med. Chem. 1998, 41, 2481–2491.*) | |
802 | |
803 Further, amino acid scale independent methods can be calculated with help of the :class:`GlobalDescriptor` class. | |
804 | |
805 """ | |
806 | |
807 def __init__(self, seqs, scalename="Eisenberg"): | |
808 """ | |
809 :param seqs: a .fasta file with sequences, a list of sequences or a single sequence as string to calculate the | |
810 descriptor values for. | |
811 :param scalename: {str} name of the amino acid scale (one of the given list above) used to calculate the | |
812 descriptor values | |
813 :return: initialized attributes :py:attr:`sequences`, :py:attr:`names` and dictionary :py:attr:`scale` with | |
814 amino acid scale values of the scale name in :py:attr:`scalename`. | |
815 :Example: | |
816 | |
817 >>> AMP = PeptideDescriptor('KLLKLLKKLLKLLK','pepcats') | |
818 >>> AMP.sequences | |
819 ['KLLKLLKKLLKLLK'] | |
820 >>> seqs = PeptideDescriptor('/Path/to/file.fasta', 'eisenberg') # load sequences from .fasta file | |
821 >>> seqs.sequences | |
822 ['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'...] | |
823 """ | |
824 super(PeptideDescriptor, self).__init__(seqs) | |
825 self.scalename, self.scale = load_scale(scalename.lower()) | |
826 self.all_moms = list() # for passing hydrophobic moments to calculate_profile | |
827 self.all_globs = list() # for passing global to calculate_profile | |
828 | |
829 def load_scale(self, scalename): | |
830 """Method to load amino acid values from a given scale | |
831 | |
832 :param scalename: {str} name of the amino acid scale to be loaded. | |
833 :return: loaded amino acid scale values in a dictionary in the attribute :py:attr:`scale`. | |
834 | |
835 .. seealso:: :func:`modlamp.core.load_scale()` | |
836 """ | |
837 self.scalename, self.scale = load_scale(scalename.lower()) | |
838 | |
839 def calculate_autocorr(self, window, append=False): | |
840 """Method for auto-correlating the amino acid values for a given descriptor scale | |
841 | |
842 :param window: {int} correlation window for descriptor calculation in a sliding window approach | |
843 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
844 attribute :py:attr:`descriptor`. | |
845 :return: calculated descriptor numpy.array in the attribute :py:attr:`descriptor`. | |
846 :Example: | |
847 | |
848 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','PPCALI') | |
849 >>> AMP.calculate_autocorr(7) | |
850 >>> AMP.descriptor | |
851 array([[ 1.28442339e+00, 1.29025116e+00, 1.03240901e+00, .... ]]) | |
852 >>> AMP.descriptor.shape | |
853 (1, 133) | |
854 | |
855 .. versionchanged:: v.2.3.0 | |
856 """ | |
857 desc = Parallel(n_jobs=-1)( | |
858 delayed(_one_autocorr)(seq, window, self.scale) for seq in self.sequences | |
859 ) | |
860 | |
861 if append: | |
862 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
863 else: | |
864 self.descriptor = np.array(desc) | |
865 | |
866 def calculate_crosscorr(self, window, append=False): | |
867 """Method for cross-correlating the amino acid values for a given descriptor scale | |
868 | |
869 :param window: {int} correlation window for descriptor calculation in a sliding window approach | |
870 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
871 attribute :py:attr:`descriptor`. | |
872 :return: calculated descriptor numpy.array in the attribute :py:attr:`descriptor`. | |
873 :Example: | |
874 | |
875 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','pepcats') | |
876 >>> AMP.calculate_crosscorr(7) | |
877 >>> AMP.descriptor | |
878 array([[ 0.6875 , 0.46666667, 0.42857143, 0.61538462, 0.58333333, ... ]]) | |
879 >>> AMP.descriptor.shape | |
880 (1, 147) | |
881 """ | |
882 desc = Parallel(n_jobs=-1)( | |
883 delayed(_one_crosscorr)(seq, window, self.scale) for seq in self.sequences | |
884 ) | |
885 | |
886 if append: | |
887 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
888 else: | |
889 self.descriptor = np.array(desc) | |
890 | |
891 def calculate_moment(self, window=1000, angle=100, modality="max", append=False): | |
892 """Method for calculating the maximum or mean moment of the amino acid values for a given descriptor scale and | |
893 window. | |
894 | |
895 :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the | |
896 window, the length of the sequence is taken. So if the default window of 1000 is chosen, for all sequences | |
897 shorter than 1000, the **global** hydrophobic moment will be calculated. Otherwise, the maximal | |
898 hydrophiobic moment for the chosen window size found in the sequence will be returned. | |
899 :param angle: {int} angle in which to calculate the moment. **100** for alpha helices, **180** for beta sheets. | |
900 :param modality: {'all', 'max' or 'mean'} Calculate respectively maximum or mean hydrophobic moment. If all, | |
901 moments for all windows are returned. | |
902 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
903 attribute :py:attr:`descriptor`. | |
904 :return: Calculated descriptor as a numpy.array in the attribute :py:attr:`descriptor` and all possible global | |
905 values in :py:attr:`all_moms` (needed for the :py:func:`calculate_profile` method) | |
906 :Example: | |
907 | |
908 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL', 'eisenberg') | |
909 >>> AMP.calculate_moment() | |
910 >>> AMP.descriptor | |
911 array([[ 0.48790226]]) | |
912 """ | |
913 if self.scale["A"] == list: | |
914 print( | |
915 "\n Descriptor moment calculation is only possible for one dimensional descriptors.\n" | |
916 ) | |
917 | |
918 else: | |
919 desc = [] | |
920 for seq in self.sequences: | |
921 wdw = min( | |
922 window, len(seq) | |
923 ) # if sequence is shorter than window, take the whole sequence instead | |
924 mtrx = [] | |
925 mwdw = [] | |
926 | |
927 for aa in range(len(seq)): | |
928 mtrx.append(self.scale[str(seq[aa])]) | |
929 | |
930 for i in range(len(mtrx) - wdw + 1): | |
931 mwdw.append(sum(mtrx[i : i + wdw], [])) | |
932 | |
933 mwdw = np.asarray(mwdw) | |
934 rads = ( | |
935 angle * (np.pi / 180) * np.asarray(range(wdw)) | |
936 ) # calculate actual moment (radial) | |
937 vcos = (mwdw * np.cos(rads)).sum(axis=1) | |
938 vsin = (mwdw * np.sin(rads)).sum(axis=1) | |
939 moms = np.sqrt(vsin**2 + vcos**2) / wdw | |
940 | |
941 if modality == "max": # take window with maximal value | |
942 moment = np.max(moms) | |
943 elif modality == "mean": # take average value over all windows | |
944 moment = np.mean(moms) | |
945 elif modality == "all": | |
946 moment = moms | |
947 else: | |
948 print( | |
949 '\nERROR!\nModality parameter is wrong, please choose between "all", "max" and "mean".\n' | |
950 ) | |
951 return | |
952 desc.append(moment) | |
953 self.all_moms.append(moms) | |
954 | |
955 desc = np.asarray(desc).reshape(len(desc), 1) # final descriptor array | |
956 | |
957 if append: | |
958 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
959 else: | |
960 self.descriptor = np.array(desc) | |
961 | |
962 def calculate_global(self, window=1000, modality="max", append=False): | |
963 """Method for calculating a global / window averaging descriptor value of a given AA scale | |
964 | |
965 :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the | |
966 window, the length of the sequence is taken. | |
967 :param modality: {'max' or 'mean'} Calculate respectively maximum or mean hydrophobic moment. | |
968 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
969 attribute :py:attr:`descriptor`. | |
970 :return: Calculated descriptor as a numpy.array in the attribute :py:attr:`descriptor` and all possible global | |
971 values in :py:attr:`all_globs` (needed for the :py:func:`calculate_profile` method) | |
972 :Example: | |
973 | |
974 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','eisenberg') | |
975 >>> AMP.calculate_global(window=1000, modality='max') | |
976 >>> AMP.descriptor | |
977 array([[ 0.44875]]) | |
978 """ | |
979 desc = list() | |
980 for n, seq in enumerate(self.sequences): | |
981 wdw = min( | |
982 window, len(seq) | |
983 ) # if sequence is shorter than window, take the whole sequence instead | |
984 mtrx = [] | |
985 mwdw = [] | |
986 | |
987 for l in range(len(seq)): # translate AA sequence into values | |
988 mtrx.append(self.scale[str(seq[l])]) | |
989 | |
990 for i in range(len(mtrx) - wdw + 1): | |
991 mwdw.append( | |
992 sum(mtrx[i : i + wdw], []) | |
993 ) # list of all the values for the different windows | |
994 | |
995 mwdw = np.asarray(mwdw) | |
996 glob = np.sum(mwdw, axis=1) / float(wdw) | |
997 outglob = float() | |
998 | |
999 if modality in ["max", "mean"]: | |
1000 if modality == "max": | |
1001 outglob = np.max( | |
1002 glob | |
1003 ) # returned moment will be the maximum of all windows | |
1004 elif modality == "mean": | |
1005 outglob = np.mean( | |
1006 glob | |
1007 ) # returned moment will be the mean of all windows | |
1008 else: | |
1009 print( | |
1010 'Modality parameter is wrong, please choose between "max" and "mean"\n.' | |
1011 ) | |
1012 return | |
1013 desc.append(outglob) | |
1014 self.all_globs.append(glob) | |
1015 | |
1016 desc = np.asarray(desc).reshape(len(desc), 1) | |
1017 if append: | |
1018 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
1019 else: | |
1020 self.descriptor = np.array(desc) | |
1021 | |
1022 def calculate_profile(self, prof_type="uH", window=7, append=False): | |
1023 """Method for calculating hydrophobicity or hydrophobic moment profiles for given sequences and fitting for | |
1024 slope and intercept. The hydrophobicity scale used is "eisenberg" | |
1025 | |
1026 :param prof_type: prof_type of profile, available: 'H' for hydrophobicity or 'uH' for hydrophobic moment | |
1027 :param window: {int} size of sliding window used (odd-numbered). | |
1028 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
1029 attribute :py:attr:`descriptor`. | |
1030 :return: Fitted slope and intercept of calculated profile for every given sequence in the attribute | |
1031 :py:attr:`descriptor`. | |
1032 :Example: | |
1033 | |
1034 >>> AMP = PeptideDescriptor('KLLKLLKKVVGALG','kytedoolittle') | |
1035 >>> AMP.calculate_profile(prof_type='H') | |
1036 >>> AMP.descriptor | |
1037 array([[ 0.03731293, 0.19246599]]) | |
1038 """ | |
1039 if prof_type == "uH": | |
1040 self.calculate_moment(window=window) | |
1041 y_vals = self.all_moms | |
1042 elif prof_type == "H": | |
1043 self.calculate_global(window=window) | |
1044 y_vals = self.all_globs | |
1045 else: | |
1046 print( | |
1047 'prof_type parameter is unknown, choose "uH" for hydrophobic moment or "H" for hydrophobicity\n.' | |
1048 ) | |
1049 sys.exit() | |
1050 | |
1051 desc = list() | |
1052 for n, seq in enumerate(self.sequences): | |
1053 x_vals = range(len(seq))[int((window - 1) / 2) : -int((window - 1) / 2)] | |
1054 if len(seq) <= window: | |
1055 slope, intercept, r_value, p_value, std_err = [0, 0, 0, 0, 0] | |
1056 else: | |
1057 slope, intercept, r_value, p_value, std_err = stats.linregress( | |
1058 x_vals, y_vals[n] | |
1059 ) | |
1060 desc.append([slope, intercept]) | |
1061 | |
1062 if append: | |
1063 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
1064 else: | |
1065 self.descriptor = np.array(desc) | |
1066 | |
1067 def calculate_arc(self, modality="max", append=False): | |
1068 """Method for calculating property arcs as seen in the helical wheel plot. Use for binary amino acid scales only. | |
1069 | |
1070 :param modality: modality of the arc to calculate, to choose between "max" and "mean". | |
1071 :param append: if true, append to current descriptor stored in the descriptor attribute. | |
1072 :return: calculated descriptor as numpy.array in the descriptor attribute. | |
1073 | |
1074 :Example: | |
1075 | |
1076 >>> arc = PeptideDescriptor("KLLKLLKKLLKLLK", scalename="peparc") | |
1077 >>> arc.calculate_arc(modality="max", append=False) | |
1078 >>> arc.descriptor | |
1079 array([[200, 160, 160, 0, 0]]) | |
1080 """ | |
1081 desc = Parallel(n_jobs=-1)( | |
1082 delayed(_one_arc)(seq, modality, self.scale) for seq in self.sequences | |
1083 ) | |
1084 | |
1085 # Converts each of the amino acids to descriptor vector | |
1086 for seq in self.sequences: | |
1087 | |
1088 # desc_mat = [] | |
1089 # for aa in seq: | |
1090 # desc_mat.append(self.scale[aa]) | |
1091 # desc_mat = np.asarray(desc_mat) | |
1092 # | |
1093 # # Check descriptor dimension | |
1094 # desc_dim = desc_mat.shape[1] | |
1095 # | |
1096 # # list to store descriptor values for all windows | |
1097 # allwindows_arc = [] | |
1098 # | |
1099 # if len(seq) > 18: | |
1100 # window = 18 | |
1101 # # calculates number of windows in sequence | |
1102 # num_windows = len(seq) - window | |
1103 # else: | |
1104 # window = len(seq) | |
1105 # num_windows = 1 | |
1106 # | |
1107 # # loop through all windows | |
1108 # for j in range(num_windows): | |
1109 # # slices descriptor matrix into current window | |
1110 # window_mat = desc_mat[j:j + window, :] | |
1111 # | |
1112 # # defines order of amino acids in helical projection | |
1113 # order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7] | |
1114 # | |
1115 # # orders window descriptor matrix into helical projection order | |
1116 # ordered = [] | |
1117 # for pos in order: | |
1118 # try: | |
1119 # ordered.append(window_mat[pos, :]) | |
1120 # except: | |
1121 # # for sequences of len < 18 adding dummy vector with 2s, length of descriptor dimensions | |
1122 # ordered.append([2] * desc_dim) | |
1123 # ordered = np.asarray(ordered) | |
1124 # | |
1125 # window_arc = [] | |
1126 # | |
1127 # # loop through pharmacophoric features | |
1128 # for m in range(desc_dim): | |
1129 # all_arcs = [] # stores all arcs that can be found of a pharmacophoric feature | |
1130 # arc = 0 | |
1131 # | |
1132 # for n in range(18): # for all positions in helix, regardless of sequence length | |
1133 # if ordered[n, m] == 0: # if position does not contain pharmacophoric feature | |
1134 # all_arcs.append(arc) # append previous arc to all arcs list | |
1135 # arc = 0 # arc is initialized | |
1136 # elif ordered[n, m] == 1: # if position contains pharmacophoric feature(PF), elongate arc by 20° | |
1137 # arc += 20 | |
1138 # elif ordered[n, m] == 2: # if position doesn't contain amino acid: | |
1139 # if ordered[n - 1, m] == 1: # if previous position contained PF add 10° | |
1140 # arc += 10 | |
1141 # elif ordered[n - 1, m] == 0: # if previous position didn't contain PF don't add anything | |
1142 # arc += 0 | |
1143 # elif ordered[ | |
1144 # n - 2, m] == 1: # if previous position is empty then check second previous for PF | |
1145 # arc += 10 | |
1146 # if n == 17: # if we are at the last position check for position n=0 instead of next position. | |
1147 # if ordered[0, m] == 1: # if it contains PF add 10° extra | |
1148 # arc += 10 | |
1149 # else: # if next position contains PF add 10° extra | |
1150 # if ordered[n + 1, m] == 1: | |
1151 # arc += 10 | |
1152 # elif ordered[n + 1, m] == 0: | |
1153 # arc += 0 | |
1154 # else: # if next position is empty check for 2nd next position | |
1155 # if n == 16: | |
1156 # if ordered[0, m] == 1: | |
1157 # arc += 10 | |
1158 # else: | |
1159 # if ordered[n + 2, m] == 1: | |
1160 # arc += 10 | |
1161 # | |
1162 # all_arcs.append(arc) | |
1163 # if not arc == 360: | |
1164 # arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together | |
1165 # all_arcs = [arc0] + all_arcs[1:] | |
1166 # | |
1167 # window_arc.append(np.max(all_arcs)) # append to window arcs the maximum arc of this PF | |
1168 # allwindows_arc.append(window_arc) # append all PF arcs of this window | |
1169 # | |
1170 # allwindows_arc = np.asarray(allwindows_arc) | |
1171 # | |
1172 # if modality == 'max': | |
1173 # final_arc = np.max(allwindows_arc, axis=0) # calculate maximum / mean arc along all windows | |
1174 # elif modality == 'mean': | |
1175 # final_arc = np.mean(allwindows_arc, axis=0) | |
1176 # else: | |
1177 # print('modality is unknown, please choose between "max" and "mean"\n.') | |
1178 # sys.exit() | |
1179 | |
1180 if append: | |
1181 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
1182 else: | |
1183 self.descriptor = np.array(desc) |