comparison plotWheels/descriptors.py @ 1:9b276485c94a draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:44:43 +0000
parents
children
comparison
equal deleted inserted replaced
0:9caa9aa44fd8 1:9b276485c94a
1 # -*- coding: utf-8 -*-
2 """
3 .. currentmodule:: modlamp.descriptors
4
5 .. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch>
6
7 This module incorporates different classes to calculate peptide descriptor values. The following classes are available:
8
9 ============================= ============================================================================
10 Class Characteristics
11 ============================= ============================================================================
12 :py:class:`GlobalDescriptor` Global one-dimensional peptide descriptors calculated from the AA sequence.
13 :py:class:`PeptideDescriptor` AA scale based global or convoluted descriptors (auto-/cross-correlated).
14 ============================= ============================================================================
15
16 .. seealso:: :class:`modlamp.core.BaseDescriptor` from which the classes in :mod:`modlamp.descriptors` inherit.
17 """
18
19 import sys
20
21 import numpy as np
22 from scipy import stats
23 from sklearn.externals.joblib import Parallel, delayed
24
25 from plotWheels.core import (
26 BaseDescriptor,
27 load_scale,
28 count_aas,
29 aa_weights,
30 aa_energies,
31 aa_formulas,
32 )
33
34 __author__ = "Alex Müller, Gisela Gabernet"
35 __docformat__ = "restructuredtext en"
36
37
38 def _one_autocorr(seq, window, scale):
39 """Private function used for calculating auto-correlated descriptors for 1 given sequence, window and an AA scale.
40 This function is used by the :py:func:`calculate_autocorr` method of :py:class:`PeptideDescriptor`.
41
42 :param seq: {str} amino acid sequence to calculate descriptor for
43 :param window: {int} correlation-window size
44 :param scale: {str} amino acid scale to be used to calculate descriptor
45 :return: {numpy.array} calculated descriptor data
46 """
47 try:
48 m = list() # list of lists to store translated sequence values
49 for l in range(len(seq)): # translate AA sequence into values
50 m.append(scale[str(seq[l])])
51 # auto-correlation in defined sequence window
52 seqdesc = list()
53 for dist in range(window): # for all correlation distances
54 for val in range(
55 len(scale["A"])
56 ): # for all features of the descriptor scale
57 valsum = list()
58 cntr = 0.0
59 for pos in range(len(seq)): # for every position in the sequence
60 if (pos + dist) < len(
61 seq
62 ): # check if corr distance is possible at that sequence position
63 cntr += 1 # counter to scale sum
64 valsum.append(m[pos][val] * m[pos + dist][val])
65 seqdesc.append(
66 sum(valsum) / cntr
67 ) # append scaled correlation distance values
68 return seqdesc
69 except ZeroDivisionError:
70 print(
71 "ERROR!\nThe chosen correlation window % i is larger than the sequence %s !"
72 % (window, seq)
73 )
74
75
76 def _one_crosscorr(seq, window, scale):
77 """Private function used for calculating cross-correlated descriptors for 1 given sequence, window and an AA scale.
78 This function is used by the :py:func:`calculate_crosscorr` method of :py:class:`PeptideDescriptor`.
79
80 :param seq: {str} amino acid sequence to calculate descriptor for
81 :param window: {int} correlation-window size
82 :param scale: {str} amino acid scale to be used to calculate descriptor
83 :return: {numpy.array} calculated descriptor data
84 """
85 try:
86 m = list() # list of lists to store translated sequence values
87 for l in range(len(seq)): # translate AA sequence into values
88 m.append(scale[str(seq[l])])
89 # auto-correlation in defined sequence window
90 seqdesc = list()
91 for val in range(len(scale["A"])): # for all features of the descriptor scale
92 for cc in range(len(scale["A"])): # for every feature cross correlation
93 if (val + cc) < len(
94 scale["A"]
95 ): # check if corr distance is in range of the num of features
96 for dist in range(window): # for all correlation distances
97 cntr = float()
98 valsum = list()
99 for pos in range(
100 len(seq)
101 ): # for every position in the sequence
102 if (pos + dist) < len(
103 seq
104 ): # check if corr distance is possible at that sequence pos
105 cntr += 1 # counter to scale sum
106 valsum.append(m[pos][val] * m[pos + dist][val + cc])
107 seqdesc.append(
108 sum(valsum) / cntr
109 ) # append scaled correlation distance values
110 return seqdesc
111 except ZeroDivisionError:
112 print(
113 "ERROR!\nThe chosen correlation window % i is larger than the sequence %s !"
114 % (window, seq)
115 )
116
117
118 def _one_arc(seq, modality, scale):
119 """Privat function used for calculating arc descriptors for one sequence and AA scale. This function is used by
120 :py:func:`calculate_arc` method method of :py:class:`PeptideDescriptor`.
121
122 :param seq: {str} amino acid sequence to calculate descriptor for
123 :param scale: {str} amino acid scale to be used to calculate descriptor
124 :return: {numpy.array} calculated descriptor data
125 """
126 desc_mat = []
127 for aa in seq:
128 desc_mat.append(scale[aa])
129 desc_mat = np.asarray(desc_mat)
130
131 # Check descriptor dimension
132 desc_dim = desc_mat.shape[1]
133
134 # list to store descriptor values for all windows
135 allwindows_arc = []
136
137 if len(seq) > 18:
138 window = 18
139 # calculates number of windows in sequence
140 num_windows = len(seq) - window
141 else:
142 window = len(seq)
143 num_windows = 1
144
145 # loop through all windows
146 for j in range(num_windows):
147 # slices descriptor matrix into current window
148 window_mat = desc_mat[j : j + window, :]
149
150 # defines order of amino acids in helical projection
151 order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7]
152
153 # orders window descriptor matrix into helical projection order
154 ordered = []
155 for pos in order:
156 try:
157 ordered.append(window_mat[pos, :])
158 except:
159 # for sequences of len < 18 adding dummy vector with 2s, length of descriptor dimensions
160 ordered.append([2] * desc_dim)
161 ordered = np.asarray(ordered)
162
163 window_arc = []
164
165 # loop through pharmacophoric features
166 for m in range(desc_dim):
167 all_arcs = (
168 []
169 ) # stores all arcs that can be found of a pharmacophoric feature
170 arc = 0
171
172 for n in range(
173 18
174 ): # for all positions in helix, regardless of sequence length
175 if (
176 ordered[n, m] == 0
177 ): # if position does not contain pharmacophoric feature
178 all_arcs.append(arc) # append previous arc to all arcs list
179 arc = 0 # arc is initialized
180 elif (
181 ordered[n, m] == 1
182 ): # if position contains pharmacophoric feature(PF), elongate arc by 20°
183 arc += 20
184 elif ordered[n, m] == 2: # if position doesn't contain amino acid:
185 if (
186 ordered[n - 1, m] == 1
187 ): # if previous position contained PF add 10°
188 arc += 10
189 elif (
190 ordered[n - 1, m] == 0
191 ): # if previous position didn't contain PF don't add anything
192 arc += 0
193 elif (
194 ordered[n - 2, m] == 1
195 ): # if previous position is empty then check second previous for PF
196 arc += 10
197 if (
198 n == 17
199 ): # if we are at the last position check for position n=0 instead of next position.
200 if ordered[0, m] == 1: # if it contains PF add 10° extra
201 arc += 10
202 else: # if next position contains PF add 10° extra
203 if ordered[n + 1, m] == 1:
204 arc += 10
205 elif ordered[n + 1, m] == 0:
206 arc += 0
207 else: # if next position is empty check for 2nd next position
208 if n == 16:
209 if ordered[0, m] == 1:
210 arc += 10
211 else:
212 if ordered[n + 2, m] == 1:
213 arc += 10
214
215 all_arcs.append(arc)
216 if not arc == 360:
217 arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together
218 all_arcs = [arc0] + all_arcs[1:]
219
220 window_arc.append(
221 np.max(all_arcs)
222 ) # append to window arcs the maximum arc of this PF
223 allwindows_arc.append(window_arc) # append all PF arcs of this window
224
225 allwindows_arc = np.asarray(allwindows_arc)
226
227 if modality == "max":
228 final_arc = np.max(
229 allwindows_arc, axis=0
230 ) # calculate maximum / mean arc along all windows
231 elif modality == "mean":
232 final_arc = np.mean(allwindows_arc, axis=0)
233 else:
234 print('modality is unknown, please choose between "max" and "mean"\n.')
235 sys.exit()
236 return final_arc
237
238
239 def _charge(seq, ph=7.0, amide=False):
240 """Calculates charge of a single sequence. The method used is first described by Bjellqvist. In the case of
241 amidation, the value for the 'Cterm' pKa is 15 (and Cterm is added to the pos_pks dictionary.
242 The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed).
243
244 **pos_pks** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
245
246 **neg_pks** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}
247
248 :param ph: {float} pH at which to calculate peptide charge.
249 :param amide: {boolean} whether the sequences have an amidated C-terminus.
250 :return: {array} descriptor values in the attribute :py:attr:`descriptor
251 """
252
253 if amide:
254 pos_pks = {"Nterm": 9.38, "K": 10.67, "R": 12.10, "H": 6.04}
255 neg_pks = {"Cterm": 15.0, "D": 3.71, "E": 4.15, "C": 8.14, "Y": 10.10}
256 else:
257 pos_pks = {"Nterm": 9.38, "K": 10.67, "R": 12.10, "H": 6.04}
258 neg_pks = {"Cterm": 2.15, "D": 3.71, "E": 4.15, "C": 8.14, "Y": 10.10}
259
260 aa_content = count_aas(seq, scale="absolute")
261 aa_content["Nterm"] = 1.0
262 aa_content["Cterm"] = 1.0
263 pos_charge = 0.0
264 for aa, pK in pos_pks.items():
265 c_r = 10 ** (pK - ph)
266 partial_charge = c_r / (c_r + 1.0)
267 pos_charge += aa_content[aa] * partial_charge
268 neg_charge = 0.0
269 for aa, pK in neg_pks.items():
270 c_r = 10 ** (ph - pK)
271 partial_charge = c_r / (c_r + 1.0)
272 neg_charge += aa_content[aa] * partial_charge
273 return round(pos_charge - neg_charge, 3)
274
275
276 class GlobalDescriptor(BaseDescriptor):
277 """
278 Base class for global, non-amino acid scale dependant descriptors. The following descriptors can be calculated by
279 the **methods** linked below:
280
281 - `Sequence Length <modlamp.html#modlamp.descriptors.GlobalDescriptor.length>`_
282 - `Molecular Formula <modlamp.html#modlamp.descriptors.GlobalDescriptor.formula>`_
283 - `Molecular Weight <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_MW>`_
284 - `Sequence Charge <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_charge>`_
285 - `Charge Density <modlamp.html#modlamp.descriptors.GlobalDescriptor.charge_density>`_
286 - `Isoelectric Point <modlamp.html#modlamp.descriptors.GlobalDescriptor.isoelectric_point>`_
287 - `Instability Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.instability_index>`_
288 - `Aromaticity <modlamp.html#modlamp.descriptors.GlobalDescriptor.aromaticity>`_
289 - `Aliphatic Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.aliphatic_index>`_
290 - `Boman Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.boman_index>`_
291 - `Hydrophobic Ratio <modlamp.html#modlamp.descriptors.GlobalDescriptor.hydrophobic_ratio>`_
292 - `all of the above <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_all>`_
293 """
294
295 def length(self, append=False):
296 """
297 Method to calculate the length (total AA count) of every sequence in the attribute :py:attr:`sequences`.
298
299 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
300 attribute :py:attr:`descriptor`.
301 :return: array of sequence lengths in the attribute :py:attr:`descriptor`
302 :Example:
303
304 >>> desc = GlobalDescriptor(['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'])
305 >>> desc.length()
306 >>> desc.descriptor
307 array([[ 8.], [12.], [12.]])
308 """
309 desc = []
310 for seq in self.sequences:
311 desc.append(float(len(seq.strip())))
312 desc = np.asarray(desc).reshape(len(desc), 1)
313 if append:
314 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
315 self.featurenames.append("Length")
316 else:
317 self.descriptor = np.array(desc)
318 self.featurenames = ["Length"]
319
320 def formula(self, amide=False, append=False):
321 """Method to calculate the molecular formula of every sequence in the attribute :py:attr:`sequences`.
322
323 :param amide: {boolean} whether the sequences are C-terminally amidated.
324 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
325 attribute :py:attr:`descriptor`.
326 :return: array of molecular formulas {str} in the attribute :py:attr:`descriptor`
327 :Example:
328
329 >>> desc = GlobalDescriptor(['KADSFLSADGHSADFSLDKKLKERL', 'ERTILSDFPQWWFASLDFLNC', 'ACDEFGHIKLMNPQRSTVWY'])
330 >>> desc.formula(amide=True)
331 >>> for v in desc.descriptor:
332 ... print(v[0])
333 C122 H197 N35 O39
334 C121 H168 N28 O33 S
335 C106 H157 N29 O30 S2
336
337 .. seealso:: :py:func:`modlamp.core.aa_formulas()`
338
339 .. versionadded:: v2.7.6
340 """
341 desc = []
342 formulas = aa_formulas()
343 for seq in self.sequences:
344 f = {"C": 0, "H": 0, "N": 0, "O": 0, "S": 0}
345 for aa in seq: # loop over all AAs
346 for k in f.keys():
347 f[k] += formulas[aa][k]
348
349 # substract H2O for every peptide bond
350 f["H"] -= 2 * (len(seq) - 1)
351 f["O"] -= len(seq) - 1
352
353 if amide: # add C-terminal amide --> replace OH with NH2
354 f["O"] -= 1
355 f["H"] += 1
356 f["N"] += 1
357
358 if f["S"] != 0:
359 val = "C%s H%s N%s O%s %s%s" % (
360 f["C"],
361 f["H"],
362 f["N"],
363 f["O"],
364 "S",
365 f["S"],
366 )
367 else:
368 val = "C%s H%s N%s O%s" % (f["C"], f["H"], f["N"], f["O"])
369
370 desc.append([val])
371
372 if append:
373 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
374 self.featurenames.append("Formula")
375 else:
376 self.descriptor = np.array(desc)
377 self.featurenames = ["Formula"]
378
379 def calculate_MW(self, amide=False, append=False):
380 """Method to calculate the molecular weight [g/mol] of every sequence in the attribute :py:attr:`sequences`.
381
382 :param amide: {boolean} whether the sequences are C-terminally amidated (subtracts 0.95 from the MW).
383 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
384 attribute :py:attr:`descriptor`.
385 :return: array of descriptor values in the attribute :py:attr:`descriptor`
386 :Example:
387
388 >>> desc = GlobalDescriptor('IAESFKGHIPL')
389 >>> desc.calculate_MW(amide=True)
390 >>> desc.descriptor
391 array([[ 1210.43]])
392
393 .. seealso:: :py:func:`modlamp.core.aa_weights()`
394
395 .. versionchanged:: v2.1.5 amide option added
396 """
397 desc = []
398 weights = aa_weights()
399 for seq in self.sequences:
400 mw = []
401 for aa in seq: # sum over aa weights
402 mw.append(weights[aa])
403 desc.append(
404 round(sum(mw) - 18.015 * (len(seq) - 1), 2)
405 ) # sum over AA MW and subtract H20 MW for every
406 # peptide bond
407 desc = np.asarray(desc).reshape(len(desc), 1)
408 if (
409 amide
410 ): # if sequences are amidated, subtract 0.98 from calculated MW (OH - NH2)
411 desc = [d - 0.98 for d in desc]
412 if append:
413 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
414 self.featurenames.append("MW")
415 else:
416 self.descriptor = np.array(desc)
417 self.featurenames = ["MW"]
418
419 def calculate_charge(self, ph=7.0, amide=False, append=False):
420 """Method to overall charge of every sequence in the attribute :py:attr:`sequences`.
421
422 The method used is first described by Bjellqvist. In the case of amidation, the value for the 'Cterm' pKa is 15
423 (and Cterm is added to the pos_pKs dictionary.
424 The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed).
425
426 **pos_pKs** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
427
428 **neg_pKs** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}
429
430 :param ph: {float} ph at which to calculate peptide charge.
431 :param amide: {boolean} whether the sequences have an amidated C-terminus.
432 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
433 attribute :py:attr:`descriptor`.
434 :return: array of descriptor values in the attribute :py:attr:`descriptor`
435 :Example:
436
437 >>> desc = GlobalDescriptor('KLAKFGKRSELVALSG')
438 >>> desc.calculate_charge(ph=7.4, amide=True)
439 >>> desc.descriptor
440 array([[ 3.989]])
441 """
442
443 desc = []
444 for seq in self.sequences:
445 desc.append(
446 _charge(seq, ph, amide)
447 ) # calculate charge with helper function
448 desc = np.asarray(desc).reshape(len(desc), 1)
449 if append:
450 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
451 self.featurenames.append("Charge")
452 else:
453 self.descriptor = np.array(desc)
454 self.featurenames = ["Charge"]
455
456 def charge_density(self, ph=7.0, amide=False, append=False):
457 """Method to calculate the charge density (charge / MW) of every sequences in the attributes :py:attr:`sequences`
458
459 :param ph: {float} pH at which to calculate peptide charge.
460 :param amide: {boolean} whether the sequences have an amidated C-terminus.
461 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
462 attribute :py:attr:`descriptor`.
463 :return: array of descriptor values in the attribute :py:attr:`descriptor`.
464 :Example:
465
466 >>> desc = GlobalDescriptor('GNSDLLIEQRTLLASDEF')
467 >>> desc.charge_density(ph=6, amide=True)
468 >>> desc.descriptor
469 array([[-0.00097119]])
470 """
471 self.calculate_charge(ph, amide)
472 charges = self.descriptor
473 self.calculate_MW(amide)
474 masses = self.descriptor
475 desc = charges / masses
476 desc = np.asarray(desc).reshape(len(desc), 1)
477 if append:
478 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
479 self.featurenames.append("ChargeDensity")
480 else:
481 self.descriptor = np.array(desc)
482 self.featurenames = ["ChargeDensity"]
483
484 def isoelectric_point(self, amide=False, append=False):
485 """
486 Method to calculate the isoelectric point of every sequence in the attribute :py:attr:`sequences`.
487 The pK scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed).
488
489 **pos_pKs** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
490
491 **neg_pKs** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}
492
493 :param amide: {boolean} whether the sequences have an amidated C-terminus.
494 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
495 attribute :py:attr:`descriptor`.
496 :return: array of descriptor values in the attribute :py:attr:`descriptor`
497 :Example:
498
499 >>> desc = GlobalDescriptor('KLFDIKFGHIPQRST')
500 >>> desc.isoelectric_point()
501 >>> desc.descriptor
502 array([[ 10.6796875]])
503 """
504 ph, ph1, ph2 = float(), float(), float()
505 desc = []
506 for seq in self.sequences:
507
508 # Bracket between ph1 and ph2
509 ph = 7.0
510 charge = _charge(seq, ph, amide)
511 if charge > 0.0:
512 ph1 = ph
513 charge1 = charge
514 while charge1 > 0.0:
515 ph = ph1 + 1.0
516 charge = _charge(seq, ph, amide)
517 if charge > 0.0:
518 ph1 = ph
519 charge1 = charge
520 else:
521 ph2 = ph
522 break
523 else:
524 ph2 = ph
525 charge2 = charge
526 while charge2 < 0.0:
527 ph = ph2 - 1.0
528 charge = _charge(seq, ph, amide)
529 if charge < 0.0:
530 ph2 = ph
531 charge2 = charge
532 else:
533 ph1 = ph
534 break
535 # Bisection
536 while ph2 - ph1 > 0.0001 and charge != 0.0:
537 ph = (ph1 + ph2) / 2.0
538 charge = _charge(seq, ph, amide)
539 if charge > 0.0:
540 ph1 = ph
541 else:
542 ph2 = ph
543 desc.append(ph)
544 desc = np.asarray(desc).reshape(len(desc), 1)
545 if append:
546 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
547 self.featurenames.append("pI")
548 else:
549 self.descriptor = np.array(desc)
550 self.featurenames = ["pI"]
551
552 def instability_index(self, append=False):
553 """
554 Method to calculate the instability of every sequence in the attribute :py:attr:`sequences`.
555 The instability index is a prediction of protein stability based on the amino acid composition.
556 ([1] K. Guruprasad, B. V Reddy, M. W. Pandit, Protein Eng. 1990, 4, 155–161.)
557
558 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
559 attribute :py:attr:`descriptor`.
560 :return: array of descriptor values in the attribute :py:attr:`descriptor`
561 :Example:
562
563 >>> desc = GlobalDescriptor('LLASMNDLLAKRST')
564 >>> desc.instability_index()
565 >>> desc.descriptor
566 array([[ 63.95714286]])
567 """
568
569 desc = []
570 dimv = load_scale("instability")[1]
571 for seq in self.sequences:
572 stabindex = float()
573 for i in range(len(seq) - 1):
574 stabindex += dimv[seq[i]][seq[i + 1]]
575 desc.append((10.0 / len(seq)) * stabindex)
576 desc = np.asarray(desc).reshape(len(desc), 1)
577 if append:
578 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
579 self.featurenames.append("InstabilityInd")
580 else:
581 self.descriptor = np.array(desc)
582 self.featurenames = ["InstabilityInd"]
583
584 def aromaticity(self, append=False):
585 """
586 Method to calculate the aromaticity of every sequence in the attribute :py:attr:`sequences`.
587 According to Lobry, 1994, it is simply the relative frequency of Phe+Trp+Tyr.
588
589 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
590 attribute :py:attr:`descriptor`.
591 :return: array of descriptor values in the attribute :py:attr:`descriptor`
592 :Example:
593
594 >>> desc = GlobalDescriptor('GLFYWRFFLQRRFLYWW')
595 >>> desc.aromaticity()
596 >>> desc.descriptor
597 array([[ 0.52941176]])
598 """
599 desc = []
600 for seq in self.sequences:
601 f = seq.count("F")
602 w = seq.count("W")
603 y = seq.count("Y")
604 desc.append(float(f + w + y) / len(seq))
605 desc = np.asarray(desc).reshape(len(desc), 1)
606 if append:
607 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
608 self.featurenames.append("Aromaticity")
609 else:
610 self.descriptor = np.array(desc)
611 self.featurenames = ["Aromaticity"]
612
613 def aliphatic_index(self, append=False):
614 """
615 Method to calculate the aliphatic index of every sequence in the attribute :py:attr:`sequences`.
616 According to Ikai, 1980, the aliphatic index is a measure of thermal stability of proteins and is dependant
617 on the relative volume occupied by aliphatic amino acids (A,I,L & V).
618 ([1] A. Ikai, J. Biochem. 1980, 88, 1895–1898.)
619
620 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
621 attribute :py:attr:`descriptor`.
622 :return: array of descriptor values in the attribute :py:attr:`descriptor`
623 :Example:
624
625 >>> desc = GlobalDescriptor('KWLKYLKKLAKLVK')
626 >>> desc.aliphatic_index()
627 >>> desc.descriptor
628 array([[ 139.28571429]])
629 """
630 desc = []
631 aa_dict = aa_weights()
632 for seq in self.sequences:
633 d = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa
634 d = {
635 k: (float(d[k]) / len(seq)) * 100 for k in d.keys()
636 } # get mole percent of all AA
637 desc.append(
638 d["A"] + 2.9 * d["V"] + 3.9 * (d["I"] + d["L"])
639 ) # formula for calculating the AI (Ikai, 1980)
640 desc = np.asarray(desc).reshape(len(desc), 1)
641 if append:
642 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
643 self.featurenames.append("AliphaticInd")
644 else:
645 self.descriptor = np.array(desc)
646 self.featurenames = ["AliphaticInd"]
647
648 def boman_index(self, append=False):
649 """Method to calculate the boman index of every sequence in the attribute :py:attr:`sequences`.
650 According to Boman, 2003, the boman index is a measure for protein-protein interactions and is calculated by
651 summing over all amino acid free energy of transfer [kcal/mol] between water and cyclohexane,[2] followed by
652 dividing by sequence length.
653 ([1] H. G. Boman, D. Wade, I. a Boman, B. Wåhlin, R. B. Merrifield, *FEBS Lett*. **1989**, *259*, 103–106.
654 [2] A. Radzick, R. Wolfenden, *Biochemistry* **1988**, *27*, 1664–1670.)
655
656 .. seealso:: :py:func:`modlamp.core.aa_energies()`
657
658 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
659 attribute :py:attr:`descriptor`.
660 :return: array of descriptor values in the attribute :py:attr:`descriptor`
661 :Example:
662
663 >>> desc = GlobalDescriptor('GLFDIVKKVVGALGSL')
664 >>> desc.boman_index()
665 >>> desc.descriptor
666 array([[-1.011875]])
667 """
668 d = aa_energies()
669 desc = []
670 for seq in self.sequences:
671 val = []
672 for a in seq:
673 val.append(d[a])
674 desc.append(sum(val) / len(val))
675 desc = np.asarray(desc).reshape(len(desc), 1)
676 if append:
677 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
678 self.featurenames.append("BomanInd")
679 else:
680 self.descriptor = np.array(desc)
681 self.featurenames = ["BomanInd"]
682
683 def hydrophobic_ratio(self, append=False):
684 """
685 Method to calculate the hydrophobic ratio of every sequence in the attribute :py:attr:`sequences`, which is the
686 relative frequency of the amino acids **A,C,F,I,L,M & V**.
687
688 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
689 attribute :py:attr:`descriptor`.
690 :return: array of descriptor values in the attribute :py:attr:`descriptor`
691 :Example:
692
693 >>> desc = GlobalDescriptor('VALLYWRTVLLAIII')
694 >>> desc.hydrophobic_ratio()
695 >>> desc.descriptor
696 array([[ 0.73333333]])
697 """
698 desc = []
699 aa_dict = aa_weights()
700 for seq in self.sequences:
701 pa = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa
702 # formula for calculating the AI (Ikai, 1980):
703 desc.append(
704 (pa["A"] + pa["C"] + pa["F"] + pa["I"] + pa["L"] + pa["M"] + pa["V"])
705 / float(len(seq))
706 )
707 desc = np.asarray(desc).reshape(len(desc), 1)
708 if append:
709 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
710 self.featurenames.append("HydrophRatio")
711 else:
712 self.descriptor = np.array(desc)
713 self.featurenames = ["HydrophRatio"]
714
715 def calculate_all(self, ph=7.4, amide=True):
716 """Method combining all global descriptors and appending them into the feature matrix in the attribute
717 :py:attr:`descriptor`.
718
719 :param ph: {float} pH at which to calculate peptide charge
720 :param amide: {boolean} whether the sequences have an amidated C-terminus.
721 :return: array of descriptor values in the attribute :py:attr:`descriptor`
722 :Example:
723
724 >>> desc = GlobalDescriptor('AFGHFKLKKLFIFGHERT')
725 >>> desc.calculate_all(amide=True)
726 >>> desc.featurenames
727 ['Length', 'MW', 'ChargeDensity', 'pI', 'InstabilityInd', 'Aromaticity', 'AliphaticInd', 'BomanInd', 'HydRatio']
728 >>> desc.descriptor
729 array([[ 18., 2.17559000e+03, 1.87167619e-03, 1.16757812e+01, ... 1.10555556e+00, 4.44444444e-01]])
730 >>> desc.save_descriptor('/path/to/outputfile.csv') # save the descriptor data (with feature names header)
731 """
732
733 # This is a strange way of doing it. However, the append=True option excludes length and charge, no idea why!
734 fn = []
735 self.length() # sequence length
736 l = self.descriptor
737 fn.extend(self.featurenames)
738 self.calculate_MW(amide=amide) # molecular weight
739 mw = self.descriptor
740 fn.extend(self.featurenames)
741 self.calculate_charge(ph=ph, amide=amide) # net charge
742 c = self.descriptor
743 fn.extend(self.featurenames)
744 self.charge_density(ph=ph, amide=amide) # charge density
745 cd = self.descriptor
746 fn.extend(self.featurenames)
747 self.isoelectric_point(amide=amide) # pI
748 pi = self.descriptor
749 fn.extend(self.featurenames)
750 self.instability_index() # instability index
751 si = self.descriptor
752 fn.extend(self.featurenames)
753 self.aromaticity() # global aromaticity
754 ar = self.descriptor
755 fn.extend(self.featurenames)
756 self.aliphatic_index() # aliphatic index
757 ai = self.descriptor
758 fn.extend(self.featurenames)
759 self.boman_index() # Boman index
760 bi = self.descriptor
761 fn.extend(self.featurenames)
762 self.hydrophobic_ratio() # Hydrophobic ratio
763 hr = self.descriptor
764 fn.extend(self.featurenames)
765
766 self.descriptor = np.concatenate((l, mw, c, cd, pi, si, ar, ai, bi, hr), axis=1)
767 self.featurenames = fn
768
769
770 class PeptideDescriptor(BaseDescriptor):
771 """Base class for peptide descriptors. The following **amino acid descriptor scales** are available for descriptor
772 calculation:
773
774 - **AASI** (An amino acid selectivity index scale for helical antimicrobial peptides, *[1] D. Juretić, D. Vukicević, N. Ilić, N. Antcheva, A. Tossi, J. Chem. Inf. Model. 2009, 49, 2873–2882.*)
775 - **ABHPRK** (modlabs inhouse physicochemical feature scale (Acidic, Basic, Hydrophobic, Polar, aRomatic, Kink-inducer)
776 - **argos** (Argos hydrophobicity amino acid scale, *[2] Argos, P., Rao, J. K. M. & Hargrave, P. A., Eur. J. Biochem. 2005, 128, 565–575.*)
777 - **bulkiness** (Amino acid side chain bulkiness scale, *[3] J. M. Zimmerman, N. Eliezer, R. Simha, J. Theor. Biol. 1968, 21, 170–201.*)
778 - **charge_phys** (Amino acid charge at pH 7.0 - Hystidine charge +0.1.)
779 - **charge_acid** (Amino acid charge at acidic pH - Hystidine charge +1.0.)
780 - **cougar** (modlabs inhouse selection of global peptide descriptors)
781 - **eisenberg** (the Eisenberg hydrophobicity consensus amino acid scale, *[4] D. Eisenberg, R. M. Weiss, T. C. Terwilliger, W. Wilcox, Faraday Symp. Chem. Soc. 1982, 17, 109.*)
782 - **Ez** (potential that assesses energies of insertion of amino acid side chains into lipid bilayers, *[5] A. Senes, D. C. Chadi, P. B. Law, R. F. S. Walters, V. Nanda, W. F. DeGrado, J. Mol. Biol. 2007, 366, 436–448.*)
783 - **flexibility** (amino acid side chain flexibilitiy scale, *[6] R. Bhaskaran, P. K. Ponnuswamy, Int. J. Pept. Protein Res. 1988, 32, 241–255.*)
784 - **grantham** (amino acid side chain composition, polarity and molecular volume, *[8] Grantham, R. Science. 185, 862–864 (1974).*)
785 - **gravy** (GRAVY hydrophobicity amino acid scale, *[9] J. Kyte, R. F. Doolittle, J. Mol. Biol. 1982, 157, 105–132.*)
786 - **hopp-woods** (Hopp-Woods amino acid hydrophobicity scale,*[10] T. P. Hopp, K. R. Woods, Proc. Natl. Acad. Sci. 1981, 78, 3824–3828.*)
787 - **ISAECI** (Isotropic Surface Area (ISA) and Electronic Charge Index (ECI) of amino acid side chains, *[11] E. R. Collantes, W. J. Dunn, J. Med. Chem. 1995, 38, 2705–2713.*)
788 - **janin** (Janin hydrophobicity amino acid scale, *[12] J. L. Cornette, K. B. Cease, H. Margalit, J. L. Spouge, J. A. Berzofsky, C. DeLisi, J. Mol. Biol. 1987, 195, 659–685.*)
789 - **kytedoolittle** (Kyte & Doolittle hydrophobicity amino acid scale, *[13] J. Kyte, R. F. Doolittle, J. Mol. Biol. 1982, 157, 105–132.*)
790 - **levitt_alpha** (Levitt amino acid alpha-helix propensity scale, extracted from http://web.expasy.org/protscale. *[14] M. Levitt, Biochemistry 1978, 17, 4277-4285.*)
791 - **MSS** (A graph-theoretical index that reflects topological shape and size of amino acid side chains, *[15] C. Raychaudhury, A. Banerjee, P. Bag, S. Roy, J. Chem. Inf. Comput. Sci. 1999, 39, 248–254.*)
792 - **MSW** (Amino acid scale based on a PCA of the molecular surface based WHIM descriptor (MS-WHIM), extended to natural amino acids, *[16] A. Zaliani, E. Gancia, J. Chem. Inf. Comput. Sci 1999, 39, 525–533.*)
793 - **pepArc** (modlabs pharmacophoric feature scale, dimensions are: hydrophobicity, polarity, positive charge, negative charge, proline.)
794 - **pepcats** (modlabs pharmacophoric feature based PEPCATS scale, *[17] C. P. Koch, A. M. Perna, M. Pillong, N. K. Todoroff, P. Wrede, G. Folkers, J. A. Hiss, G. Schneider, PLoS Comput. Biol. 2013, 9, e1003088.*)
795 - **polarity** (Amino acid polarity scale, *[18] J. M. Zimmerman, N. Eliezer, R. Simha, J. Theor. Biol. 1968, 21, 170–201.*)
796 - **PPCALI** (modlabs inhouse scale derived from a PCA of 143 amino acid property scales, *[19] C. P. Koch, A. M. Perna, M. Pillong, N. K. Todoroff, P. Wrede, G. Folkers, J. A. Hiss, G. Schneider, PLoS Comput. Biol. 2013, 9, e1003088.*)
797 - **refractivity** (Relative amino acid refractivity values, *[20] T. L. McMeekin, M. Wilensky, M. L. Groves, Biochem. Biophys. Res. Commun. 1962, 7, 151–156.*)
798 - **t_scale** (A PCA derived scale based on amino acid side chain properties calculated with 6 different probes of the GRID program, *[21] M. Cocchi, E. Johansson, Quant. Struct. Act. Relationships 1993, 12, 1–8.*)
799 - **TM_tend** (Amino acid transmembrane propensity scale, extracted from http://web.expasy.org/protscale, *[22] Zhao, G., London E. Protein Sci. 2006, 15, 1987-2001.*)
800 - **z3** (The original three dimensional Z-scale, *[23] S. Hellberg, M. Sjöström, B. Skagerberg, S. Wold, J. Med. Chem. 1987, 30, 1126–1135.*)
801 - **z5** (The extended five dimensional Z-scale, *[24] M. Sandberg, L. Eriksson, J. Jonsson, M. Sjöström, S. Wold, J. Med. Chem. 1998, 41, 2481–2491.*)
802
803 Further, amino acid scale independent methods can be calculated with help of the :class:`GlobalDescriptor` class.
804
805 """
806
807 def __init__(self, seqs, scalename="Eisenberg"):
808 """
809 :param seqs: a .fasta file with sequences, a list of sequences or a single sequence as string to calculate the
810 descriptor values for.
811 :param scalename: {str} name of the amino acid scale (one of the given list above) used to calculate the
812 descriptor values
813 :return: initialized attributes :py:attr:`sequences`, :py:attr:`names` and dictionary :py:attr:`scale` with
814 amino acid scale values of the scale name in :py:attr:`scalename`.
815 :Example:
816
817 >>> AMP = PeptideDescriptor('KLLKLLKKLLKLLK','pepcats')
818 >>> AMP.sequences
819 ['KLLKLLKKLLKLLK']
820 >>> seqs = PeptideDescriptor('/Path/to/file.fasta', 'eisenberg') # load sequences from .fasta file
821 >>> seqs.sequences
822 ['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'...]
823 """
824 super(PeptideDescriptor, self).__init__(seqs)
825 self.scalename, self.scale = load_scale(scalename.lower())
826 self.all_moms = list() # for passing hydrophobic moments to calculate_profile
827 self.all_globs = list() # for passing global to calculate_profile
828
829 def load_scale(self, scalename):
830 """Method to load amino acid values from a given scale
831
832 :param scalename: {str} name of the amino acid scale to be loaded.
833 :return: loaded amino acid scale values in a dictionary in the attribute :py:attr:`scale`.
834
835 .. seealso:: :func:`modlamp.core.load_scale()`
836 """
837 self.scalename, self.scale = load_scale(scalename.lower())
838
839 def calculate_autocorr(self, window, append=False):
840 """Method for auto-correlating the amino acid values for a given descriptor scale
841
842 :param window: {int} correlation window for descriptor calculation in a sliding window approach
843 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
844 attribute :py:attr:`descriptor`.
845 :return: calculated descriptor numpy.array in the attribute :py:attr:`descriptor`.
846 :Example:
847
848 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','PPCALI')
849 >>> AMP.calculate_autocorr(7)
850 >>> AMP.descriptor
851 array([[ 1.28442339e+00, 1.29025116e+00, 1.03240901e+00, .... ]])
852 >>> AMP.descriptor.shape
853 (1, 133)
854
855 .. versionchanged:: v.2.3.0
856 """
857 desc = Parallel(n_jobs=-1)(
858 delayed(_one_autocorr)(seq, window, self.scale) for seq in self.sequences
859 )
860
861 if append:
862 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
863 else:
864 self.descriptor = np.array(desc)
865
866 def calculate_crosscorr(self, window, append=False):
867 """Method for cross-correlating the amino acid values for a given descriptor scale
868
869 :param window: {int} correlation window for descriptor calculation in a sliding window approach
870 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
871 attribute :py:attr:`descriptor`.
872 :return: calculated descriptor numpy.array in the attribute :py:attr:`descriptor`.
873 :Example:
874
875 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','pepcats')
876 >>> AMP.calculate_crosscorr(7)
877 >>> AMP.descriptor
878 array([[ 0.6875 , 0.46666667, 0.42857143, 0.61538462, 0.58333333, ... ]])
879 >>> AMP.descriptor.shape
880 (1, 147)
881 """
882 desc = Parallel(n_jobs=-1)(
883 delayed(_one_crosscorr)(seq, window, self.scale) for seq in self.sequences
884 )
885
886 if append:
887 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
888 else:
889 self.descriptor = np.array(desc)
890
891 def calculate_moment(self, window=1000, angle=100, modality="max", append=False):
892 """Method for calculating the maximum or mean moment of the amino acid values for a given descriptor scale and
893 window.
894
895 :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the
896 window, the length of the sequence is taken. So if the default window of 1000 is chosen, for all sequences
897 shorter than 1000, the **global** hydrophobic moment will be calculated. Otherwise, the maximal
898 hydrophiobic moment for the chosen window size found in the sequence will be returned.
899 :param angle: {int} angle in which to calculate the moment. **100** for alpha helices, **180** for beta sheets.
900 :param modality: {'all', 'max' or 'mean'} Calculate respectively maximum or mean hydrophobic moment. If all,
901 moments for all windows are returned.
902 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
903 attribute :py:attr:`descriptor`.
904 :return: Calculated descriptor as a numpy.array in the attribute :py:attr:`descriptor` and all possible global
905 values in :py:attr:`all_moms` (needed for the :py:func:`calculate_profile` method)
906 :Example:
907
908 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL', 'eisenberg')
909 >>> AMP.calculate_moment()
910 >>> AMP.descriptor
911 array([[ 0.48790226]])
912 """
913 if self.scale["A"] == list:
914 print(
915 "\n Descriptor moment calculation is only possible for one dimensional descriptors.\n"
916 )
917
918 else:
919 desc = []
920 for seq in self.sequences:
921 wdw = min(
922 window, len(seq)
923 ) # if sequence is shorter than window, take the whole sequence instead
924 mtrx = []
925 mwdw = []
926
927 for aa in range(len(seq)):
928 mtrx.append(self.scale[str(seq[aa])])
929
930 for i in range(len(mtrx) - wdw + 1):
931 mwdw.append(sum(mtrx[i : i + wdw], []))
932
933 mwdw = np.asarray(mwdw)
934 rads = (
935 angle * (np.pi / 180) * np.asarray(range(wdw))
936 ) # calculate actual moment (radial)
937 vcos = (mwdw * np.cos(rads)).sum(axis=1)
938 vsin = (mwdw * np.sin(rads)).sum(axis=1)
939 moms = np.sqrt(vsin**2 + vcos**2) / wdw
940
941 if modality == "max": # take window with maximal value
942 moment = np.max(moms)
943 elif modality == "mean": # take average value over all windows
944 moment = np.mean(moms)
945 elif modality == "all":
946 moment = moms
947 else:
948 print(
949 '\nERROR!\nModality parameter is wrong, please choose between "all", "max" and "mean".\n'
950 )
951 return
952 desc.append(moment)
953 self.all_moms.append(moms)
954
955 desc = np.asarray(desc).reshape(len(desc), 1) # final descriptor array
956
957 if append:
958 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
959 else:
960 self.descriptor = np.array(desc)
961
962 def calculate_global(self, window=1000, modality="max", append=False):
963 """Method for calculating a global / window averaging descriptor value of a given AA scale
964
965 :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the
966 window, the length of the sequence is taken.
967 :param modality: {'max' or 'mean'} Calculate respectively maximum or mean hydrophobic moment.
968 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
969 attribute :py:attr:`descriptor`.
970 :return: Calculated descriptor as a numpy.array in the attribute :py:attr:`descriptor` and all possible global
971 values in :py:attr:`all_globs` (needed for the :py:func:`calculate_profile` method)
972 :Example:
973
974 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','eisenberg')
975 >>> AMP.calculate_global(window=1000, modality='max')
976 >>> AMP.descriptor
977 array([[ 0.44875]])
978 """
979 desc = list()
980 for n, seq in enumerate(self.sequences):
981 wdw = min(
982 window, len(seq)
983 ) # if sequence is shorter than window, take the whole sequence instead
984 mtrx = []
985 mwdw = []
986
987 for l in range(len(seq)): # translate AA sequence into values
988 mtrx.append(self.scale[str(seq[l])])
989
990 for i in range(len(mtrx) - wdw + 1):
991 mwdw.append(
992 sum(mtrx[i : i + wdw], [])
993 ) # list of all the values for the different windows
994
995 mwdw = np.asarray(mwdw)
996 glob = np.sum(mwdw, axis=1) / float(wdw)
997 outglob = float()
998
999 if modality in ["max", "mean"]:
1000 if modality == "max":
1001 outglob = np.max(
1002 glob
1003 ) # returned moment will be the maximum of all windows
1004 elif modality == "mean":
1005 outglob = np.mean(
1006 glob
1007 ) # returned moment will be the mean of all windows
1008 else:
1009 print(
1010 'Modality parameter is wrong, please choose between "max" and "mean"\n.'
1011 )
1012 return
1013 desc.append(outglob)
1014 self.all_globs.append(glob)
1015
1016 desc = np.asarray(desc).reshape(len(desc), 1)
1017 if append:
1018 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
1019 else:
1020 self.descriptor = np.array(desc)
1021
1022 def calculate_profile(self, prof_type="uH", window=7, append=False):
1023 """Method for calculating hydrophobicity or hydrophobic moment profiles for given sequences and fitting for
1024 slope and intercept. The hydrophobicity scale used is "eisenberg"
1025
1026 :param prof_type: prof_type of profile, available: 'H' for hydrophobicity or 'uH' for hydrophobic moment
1027 :param window: {int} size of sliding window used (odd-numbered).
1028 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
1029 attribute :py:attr:`descriptor`.
1030 :return: Fitted slope and intercept of calculated profile for every given sequence in the attribute
1031 :py:attr:`descriptor`.
1032 :Example:
1033
1034 >>> AMP = PeptideDescriptor('KLLKLLKKVVGALG','kytedoolittle')
1035 >>> AMP.calculate_profile(prof_type='H')
1036 >>> AMP.descriptor
1037 array([[ 0.03731293, 0.19246599]])
1038 """
1039 if prof_type == "uH":
1040 self.calculate_moment(window=window)
1041 y_vals = self.all_moms
1042 elif prof_type == "H":
1043 self.calculate_global(window=window)
1044 y_vals = self.all_globs
1045 else:
1046 print(
1047 'prof_type parameter is unknown, choose "uH" for hydrophobic moment or "H" for hydrophobicity\n.'
1048 )
1049 sys.exit()
1050
1051 desc = list()
1052 for n, seq in enumerate(self.sequences):
1053 x_vals = range(len(seq))[int((window - 1) / 2) : -int((window - 1) / 2)]
1054 if len(seq) <= window:
1055 slope, intercept, r_value, p_value, std_err = [0, 0, 0, 0, 0]
1056 else:
1057 slope, intercept, r_value, p_value, std_err = stats.linregress(
1058 x_vals, y_vals[n]
1059 )
1060 desc.append([slope, intercept])
1061
1062 if append:
1063 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
1064 else:
1065 self.descriptor = np.array(desc)
1066
1067 def calculate_arc(self, modality="max", append=False):
1068 """Method for calculating property arcs as seen in the helical wheel plot. Use for binary amino acid scales only.
1069
1070 :param modality: modality of the arc to calculate, to choose between "max" and "mean".
1071 :param append: if true, append to current descriptor stored in the descriptor attribute.
1072 :return: calculated descriptor as numpy.array in the descriptor attribute.
1073
1074 :Example:
1075
1076 >>> arc = PeptideDescriptor("KLLKLLKKLLKLLK", scalename="peparc")
1077 >>> arc.calculate_arc(modality="max", append=False)
1078 >>> arc.descriptor
1079 array([[200, 160, 160, 0, 0]])
1080 """
1081 desc = Parallel(n_jobs=-1)(
1082 delayed(_one_arc)(seq, modality, self.scale) for seq in self.sequences
1083 )
1084
1085 # Converts each of the amino acids to descriptor vector
1086 for seq in self.sequences:
1087
1088 # desc_mat = []
1089 # for aa in seq:
1090 # desc_mat.append(self.scale[aa])
1091 # desc_mat = np.asarray(desc_mat)
1092 #
1093 # # Check descriptor dimension
1094 # desc_dim = desc_mat.shape[1]
1095 #
1096 # # list to store descriptor values for all windows
1097 # allwindows_arc = []
1098 #
1099 # if len(seq) > 18:
1100 # window = 18
1101 # # calculates number of windows in sequence
1102 # num_windows = len(seq) - window
1103 # else:
1104 # window = len(seq)
1105 # num_windows = 1
1106 #
1107 # # loop through all windows
1108 # for j in range(num_windows):
1109 # # slices descriptor matrix into current window
1110 # window_mat = desc_mat[j:j + window, :]
1111 #
1112 # # defines order of amino acids in helical projection
1113 # order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7]
1114 #
1115 # # orders window descriptor matrix into helical projection order
1116 # ordered = []
1117 # for pos in order:
1118 # try:
1119 # ordered.append(window_mat[pos, :])
1120 # except:
1121 # # for sequences of len < 18 adding dummy vector with 2s, length of descriptor dimensions
1122 # ordered.append([2] * desc_dim)
1123 # ordered = np.asarray(ordered)
1124 #
1125 # window_arc = []
1126 #
1127 # # loop through pharmacophoric features
1128 # for m in range(desc_dim):
1129 # all_arcs = [] # stores all arcs that can be found of a pharmacophoric feature
1130 # arc = 0
1131 #
1132 # for n in range(18): # for all positions in helix, regardless of sequence length
1133 # if ordered[n, m] == 0: # if position does not contain pharmacophoric feature
1134 # all_arcs.append(arc) # append previous arc to all arcs list
1135 # arc = 0 # arc is initialized
1136 # elif ordered[n, m] == 1: # if position contains pharmacophoric feature(PF), elongate arc by 20°
1137 # arc += 20
1138 # elif ordered[n, m] == 2: # if position doesn't contain amino acid:
1139 # if ordered[n - 1, m] == 1: # if previous position contained PF add 10°
1140 # arc += 10
1141 # elif ordered[n - 1, m] == 0: # if previous position didn't contain PF don't add anything
1142 # arc += 0
1143 # elif ordered[
1144 # n - 2, m] == 1: # if previous position is empty then check second previous for PF
1145 # arc += 10
1146 # if n == 17: # if we are at the last position check for position n=0 instead of next position.
1147 # if ordered[0, m] == 1: # if it contains PF add 10° extra
1148 # arc += 10
1149 # else: # if next position contains PF add 10° extra
1150 # if ordered[n + 1, m] == 1:
1151 # arc += 10
1152 # elif ordered[n + 1, m] == 0:
1153 # arc += 0
1154 # else: # if next position is empty check for 2nd next position
1155 # if n == 16:
1156 # if ordered[0, m] == 1:
1157 # arc += 10
1158 # else:
1159 # if ordered[n + 2, m] == 1:
1160 # arc += 10
1161 #
1162 # all_arcs.append(arc)
1163 # if not arc == 360:
1164 # arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together
1165 # all_arcs = [arc0] + all_arcs[1:]
1166 #
1167 # window_arc.append(np.max(all_arcs)) # append to window arcs the maximum arc of this PF
1168 # allwindows_arc.append(window_arc) # append all PF arcs of this window
1169 #
1170 # allwindows_arc = np.asarray(allwindows_arc)
1171 #
1172 # if modality == 'max':
1173 # final_arc = np.max(allwindows_arc, axis=0) # calculate maximum / mean arc along all windows
1174 # elif modality == 'mean':
1175 # final_arc = np.mean(allwindows_arc, axis=0)
1176 # else:
1177 # print('modality is unknown, please choose between "max" and "mean"\n.')
1178 # sys.exit()
1179
1180 if append:
1181 self.descriptor = np.hstack((self.descriptor, np.array(desc)))
1182 else:
1183 self.descriptor = np.array(desc)