Mercurial > repos > cpt > cpt_helical_wheel
comparison plotWheels/core.py @ 1:9b276485c94a draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:44:43 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:9caa9aa44fd8 | 1:9b276485c94a |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 .. currentmodule:: modlamp.core | |
4 | |
5 .. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch> | |
6 | |
7 Core helper functions and classes for other modules. The two main classes are: | |
8 | |
9 ============================= ======================================================================================= | |
10 Class Characteristics | |
11 ============================= ======================================================================================= | |
12 :py:class:`BaseSequence` Base class inheriting to all sequence classes in the module :py:mod:`modlamp.sequences` | |
13 :py:class:`BaseDescriptor` Base class inheriting to the two descriptor classes in :py:mod:`modlamp.descriptors` | |
14 ============================= ======================================================================================= | |
15 """ | |
16 | |
17 import os | |
18 import random | |
19 import re | |
20 | |
21 import numpy as np | |
22 import pandas as pd | |
23 import collections | |
24 import operator | |
25 from scipy.spatial import distance | |
26 from sklearn.preprocessing import MinMaxScaler, StandardScaler | |
27 from sklearn.utils import shuffle | |
28 | |
29 __author__ = "Alex Müller, Gisela Gabernet" | |
30 __docformat__ = "restructuredtext en" | |
31 | |
32 | |
33 class BaseSequence(object): | |
34 """Base class for sequence classes in the module :mod:`modlamp.sequences`. | |
35 It contains amino acid probabilities for different sequence generation classes. | |
36 | |
37 The following amino acid probabilities are used: (extracted from the | |
38 `APD3 <http://aps.unmc.edu/AP/statistic/statistic.php>`_, March 17, 2016) | |
39 | |
40 === ==== ====== ========= ========== | |
41 AA rand AMP AMPnoCM randnoCM | |
42 === ==== ====== ========= ========== | |
43 A 0.05 0.0766 0.0812275 0.05555555 | |
44 C 0.05 0.071 0.0 0.0 | |
45 D 0.05 0.026 0.0306275 0.05555555 | |
46 E 0.05 0.0264 0.0310275 0.05555555 | |
47 F 0.05 0.0405 0.0451275 0.05555555 | |
48 G 0.05 0.1172 0.1218275 0.05555555 | |
49 H 0.05 0.021 0.0256275 0.05555555 | |
50 I 0.05 0.061 0.0656275 0.05555555 | |
51 K 0.05 0.0958 0.1004275 0.05555555 | |
52 L 0.05 0.0838 0.0884275 0.05555555 | |
53 M 0.05 0.0123 0.0 0.0 | |
54 N 0.05 0.0386 0.0432275 0.05555555 | |
55 P 0.05 0.0463 0.0509275 0.05555555 | |
56 Q 0.05 0.0251 0.0297275 0.05555555 | |
57 R 0.05 0.0545 0.0591275 0.05555555 | |
58 S 0.05 0.0613 0.0659275 0.05555555 | |
59 T 0.05 0.0455 0.0501275 0.05555555 | |
60 V 0.05 0.0572 0.0618275 0.05555555 | |
61 W 0.05 0.0155 0.0201275 0.05555555 | |
62 Y 0.05 0.0244 0.0290275 0.05555555 | |
63 === ==== ====== ========= ========== | |
64 | |
65 """ | |
66 | |
67 def __init__(self, seqnum, lenmin=7, lenmax=28): | |
68 """ | |
69 :param seqnum: number of sequences to generate | |
70 :param lenmin: minimal length of the generated sequences | |
71 :param lenmax: maximal length of the generated sequences | |
72 :return: attributes :py:attr:`seqnum`, :py:attr:`lenmin` and :py:attr:`lenmax`. | |
73 :Example: | |
74 | |
75 >>> b = BaseSequence(10, 7, 28) | |
76 >>> b.seqnum | |
77 10 | |
78 >>> b.lenmin | |
79 7 | |
80 >>> b.lenmax | |
81 28 | |
82 """ | |
83 self.sequences = list() | |
84 self.names = list() | |
85 self.lenmin = int(lenmin) | |
86 self.lenmax = int(lenmax) | |
87 self.seqnum = int(seqnum) | |
88 | |
89 # AA classes: | |
90 self.AA_hyd = ["G", "A", "L", "I", "V"] | |
91 self.AA_basic = ["K", "R"] | |
92 self.AA_acidic = ["D", "E"] | |
93 self.AA_aroma = ["W", "Y", "F"] | |
94 self.AA_polar = ["S", "T", "Q", "N"] | |
95 # AA labels: | |
96 self.AAs = [ | |
97 "A", | |
98 "C", | |
99 "D", | |
100 "E", | |
101 "F", | |
102 "G", | |
103 "H", | |
104 "I", | |
105 "K", | |
106 "L", | |
107 "M", | |
108 "N", | |
109 "P", | |
110 "Q", | |
111 "R", | |
112 "S", | |
113 "T", | |
114 "V", | |
115 "W", | |
116 "Y", | |
117 ] | |
118 # AA probability from the APD3 database: | |
119 self.prob_AMP = [ | |
120 0.0766, | |
121 0.071, | |
122 0.026, | |
123 0.0264, | |
124 0.0405, | |
125 0.1172, | |
126 0.021, | |
127 0.061, | |
128 0.0958, | |
129 0.0838, | |
130 0.0123, | |
131 0.0386, | |
132 0.0463, | |
133 0.0251, | |
134 0.0545, | |
135 0.0613, | |
136 0.0455, | |
137 0.0572, | |
138 0.0155, | |
139 0.0244, | |
140 ] | |
141 # AA probability from the APD2 database without Cys and Met (synthesis reasons) | |
142 self.prob_AMPnoCM = [ | |
143 0.081228, | |
144 0.0, | |
145 0.030627, | |
146 0.031027, | |
147 0.045128, | |
148 0.121828, | |
149 0.025627, | |
150 0.065628, | |
151 0.100428, | |
152 0.088428, | |
153 0.0, | |
154 0.043228, | |
155 0.050928, | |
156 0.029728, | |
157 0.059128, | |
158 0.065927, | |
159 0.050128, | |
160 0.061828, | |
161 0.020128, | |
162 0.029028, | |
163 ] | |
164 # equal AA probabilities: | |
165 self.prob = [ | |
166 0.05, | |
167 0.05, | |
168 0.05, | |
169 0.05, | |
170 0.05, | |
171 0.05, | |
172 0.05, | |
173 0.05, | |
174 0.05, | |
175 0.05, | |
176 0.05, | |
177 0.05, | |
178 0.05, | |
179 0.05, | |
180 0.05, | |
181 0.05, | |
182 0.05, | |
183 0.05, | |
184 0.05, | |
185 0.05, | |
186 ] | |
187 # equal AA probabilities but 0 for Cys and Met: | |
188 self.prob_randnoCM = [ | |
189 0.05555555555, | |
190 0.0, | |
191 0.05555555555, | |
192 0.05555555555, | |
193 0.05555555555, | |
194 0.05555555555, | |
195 0.05555555555, | |
196 0.05555555555, | |
197 0.05555555555, | |
198 0.05555555555, | |
199 0.0, | |
200 0.05555555555, | |
201 0.05555555555, | |
202 0.05555555555, | |
203 0.05555555555, | |
204 0.05555555555, | |
205 0.05555555555, | |
206 0.05555555555, | |
207 0.05555555555, | |
208 0.05555555555, | |
209 ] | |
210 | |
211 # AA probability from the linear CancerPPD peptides: | |
212 self.prob_ACP = [ | |
213 0.14526966, | |
214 0.0, | |
215 0.00690031, | |
216 0.00780824, | |
217 0.06991102, | |
218 0.04957327, | |
219 0.01725077, | |
220 0.05647358, | |
221 0.27637552, | |
222 0.17759216, | |
223 0.00998729, | |
224 0.00798983, | |
225 0.01307427, | |
226 0.00381333, | |
227 0.02941711, | |
228 0.02651171, | |
229 0.0154349, | |
230 0.04013074, | |
231 0.0406755, | |
232 0.00581079, | |
233 ] | |
234 | |
235 # AA probabilities for perfect amphipathic helix of different arc sizes | |
236 self.prob_amphihel = [ | |
237 [ | |
238 0.04545455, | |
239 0.0, | |
240 0.04545454, | |
241 0.04545455, | |
242 0.0, | |
243 0.04545455, | |
244 0.04545455, | |
245 0.0, | |
246 0.25, | |
247 0.0, | |
248 0.0, | |
249 0.04545454, | |
250 0.04545455, | |
251 0.04545454, | |
252 0.25, | |
253 0.04545454, | |
254 0.04545454, | |
255 0.0, | |
256 0.0, | |
257 0.04545454, | |
258 ], | |
259 [ | |
260 0.0, | |
261 0.0, | |
262 0.0, | |
263 0.0, | |
264 0.16666667, | |
265 0.0, | |
266 0.0, | |
267 0.16666667, | |
268 0.0, | |
269 0.16666667, | |
270 0.0, | |
271 0.0, | |
272 0.0, | |
273 0.0, | |
274 0.0, | |
275 0.0, | |
276 0.0, | |
277 0.16666667, | |
278 0.16666667, | |
279 (1.0 - 0.16666667 * 5), | |
280 ], | |
281 ] | |
282 | |
283 # helical ACP AA probabilities, depending on the position of the AA in the helix. | |
284 self.prob_ACPhel = np.array( | |
285 [ | |
286 [ | |
287 0.0483871, | |
288 0.0, | |
289 0.0, | |
290 0.0483871, | |
291 0.01612903, | |
292 0.12903226, | |
293 0.03225807, | |
294 0.09677419, | |
295 0.19354839, | |
296 0.5, | |
297 0.0483871, | |
298 0.11290323, | |
299 0.1, | |
300 0.18518519, | |
301 0.07843137, | |
302 0.12, | |
303 0.17073172, | |
304 0.16666667, | |
305 ], | |
306 [ | |
307 0.0, | |
308 0.0, | |
309 0.0, | |
310 0.0, | |
311 0.0, | |
312 0.0, | |
313 0.0, | |
314 0.0, | |
315 0.0, | |
316 0.0, | |
317 0.01612903, | |
318 0.0, | |
319 0.0, | |
320 0.0, | |
321 0.0, | |
322 0.0, | |
323 0.02439024, | |
324 0.19444444, | |
325 ], | |
326 [ | |
327 0.0, | |
328 0.01612903, | |
329 0.0, | |
330 0.27419355, | |
331 0.01612903, | |
332 0.0, | |
333 0.0, | |
334 0.01612903, | |
335 0.0, | |
336 0.0, | |
337 0.0, | |
338 0.0, | |
339 0.0, | |
340 0.0, | |
341 0.0, | |
342 0.0, | |
343 0.0, | |
344 0.0, | |
345 ], | |
346 [ | |
347 0.0, | |
348 0.0, | |
349 0.0, | |
350 0.0, | |
351 0.0, | |
352 0.0, | |
353 0.0, | |
354 0.06451613, | |
355 0.0, | |
356 0.01612903, | |
357 0.0483871, | |
358 0.01612903, | |
359 0.0, | |
360 0.01851852, | |
361 0.0, | |
362 0.0, | |
363 0.0, | |
364 0.0, | |
365 ], | |
366 [ | |
367 0.16129032, | |
368 0.0483871, | |
369 0.30645161, | |
370 0.0, | |
371 0.0483871, | |
372 0.0, | |
373 0.0, | |
374 0.01612903, | |
375 0.0, | |
376 0.01612903, | |
377 0.0, | |
378 0.09677419, | |
379 0.06666667, | |
380 0.01851852, | |
381 0.0, | |
382 0.02, | |
383 0.14634146, | |
384 0.0, | |
385 ], | |
386 [ | |
387 0.64516129, | |
388 0.0, | |
389 0.17741936, | |
390 0.14516129, | |
391 0.0, | |
392 0.01612903, | |
393 0.25806452, | |
394 0.11290323, | |
395 0.06451613, | |
396 0.08064516, | |
397 0.22580645, | |
398 0.03225807, | |
399 0.06666667, | |
400 0.2037037, | |
401 0.1372549, | |
402 0.1, | |
403 0.0, | |
404 0.05555556, | |
405 ], | |
406 [ | |
407 0.0, | |
408 0.0, | |
409 0.0, | |
410 0.01612903, | |
411 0.0, | |
412 0.0, | |
413 0.01612903, | |
414 0.0, | |
415 0.03225807, | |
416 0.0, | |
417 0.0, | |
418 0.20967742, | |
419 0.0, | |
420 0.0, | |
421 0.0, | |
422 0.16, | |
423 0.0, | |
424 0.0, | |
425 ], | |
426 [ | |
427 0.0483871, | |
428 0.11290323, | |
429 0.01612903, | |
430 0.08064516, | |
431 0.33870968, | |
432 0.27419355, | |
433 0.0, | |
434 0.0483871, | |
435 0.14516129, | |
436 0.06451613, | |
437 0.03225807, | |
438 0.06451613, | |
439 0.18333333, | |
440 0.0, | |
441 0.0, | |
442 0.1, | |
443 0.26829268, | |
444 0.0, | |
445 ], | |
446 [ | |
447 0.0, | |
448 0.03225807, | |
449 0.01612903, | |
450 0.12903226, | |
451 0.12903226, | |
452 0.0, | |
453 0.38709677, | |
454 0.33870968, | |
455 0.0483871, | |
456 0.03225807, | |
457 0.41935484, | |
458 0.08064516, | |
459 0.0, | |
460 0.03703704, | |
461 0.29411765, | |
462 0.04, | |
463 0.02439024, | |
464 0.02777778, | |
465 ], | |
466 [ | |
467 0.0483871, | |
468 0.70967742, | |
469 0.12903226, | |
470 0.0483871, | |
471 0.09677419, | |
472 0.32258064, | |
473 0.20967742, | |
474 0.06451613, | |
475 0.11290323, | |
476 0.06451613, | |
477 0.03225807, | |
478 0.03225807, | |
479 0.28333333, | |
480 0.24074074, | |
481 0.03921569, | |
482 0.28, | |
483 0.07317073, | |
484 0.22222222, | |
485 ], | |
486 [ | |
487 0.0, | |
488 0.01612903, | |
489 0.01612903, | |
490 0.0483871, | |
491 0.01612903, | |
492 0.03225807, | |
493 0.0, | |
494 0.0, | |
495 0.0, | |
496 0.0, | |
497 0.0, | |
498 0.0, | |
499 0.03333333, | |
500 0.0, | |
501 0.01960784, | |
502 0.02, | |
503 0.0, | |
504 0.0, | |
505 ], | |
506 [ | |
507 0.0, | |
508 0.01612903, | |
509 0.0, | |
510 0.0, | |
511 0.0, | |
512 0.0, | |
513 0.0, | |
514 0.0, | |
515 0.01612903, | |
516 0.0, | |
517 0.03225807, | |
518 0.0, | |
519 0.0, | |
520 0.0, | |
521 0.01960784, | |
522 0.02, | |
523 0.0, | |
524 0.0, | |
525 ], | |
526 [ | |
527 0.0, | |
528 0.0, | |
529 0.14516129, | |
530 0.01612903, | |
531 0.03225807, | |
532 0.01612903, | |
533 0.0, | |
534 0.0, | |
535 0.0, | |
536 0.0, | |
537 0.01612903, | |
538 0.0, | |
539 0.0, | |
540 0.12962963, | |
541 0.17647059, | |
542 0.0, | |
543 0.0, | |
544 0.0, | |
545 ], | |
546 [ | |
547 0.0, | |
548 0.0, | |
549 0.01612903, | |
550 0.01612903, | |
551 0.0, | |
552 0.0, | |
553 0.01612903, | |
554 0.0, | |
555 0.01612903, | |
556 0.0, | |
557 0.0, | |
558 0.01612903, | |
559 0.0, | |
560 0.01851852, | |
561 0.0, | |
562 0.0, | |
563 0.0, | |
564 0.0, | |
565 ], | |
566 [ | |
567 0.0, | |
568 0.01612903, | |
569 0.01612903, | |
570 0.0, | |
571 0.01612903, | |
572 0.0, | |
573 0.01612903, | |
574 0.0, | |
575 0.01612903, | |
576 0.01612903, | |
577 0.01612903, | |
578 0.01612903, | |
579 0.0, | |
580 0.01851852, | |
581 0.01960784, | |
582 0.0, | |
583 0.04878049, | |
584 0.0, | |
585 ], | |
586 [ | |
587 0.01612903, | |
588 0.0, | |
589 0.01612903, | |
590 0.12903226, | |
591 0.03225807, | |
592 0.03225807, | |
593 0.0483871, | |
594 0.17741936, | |
595 0.0, | |
596 0.03225807, | |
597 0.09677419, | |
598 0.0483871, | |
599 0.01666667, | |
600 0.0, | |
601 0.15686274, | |
602 0.1, | |
603 0.0, | |
604 0.05555556, | |
605 ], | |
606 [ | |
607 0.01612903, | |
608 0.01612903, | |
609 0.0, | |
610 0.01612903, | |
611 0.0483871, | |
612 0.01612903, | |
613 0.0, | |
614 0.01612903, | |
615 0.0, | |
616 0.01612903, | |
617 0.01612903, | |
618 0.11290323, | |
619 0.0, | |
620 0.01851852, | |
621 0.03921569, | |
622 0.02, | |
623 0.0, | |
624 0.05555556, | |
625 ], | |
626 [ | |
627 0.01612903, | |
628 0.01612903, | |
629 0.01612903, | |
630 0.01612903, | |
631 0.20967742, | |
632 0.16129032, | |
633 0.01612903, | |
634 0.0483871, | |
635 0.33870968, | |
636 0.16129032, | |
637 0.0, | |
638 0.14516129, | |
639 0.25, | |
640 0.11111111, | |
641 0.01960784, | |
642 0.02, | |
643 0.21951219, | |
644 0.22222222, | |
645 ], | |
646 [ | |
647 0.0, | |
648 0.0, | |
649 0.12903226, | |
650 0.01612903, | |
651 0.0, | |
652 0.0, | |
653 0.0, | |
654 0.0, | |
655 0.01612903, | |
656 0.0, | |
657 0.0, | |
658 0.0, | |
659 0.0, | |
660 0.0, | |
661 0.0, | |
662 0.0, | |
663 0.02439024, | |
664 0.0, | |
665 ], | |
666 [ | |
667 0.0, | |
668 0.0, | |
669 0.0, | |
670 0.0, | |
671 0.0, | |
672 0.0, | |
673 0.0, | |
674 0.0, | |
675 0.0, | |
676 0.0, | |
677 0.0, | |
678 0.01612903, | |
679 0.0, | |
680 0.0, | |
681 0.0, | |
682 0.0, | |
683 0.0, | |
684 0.0, | |
685 ], | |
686 ] | |
687 ) | |
688 | |
689 def save_fasta(self, filename, names=False): | |
690 """Method to save generated sequences in a ``.FASTA`` formatted file. | |
691 | |
692 :param filename: output filename in which the sequences from :py:attr:`sequences` are safed in fasta format. | |
693 :param names: {bool} whether sequence names from :py:attr:`names` should be saved as sequence identifiers | |
694 :return: a FASTA formatted file containing the generated sequences | |
695 :Example: | |
696 | |
697 >>> b = BaseSequence(2) | |
698 >>> b.sequences = ['KLLSLSLALDLLS', 'KLPERTVVNSSDF'] | |
699 >>> b.names = ['Sequence1', 'Sequence2'] | |
700 >>> b.save_fasta('/location/of/fasta/file.fasta', names=True) | |
701 """ | |
702 if names: | |
703 save_fasta(filename, self.sequences, self.names) | |
704 else: | |
705 save_fasta(filename, self.sequences) | |
706 | |
707 def mutate_AA(self, nr, prob): | |
708 """Method to mutate with **prob** probability a **nr** of positions per sequence randomly. | |
709 | |
710 :param nr: number of mutations to perform per sequence | |
711 :param prob: probability of mutating a sequence | |
712 :return: mutated sequences in the attribute :py:attr:`sequences`. | |
713 :Example: | |
714 | |
715 >>> b = BaseSequence(1) | |
716 >>> b.sequences = ['IAKAGRAIIK'] | |
717 >>> b.mutate_AA(3, 1.) | |
718 >>> b.sequences | |
719 ['NAKAGRAWIK'] | |
720 """ | |
721 for s in range(len(self.sequences)): | |
722 # mutate: yes or no? prob = mutation probability | |
723 mutate = np.random.choice([1, 0], 1, p=[prob, 1 - float(prob)]) | |
724 if mutate == 1: | |
725 seq = list(self.sequences[s]) | |
726 cnt = 0 | |
727 while cnt < nr: # mutate "nr" AA | |
728 seq[random.choice(range(len(seq)))] = random.choice(self.AAs) | |
729 cnt += 1 | |
730 self.sequences[s] = "".join(seq) | |
731 | |
732 def filter_duplicates(self): | |
733 """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences` | |
734 | |
735 :return: filtered sequences list in the attribute :py:attr:`sequences` and corresponding names. | |
736 :Example: | |
737 | |
738 >>> b = BaseSequence(4) | |
739 >>> b.sequences = ['KLLKLLKKLLKLLK', 'KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK', 'KLAKLAKKLAKLAK'] | |
740 >>> b.filter_duplicates() | |
741 >>> b.sequences | |
742 ['KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK'] | |
743 | |
744 .. versionadded:: v2.2.5 | |
745 """ | |
746 if not self.names: | |
747 self.names = ["Seq_" + str(i) for i in range(len(self.sequences))] | |
748 df = pd.DataFrame( | |
749 list(zip(self.sequences, self.names)), columns=["Sequences", "Names"] | |
750 ) | |
751 df = df.drop_duplicates( | |
752 "Sequences", "first" | |
753 ) # keep first occurrence of duplicate | |
754 self.sequences = df["Sequences"].get_values().tolist() | |
755 self.names = df["Names"].get_values().tolist() | |
756 | |
757 def keep_natural_aa(self): | |
758 """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character | |
759 that is not in ``['A','C','D,'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']``. | |
760 | |
761 :return: filtered sequence list in the attribute :py:attr:`sequences`. The other attributes are also filtered | |
762 accordingly (if present). | |
763 :Example: | |
764 | |
765 >>> b = BaseSequence(2) | |
766 >>> b.sequences = ['BBBsdflUasUJfBJ', 'GLFDIVKKVVGALGSL'] | |
767 >>> b.keep_natural_aa() | |
768 >>> b.sequences | |
769 ['GLFDIVKKVVGALGSL'] | |
770 """ | |
771 natural_aa = [ | |
772 "A", | |
773 "C", | |
774 "D", | |
775 "E", | |
776 "F", | |
777 "G", | |
778 "H", | |
779 "I", | |
780 "K", | |
781 "L", | |
782 "M", | |
783 "N", | |
784 "P", | |
785 "Q", | |
786 "R", | |
787 "S", | |
788 "T", | |
789 "V", | |
790 "W", | |
791 "Y", | |
792 ] | |
793 | |
794 seqs = [] | |
795 names = [] | |
796 | |
797 for i, s in enumerate(self.sequences): | |
798 seq = list(s.upper()) | |
799 if all(c in natural_aa for c in seq): | |
800 seqs.append(s.upper()) | |
801 if hasattr(self, "names") and self.names: | |
802 names.append(self.names[i]) | |
803 | |
804 self.sequences = seqs | |
805 self.names = names | |
806 | |
807 def filter_aa(self, amino_acids): | |
808 """Method to filter out corresponding names and descriptor values of sequences with given amino acids in the | |
809 argument list *aminoacids*. | |
810 | |
811 :param amino_acids: {list} amino acids to be filtered | |
812 :return: filtered list of sequences names in the corresponding attributes. | |
813 :Example: | |
814 | |
815 >>> b = BaseSequence(3) | |
816 >>> b.sequences = ['AAALLLIIIKKK', 'CCEERRT', 'LLVVIIFFFQQ'] | |
817 >>> b.filter_aa(['C']) | |
818 >>> b.sequences | |
819 ['AAALLLIIIKKK', 'LLVVIIFFFQQ'] | |
820 """ | |
821 | |
822 pattern = re.compile("|".join(amino_acids)) | |
823 seqs = [] | |
824 names = [] | |
825 | |
826 for i, s in enumerate(self.sequences): | |
827 if not pattern.search(s): | |
828 seqs.append(s) | |
829 if hasattr(self, "names") and self.names: | |
830 names.append(self.names[i]) | |
831 | |
832 self.sequences = seqs | |
833 self.names = names | |
834 | |
835 def clean(self): | |
836 """Method to clean / clear / empty the attributes :py:attr:`sequences` and :py:attr:`names`. | |
837 | |
838 :return: freshly initialized, empty class attributes. | |
839 """ | |
840 self.__init__(self.seqnum, self.lenmin, self.lenmax) | |
841 | |
842 | |
843 class BaseDescriptor(object): | |
844 """ | |
845 Base class inheriting to both peptide descriptor classes :py:class:`modlamp.descriptors.GlobalDescriptor` and | |
846 :py:class:`modlamp.descriptors.PeptideDescriptor`. | |
847 """ | |
848 | |
849 def __init__(self, seqs): | |
850 """ | |
851 :param seqs: a ``.FASTA`` file with sequences, a list / array of sequences or a single sequence as string to | |
852 calculate the descriptor values for. | |
853 :return: initialized attributes :py:attr:`sequences` and :py:attr:`names`. | |
854 :Example: | |
855 | |
856 >>> AMP = BaseDescriptor('KLLKLLKKLLKLLK','pepCATS') | |
857 >>> AMP.sequences | |
858 ['KLLKLLKKLLKLLK'] | |
859 >>> seqs = BaseDescriptor('/Path/to/file.fasta', 'eisenberg') # load sequences from .fasta file | |
860 >>> seqs.sequences | |
861 ['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'...] | |
862 """ | |
863 if type(seqs) == list and seqs[0].isupper(): | |
864 self.sequences = [s.strip() for s in seqs] | |
865 self.names = [] | |
866 elif type(seqs) == np.ndarray and seqs[0].isupper(): | |
867 self.sequences = [s.strip() for s in seqs.tolist()] | |
868 self.names = [] | |
869 elif type(seqs) == str and seqs.isupper(): | |
870 self.sequences = [seqs.strip()] | |
871 self.names = [] | |
872 elif os.path.isfile(seqs): | |
873 if seqs.endswith(".fasta"): # read .fasta file | |
874 self.sequences, self.names = read_fasta(seqs) | |
875 elif seqs.endswith(".csv"): # read .csv file with sequences every line | |
876 with open(seqs) as f: | |
877 self.sequences = list() | |
878 cntr = 0 | |
879 self.names = [] | |
880 for line in f: | |
881 if line.isupper(): | |
882 self.sequences.append(line.strip()) | |
883 self.names.append("seq_" + str(cntr)) | |
884 cntr += 1 | |
885 else: | |
886 print("Sorry, currently only .fasta or .csv files can be read!") | |
887 else: | |
888 print( | |
889 "%s does not exist, is not a valid list of AA sequences or is not a valid sequence string" | |
890 % seqs | |
891 ) | |
892 | |
893 self.descriptor = np.array([[]]) | |
894 self.target = np.array([], dtype="int") | |
895 self.scaler = None | |
896 self.featurenames = [] | |
897 | |
898 def read_fasta(self, filename): | |
899 """Method for loading sequences from a ``.FASTA`` formatted file into the attributes :py:attr:`sequences` and | |
900 :py:attr:`names`. | |
901 | |
902 :param filename: {str} ``.FASTA`` file with sequences and headers to read | |
903 :return: {list} sequences in the attribute :py:attr:`sequences` with corresponding sequence names in | |
904 :py:attr:`names`. | |
905 """ | |
906 self.sequences, self.names = read_fasta(filename) | |
907 | |
908 def save_fasta(self, filename, names=False): | |
909 """Method for saving sequences from :py:attr:`sequences` to a ``.FASTA`` formatted file. | |
910 | |
911 :param filename: {str} filename of the output ``.FASTA`` file | |
912 :param names: {bool} whether sequence names from self.names should be saved as sequence identifiers | |
913 :return: a FASTA formatted file containing the generated sequences | |
914 """ | |
915 if names: | |
916 save_fasta(filename, self.sequences, self.names) | |
917 else: | |
918 save_fasta(filename, self.sequences) | |
919 | |
920 def count_aa(self, scale="relative", average=False, append=False): | |
921 """Method for producing the amino acid distribution for the given sequences as a descriptor | |
922 | |
923 :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA | |
924 :param average: {boolean} whether the averaged amino acid counts for all sequences should be returned | |
925 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
926 attribute :py:attr:`descriptor`. | |
927 :return: the amino acid distributions for every sequence individually in the attribute :py:attr:`descriptor` | |
928 :Example: | |
929 | |
930 >>> AMP = PeptideDescriptor('ACDEFGHIKLMNPQRSTVWY') # aa_count() does not depend on the descriptor scale | |
931 >>> AMP.count_aa() | |
932 >>> AMP.descriptor | |
933 array([[ 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, ... ]]) | |
934 >>> AMP.descriptor.shape | |
935 (1, 20) | |
936 | |
937 .. seealso:: :py:func:`modlamp.core.count_aa()` | |
938 """ | |
939 desc = list() | |
940 for seq in self.sequences: | |
941 od = count_aas(seq, scale) | |
942 desc.append(list(od.values())) | |
943 | |
944 desc = np.array(desc) | |
945 self.featurenames = list(od.keys()) | |
946 | |
947 if append: | |
948 self.descriptor = np.hstack((self.descriptor, desc)) | |
949 elif average: | |
950 self.descriptor = np.mean(desc, axis=0) | |
951 else: | |
952 self.descriptor = desc | |
953 | |
954 def count_ngrams(self, n): | |
955 """Method for producing n-grams of all sequences in self.sequences | |
956 | |
957 :param n: {int or list of ints} defines whether counts or frequencies are given for each AA | |
958 :return: {dict} dictionary with n-grams as keys and their counts in the sequence as values in :py:attr:`descriptor` | |
959 :Example: | |
960 | |
961 >>> D = PeptideDescriptor('GLLDFLSLAALSLDKLVKKGALS') | |
962 >>> D.count_ngrams([2, 3]) | |
963 >>> D.descriptor | |
964 {'LS': 3, 'LD': 2, 'LSL': 2, 'AL': 2, ..., 'LVK': 1} | |
965 | |
966 .. seealso:: :py:func:`modlamp.core.count_ngrams()` | |
967 """ | |
968 ngrams = dict() | |
969 for seq in self.sequences: | |
970 d = count_ngrams(seq, n) | |
971 for k, v in d.items(): | |
972 if k in ngrams.keys(): | |
973 ngrams[k] += v | |
974 else: | |
975 ngrams[k] = v | |
976 self.descriptor = ngrams | |
977 | |
978 def feature_scaling(self, stype="standard", fit=True): | |
979 """Method for feature scaling of the calculated descriptor matrix. | |
980 | |
981 :param stype: {'standard' or 'minmax'} type of scaling to be used | |
982 :param fit: {boolean} defines whether the used scaler is first fitting on the data (True) or | |
983 whether the already fitted scaler in :py:attr:`scaler` should be used to transform (False). | |
984 :return: scaled descriptor values in :py:attr:`descriptor` | |
985 :Example: | |
986 | |
987 >>> D.descriptor | |
988 array([[0.155],[0.34],[0.16235294],[-0.08842105],[0.116]]) | |
989 >>> D.feature_scaling(type='minmax',fit=True) | |
990 array([[0.56818182],[1.],[0.5853447],[0.],[0.47714988]]) | |
991 """ | |
992 if stype in ["standard", "minmax"]: | |
993 if stype == "standard": | |
994 self.scaler = StandardScaler() | |
995 elif stype == "minmax": | |
996 self.scaler = MinMaxScaler() | |
997 | |
998 if fit: | |
999 self.descriptor = self.scaler.fit_transform(self.descriptor) | |
1000 else: | |
1001 self.descriptor = self.scaler.transform(self.descriptor) | |
1002 else: | |
1003 print("Unknown scaler type!\nAvailable: 'standard', 'minmax'") | |
1004 | |
1005 def feature_shuffle(self): | |
1006 """Method for shuffling feature columns randomly. | |
1007 | |
1008 :return: descriptor matrix with shuffled feature columns in :py:attr:`descriptor` | |
1009 :Example: | |
1010 | |
1011 >>> D.descriptor | |
1012 array([[0.80685625,167.05234375,39.56818125,-0.26338667,155.16888667,33.48778]]) | |
1013 >>> D.feature_shuffle() | |
1014 array([[155.16888667,-0.26338667,167.05234375,0.80685625,39.56818125,33.48778]]) | |
1015 """ | |
1016 self.descriptor = shuffle(self.descriptor.transpose()).transpose() | |
1017 | |
1018 def sequence_order_shuffle(self): | |
1019 """Method for shuffling sequence order in the attribute :py:attr:`sequences`. | |
1020 | |
1021 :return: sequences in :py:attr:`sequences` with shuffled order in the list. | |
1022 :Example: | |
1023 | |
1024 >>> D.sequences | |
1025 ['LILRALKGAARALKVA','VKIAKIALKIIKGLG','VGVRLIKGIGRVARGAI','LRGLRGVIRGGKAIVRVGK','GGKLVRLIARIGKGV'] | |
1026 >>> D.sequence_order_shuffle() | |
1027 >>> D.sequences | |
1028 ['VGVRLIKGIGRVARGAI','LILRALKGAARALKVA','LRGLRGVIRGGKAIVRVGK','GGKLVRLIARIGKGV','VKIAKIALKIIKGLG'] | |
1029 """ | |
1030 self.sequences = shuffle(self.sequences) | |
1031 | |
1032 def random_selection(self, num): | |
1033 """Method to randomly select a specified number of sequences (with names and descriptors if present) out of a given | |
1034 descriptor instance. | |
1035 | |
1036 :param num: {int} number of entries to be randomly selected | |
1037 :return: updated instance | |
1038 :Example: | |
1039 | |
1040 >>> h = Helices(7, 28, 100) | |
1041 >>> h.generate_helices() | |
1042 >>> desc = PeptideDescriptor(h.sequences, 'eisenberg') | |
1043 >>> desc.calculate_moment() | |
1044 >>> len(desc.sequences) | |
1045 100 | |
1046 >>> len(desc.descriptor) | |
1047 100 | |
1048 >>> desc.random_selection(10) | |
1049 >>> len(desc.descriptor) | |
1050 10 | |
1051 >>> len(desc.descriptor) | |
1052 10 | |
1053 | |
1054 .. versionadded:: v2.2.3 | |
1055 """ | |
1056 | |
1057 sel = np.random.choice(len(self.sequences), size=num, replace=False) | |
1058 self.sequences = np.array(self.sequences)[sel].tolist() | |
1059 if hasattr(self, "descriptor") and self.descriptor.size: | |
1060 self.descriptor = self.descriptor[sel] | |
1061 if hasattr(self, "names") and self.names: | |
1062 self.names = np.array(self.names)[sel].tolist() | |
1063 if hasattr(self, "target") and self.target.size: | |
1064 self.target = self.target[sel] | |
1065 | |
1066 def minmax_selection(self, iterations, distmetric="euclidean", seed=0): | |
1067 """Method to select a specified number of sequences according to the minmax algorithm. | |
1068 | |
1069 :param iterations: {int} Number of sequences to retrieve. | |
1070 :param distmetric: Distance metric to calculate the distances between the sequences in descriptor space. | |
1071 Choose from 'euclidean' or 'minkowsky'. | |
1072 :param seed: {int} Set a random seed for numpy to pick the first sequence. | |
1073 :return: updated instance | |
1074 | |
1075 .. seealso:: **SciPy** http://docs.scipy.org/doc/scipy/reference/spatial.distance.html | |
1076 """ | |
1077 | |
1078 # Storing M into pool, where selections get deleted | |
1079 pool = self.descriptor # Store pool where selections get deleted | |
1080 minmaxidx = list() # Store original indices of selections to return | |
1081 | |
1082 # Randomly selecting first peptide into the sele | |
1083 np.random.seed(seed) | |
1084 idx = int(np.random.random_integers(0, len(pool), 1)) | |
1085 sele = pool[idx : idx + 1, :] | |
1086 minmaxidx.append( | |
1087 int(*np.where(np.all(self.descriptor == pool[idx : idx + 1, :], axis=1))) | |
1088 ) | |
1089 | |
1090 # Deleting peptide in selection from pool | |
1091 pool = np.delete(pool, idx, axis=0) | |
1092 | |
1093 for i in range(iterations - 1): | |
1094 # Calculating distance from sele to the rest of the peptides | |
1095 dist = distance.cdist(pool, sele, distmetric) | |
1096 | |
1097 # Choosing maximal distances for every sele instance | |
1098 maxidx = np.argmax(dist, axis=0) | |
1099 maxcols = np.max(dist, axis=0) | |
1100 | |
1101 # Choosing minimal distance among the maximal distances | |
1102 minmax = np.argmin(maxcols) | |
1103 maxidx = int(maxidx[minmax]) | |
1104 | |
1105 # Adding it to selection and removing from pool | |
1106 sele = np.append(sele, pool[maxidx : maxidx + 1, :], axis=0) | |
1107 pool = np.delete(pool, maxidx, axis=0) | |
1108 minmaxidx.append( | |
1109 int( | |
1110 *np.where( | |
1111 np.all(self.descriptor == pool[maxidx : maxidx + 1, :], axis=1) | |
1112 ) | |
1113 ) | |
1114 ) | |
1115 | |
1116 self.sequences = np.array(self.sequences)[minmaxidx].tolist() | |
1117 if hasattr(self, "descriptor") and self.descriptor.size: | |
1118 self.descriptor = self.descriptor[minmaxidx] | |
1119 if hasattr(self, "names") and self.names: | |
1120 self.names = np.array(self.names)[minmaxidx].tolist() | |
1121 if hasattr(self, "target") and self.target.size: | |
1122 self.target = self.descriptor[minmaxidx] | |
1123 | |
1124 def filter_sequences(self, sequences): | |
1125 """Method to filter out entries for given sequences in *sequences* out of a descriptor instance. All | |
1126 corresponding attribute values of these sequences (e.g. in :py:attr:`descriptor`, :py:attr:`name`) are deleted | |
1127 as well. The method returns an updated descriptor instance. | |
1128 | |
1129 :param sequences: {list} sequences to be filtered out of the whole instance, including corresponding data | |
1130 :return: updated instance without filtered sequences | |
1131 :Example: | |
1132 | |
1133 >>> sequences = ['KLLKLLKKLLKLLK', 'ACDEFGHIK', 'GLFDIVKKVV', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALGSL'] | |
1134 >>> desc = PeptideDescriptor(sequences, 'pepcats') | |
1135 >>> desc.calculate_crosscorr(7) | |
1136 >>> len(desc.descriptor) | |
1137 5 | |
1138 >>> desc.filter_sequences('KLLKLLKKLLKLLK') | |
1139 >>> len(desc.descriptor) | |
1140 4 | |
1141 >>> desc.sequences | |
1142 ['ACDEFGHIK', 'GLFDIVKKVV', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALGSL'] | |
1143 """ | |
1144 indices = list() | |
1145 if isinstance( | |
1146 sequences, str | |
1147 ): # check if sequences is only one sequence string and convert it to a list | |
1148 sequences = [sequences] | |
1149 for s in sequences: # get indices of queried sequences | |
1150 indices.append(self.sequences.index(s)) | |
1151 | |
1152 self.sequences = np.delete(np.array(self.sequences), indices, 0).tolist() | |
1153 if hasattr(self, "descriptor") and self.descriptor.size: | |
1154 self.descriptor = np.delete(self.descriptor, indices, 0) | |
1155 if hasattr(self, "names") and self.names: | |
1156 self.names = np.delete(np.array(self.names), indices, 0).tolist() | |
1157 if hasattr(self, "target") and self.target.size: | |
1158 self.target = np.delete(self.target, indices, 0) | |
1159 | |
1160 def filter_values(self, values, operator="=="): | |
1161 """Method to filter the descriptor matrix in the attribute :py:attr:`descriptor` for a given list of values (same | |
1162 size as the number of features in the descriptor matrix!) The operator option tells the method whether to | |
1163 filter for values equal, lower, higher ect. to the given values in the *values* array. | |
1164 | |
1165 :param values: {list} values to filter the attribute :py:attr:`descriptor` for | |
1166 :param operator: {str} filter criterion, available the operators ``==``, ``<``, ``>``, ``<=``and ``>=``. | |
1167 :return: descriptor matrix and updated sequences containing only entries with descriptor values given in | |
1168 *values* in the corresponding attributes. | |
1169 :Example: | |
1170 | |
1171 >>> desc.descriptor # desc = BaseDescriptor instance | |
1172 array([[ 0.7666517 ], | |
1173 [ 0.38373498]]) | |
1174 >>> desc.filter_values([0.5], '<') | |
1175 >>> desc.descriptor | |
1176 array([[ 0.38373498]]) | |
1177 """ | |
1178 dim = self.descriptor.shape[1] | |
1179 for d in range(dim): # for all the features in self.descriptor | |
1180 if operator == "==": | |
1181 indices = np.where(self.descriptor[:, d] == values[d])[0] | |
1182 elif operator == "<": | |
1183 indices = np.where(self.descriptor[:, d] < values[d])[0] | |
1184 elif operator == ">": | |
1185 indices = np.where(self.descriptor[:, d] > values[d])[0] | |
1186 elif operator == "<=": | |
1187 indices = np.where(self.descriptor[:, d] <= values[d])[0] | |
1188 elif operator == ">=": | |
1189 indices = np.where(self.descriptor[:, d] >= values[d])[0] | |
1190 else: | |
1191 raise KeyError( | |
1192 "available operators: ``==``, ``<``, ``>``, ``<=``and ``>=``" | |
1193 ) | |
1194 | |
1195 # filter descriptor matrix, sequence list and names list according to obtained indices | |
1196 self.sequences = np.array(self.sequences)[indices].tolist() | |
1197 if hasattr(self, "descriptor") and self.descriptor.size: | |
1198 self.descriptor = self.descriptor[indices] | |
1199 if hasattr(self, "names") and self.names: | |
1200 self.names = np.array(self.names)[indices].tolist() | |
1201 if hasattr(self, "target") and self.target.size: | |
1202 self.target = self.target[indices] | |
1203 | |
1204 def filter_aa(self, amino_acids): | |
1205 """Method to filter out corresponding names and descriptor values of sequences with given amino acids in the | |
1206 argument list *aminoacids*. | |
1207 | |
1208 :param amino_acids: list of amino acids to be filtered | |
1209 :return: filtered list of sequences, descriptor values, target values and names in the corresponding attributes. | |
1210 :Example: | |
1211 | |
1212 >>> b = BaseSequence(3) | |
1213 >>> b.sequences = ['AAALLLIIIKKK', 'CCEERRT', 'LLVVIIFFFQQ'] | |
1214 >>> b.filter_aa(['C']) | |
1215 >>> b.sequences | |
1216 ['AAALLLIIIKKK', 'LLVVIIFFFQQ'] | |
1217 """ | |
1218 | |
1219 pattern = re.compile("|".join(amino_acids)) | |
1220 seqs = [] | |
1221 desc = [] | |
1222 names = [] | |
1223 target = [] | |
1224 | |
1225 for i, s in enumerate(self.sequences): | |
1226 if not pattern.search(s): | |
1227 seqs.append(s) | |
1228 if hasattr(self, "descriptor") and self.descriptor.size: | |
1229 desc.append(self.descriptor[i]) | |
1230 if hasattr(self, "names") and self.names: | |
1231 names.append(self.names[i]) | |
1232 if hasattr(self, "target") and self.target.size: | |
1233 target.append(self.target[i]) | |
1234 | |
1235 self.sequences = seqs | |
1236 self.names = names | |
1237 self.descriptor = np.array(desc) | |
1238 self.target = np.array(target, dtype="int") | |
1239 | |
1240 def filter_duplicates(self): | |
1241 """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences` | |
1242 | |
1243 :return: filtered sequences list in the attribute :py:attr:`sequences` and corresponding names. | |
1244 :Example: | |
1245 | |
1246 >>> b = BaseDescriptor(['KLLKLLKKLLKLLK', 'KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK', 'KLAKLAKKLAKLAK']) | |
1247 >>> b.filter_duplicates() | |
1248 >>> b.sequences | |
1249 ['KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK'] | |
1250 | |
1251 .. versionadded:: v2.2.5 | |
1252 """ | |
1253 if not self.names: | |
1254 self.names = ["Seq_" + str(i) for i in range(len(self.sequences))] | |
1255 if not self.target: | |
1256 self.target = [0] * len(self.sequences) | |
1257 if not self.descriptor: | |
1258 self.descriptor = np.zeros(len(self.sequences)) | |
1259 df = pd.DataFrame( | |
1260 np.array([self.sequences, self.names, self.descriptor, self.target]).T, | |
1261 columns=["Sequences", "Names", "Descriptor", "Target"], | |
1262 ) | |
1263 df = df.drop_duplicates( | |
1264 "Sequences", "first" | |
1265 ) # keep first occurrence of duplicate | |
1266 self.sequences = df["Sequences"].get_values().tolist() | |
1267 self.names = df["Names"].get_values().tolist() | |
1268 self.descriptor = df["Descriptor"].get_values() | |
1269 self.target = df["Target"].get_values() | |
1270 | |
1271 def keep_natural_aa(self): | |
1272 """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character | |
1273 that is not in ['A','C','D,'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']. | |
1274 | |
1275 :return: filtered sequence list in the attribute :py:attr:`sequences`. The other attributes are also filtered | |
1276 accordingly (if present). | |
1277 :Example: | |
1278 | |
1279 >>> b = BaseSequence(2) | |
1280 >>> b.sequences = ['BBBsdflUasUJfBJ', 'GLFDIVKKVVGALGSL'] | |
1281 >>> b.keep_natural_aa() | |
1282 >>> b.sequences | |
1283 ['GLFDIVKKVVGALGSL'] | |
1284 """ | |
1285 | |
1286 natural_aa = [ | |
1287 "A", | |
1288 "C", | |
1289 "D", | |
1290 "E", | |
1291 "F", | |
1292 "G", | |
1293 "H", | |
1294 "I", | |
1295 "K", | |
1296 "L", | |
1297 "M", | |
1298 "N", | |
1299 "P", | |
1300 "Q", | |
1301 "R", | |
1302 "S", | |
1303 "T", | |
1304 "V", | |
1305 "W", | |
1306 "Y", | |
1307 ] | |
1308 | |
1309 seqs = [] | |
1310 desc = [] | |
1311 names = [] | |
1312 target = [] | |
1313 | |
1314 for i, s in enumerate(self.sequences): | |
1315 seq = list(s.upper()) | |
1316 if all(c in natural_aa for c in seq): | |
1317 seqs.append(s.upper()) | |
1318 if hasattr(self, "descriptor") and self.descriptor.size: | |
1319 desc.append(self.descriptor[i]) | |
1320 if hasattr(self, "names") and self.names: | |
1321 names.append(self.names[i]) | |
1322 if hasattr(self, "target") and self.target.size: | |
1323 target.append(self.target[i]) | |
1324 | |
1325 self.sequences = seqs | |
1326 self.names = names | |
1327 self.descriptor = np.array(desc) | |
1328 self.target = np.array(target, dtype="int") | |
1329 | |
1330 def load_descriptordata( | |
1331 self, filename, delimiter=",", targets=False, skip_header=0 | |
1332 ): | |
1333 """Method to load any data file with sequences and descriptor values and save it to a new insatnce of the | |
1334 class :class:`modlamp.descriptors.PeptideDescriptor`. | |
1335 | |
1336 .. note:: Headers are not considered. To skip initial lines in the file, use the *skip_header* option. | |
1337 | |
1338 :param filename: {str} filename of the data file to be loaded | |
1339 :param delimiter: {str} column delimiter | |
1340 :param targets: {boolean} whether last column in the file contains a target class vector | |
1341 :param skip_header: {int} number of initial lines to skip in the file | |
1342 :return: loaded sequences, descriptor values and targets in the corresponding attributes. | |
1343 """ | |
1344 data = np.genfromtxt(filename, delimiter=delimiter, skip_header=skip_header) | |
1345 data = data[:, 1:] # skip sequences as they are "nan" when read as float | |
1346 seqs = np.genfromtxt(filename, delimiter=delimiter, dtype="str") | |
1347 seqs = seqs[:, 0] | |
1348 if targets: | |
1349 self.target = np.array(data[:, -1], dtype="int") | |
1350 self.sequences = seqs | |
1351 self.descriptor = data | |
1352 | |
1353 def save_descriptor(self, filename, delimiter=",", targets=None, header=None): | |
1354 """Method to save the descriptor values to a .csv/.txt file | |
1355 | |
1356 :param filename: filename of the output file | |
1357 :param delimiter: column delimiter | |
1358 :param targets: target class vector to be added to descriptor (same length as :py:attr:`sequences`) | |
1359 :param header: {str} header to be written at the beginning of the file (if ``None``: feature names are taken) | |
1360 :return: output file with peptide names and descriptor values | |
1361 """ | |
1362 seqs = np.array(self.sequences, dtype="|S80")[:, np.newaxis] | |
1363 ids = np.array(self.names, dtype="|S80")[:, np.newaxis] | |
1364 if ids.shape == seqs.shape: | |
1365 names = np.hstack((ids, seqs)) | |
1366 else: | |
1367 names = seqs | |
1368 if targets and len(targets) == len(self.sequences): | |
1369 target = np.array(targets)[:, np.newaxis] | |
1370 data = np.hstack((names, self.descriptor, target)) | |
1371 else: | |
1372 data = np.hstack((names, self.descriptor)) | |
1373 if not header: | |
1374 featurenames = [["Sequence"]] + self.featurenames | |
1375 header = ", ".join([f[0] for f in featurenames]) | |
1376 np.savetxt(filename, data, delimiter=delimiter, fmt="%s", header=header) | |
1377 | |
1378 | |
1379 def load_scale(scalename): | |
1380 """Method to load scale values for a given amino acid scale | |
1381 | |
1382 :param scalename: amino acid scale name, for available scales see the | |
1383 :class:`modlamp.descriptors.PeptideDescriptor()` documentation. | |
1384 :return: amino acid scale values in dictionary format. | |
1385 """ | |
1386 # predefined amino acid scales dictionary | |
1387 scales = { | |
1388 "aasi": { | |
1389 "A": [1.89], | |
1390 "C": [1.73], | |
1391 "D": [3.13], | |
1392 "E": [3.14], | |
1393 "F": [1.53], | |
1394 "G": [2.67], | |
1395 "H": [3], | |
1396 "I": [1.97], | |
1397 "K": [2.28], | |
1398 "L": [1.74], | |
1399 "M": [2.5], | |
1400 "N": [2.33], | |
1401 "P": [0.22], | |
1402 "Q": [3.05], | |
1403 "R": [1.91], | |
1404 "S": [2.14], | |
1405 "T": [2.18], | |
1406 "V": [2.37], | |
1407 "W": [2], | |
1408 "Y": [2.01], | |
1409 }, | |
1410 "abhprk": { | |
1411 "A": [0, 0, 0, 0, 0, 0], | |
1412 "C": [0, 0, 0, 0, 0, 0], | |
1413 "D": [1, 0, 0, 1, 0, 0], | |
1414 "E": [1, 0, 0, 1, 0, 0], | |
1415 "F": [0, 0, 1, 0, 1, 0], | |
1416 "G": [0, 0, 0, 0, 0, 0], | |
1417 "H": [0, 0, 0, 1, 1, 0], | |
1418 "I": [0, 0, 1, 0, 0, 0], | |
1419 "K": [0, 1, 0, 1, 0, 0], | |
1420 "L": [0, 0, 1, 0, 0, 0], | |
1421 "M": [0, 0, 1, 0, 0, 0], | |
1422 "N": [0, 0, 0, 1, 0, 0], | |
1423 "P": [0, 0, 0, 0, 0, 1], | |
1424 "Q": [0, 0, 0, 1, 0, 0], | |
1425 "R": [0, 1, 0, 1, 0, 0], | |
1426 "S": [0, 0, 0, 1, 0, 0], | |
1427 "T": [0, 0, 0, 1, 0, 0], | |
1428 "V": [0, 0, 1, 0, 0, 0], | |
1429 "W": [0, 0, 1, 0, 1, 0], | |
1430 "Y": [0, 0, 0, 1, 1, 0], | |
1431 }, | |
1432 "argos": { | |
1433 "I": [0.77], | |
1434 "F": [1.2], | |
1435 "V": [0.14], | |
1436 "L": [2.3], | |
1437 "W": [0.07], | |
1438 "M": [2.3], | |
1439 "A": [0.64], | |
1440 "G": [-0.48], | |
1441 "C": [0.25], | |
1442 "Y": [-0.41], | |
1443 "P": [-0.31], | |
1444 "T": [-0.13], | |
1445 "S": [-0.25], | |
1446 "H": [-0.87], | |
1447 "E": [-0.94], | |
1448 "N": [-0.89], | |
1449 "Q": [-0.61], | |
1450 "D": [-1], | |
1451 "K": [-1], | |
1452 "R": [-0.68], | |
1453 }, | |
1454 "bulkiness": { | |
1455 "A": [0.443], | |
1456 "C": [0.551], | |
1457 "D": [0.453], | |
1458 "E": [0.557], | |
1459 "F": [0.898], | |
1460 "G": [0], | |
1461 "H": [0.563], | |
1462 "I": [0.985], | |
1463 "K": [0.674], | |
1464 "L": [0.985], | |
1465 "M": [0.703], | |
1466 "N": [0.516], | |
1467 "P": [0.768], | |
1468 "Q": [0.605], | |
1469 "R": [0.596], | |
1470 "S": [0.332], | |
1471 "T": [0.677], | |
1472 "V": [0.995], | |
1473 "W": [1], | |
1474 "Y": [0.801], | |
1475 }, | |
1476 "charge_phys": { | |
1477 "A": [0.0], | |
1478 "C": [-0.1], | |
1479 "D": [-1.0], | |
1480 "E": [-1.0], | |
1481 "F": [0.0], | |
1482 "G": [0.0], | |
1483 "H": [0.1], | |
1484 "I": [0.0], | |
1485 "K": [1.0], | |
1486 "L": [0.0], | |
1487 "M": [0.0], | |
1488 "N": [0.0], | |
1489 "P": [0.0], | |
1490 "Q": [0.0], | |
1491 "R": [1.0], | |
1492 "S": [0.0], | |
1493 "T": [0.0], | |
1494 "V": [0.0], | |
1495 "W": [0.0], | |
1496 "Y": [0.0], | |
1497 }, | |
1498 "charge_acid": { | |
1499 "A": [0.0], | |
1500 "C": [-0.1], | |
1501 "D": [-1.0], | |
1502 "E": [-1.0], | |
1503 "F": [0.0], | |
1504 "G": [0.0], | |
1505 "H": [1.0], | |
1506 "I": [0.0], | |
1507 "K": [1.0], | |
1508 "L": [0.0], | |
1509 "M": [0.0], | |
1510 "N": [0.0], | |
1511 "P": [0.0], | |
1512 "Q": [0.0], | |
1513 "R": [1.0], | |
1514 "S": [0.0], | |
1515 "T": [0.0], | |
1516 "V": [0.0], | |
1517 "W": [0.0], | |
1518 "Y": [0.0], | |
1519 }, | |
1520 "cougar": { | |
1521 "A": [0.25, 0.62, 1.89], | |
1522 "C": [0.208, 0.29, 1.73], | |
1523 "D": [0.875, -0.9, 3.13], | |
1524 "E": [0.833, -0.74, 3.14], | |
1525 "F": [0.042, 1.2, 1.53], | |
1526 "G": [1, 0.48, 2.67], | |
1527 "H": [0.083, -0.4, 3], | |
1528 "I": [0.667, 1.4, 1.97], | |
1529 "K": [0.708, -1.5, 2.28], | |
1530 "L": [0.292, 1.1, 1.74], | |
1531 "M": [0, 0.64, 2.5], | |
1532 "N": [0.667, -0.78, 2.33], | |
1533 "P": [0.875, 0.12, 0.22], | |
1534 "Q": [0.792, -0.85, 3.05], | |
1535 "R": [0.958, -2.5, 1.91], | |
1536 "S": [0.875, -0.18, 2.14], | |
1537 "T": [0.583, -0.05, 2.18], | |
1538 "V": [0.375, 1.1, 2.37], | |
1539 "W": [0.042, 0.81, 2], | |
1540 "Y": [0.5, 0.26, 2.01], | |
1541 }, | |
1542 "eisenberg": { | |
1543 "I": [1.4], | |
1544 "F": [1.2], | |
1545 "V": [1.1], | |
1546 "L": [1.1], | |
1547 "W": [0.81], | |
1548 "M": [0.64], | |
1549 "A": [0.62], | |
1550 "G": [0.48], | |
1551 "C": [0.29], | |
1552 "Y": [0.26], | |
1553 "P": [0.12], | |
1554 "T": [-0.05], | |
1555 "S": [-0.18], | |
1556 "H": [-0.4], | |
1557 "E": [-0.74], | |
1558 "N": [-0.78], | |
1559 "Q": [-0.85], | |
1560 "D": [-0.9], | |
1561 "K": [-1.5], | |
1562 "R": [-2.5], | |
1563 }, | |
1564 "ez": { | |
1565 "A": [-0.29, 10.22, 4.67], | |
1566 "C": [0.95, 13.69, 5.77], | |
1567 "D": [1.19, 14.25, 8.98], | |
1568 "E": [1.3, 14.66, 4.16], | |
1569 "F": [-0.8, 19.67, 7.12], | |
1570 "G": [-0.01, 13.86, 6], | |
1571 "H": [0.75, 12.26, 2.77], | |
1572 "I": [-0.56, 14.34, 10.69], | |
1573 "K": [1.66, 11.11, 2.09], | |
1574 "L": [-0.64, 17.34, 8.61], | |
1575 "M": [-0.28, 18.04, 7.13], | |
1576 "N": [0.89, 12.78, 6.28], | |
1577 "P": [0.83, 18.09, 3.53], | |
1578 "Q": [1.21, 10.46, 2.59], | |
1579 "R": [1.55, 9.34, 4.68], | |
1580 "S": [0.1, 13.86, 6], | |
1581 "T": [0.01, 13.86, 6], | |
1582 "V": [-0.47, 11.35, 4.97], | |
1583 "W": [-0.85, 11.65, 7.2], | |
1584 "Y": [-0.42, 13.04, 6.2], | |
1585 }, | |
1586 "flexibility": { | |
1587 "A": [0.25], | |
1588 "C": [0.208], | |
1589 "D": [0.875], | |
1590 "E": [0.833], | |
1591 "F": [0.042], | |
1592 "G": [1], | |
1593 "H": [0.083], | |
1594 "I": [0.667], | |
1595 "K": [0.708], | |
1596 "L": [0.292], | |
1597 "M": [0.0], | |
1598 "N": [0.667], | |
1599 "P": [0.875], | |
1600 "Q": [0.792], | |
1601 "R": [0.958], | |
1602 "S": [0.875], | |
1603 "T": [0.583], | |
1604 "V": [0.375], | |
1605 "W": [0.042], | |
1606 "Y": [0.5], | |
1607 }, | |
1608 "grantham": { | |
1609 "A": [0, 8.1, 31], | |
1610 "C": [2.75, 5.5, 55], | |
1611 "D": [1.38, 13.0, 54], | |
1612 "E": [0.92, 12.3, 83], | |
1613 "F": [0, 5.2, 132], | |
1614 "G": [0.74, 9.0, 3], | |
1615 "H": [0.58, 10.4, 96], | |
1616 "I": [0, 5.2, 111], | |
1617 "K": [0.33, 11.3, 119], | |
1618 "L": [0, 4.9, 111], | |
1619 "M": [0, 5.7, 105], | |
1620 "N": [1.33, 11.6, 56], | |
1621 "P": [0.39, 8.0, 32.5], | |
1622 "Q": [0.89, 10.5, 85], | |
1623 "R": [0.65, 10.5, 124], | |
1624 "S": [1.42, 9.2, 32], | |
1625 "T": [0.71, 8.6, 61], | |
1626 "V": [0, 5.9, 84], | |
1627 "W": [0.13, 5.4, 170], | |
1628 "Y": [0.20, 6.2, 136], | |
1629 }, | |
1630 "gravy": { | |
1631 "I": [4.5], | |
1632 "V": [4.2], | |
1633 "L": [3.8], | |
1634 "F": [2.8], | |
1635 "C": [2.5], | |
1636 "M": [1.9], | |
1637 "A": [1.8], | |
1638 "G": [-0.4], | |
1639 "T": [-0.7], | |
1640 "W": [-0.9], | |
1641 "S": [-0.8], | |
1642 "Y": [-1.3], | |
1643 "P": [-1.6], | |
1644 "H": [-3.2], | |
1645 "E": [-3.5], | |
1646 "Q": [-3.5], | |
1647 "D": [-3.5], | |
1648 "N": [-3.5], | |
1649 "K": [-3.9], | |
1650 "R": [-4.5], | |
1651 }, | |
1652 "hopp-woods": { | |
1653 "A": [-0.5], | |
1654 "C": [-1], | |
1655 "D": [3], | |
1656 "E": [3], | |
1657 "F": [-2.5], | |
1658 "G": [0], | |
1659 "H": [-0.5], | |
1660 "I": [-1.8], | |
1661 "K": [3], | |
1662 "L": [-1.8], | |
1663 "M": [-1.3], | |
1664 "N": [0.2], | |
1665 "P": [0], | |
1666 "Q": [0.2], | |
1667 "R": [3], | |
1668 "S": [0.3], | |
1669 "T": [-0.4], | |
1670 "V": [-1.5], | |
1671 "W": [-3.4], | |
1672 "Y": [-2.3], | |
1673 }, | |
1674 "isaeci": { | |
1675 "A": [62.9, 0.05], | |
1676 "C": [78.51, 0.15], | |
1677 "D": [18.46, 1.25], | |
1678 "E": [30.19, 1.31], | |
1679 "F": [189.42, 0.14], | |
1680 "G": [19.93, 0.02], | |
1681 "H": [87.38, 0.56], | |
1682 "I": [149.77, 0.09], | |
1683 "K": [102.78, 0.53], | |
1684 "L": [154.35, 0.1], | |
1685 "M": [132.22, 0.34], | |
1686 "N": [19.53, 1.36], | |
1687 "P": [122.35, 0.16], | |
1688 "Q": [17.87, 1.31], | |
1689 "R": [52.98, 1.69], | |
1690 "S": [19.75, 0.56], | |
1691 "T": [59.44, 0.65], | |
1692 "V": [120.91, 0.07], | |
1693 "W": [179.16, 1.08], | |
1694 "Y": [132.16, 0.72], | |
1695 }, | |
1696 "janin": { | |
1697 "I": [1.2], | |
1698 "F": [0.87], | |
1699 "V": [1], | |
1700 "L": [0.87], | |
1701 "W": [0.59], | |
1702 "M": [0.73], | |
1703 "A": [0.59], | |
1704 "G": [0.59], | |
1705 "C": [1.4], | |
1706 "Y": [-0.4], | |
1707 "P": [-0.26], | |
1708 "T": [-0.12], | |
1709 "S": [0.02], | |
1710 "H": [0.02], | |
1711 "E": [-0.83], | |
1712 "N": [-0.55], | |
1713 "Q": [-0.83], | |
1714 "D": [-0.69], | |
1715 "K": [-2.4], | |
1716 "R": [-1.8], | |
1717 }, | |
1718 "kytedoolittle": { | |
1719 "I": [1.7], | |
1720 "F": [1.1], | |
1721 "V": [1.6], | |
1722 "L": [1.4], | |
1723 "W": [-0.14], | |
1724 "M": [0.8], | |
1725 "A": [0.77], | |
1726 "G": [0.03], | |
1727 "C": [1], | |
1728 "Y": [-0.27], | |
1729 "P": [-0.37], | |
1730 "T": [-0.07], | |
1731 "S": [-0.1], | |
1732 "H": [-0.91], | |
1733 "E": [-1], | |
1734 "N": [-1], | |
1735 "Q": [-1], | |
1736 "D": [-1], | |
1737 "K": [-1.1], | |
1738 "R": [-1.3], | |
1739 }, | |
1740 "levitt_alpha": { | |
1741 "A": [1.29], | |
1742 "C": [1.11], | |
1743 "D": [1.04], | |
1744 "E": [1.44], | |
1745 "F": [1.07], | |
1746 "G": [0.56], | |
1747 "H": [1.22], | |
1748 "I": [0.97], | |
1749 "K": [1.23], | |
1750 "L": [1.3], | |
1751 "M": [1.47], | |
1752 "N": [0.9], | |
1753 "P": [0.52], | |
1754 "Q": [1.27], | |
1755 "R": [0.96], | |
1756 "S": [0.82], | |
1757 "T": [0.82], | |
1758 "V": [0.91], | |
1759 "W": [0.99], | |
1760 "Y": [0.72], | |
1761 }, | |
1762 "mss": { | |
1763 "A": [13.02], | |
1764 "C": [23.7067], | |
1765 "D": [22.02], | |
1766 "E": [20.0233], | |
1767 "F": [23.5288], | |
1768 "G": [1.01], | |
1769 "H": [23.5283], | |
1770 "I": [22.3611], | |
1771 "K": [18.9756], | |
1772 "L": [19.6944], | |
1773 "M": [21.92], | |
1774 "N": [21.8567], | |
1775 "P": [19.0242], | |
1776 "Q": [19.9689], | |
1777 "R": [19.0434], | |
1778 "S": [18.3533], | |
1779 "T": [22.3567], | |
1780 "V": [21.0267], | |
1781 "W": [26.1975], | |
1782 "Y": [24.1954], | |
1783 }, | |
1784 "msw": { | |
1785 "A": [-0.73, 0.2, -0.62], | |
1786 "C": [-0.66, 0.26, -0.27], | |
1787 "D": [0.11, -1, -0.96], | |
1788 "E": [0.24, -0.39, -0.04], | |
1789 "F": [0.76, 0.85, -0.34], | |
1790 "G": [-0.31, -0.28, -0.75], | |
1791 "H": [0.84, 0.67, -0.78], | |
1792 "I": [-0.91, 0.83, -0.25], | |
1793 "K": [-0.51, 0.08, 0.6], | |
1794 "L": [-0.74, 0.72, -0.16], | |
1795 "M": [-0.7, 1, -0.32], | |
1796 "N": [0.14, 0.2, -0.66], | |
1797 "P": [-0.43, 0.73, -0.6], | |
1798 "Q": [0.3, 1, -0.3], | |
1799 "R": [-0.22, 0.27, 1], | |
1800 "S": [-0.8, 0.61, -1], | |
1801 "T": [-0.58, 0.85, -0.89], | |
1802 "V": [-1, 0.79, -0.58], | |
1803 "W": [1, 0.98, -0.47], | |
1804 "Y": [0.97, 0.66, -0.16], | |
1805 }, | |
1806 "pepcats": { | |
1807 "A": [1, 0, 0, 0, 0, 0], | |
1808 "C": [1, 0, 1, 1, 0, 0], | |
1809 "D": [0, 0, 1, 0, 0, 1], | |
1810 "E": [0, 0, 1, 0, 0, 1], | |
1811 "F": [1, 1, 0, 0, 0, 0], | |
1812 "G": [0, 0, 0, 0, 0, 0], | |
1813 "H": [1, 1, 0, 1, 1, 0], | |
1814 "I": [1, 0, 0, 0, 0, 0], | |
1815 "K": [1, 0, 0, 1, 1, 0], | |
1816 "L": [1, 0, 0, 0, 0, 0], | |
1817 "M": [1, 0, 1, 0, 0, 0], | |
1818 "N": [0, 0, 1, 1, 0, 0], | |
1819 "P": [1, 0, 0, 0, 0, 0], | |
1820 "Q": [0, 0, 1, 1, 0, 0], | |
1821 "R": [1, 0, 0, 1, 1, 0], | |
1822 "S": [0, 0, 1, 1, 0, 0], | |
1823 "T": [0, 0, 1, 1, 0, 0], | |
1824 "V": [1, 0, 0, 0, 0, 0], | |
1825 "W": [1, 1, 0, 1, 0, 0], | |
1826 "Y": [1, 1, 1, 1, 0, 0], | |
1827 }, | |
1828 "peparc": { | |
1829 "A": [1, 0, 0, 0, 0], | |
1830 "C": [0, 1, 0, 0, 0], | |
1831 "D": [0, 1, 0, 1, 0], | |
1832 "E": [0, 1, 0, 1, 0], | |
1833 "F": [1, 0, 0, 0, 0], | |
1834 "G": [0, 0, 0, 0, 0], | |
1835 "H": [0, 1, 1, 0, 0], | |
1836 "I": [1, 0, 0, 0, 0], | |
1837 "K": [0, 1, 1, 0, 0], | |
1838 "L": [1, 0, 0, 0, 0], | |
1839 "M": [1, 0, 0, 0, 0], | |
1840 "N": [0, 1, 0, 0, 0], | |
1841 "P": [0, 0, 0, 0, 1], | |
1842 "Q": [0, 1, 0, 0, 0], | |
1843 "R": [0, 1, 1, 0, 0], | |
1844 "S": [0, 1, 0, 0, 0], | |
1845 "T": [0, 1, 0, 0, 0], | |
1846 "V": [1, 0, 0, 0, 0], | |
1847 "W": [1, 0, 0, 0, 0], | |
1848 "Y": [1, 0, 0, 0, 0], | |
1849 }, | |
1850 "polarity": { | |
1851 "A": [0.395], | |
1852 "C": [0.074], | |
1853 "D": [1.0], | |
1854 "E": [0.914], | |
1855 "F": [0.037], | |
1856 "G": [0.506], | |
1857 "H": [0.679], | |
1858 "I": [0.037], | |
1859 "K": [0.79], | |
1860 "L": [0.0], | |
1861 "M": [0.099], | |
1862 "N": [0.827], | |
1863 "P": [0.383], | |
1864 "Q": [0.691], | |
1865 "R": [0.691], | |
1866 "S": [0.531], | |
1867 "T": [0.457], | |
1868 "V": [0.123], | |
1869 "W": [0.062], | |
1870 "Y": [0.16], | |
1871 }, | |
1872 "ppcali": { | |
1873 "A": [ | |
1874 0.070781, | |
1875 0.036271, | |
1876 2.042, | |
1877 0.083272, | |
1878 0.69089, | |
1879 0.15948, | |
1880 -0.80893, | |
1881 0.24698, | |
1882 0.86525, | |
1883 0.68563, | |
1884 -0.24665, | |
1885 0.61314, | |
1886 -0.53343, | |
1887 -0.50878, | |
1888 -1.3646, | |
1889 2.2679, | |
1890 -1.5644, | |
1891 -0.75043, | |
1892 -0.65875, | |
1893 ], | |
1894 "C": [ | |
1895 0.61013, | |
1896 -0.93043, | |
1897 -0.85983, | |
1898 -2.2704, | |
1899 1.5877, | |
1900 -2.0066, | |
1901 -0.30314, | |
1902 1.2544, | |
1903 -0.2832, | |
1904 -1.2844, | |
1905 -0.73449, | |
1906 -0.11235, | |
1907 -0.41152, | |
1908 -0.0050164, | |
1909 0.28307, | |
1910 0.20522, | |
1911 -0.021084, | |
1912 -0.15627, | |
1913 -0.32689, | |
1914 ], | |
1915 "D": [ | |
1916 -1.3215, | |
1917 0.24063, | |
1918 -0.032754, | |
1919 -0.37863, | |
1920 1.2051, | |
1921 1.0001, | |
1922 2.1827, | |
1923 0.19212, | |
1924 -0.60529, | |
1925 0.37639, | |
1926 -0.46451, | |
1927 -0.46788, | |
1928 1.4077, | |
1929 -2.1661, | |
1930 0.72604, | |
1931 -0.12332, | |
1932 -0.8243, | |
1933 -0.082989, | |
1934 0.053476, | |
1935 ], | |
1936 "E": [ | |
1937 -0.87713, | |
1938 1.4905, | |
1939 1.0755, | |
1940 0.35944, | |
1941 1.567, | |
1942 0.41365, | |
1943 1.0944, | |
1944 0.72634, | |
1945 -0.74957, | |
1946 0.038939, | |
1947 0.075057, | |
1948 0.78637, | |
1949 -1.4543, | |
1950 1.6667, | |
1951 -0.097439, | |
1952 -0.24293, | |
1953 1.7687, | |
1954 0.36174, | |
1955 -0.11585, | |
1956 ], | |
1957 "F": [ | |
1958 1.3557, | |
1959 -0.10336, | |
1960 -0.4309, | |
1961 0.41269, | |
1962 -0.083356, | |
1963 0.83783, | |
1964 0.095381, | |
1965 -0.65222, | |
1966 -0.3119, | |
1967 0.43293, | |
1968 -1.0011, | |
1969 -0.66855, | |
1970 -0.10242, | |
1971 1.2066, | |
1972 2.6234, | |
1973 1.9981, | |
1974 -0.25016, | |
1975 0.71979, | |
1976 0.21569, | |
1977 ], | |
1978 "G": [ | |
1979 -1.0818, | |
1980 -2.1561, | |
1981 0.77082, | |
1982 -0.92747, | |
1983 -1.0748, | |
1984 1.7997, | |
1985 -1.3708, | |
1986 1.279, | |
1987 -1.2098, | |
1988 0.46065, | |
1989 0.43076, | |
1990 0.20037, | |
1991 -0.2302, | |
1992 0.2646, | |
1993 0.57149, | |
1994 -0.68432, | |
1995 0.19341, | |
1996 -0.061606, | |
1997 -0.08071, | |
1998 ], | |
1999 "H": [ | |
2000 -0.050161, | |
2001 0.69246, | |
2002 -0.88397, | |
2003 -0.64601, | |
2004 0.24622, | |
2005 0.10487, | |
2006 -1.1317, | |
2007 -2.3661, | |
2008 -0.89918, | |
2009 0.46391, | |
2010 -0.62359, | |
2011 2.5478, | |
2012 -0.34737, | |
2013 -0.52062, | |
2014 0.17522, | |
2015 -0.88648, | |
2016 -0.4755, | |
2017 0.023187, | |
2018 -0.28261, | |
2019 ], | |
2020 "I": [ | |
2021 1.4829, | |
2022 -0.46435, | |
2023 0.50189, | |
2024 0.55724, | |
2025 -0.51535, | |
2026 -0.29914, | |
2027 0.97236, | |
2028 -0.15793, | |
2029 -0.98246, | |
2030 -0.54347, | |
2031 0.97806, | |
2032 0.37577, | |
2033 1.618, | |
2034 0.62323, | |
2035 -0.59359, | |
2036 -0.35483, | |
2037 -0.085017, | |
2038 0.55825, | |
2039 -2.7542, | |
2040 ], | |
2041 "K": [ | |
2042 -0.85344, | |
2043 1.529, | |
2044 0.27747, | |
2045 0.32993, | |
2046 -1.1786, | |
2047 -0.16633, | |
2048 -1.0459, | |
2049 0.44621, | |
2050 0.41027, | |
2051 -2.5318, | |
2052 0.91329, | |
2053 0.53385, | |
2054 0.61417, | |
2055 -1.111, | |
2056 1.1323, | |
2057 0.95105, | |
2058 0.76769, | |
2059 -0.016115, | |
2060 0.054995, | |
2061 ], | |
2062 "L": [ | |
2063 1.2857, | |
2064 0.039488, | |
2065 1.5378, | |
2066 0.87969, | |
2067 -0.21419, | |
2068 0.40389, | |
2069 -0.20426, | |
2070 -0.14351, | |
2071 0.61024, | |
2072 -1.1927, | |
2073 -2.2149, | |
2074 -0.84248, | |
2075 -0.5061, | |
2076 -0.48548, | |
2077 0.10791, | |
2078 -2.1503, | |
2079 -0.12006, | |
2080 -0.60222, | |
2081 0.26546, | |
2082 ], | |
2083 "M": [ | |
2084 1.137, | |
2085 0.64388, | |
2086 0.13724, | |
2087 -0.2988, | |
2088 1.2288, | |
2089 0.24981, | |
2090 -1.6427, | |
2091 -0.75868, | |
2092 -0.54902, | |
2093 1.0571, | |
2094 1.272, | |
2095 -1.9104, | |
2096 0.70919, | |
2097 -0.93575, | |
2098 -0.6314, | |
2099 -0.079654, | |
2100 1.634, | |
2101 -0.0021923, | |
2102 0.49825, | |
2103 ], | |
2104 "N": [ | |
2105 -1.084, | |
2106 -0.176, | |
2107 -0.47062, | |
2108 -0.92245, | |
2109 -0.32953, | |
2110 0.74278, | |
2111 0.34551, | |
2112 -1.4605, | |
2113 0.25219, | |
2114 -1.2107, | |
2115 -0.59978, | |
2116 -0.79183, | |
2117 1.3268, | |
2118 1.9839, | |
2119 -1.6137, | |
2120 0.5333, | |
2121 0.033889, | |
2122 -1.0331, | |
2123 0.83019, | |
2124 ], | |
2125 "P": [ | |
2126 -1.1823, | |
2127 -1.6911, | |
2128 -1.1331, | |
2129 3.073, | |
2130 1.1942, | |
2131 -0.93426, | |
2132 -0.72985, | |
2133 -0.042441, | |
2134 -0.19264, | |
2135 -0.21603, | |
2136 -0.1239, | |
2137 0.054016, | |
2138 0.15241, | |
2139 -0.019691, | |
2140 -0.20543, | |
2141 0.10206, | |
2142 0.07671, | |
2143 -0.081968, | |
2144 0.20348, | |
2145 ], | |
2146 "Q": [ | |
2147 -0.57747, | |
2148 0.97452, | |
2149 -0.077547, | |
2150 -0.0033488, | |
2151 0.17184, | |
2152 -0.52537, | |
2153 -0.27362, | |
2154 -0.1366, | |
2155 0.2057, | |
2156 -0.013066, | |
2157 1.8834, | |
2158 -1.2736, | |
2159 -0.84991, | |
2160 1.0445, | |
2161 0.69027, | |
2162 -1.2866, | |
2163 -2.6776, | |
2164 0.1683, | |
2165 0.086105, | |
2166 ], | |
2167 "R": [ | |
2168 -0.62245, | |
2169 1.545, | |
2170 -0.61966, | |
2171 0.19057, | |
2172 -1.7485, | |
2173 -1.3909, | |
2174 -0.47526, | |
2175 1.3938, | |
2176 -0.84556, | |
2177 1.7344, | |
2178 -1.6516, | |
2179 -0.52678, | |
2180 0.6791, | |
2181 0.24374, | |
2182 -0.62551, | |
2183 -0.0028271, | |
2184 -0.053884, | |
2185 0.14926, | |
2186 -0.17232, | |
2187 ], | |
2188 "S": [ | |
2189 -0.86409, | |
2190 -0.77147, | |
2191 0.38542, | |
2192 -0.59389, | |
2193 -0.53313, | |
2194 -0.47585, | |
2195 0.31966, | |
2196 -0.89716, | |
2197 1.8029, | |
2198 0.26431, | |
2199 -0.23173, | |
2200 -0.37626, | |
2201 -0.47349, | |
2202 -0.42878, | |
2203 -0.47297, | |
2204 -0.079826, | |
2205 0.57043, | |
2206 3.2057, | |
2207 -0.18413, | |
2208 ], | |
2209 "T": [ | |
2210 -0.33027, | |
2211 -0.57447, | |
2212 0.18653, | |
2213 -0.28941, | |
2214 -0.62681, | |
2215 -1.0737, | |
2216 0.80363, | |
2217 -0.59525, | |
2218 1.8786, | |
2219 1.3971, | |
2220 0.63929, | |
2221 0.21281, | |
2222 -0.067048, | |
2223 0.096271, | |
2224 1.323, | |
2225 -0.36173, | |
2226 1.2261, | |
2227 -2.2771, | |
2228 -0.65412, | |
2229 ], | |
2230 "V": [ | |
2231 1.1675, | |
2232 -0.61554, | |
2233 0.95405, | |
2234 0.11662, | |
2235 -0.74473, | |
2236 -1.1482, | |
2237 1.1309, | |
2238 0.12079, | |
2239 -0.77171, | |
2240 0.18597, | |
2241 0.93442, | |
2242 1.201, | |
2243 0.3826, | |
2244 -0.091573, | |
2245 -0.31269, | |
2246 0.074367, | |
2247 -0.22946, | |
2248 0.24322, | |
2249 2.9836, | |
2250 ], | |
2251 "W": [ | |
2252 1.1881, | |
2253 0.43789, | |
2254 -1.7915, | |
2255 0.138, | |
2256 0.43088, | |
2257 1.6467, | |
2258 -0.11987, | |
2259 1.7369, | |
2260 2.0818, | |
2261 0.33122, | |
2262 0.31829, | |
2263 1.1586, | |
2264 0.67649, | |
2265 0.30819, | |
2266 -0.55772, | |
2267 -0.54491, | |
2268 -0.17969, | |
2269 0.24477, | |
2270 0.38674, | |
2271 ], | |
2272 "Y": [ | |
2273 0.54671, | |
2274 -0.1468, | |
2275 -1.5688, | |
2276 0.19001, | |
2277 -1.2736, | |
2278 0.66162, | |
2279 1.1614, | |
2280 -0.18614, | |
2281 -0.70654, | |
2282 -0.43634, | |
2283 0.44775, | |
2284 -0.71366, | |
2285 -2.5907, | |
2286 -1.1649, | |
2287 -1.1576, | |
2288 0.66572, | |
2289 0.21019, | |
2290 -0.61016, | |
2291 -0.34844, | |
2292 ], | |
2293 }, | |
2294 "refractivity": { | |
2295 "A": [0.102045615], | |
2296 "C": [0.841053374], | |
2297 "D": [0.282153774], | |
2298 "E": [0.405831178], | |
2299 "F": [0.691276746], | |
2300 "G": [0], | |
2301 "H": [0.512814484], | |
2302 "I": [0.448154244], | |
2303 "K": [0.50058782], | |
2304 "L": [0.441570656], | |
2305 "M": [0.508817305], | |
2306 "N": [0.282153774], | |
2307 "P": [0.256995062], | |
2308 "Q": [0.405831178], | |
2309 "R": [0.626851634], | |
2310 "S": [0.149306372], | |
2311 "T": [0.258876087], | |
2312 "V": [0.327298378], | |
2313 "W": [1], | |
2314 "Y": [0.741359041], | |
2315 }, | |
2316 "t_scale": { | |
2317 "A": [-8.4, -8.01, -3.73, -3.65, -6.12, -1.59, 1.56], | |
2318 "C": [-2.44, -1.96, 0.93, -2.35, 1.31, 2.29, -1.52], | |
2319 "D": [-6.84, -0.94, 17.68, -0.03, 3.44, 9.07, 4.32], | |
2320 "E": [-6.5, 16.2, 17.28, 3.11, -4.75, -2.54, 4.72], | |
2321 "F": [21.59, -5.73, 1.03, -3.3, 2.64, -5.02, 1.7], | |
2322 "G": [-8.48, -10.37, -5.14, -6.51, -11.84, -3.6, 2.01], | |
2323 "H": [15.28, -3.67, 6.72, -6.38, 4.12, -1.55, -2.85], | |
2324 "I": [-2.97, 4.64, -0.77, 11, 3.26, -4.36, -7.88], | |
2325 "K": [2.7, 13.46, -14.03, -2.55, 2.77, 0.15, 3.19], | |
2326 "L": [2.61, 5.96, 1.97, 2.59, -4.77, -4.84, -5.44], | |
2327 "M": [3.38, 12.43, -4.77, 0.45, -1.55, -0.6, 3.26], | |
2328 "N": [-3.11, -1.22, 6.26, -9.38, 9.94, 7.66, -4.81], | |
2329 "P": [-5.35, -9.07, -1.52, -8.79, -8.73, 4.29, -9.91], | |
2330 "Q": [-5.31, 15.64, 8.44, 1.03, -4.32, -4.4, -0.52], | |
2331 "R": [-2.27, 18.9, -18.24, -3.47, 3.03, 6.64, 0.45], | |
2332 "S": [-15.88, -11.21, -2.44, -3.61, 3.46, -0.37, 8.98], | |
2333 "T": [-17.81, -13.64, -5.19, 10.57, 6.91, -4.43, 3.49], | |
2334 "V": [-5.8, -6.15, -2.26, 9.87, 5.28, -1.49, -7.54], | |
2335 "W": [21.68, -8.78, -2.53, 15.53, -8.15, 11.98, 3.23], | |
2336 "Y": [23.9, -6.47, 0.31, -4.14, 4.08, -7.28, 3.59], | |
2337 }, | |
2338 "tm_tend": { | |
2339 "A": [0.38], | |
2340 "C": [-0.3], | |
2341 "D": [-3.27], | |
2342 "E": [-2.9], | |
2343 "F": [1.98], | |
2344 "G": [-0.19], | |
2345 "H": [-1.44], | |
2346 "I": [1.97], | |
2347 "K": [-3.46], | |
2348 "L": [1.82], | |
2349 "M": [1.4], | |
2350 "N": [-1.62], | |
2351 "P": [-1.44], | |
2352 "Q": [-1.84], | |
2353 "R": [-2.57], | |
2354 "S": [-0.53], | |
2355 "T": [-0.32], | |
2356 "V": [1.46], | |
2357 "W": [1.53], | |
2358 "Y": [0.49], | |
2359 }, | |
2360 "z3": { | |
2361 "A": [0.07, -1.73, 0.09], | |
2362 "C": [0.71, -0.97, 4.13], | |
2363 "D": [3.64, 1.13, 2.36], | |
2364 "E": [3.08, 0.39, -0.07], | |
2365 "F": [-4.92, 1.3, 0.45], | |
2366 "G": [2.23, -5.36, 0.3], | |
2367 "H": [2.41, 1.74, 1.11], | |
2368 "I": [-4.44, -1.68, -1.03], | |
2369 "K": [2.84, 1.41, -3.14], | |
2370 "L": [-4.19, -1.03, -0.98], | |
2371 "M": [-2.49, -0.27, -0.41], | |
2372 "N": [3.22, 1.45, 0.84], | |
2373 "P": [-1.22, 0.88, 2.23], | |
2374 "Q": [2.18, 0.53, -1.14], | |
2375 "R": [2.88, 2.52, -3.44], | |
2376 "S": [1.96, -1.63, 0.57], | |
2377 "T": [0.92, -2.09, -1.4], | |
2378 "V": [-2.69, -2.53, -1.29], | |
2379 "W": [-4.75, 3.65, 0.85], | |
2380 "Y": [-1.39, 2.32, 0.01], | |
2381 }, | |
2382 "z5": { | |
2383 "A": [0.24, -2.32, 0.6, -0.14, 1.3], | |
2384 "C": [0.84, -1.67, 3.71, 0.18, -2.65], | |
2385 "D": [3.98, 0.93, 1.93, -2.46, 0.75], | |
2386 "E": [3.11, 0.26, -0.11, -3.04, -0.25], | |
2387 "F": [-4.22, 1.94, 1.06, 0.54, -0.62], | |
2388 "G": [2.05, -4.06, 0.36, -0.82, -0.38], | |
2389 "H": [2.47, 1.95, 0.26, 3.9, 0.09], | |
2390 "I": [-3.89, -1.73, -1.71, -0.84, 0.26], | |
2391 "K": [2.29, 0.89, -2.49, 1.49, 0.31], | |
2392 "L": [-4.28, -1.3, -1.49, -0.72, 0.84], | |
2393 "M": [-2.85, -0.22, 0.47, 1.94, -0.98], | |
2394 "N": [3.05, 1.62, 1.04, -1.15, 1.61], | |
2395 "P": [-1.66, 0.27, 1.84, 0.7, 2], | |
2396 "Q": [1.75, 0.5, -1.44, -1.34, 0.66], | |
2397 "R": [3.52, 2.5, -3.5, 1.99, -0.17], | |
2398 "S": [2.39, -1.07, 1.15, -1.39, 0.67], | |
2399 "T": [0.75, -2.18, -1.12, -1.46, -0.4], | |
2400 "V": [-2.59, -2.64, -1.54, -0.85, -0.02], | |
2401 "W": [-4.36, 3.94, 0.59, 3.44, -1.59], | |
2402 "Y": [-2.54, 2.44, 0.43, 0.04, -1.47], | |
2403 }, | |
2404 } | |
2405 if scalename == "all": | |
2406 d = { | |
2407 "I": [], | |
2408 "F": [], | |
2409 "V": [], | |
2410 "L": [], | |
2411 "W": [], | |
2412 "M": [], | |
2413 "A": [], | |
2414 "G": [], | |
2415 "C": [], | |
2416 "Y": [], | |
2417 "P": [], | |
2418 "T": [], | |
2419 "S": [], | |
2420 "H": [], | |
2421 "E": [], | |
2422 "N": [], | |
2423 "Q": [], | |
2424 "D": [], | |
2425 "K": [], | |
2426 "R": [], | |
2427 } | |
2428 for scale in scales.keys(): | |
2429 for k, v in scales[scale].items(): | |
2430 d[k].extend(v) | |
2431 return "all", d | |
2432 | |
2433 elif scalename == "instability": | |
2434 d = { | |
2435 "A": { | |
2436 "A": 1.0, | |
2437 "C": 44.94, | |
2438 "E": 1.0, | |
2439 "D": -7.49, | |
2440 "G": 1.0, | |
2441 "F": 1.0, | |
2442 "I": 1.0, | |
2443 "H": -7.49, | |
2444 "K": 1.0, | |
2445 "M": 1.0, | |
2446 "L": 1.0, | |
2447 "N": 1.0, | |
2448 "Q": 1.0, | |
2449 "P": 20.26, | |
2450 "S": 1.0, | |
2451 "R": 1.0, | |
2452 "T": 1.0, | |
2453 "W": 1.0, | |
2454 "V": 1.0, | |
2455 "Y": 1.0, | |
2456 }, | |
2457 "C": { | |
2458 "A": 1.0, | |
2459 "C": 1.0, | |
2460 "E": 1.0, | |
2461 "D": 20.26, | |
2462 "G": 1.0, | |
2463 "F": 1.0, | |
2464 "I": 1.0, | |
2465 "H": 33.6, | |
2466 "K": 1.0, | |
2467 "M": 33.6, | |
2468 "L": 20.26, | |
2469 "N": 1.0, | |
2470 "Q": -6.54, | |
2471 "P": 20.26, | |
2472 "S": 1.0, | |
2473 "R": 1.0, | |
2474 "T": 33.6, | |
2475 "W": 24.68, | |
2476 "V": -6.54, | |
2477 "Y": 1.0, | |
2478 }, | |
2479 "E": { | |
2480 "A": 1.0, | |
2481 "C": 44.94, | |
2482 "E": 33.6, | |
2483 "D": 20.26, | |
2484 "G": 1.0, | |
2485 "F": 1.0, | |
2486 "I": 20.26, | |
2487 "H": -6.54, | |
2488 "K": 1.0, | |
2489 "M": 1.0, | |
2490 "L": 1.0, | |
2491 "N": 1.0, | |
2492 "Q": 20.26, | |
2493 "P": 20.26, | |
2494 "S": 20.26, | |
2495 "R": 1.0, | |
2496 "T": 1.0, | |
2497 "W": -14.03, | |
2498 "V": 1.0, | |
2499 "Y": 1.0, | |
2500 }, | |
2501 "D": { | |
2502 "A": 1.0, | |
2503 "C": 1.0, | |
2504 "E": 1.0, | |
2505 "D": 1.0, | |
2506 "G": 1.0, | |
2507 "F": -6.54, | |
2508 "I": 1.0, | |
2509 "H": 1.0, | |
2510 "K": -7.49, | |
2511 "M": 1.0, | |
2512 "L": 1.0, | |
2513 "N": 1.0, | |
2514 "Q": 1.0, | |
2515 "P": 1.0, | |
2516 "S": 20.26, | |
2517 "R": -6.54, | |
2518 "T": -14.03, | |
2519 "W": 1.0, | |
2520 "V": 1.0, | |
2521 "Y": 1.0, | |
2522 }, | |
2523 "G": { | |
2524 "A": -7.49, | |
2525 "C": 1.0, | |
2526 "E": -6.54, | |
2527 "D": 1.0, | |
2528 "G": 13.34, | |
2529 "F": 1.0, | |
2530 "I": -7.49, | |
2531 "H": 1.0, | |
2532 "K": -7.49, | |
2533 "M": 1.0, | |
2534 "L": 1.0, | |
2535 "N": -7.49, | |
2536 "Q": 1.0, | |
2537 "P": 1.0, | |
2538 "S": 1.0, | |
2539 "R": 1.0, | |
2540 "T": -7.49, | |
2541 "W": 13.34, | |
2542 "V": 1.0, | |
2543 "Y": -7.49, | |
2544 }, | |
2545 "F": { | |
2546 "A": 1.0, | |
2547 "C": 1.0, | |
2548 "E": 1.0, | |
2549 "D": 13.34, | |
2550 "G": 1.0, | |
2551 "F": 1.0, | |
2552 "I": 1.0, | |
2553 "H": 1.0, | |
2554 "K": -14.03, | |
2555 "M": 1.0, | |
2556 "L": 1.0, | |
2557 "N": 1.0, | |
2558 "Q": 1.0, | |
2559 "P": 20.26, | |
2560 "S": 1.0, | |
2561 "R": 1.0, | |
2562 "T": 1.0, | |
2563 "W": 1.0, | |
2564 "V": 1.0, | |
2565 "Y": 33.601, | |
2566 }, | |
2567 "I": { | |
2568 "A": 1.0, | |
2569 "C": 1.0, | |
2570 "E": 44.94, | |
2571 "D": 1.0, | |
2572 "G": 1.0, | |
2573 "F": 1.0, | |
2574 "I": 1.0, | |
2575 "H": 13.34, | |
2576 "K": -7.49, | |
2577 "M": 1.0, | |
2578 "L": 20.26, | |
2579 "N": 1.0, | |
2580 "Q": 1.0, | |
2581 "P": -1.88, | |
2582 "S": 1.0, | |
2583 "R": 1.0, | |
2584 "T": 1.0, | |
2585 "W": 1.0, | |
2586 "V": -7.49, | |
2587 "Y": 1.0, | |
2588 }, | |
2589 "H": { | |
2590 "A": 1.0, | |
2591 "C": 1.0, | |
2592 "E": 1.0, | |
2593 "D": 1.0, | |
2594 "G": -9.37, | |
2595 "F": -9.37, | |
2596 "I": 44.94, | |
2597 "H": 1.0, | |
2598 "K": 24.68, | |
2599 "M": 1.0, | |
2600 "L": 1.0, | |
2601 "N": 24.68, | |
2602 "Q": 1.0, | |
2603 "P": -1.88, | |
2604 "S": 1.0, | |
2605 "R": 1.0, | |
2606 "T": -6.54, | |
2607 "W": -1.88, | |
2608 "V": 1.0, | |
2609 "Y": 44.94, | |
2610 }, | |
2611 "K": { | |
2612 "A": 1.0, | |
2613 "C": 1.0, | |
2614 "E": 1.0, | |
2615 "D": 1.0, | |
2616 "G": -7.49, | |
2617 "F": 1.0, | |
2618 "I": -7.49, | |
2619 "H": 1.0, | |
2620 "K": 1.0, | |
2621 "M": 33.6, | |
2622 "L": -7.49, | |
2623 "N": 1.0, | |
2624 "Q": 24.64, | |
2625 "P": -6.54, | |
2626 "S": 1.0, | |
2627 "R": 33.6, | |
2628 "T": 1.0, | |
2629 "W": 1.0, | |
2630 "V": -7.49, | |
2631 "Y": 1.0, | |
2632 }, | |
2633 "M": { | |
2634 "A": 13.34, | |
2635 "C": 1.0, | |
2636 "E": 1.0, | |
2637 "D": 1.0, | |
2638 "G": 1.0, | |
2639 "F": 1.0, | |
2640 "I": 1.0, | |
2641 "H": 58.28, | |
2642 "K": 1.0, | |
2643 "M": -1.88, | |
2644 "L": 1.0, | |
2645 "N": 1.0, | |
2646 "Q": -6.54, | |
2647 "P": 44.94, | |
2648 "S": 44.94, | |
2649 "R": -6.54, | |
2650 "T": -1.88, | |
2651 "W": 1.0, | |
2652 "V": 1.0, | |
2653 "Y": 24.68, | |
2654 }, | |
2655 "L": { | |
2656 "A": 1.0, | |
2657 "C": 1.0, | |
2658 "E": 1.0, | |
2659 "D": 1.0, | |
2660 "G": 1.0, | |
2661 "F": 1.0, | |
2662 "I": 1.0, | |
2663 "H": 1.0, | |
2664 "K": -7.49, | |
2665 "M": 1.0, | |
2666 "L": 1.0, | |
2667 "N": 1.0, | |
2668 "Q": 33.6, | |
2669 "P": 20.26, | |
2670 "S": 1.0, | |
2671 "R": 20.26, | |
2672 "T": 1.0, | |
2673 "W": 24.68, | |
2674 "V": 1.0, | |
2675 "Y": 1.0, | |
2676 }, | |
2677 "N": { | |
2678 "A": 1.0, | |
2679 "C": -1.88, | |
2680 "E": 1.0, | |
2681 "D": 1.0, | |
2682 "G": -14.03, | |
2683 "F": -14.03, | |
2684 "I": 44.94, | |
2685 "H": 1.0, | |
2686 "K": 24.68, | |
2687 "M": 1.0, | |
2688 "L": 1.0, | |
2689 "N": 1.0, | |
2690 "Q": -6.54, | |
2691 "P": -1.88, | |
2692 "S": 1.0, | |
2693 "R": 1.0, | |
2694 "T": -7.49, | |
2695 "W": -9.37, | |
2696 "V": 1.0, | |
2697 "Y": 1.0, | |
2698 }, | |
2699 "Q": { | |
2700 "A": 1.0, | |
2701 "C": -6.54, | |
2702 "E": 20.26, | |
2703 "D": 20.26, | |
2704 "G": 1.0, | |
2705 "F": -6.54, | |
2706 "I": 1.0, | |
2707 "H": 1.0, | |
2708 "K": 1.0, | |
2709 "M": 1.0, | |
2710 "L": 1.0, | |
2711 "N": 1.0, | |
2712 "Q": 20.26, | |
2713 "P": 20.26, | |
2714 "S": 44.94, | |
2715 "R": 1.0, | |
2716 "T": 1.0, | |
2717 "W": 1.0, | |
2718 "V": -6.54, | |
2719 "Y": -6.54, | |
2720 }, | |
2721 "P": { | |
2722 "A": 20.26, | |
2723 "C": -6.54, | |
2724 "E": 18.38, | |
2725 "D": -6.54, | |
2726 "G": 1.0, | |
2727 "F": 20.26, | |
2728 "I": 1.0, | |
2729 "H": 1.0, | |
2730 "K": 1.0, | |
2731 "M": -6.54, | |
2732 "L": 1.0, | |
2733 "N": 1.0, | |
2734 "Q": 20.26, | |
2735 "P": 20.26, | |
2736 "S": 20.26, | |
2737 "R": -6.54, | |
2738 "T": 1.0, | |
2739 "W": -1.88, | |
2740 "V": 20.26, | |
2741 "Y": 1.0, | |
2742 }, | |
2743 "S": { | |
2744 "A": 1.0, | |
2745 "C": 33.6, | |
2746 "E": 20.26, | |
2747 "D": 1.0, | |
2748 "G": 1.0, | |
2749 "F": 1.0, | |
2750 "I": 1.0, | |
2751 "H": 1.0, | |
2752 "K": 1.0, | |
2753 "M": 1.0, | |
2754 "L": 1.0, | |
2755 "N": 1.0, | |
2756 "Q": 20.26, | |
2757 "P": 44.94, | |
2758 "S": 20.26, | |
2759 "R": 20.26, | |
2760 "T": 1.0, | |
2761 "W": 1.0, | |
2762 "V": 1.0, | |
2763 "Y": 1.0, | |
2764 }, | |
2765 "R": { | |
2766 "A": 1.0, | |
2767 "C": 1.0, | |
2768 "E": 1.0, | |
2769 "D": 1.0, | |
2770 "G": -7.49, | |
2771 "F": 1.0, | |
2772 "I": 1.0, | |
2773 "H": 20.26, | |
2774 "K": 1.0, | |
2775 "M": 1.0, | |
2776 "L": 1.0, | |
2777 "N": 13.34, | |
2778 "Q": 20.26, | |
2779 "P": 20.26, | |
2780 "S": 44.94, | |
2781 "R": 58.28, | |
2782 "T": 1.0, | |
2783 "W": 58.28, | |
2784 "V": 1.0, | |
2785 "Y": -6.54, | |
2786 }, | |
2787 "T": { | |
2788 "A": 1.0, | |
2789 "C": 1.0, | |
2790 "E": 20.26, | |
2791 "D": 1.0, | |
2792 "G": -7.49, | |
2793 "F": 13.34, | |
2794 "I": 1.0, | |
2795 "H": 1.0, | |
2796 "K": 1.0, | |
2797 "M": 1.0, | |
2798 "L": 1.0, | |
2799 "N": -14.03, | |
2800 "Q": -6.54, | |
2801 "P": 1.0, | |
2802 "S": 1.0, | |
2803 "R": 1.0, | |
2804 "T": 1.0, | |
2805 "W": -14.03, | |
2806 "V": 1.0, | |
2807 "Y": 1.0, | |
2808 }, | |
2809 "W": { | |
2810 "A": -14.03, | |
2811 "C": 1.0, | |
2812 "E": 1.0, | |
2813 "D": 1.0, | |
2814 "G": -9.37, | |
2815 "F": 1.0, | |
2816 "I": 1.0, | |
2817 "H": 24.68, | |
2818 "K": 1.0, | |
2819 "M": 24.68, | |
2820 "L": 13.34, | |
2821 "N": 13.34, | |
2822 "Q": 1.0, | |
2823 "P": 1.0, | |
2824 "S": 1.0, | |
2825 "R": 1.0, | |
2826 "T": -14.03, | |
2827 "W": 1.0, | |
2828 "V": -7.49, | |
2829 "Y": 1.0, | |
2830 }, | |
2831 "V": { | |
2832 "A": 1.0, | |
2833 "C": 1.0, | |
2834 "E": 1.0, | |
2835 "D": -14.03, | |
2836 "G": -7.49, | |
2837 "F": 1.0, | |
2838 "I": 1.0, | |
2839 "H": 1.0, | |
2840 "K": -1.88, | |
2841 "M": 1.0, | |
2842 "L": 1.0, | |
2843 "N": 1.0, | |
2844 "Q": 1.0, | |
2845 "P": 20.26, | |
2846 "S": 1.0, | |
2847 "R": 1.0, | |
2848 "T": -7.49, | |
2849 "W": 1.0, | |
2850 "V": 1.0, | |
2851 "Y": -6.54, | |
2852 }, | |
2853 "Y": { | |
2854 "A": 24.68, | |
2855 "C": 1.0, | |
2856 "E": -6.54, | |
2857 "D": 24.68, | |
2858 "G": -7.49, | |
2859 "F": 1.0, | |
2860 "I": 1.0, | |
2861 "H": 13.34, | |
2862 "K": 1.0, | |
2863 "M": 44.94, | |
2864 "L": 1.0, | |
2865 "N": 1.0, | |
2866 "Q": 1.0, | |
2867 "P": 13.34, | |
2868 "S": 1.0, | |
2869 "R": -15.91, | |
2870 "T": -7.49, | |
2871 "W": -9.37, | |
2872 "V": 1.0, | |
2873 "Y": 13.34, | |
2874 }, | |
2875 } | |
2876 return "instability", d | |
2877 | |
2878 else: | |
2879 return scalename, scales[scalename] | |
2880 | |
2881 | |
2882 def read_fasta(inputfile): | |
2883 """Method for loading sequences from a FASTA formatted file into :py:attr:`sequences` & :py:attr:`names`. | |
2884 This method is used by the base class :class:`modlamp.descriptors.PeptideDescriptor` if the input is a FASTA file. | |
2885 | |
2886 :param inputfile: .fasta file with sequences and headers to read | |
2887 :return: list of sequences in the attribute :py:attr:`sequences` with corresponding sequence names in | |
2888 :py:attr:`names`. | |
2889 """ | |
2890 names = list() # list for storing names | |
2891 sequences = list() # list for storing sequences | |
2892 seq = str() | |
2893 with open(inputfile) as f: | |
2894 all = f.readlines() | |
2895 last = all[-1] | |
2896 for line in all: | |
2897 if line.startswith(">"): | |
2898 names.append( | |
2899 line.split(" ")[0][1:].strip() | |
2900 ) # add FASTA name without description as molecule name | |
2901 sequences.append(seq.strip()) | |
2902 seq = str() | |
2903 elif line == last: | |
2904 seq += line.strip() # remove potential white space | |
2905 sequences.append(seq.strip()) | |
2906 else: | |
2907 seq += line.strip() # remove potential white space | |
2908 return sequences[1:], names | |
2909 | |
2910 | |
2911 def save_fasta(filename, sequences, names=None): | |
2912 """Method for saving sequences in the instance :py:attr:`sequences` to a file in FASTA format. | |
2913 | |
2914 :param filename: {str} output filename (ending .fasta) | |
2915 :param sequences: {list} sequences to be saved to file | |
2916 :param names: {list} whether sequence names from self.names should be saved as sequence identifiers | |
2917 :return: a FASTA formatted file containing the generated sequences | |
2918 """ | |
2919 if os.path.exists(filename): | |
2920 os.remove(filename) # remove outputfile, it it exists | |
2921 | |
2922 with open(filename, "w") as o: | |
2923 for n, seq in enumerate(sequences): | |
2924 if names: | |
2925 o.write(">" + str(names[n]) + "\n") | |
2926 else: | |
2927 o.write(">Seq_" + str(n) + "\n") | |
2928 o.write(seq + "\n") | |
2929 | |
2930 | |
2931 def aa_weights(): | |
2932 """Function holding molecular weight data on all natural amino acids. | |
2933 | |
2934 :return: dictionary with amino acid letters and corresponding weights | |
2935 | |
2936 .. versionadded:: v2.4.1 | |
2937 """ | |
2938 weights = { | |
2939 "A": 89.093, | |
2940 "C": 121.158, | |
2941 "D": 133.103, | |
2942 "E": 147.129, | |
2943 "F": 165.189, | |
2944 "G": 75.067, | |
2945 "H": 155.155, | |
2946 "I": 131.173, | |
2947 "K": 146.188, | |
2948 "L": 131.173, | |
2949 "M": 149.211, | |
2950 "N": 132.118, | |
2951 "P": 115.131, | |
2952 "Q": 146.145, | |
2953 "R": 174.20, | |
2954 "S": 105.093, | |
2955 "T": 119.119, | |
2956 "V": 117.146, | |
2957 "W": 204.225, | |
2958 "Y": 181.189, | |
2959 } | |
2960 return weights | |
2961 | |
2962 | |
2963 def count_aas(seq, scale="relative"): | |
2964 """Function to count the amino acids occuring in a given sequence. | |
2965 | |
2966 :param seq: {str} amino acid sequence | |
2967 :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA | |
2968 :return: {dict} dictionary with amino acids as keys and their counts in the sequence as values. | |
2969 """ | |
2970 if seq == "": # error if len(seq) == 0 | |
2971 seq = " " | |
2972 aas = [ | |
2973 "A", | |
2974 "C", | |
2975 "D", | |
2976 "E", | |
2977 "F", | |
2978 "G", | |
2979 "H", | |
2980 "I", | |
2981 "K", | |
2982 "L", | |
2983 "M", | |
2984 "N", | |
2985 "P", | |
2986 "Q", | |
2987 "R", | |
2988 "S", | |
2989 "T", | |
2990 "V", | |
2991 "W", | |
2992 "Y", | |
2993 ] | |
2994 scl = 1.0 | |
2995 if scale == "relative": | |
2996 scl = len(seq) | |
2997 aa = {a: (float(seq.count(a)) / scl) for a in aas} | |
2998 aa = collections.OrderedDict(sorted(list(aa.items()))) | |
2999 return aa | |
3000 | |
3001 | |
3002 def count_ngrams(seq, n): | |
3003 """Function to count the n-grams of an amino acid sequence. N can be one integer or a list of integers | |
3004 | |
3005 :param seq: {str} amino acid sequence | |
3006 :param n: {int or list of ints} defines whether counts or frequencies are given for each AA | |
3007 :return: {dict} dictionary with n-grams as keys and their counts in the sequence as values. | |
3008 """ | |
3009 if seq == "": | |
3010 seq = " " | |
3011 if isinstance(n, int): | |
3012 n = [n] | |
3013 ngrams = list() | |
3014 for i in n: | |
3015 ngrams.extend([seq[j : j + i] for j in range(len(seq) - (i - 1))]) | |
3016 counts = {g: (seq.count(g)) for g in set(ngrams)} | |
3017 counts = collections.OrderedDict( | |
3018 sorted(counts.items(), key=operator.itemgetter(1), reverse=True) | |
3019 ) | |
3020 return counts | |
3021 | |
3022 | |
3023 def aa_energies(): | |
3024 """Function holding free energies of transfer between cyclohexane and water for all natural amino acids. | |
3025 H. G. Boman, D. Wade, I. a Boman, B. Wåhlin, R. B. Merrifield, *FEBS Lett*. **1989**, *259*, 103–106. | |
3026 | |
3027 :return: dictionary with amino acid letters and corresponding energies. | |
3028 """ | |
3029 energies = { | |
3030 "L": -4.92, | |
3031 "I": -4.92, | |
3032 "V": -4.04, | |
3033 "F": -2.98, | |
3034 "M": -2.35, | |
3035 "W": -2.33, | |
3036 "A": -1.81, | |
3037 "C": -1.28, | |
3038 "G": -0.94, | |
3039 "Y": 0.14, | |
3040 "T": 2.57, | |
3041 "S": 3.40, | |
3042 "H": 4.66, | |
3043 "Q": 5.54, | |
3044 "K": 5.55, | |
3045 "N": 6.64, | |
3046 "E": 6.81, | |
3047 "D": 8.72, | |
3048 "R": 14.92, | |
3049 "P": 0.0, | |
3050 } | |
3051 return energies | |
3052 | |
3053 | |
3054 def ngrams_apd(): | |
3055 """Function returning the most frequent 2-, 3- and 4-grams from all sequences in the `APD3 | |
3056 <http://aps.unmc.edu/AP/>`_, version August 2016 with 2727 sequences. | |
3057 For all 2, 3 and 4grams, all possible ngrams were generated from all sequences and the top 50 most frequent | |
3058 assembled into a list. Finally, leading and tailing spaces were striped and duplicates as well as ngrams containing | |
3059 spaces were removed. | |
3060 | |
3061 :return: numpy.array containing most frequent ngrams | |
3062 """ | |
3063 return np.array( | |
3064 [ | |
3065 "AGK", | |
3066 "CKI", | |
3067 "RR", | |
3068 "YGGG", | |
3069 "LSGL", | |
3070 "RG", | |
3071 "YGGY", | |
3072 "PRP", | |
3073 "LGGG", | |
3074 "GV", | |
3075 "GT", | |
3076 "GS", | |
3077 "GR", | |
3078 "IAG", | |
3079 "GG", | |
3080 "GF", | |
3081 "GC", | |
3082 "GGYG", | |
3083 "GA", | |
3084 "GL", | |
3085 "GK", | |
3086 "GI", | |
3087 "IPC", | |
3088 "KAA", | |
3089 "LAK", | |
3090 "GLGG", | |
3091 "GGLG", | |
3092 "CKIT", | |
3093 "GAGK", | |
3094 "LLSG", | |
3095 "LKK", | |
3096 "FLP", | |
3097 "LSG", | |
3098 "SCK", | |
3099 "LLS", | |
3100 "GETC", | |
3101 "VLG", | |
3102 "GKLL", | |
3103 "LLG", | |
3104 "C", | |
3105 "KCKI", | |
3106 "G", | |
3107 "VGK", | |
3108 "CSC", | |
3109 "TKKC", | |
3110 "GCS", | |
3111 "GKA", | |
3112 "IGK", | |
3113 "GESC", | |
3114 "KVCY", | |
3115 "KKL", | |
3116 "KKI", | |
3117 "KKC", | |
3118 "LGGL", | |
3119 "GLL", | |
3120 "CGE", | |
3121 "GGYC", | |
3122 "GLLS", | |
3123 "GLF", | |
3124 "AKK", | |
3125 "GKAA", | |
3126 "ESCV", | |
3127 "GLP", | |
3128 "CGES", | |
3129 "PCGE", | |
3130 "FL", | |
3131 "CGET", | |
3132 "GLW", | |
3133 "KGAA", | |
3134 "KAAL", | |
3135 "GGY", | |
3136 "GGG", | |
3137 "IKG", | |
3138 "LKG", | |
3139 "GGL", | |
3140 "CK", | |
3141 "GTC", | |
3142 "CG", | |
3143 "SKKC", | |
3144 "CS", | |
3145 "CR", | |
3146 "KC", | |
3147 "AGKA", | |
3148 "KA", | |
3149 "KG", | |
3150 "LKCK", | |
3151 "SCKL", | |
3152 "KK", | |
3153 "KI", | |
3154 "KN", | |
3155 "KL", | |
3156 "SK", | |
3157 "KV", | |
3158 "SL", | |
3159 "SC", | |
3160 "SG", | |
3161 "AAA", | |
3162 "VAK", | |
3163 "AAL", | |
3164 "AAK", | |
3165 "GGGG", | |
3166 "KNVA", | |
3167 "GGGL", | |
3168 "GYG", | |
3169 "LG", | |
3170 "LA", | |
3171 "LL", | |
3172 "LK", | |
3173 "LS", | |
3174 "LP", | |
3175 "GCSC", | |
3176 "TC", | |
3177 "GAA", | |
3178 "AA", | |
3179 "VA", | |
3180 "VC", | |
3181 "AG", | |
3182 "VG", | |
3183 "AI", | |
3184 "AK", | |
3185 "VL", | |
3186 "AL", | |
3187 "TPGC", | |
3188 "IK", | |
3189 "IA", | |
3190 "IG", | |
3191 "YGG", | |
3192 "LGK", | |
3193 "CSCK", | |
3194 "GYGG", | |
3195 "LGG", | |
3196 "KGA", | |
3197 ] | |
3198 ) | |
3199 | |
3200 | |
3201 def aa_formulas(): | |
3202 """ | |
3203 Function returning the molecular formulas of all amino acids. All amino acids are considered in the neutral form | |
3204 (uncharged). | |
3205 """ | |
3206 formulas = { | |
3207 "A": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 0}, | |
3208 "C": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 1}, | |
3209 "D": {"C": 4, "H": 7, "N": 1, "O": 4, "S": 0}, | |
3210 "E": {"C": 5, "H": 9, "N": 1, "O": 4, "S": 0}, | |
3211 "F": {"C": 9, "H": 11, "N": 1, "O": 2, "S": 0}, | |
3212 "G": {"C": 2, "H": 5, "N": 1, "O": 2, "S": 0}, | |
3213 "H": {"C": 6, "H": 9, "N": 3, "O": 2, "S": 0}, | |
3214 "I": {"C": 6, "H": 13, "N": 1, "O": 2, "S": 0}, | |
3215 "K": {"C": 6, "H": 14, "N": 2, "O": 2, "S": 0}, | |
3216 "L": {"C": 6, "H": 13, "N": 1, "O": 2, "S": 0}, | |
3217 "M": {"C": 5, "H": 11, "N": 1, "O": 2, "S": 1}, | |
3218 "N": {"C": 4, "H": 8, "N": 2, "O": 3, "S": 0}, | |
3219 "P": {"C": 5, "H": 9, "N": 1, "O": 2, "S": 0}, | |
3220 "Q": {"C": 5, "H": 10, "N": 2, "O": 3, "S": 0}, | |
3221 "R": {"C": 6, "H": 14, "N": 4, "O": 2, "S": 0}, | |
3222 "S": {"C": 3, "H": 7, "N": 1, "O": 3, "S": 0}, | |
3223 "T": {"C": 4, "H": 9, "N": 1, "O": 3, "S": 0}, | |
3224 "V": {"C": 5, "H": 11, "N": 1, "O": 2, "S": 0}, | |
3225 "W": {"C": 11, "H": 12, "N": 2, "O": 2, "S": 0}, | |
3226 "Y": {"C": 9, "H": 11, "N": 1, "O": 3, "S": 0}, | |
3227 } | |
3228 return formulas |