Mercurial > repos > cpt > cpt_helical_wheel
comparison plotWheels/core.py @ 1:9b276485c94a draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt |
|---|---|
| date | Mon, 05 Jun 2023 02:44:43 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:9caa9aa44fd8 | 1:9b276485c94a |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """ | |
| 3 .. currentmodule:: modlamp.core | |
| 4 | |
| 5 .. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch> | |
| 6 | |
| 7 Core helper functions and classes for other modules. The two main classes are: | |
| 8 | |
| 9 ============================= ======================================================================================= | |
| 10 Class Characteristics | |
| 11 ============================= ======================================================================================= | |
| 12 :py:class:`BaseSequence` Base class inheriting to all sequence classes in the module :py:mod:`modlamp.sequences` | |
| 13 :py:class:`BaseDescriptor` Base class inheriting to the two descriptor classes in :py:mod:`modlamp.descriptors` | |
| 14 ============================= ======================================================================================= | |
| 15 """ | |
| 16 | |
| 17 import os | |
| 18 import random | |
| 19 import re | |
| 20 | |
| 21 import numpy as np | |
| 22 import pandas as pd | |
| 23 import collections | |
| 24 import operator | |
| 25 from scipy.spatial import distance | |
| 26 from sklearn.preprocessing import MinMaxScaler, StandardScaler | |
| 27 from sklearn.utils import shuffle | |
| 28 | |
| 29 __author__ = "Alex Müller, Gisela Gabernet" | |
| 30 __docformat__ = "restructuredtext en" | |
| 31 | |
| 32 | |
| 33 class BaseSequence(object): | |
| 34 """Base class for sequence classes in the module :mod:`modlamp.sequences`. | |
| 35 It contains amino acid probabilities for different sequence generation classes. | |
| 36 | |
| 37 The following amino acid probabilities are used: (extracted from the | |
| 38 `APD3 <http://aps.unmc.edu/AP/statistic/statistic.php>`_, March 17, 2016) | |
| 39 | |
| 40 === ==== ====== ========= ========== | |
| 41 AA rand AMP AMPnoCM randnoCM | |
| 42 === ==== ====== ========= ========== | |
| 43 A 0.05 0.0766 0.0812275 0.05555555 | |
| 44 C 0.05 0.071 0.0 0.0 | |
| 45 D 0.05 0.026 0.0306275 0.05555555 | |
| 46 E 0.05 0.0264 0.0310275 0.05555555 | |
| 47 F 0.05 0.0405 0.0451275 0.05555555 | |
| 48 G 0.05 0.1172 0.1218275 0.05555555 | |
| 49 H 0.05 0.021 0.0256275 0.05555555 | |
| 50 I 0.05 0.061 0.0656275 0.05555555 | |
| 51 K 0.05 0.0958 0.1004275 0.05555555 | |
| 52 L 0.05 0.0838 0.0884275 0.05555555 | |
| 53 M 0.05 0.0123 0.0 0.0 | |
| 54 N 0.05 0.0386 0.0432275 0.05555555 | |
| 55 P 0.05 0.0463 0.0509275 0.05555555 | |
| 56 Q 0.05 0.0251 0.0297275 0.05555555 | |
| 57 R 0.05 0.0545 0.0591275 0.05555555 | |
| 58 S 0.05 0.0613 0.0659275 0.05555555 | |
| 59 T 0.05 0.0455 0.0501275 0.05555555 | |
| 60 V 0.05 0.0572 0.0618275 0.05555555 | |
| 61 W 0.05 0.0155 0.0201275 0.05555555 | |
| 62 Y 0.05 0.0244 0.0290275 0.05555555 | |
| 63 === ==== ====== ========= ========== | |
| 64 | |
| 65 """ | |
| 66 | |
| 67 def __init__(self, seqnum, lenmin=7, lenmax=28): | |
| 68 """ | |
| 69 :param seqnum: number of sequences to generate | |
| 70 :param lenmin: minimal length of the generated sequences | |
| 71 :param lenmax: maximal length of the generated sequences | |
| 72 :return: attributes :py:attr:`seqnum`, :py:attr:`lenmin` and :py:attr:`lenmax`. | |
| 73 :Example: | |
| 74 | |
| 75 >>> b = BaseSequence(10, 7, 28) | |
| 76 >>> b.seqnum | |
| 77 10 | |
| 78 >>> b.lenmin | |
| 79 7 | |
| 80 >>> b.lenmax | |
| 81 28 | |
| 82 """ | |
| 83 self.sequences = list() | |
| 84 self.names = list() | |
| 85 self.lenmin = int(lenmin) | |
| 86 self.lenmax = int(lenmax) | |
| 87 self.seqnum = int(seqnum) | |
| 88 | |
| 89 # AA classes: | |
| 90 self.AA_hyd = ["G", "A", "L", "I", "V"] | |
| 91 self.AA_basic = ["K", "R"] | |
| 92 self.AA_acidic = ["D", "E"] | |
| 93 self.AA_aroma = ["W", "Y", "F"] | |
| 94 self.AA_polar = ["S", "T", "Q", "N"] | |
| 95 # AA labels: | |
| 96 self.AAs = [ | |
| 97 "A", | |
| 98 "C", | |
| 99 "D", | |
| 100 "E", | |
| 101 "F", | |
| 102 "G", | |
| 103 "H", | |
| 104 "I", | |
| 105 "K", | |
| 106 "L", | |
| 107 "M", | |
| 108 "N", | |
| 109 "P", | |
| 110 "Q", | |
| 111 "R", | |
| 112 "S", | |
| 113 "T", | |
| 114 "V", | |
| 115 "W", | |
| 116 "Y", | |
| 117 ] | |
| 118 # AA probability from the APD3 database: | |
| 119 self.prob_AMP = [ | |
| 120 0.0766, | |
| 121 0.071, | |
| 122 0.026, | |
| 123 0.0264, | |
| 124 0.0405, | |
| 125 0.1172, | |
| 126 0.021, | |
| 127 0.061, | |
| 128 0.0958, | |
| 129 0.0838, | |
| 130 0.0123, | |
| 131 0.0386, | |
| 132 0.0463, | |
| 133 0.0251, | |
| 134 0.0545, | |
| 135 0.0613, | |
| 136 0.0455, | |
| 137 0.0572, | |
| 138 0.0155, | |
| 139 0.0244, | |
| 140 ] | |
| 141 # AA probability from the APD2 database without Cys and Met (synthesis reasons) | |
| 142 self.prob_AMPnoCM = [ | |
| 143 0.081228, | |
| 144 0.0, | |
| 145 0.030627, | |
| 146 0.031027, | |
| 147 0.045128, | |
| 148 0.121828, | |
| 149 0.025627, | |
| 150 0.065628, | |
| 151 0.100428, | |
| 152 0.088428, | |
| 153 0.0, | |
| 154 0.043228, | |
| 155 0.050928, | |
| 156 0.029728, | |
| 157 0.059128, | |
| 158 0.065927, | |
| 159 0.050128, | |
| 160 0.061828, | |
| 161 0.020128, | |
| 162 0.029028, | |
| 163 ] | |
| 164 # equal AA probabilities: | |
| 165 self.prob = [ | |
| 166 0.05, | |
| 167 0.05, | |
| 168 0.05, | |
| 169 0.05, | |
| 170 0.05, | |
| 171 0.05, | |
| 172 0.05, | |
| 173 0.05, | |
| 174 0.05, | |
| 175 0.05, | |
| 176 0.05, | |
| 177 0.05, | |
| 178 0.05, | |
| 179 0.05, | |
| 180 0.05, | |
| 181 0.05, | |
| 182 0.05, | |
| 183 0.05, | |
| 184 0.05, | |
| 185 0.05, | |
| 186 ] | |
| 187 # equal AA probabilities but 0 for Cys and Met: | |
| 188 self.prob_randnoCM = [ | |
| 189 0.05555555555, | |
| 190 0.0, | |
| 191 0.05555555555, | |
| 192 0.05555555555, | |
| 193 0.05555555555, | |
| 194 0.05555555555, | |
| 195 0.05555555555, | |
| 196 0.05555555555, | |
| 197 0.05555555555, | |
| 198 0.05555555555, | |
| 199 0.0, | |
| 200 0.05555555555, | |
| 201 0.05555555555, | |
| 202 0.05555555555, | |
| 203 0.05555555555, | |
| 204 0.05555555555, | |
| 205 0.05555555555, | |
| 206 0.05555555555, | |
| 207 0.05555555555, | |
| 208 0.05555555555, | |
| 209 ] | |
| 210 | |
| 211 # AA probability from the linear CancerPPD peptides: | |
| 212 self.prob_ACP = [ | |
| 213 0.14526966, | |
| 214 0.0, | |
| 215 0.00690031, | |
| 216 0.00780824, | |
| 217 0.06991102, | |
| 218 0.04957327, | |
| 219 0.01725077, | |
| 220 0.05647358, | |
| 221 0.27637552, | |
| 222 0.17759216, | |
| 223 0.00998729, | |
| 224 0.00798983, | |
| 225 0.01307427, | |
| 226 0.00381333, | |
| 227 0.02941711, | |
| 228 0.02651171, | |
| 229 0.0154349, | |
| 230 0.04013074, | |
| 231 0.0406755, | |
| 232 0.00581079, | |
| 233 ] | |
| 234 | |
| 235 # AA probabilities for perfect amphipathic helix of different arc sizes | |
| 236 self.prob_amphihel = [ | |
| 237 [ | |
| 238 0.04545455, | |
| 239 0.0, | |
| 240 0.04545454, | |
| 241 0.04545455, | |
| 242 0.0, | |
| 243 0.04545455, | |
| 244 0.04545455, | |
| 245 0.0, | |
| 246 0.25, | |
| 247 0.0, | |
| 248 0.0, | |
| 249 0.04545454, | |
| 250 0.04545455, | |
| 251 0.04545454, | |
| 252 0.25, | |
| 253 0.04545454, | |
| 254 0.04545454, | |
| 255 0.0, | |
| 256 0.0, | |
| 257 0.04545454, | |
| 258 ], | |
| 259 [ | |
| 260 0.0, | |
| 261 0.0, | |
| 262 0.0, | |
| 263 0.0, | |
| 264 0.16666667, | |
| 265 0.0, | |
| 266 0.0, | |
| 267 0.16666667, | |
| 268 0.0, | |
| 269 0.16666667, | |
| 270 0.0, | |
| 271 0.0, | |
| 272 0.0, | |
| 273 0.0, | |
| 274 0.0, | |
| 275 0.0, | |
| 276 0.0, | |
| 277 0.16666667, | |
| 278 0.16666667, | |
| 279 (1.0 - 0.16666667 * 5), | |
| 280 ], | |
| 281 ] | |
| 282 | |
| 283 # helical ACP AA probabilities, depending on the position of the AA in the helix. | |
| 284 self.prob_ACPhel = np.array( | |
| 285 [ | |
| 286 [ | |
| 287 0.0483871, | |
| 288 0.0, | |
| 289 0.0, | |
| 290 0.0483871, | |
| 291 0.01612903, | |
| 292 0.12903226, | |
| 293 0.03225807, | |
| 294 0.09677419, | |
| 295 0.19354839, | |
| 296 0.5, | |
| 297 0.0483871, | |
| 298 0.11290323, | |
| 299 0.1, | |
| 300 0.18518519, | |
| 301 0.07843137, | |
| 302 0.12, | |
| 303 0.17073172, | |
| 304 0.16666667, | |
| 305 ], | |
| 306 [ | |
| 307 0.0, | |
| 308 0.0, | |
| 309 0.0, | |
| 310 0.0, | |
| 311 0.0, | |
| 312 0.0, | |
| 313 0.0, | |
| 314 0.0, | |
| 315 0.0, | |
| 316 0.0, | |
| 317 0.01612903, | |
| 318 0.0, | |
| 319 0.0, | |
| 320 0.0, | |
| 321 0.0, | |
| 322 0.0, | |
| 323 0.02439024, | |
| 324 0.19444444, | |
| 325 ], | |
| 326 [ | |
| 327 0.0, | |
| 328 0.01612903, | |
| 329 0.0, | |
| 330 0.27419355, | |
| 331 0.01612903, | |
| 332 0.0, | |
| 333 0.0, | |
| 334 0.01612903, | |
| 335 0.0, | |
| 336 0.0, | |
| 337 0.0, | |
| 338 0.0, | |
| 339 0.0, | |
| 340 0.0, | |
| 341 0.0, | |
| 342 0.0, | |
| 343 0.0, | |
| 344 0.0, | |
| 345 ], | |
| 346 [ | |
| 347 0.0, | |
| 348 0.0, | |
| 349 0.0, | |
| 350 0.0, | |
| 351 0.0, | |
| 352 0.0, | |
| 353 0.0, | |
| 354 0.06451613, | |
| 355 0.0, | |
| 356 0.01612903, | |
| 357 0.0483871, | |
| 358 0.01612903, | |
| 359 0.0, | |
| 360 0.01851852, | |
| 361 0.0, | |
| 362 0.0, | |
| 363 0.0, | |
| 364 0.0, | |
| 365 ], | |
| 366 [ | |
| 367 0.16129032, | |
| 368 0.0483871, | |
| 369 0.30645161, | |
| 370 0.0, | |
| 371 0.0483871, | |
| 372 0.0, | |
| 373 0.0, | |
| 374 0.01612903, | |
| 375 0.0, | |
| 376 0.01612903, | |
| 377 0.0, | |
| 378 0.09677419, | |
| 379 0.06666667, | |
| 380 0.01851852, | |
| 381 0.0, | |
| 382 0.02, | |
| 383 0.14634146, | |
| 384 0.0, | |
| 385 ], | |
| 386 [ | |
| 387 0.64516129, | |
| 388 0.0, | |
| 389 0.17741936, | |
| 390 0.14516129, | |
| 391 0.0, | |
| 392 0.01612903, | |
| 393 0.25806452, | |
| 394 0.11290323, | |
| 395 0.06451613, | |
| 396 0.08064516, | |
| 397 0.22580645, | |
| 398 0.03225807, | |
| 399 0.06666667, | |
| 400 0.2037037, | |
| 401 0.1372549, | |
| 402 0.1, | |
| 403 0.0, | |
| 404 0.05555556, | |
| 405 ], | |
| 406 [ | |
| 407 0.0, | |
| 408 0.0, | |
| 409 0.0, | |
| 410 0.01612903, | |
| 411 0.0, | |
| 412 0.0, | |
| 413 0.01612903, | |
| 414 0.0, | |
| 415 0.03225807, | |
| 416 0.0, | |
| 417 0.0, | |
| 418 0.20967742, | |
| 419 0.0, | |
| 420 0.0, | |
| 421 0.0, | |
| 422 0.16, | |
| 423 0.0, | |
| 424 0.0, | |
| 425 ], | |
| 426 [ | |
| 427 0.0483871, | |
| 428 0.11290323, | |
| 429 0.01612903, | |
| 430 0.08064516, | |
| 431 0.33870968, | |
| 432 0.27419355, | |
| 433 0.0, | |
| 434 0.0483871, | |
| 435 0.14516129, | |
| 436 0.06451613, | |
| 437 0.03225807, | |
| 438 0.06451613, | |
| 439 0.18333333, | |
| 440 0.0, | |
| 441 0.0, | |
| 442 0.1, | |
| 443 0.26829268, | |
| 444 0.0, | |
| 445 ], | |
| 446 [ | |
| 447 0.0, | |
| 448 0.03225807, | |
| 449 0.01612903, | |
| 450 0.12903226, | |
| 451 0.12903226, | |
| 452 0.0, | |
| 453 0.38709677, | |
| 454 0.33870968, | |
| 455 0.0483871, | |
| 456 0.03225807, | |
| 457 0.41935484, | |
| 458 0.08064516, | |
| 459 0.0, | |
| 460 0.03703704, | |
| 461 0.29411765, | |
| 462 0.04, | |
| 463 0.02439024, | |
| 464 0.02777778, | |
| 465 ], | |
| 466 [ | |
| 467 0.0483871, | |
| 468 0.70967742, | |
| 469 0.12903226, | |
| 470 0.0483871, | |
| 471 0.09677419, | |
| 472 0.32258064, | |
| 473 0.20967742, | |
| 474 0.06451613, | |
| 475 0.11290323, | |
| 476 0.06451613, | |
| 477 0.03225807, | |
| 478 0.03225807, | |
| 479 0.28333333, | |
| 480 0.24074074, | |
| 481 0.03921569, | |
| 482 0.28, | |
| 483 0.07317073, | |
| 484 0.22222222, | |
| 485 ], | |
| 486 [ | |
| 487 0.0, | |
| 488 0.01612903, | |
| 489 0.01612903, | |
| 490 0.0483871, | |
| 491 0.01612903, | |
| 492 0.03225807, | |
| 493 0.0, | |
| 494 0.0, | |
| 495 0.0, | |
| 496 0.0, | |
| 497 0.0, | |
| 498 0.0, | |
| 499 0.03333333, | |
| 500 0.0, | |
| 501 0.01960784, | |
| 502 0.02, | |
| 503 0.0, | |
| 504 0.0, | |
| 505 ], | |
| 506 [ | |
| 507 0.0, | |
| 508 0.01612903, | |
| 509 0.0, | |
| 510 0.0, | |
| 511 0.0, | |
| 512 0.0, | |
| 513 0.0, | |
| 514 0.0, | |
| 515 0.01612903, | |
| 516 0.0, | |
| 517 0.03225807, | |
| 518 0.0, | |
| 519 0.0, | |
| 520 0.0, | |
| 521 0.01960784, | |
| 522 0.02, | |
| 523 0.0, | |
| 524 0.0, | |
| 525 ], | |
| 526 [ | |
| 527 0.0, | |
| 528 0.0, | |
| 529 0.14516129, | |
| 530 0.01612903, | |
| 531 0.03225807, | |
| 532 0.01612903, | |
| 533 0.0, | |
| 534 0.0, | |
| 535 0.0, | |
| 536 0.0, | |
| 537 0.01612903, | |
| 538 0.0, | |
| 539 0.0, | |
| 540 0.12962963, | |
| 541 0.17647059, | |
| 542 0.0, | |
| 543 0.0, | |
| 544 0.0, | |
| 545 ], | |
| 546 [ | |
| 547 0.0, | |
| 548 0.0, | |
| 549 0.01612903, | |
| 550 0.01612903, | |
| 551 0.0, | |
| 552 0.0, | |
| 553 0.01612903, | |
| 554 0.0, | |
| 555 0.01612903, | |
| 556 0.0, | |
| 557 0.0, | |
| 558 0.01612903, | |
| 559 0.0, | |
| 560 0.01851852, | |
| 561 0.0, | |
| 562 0.0, | |
| 563 0.0, | |
| 564 0.0, | |
| 565 ], | |
| 566 [ | |
| 567 0.0, | |
| 568 0.01612903, | |
| 569 0.01612903, | |
| 570 0.0, | |
| 571 0.01612903, | |
| 572 0.0, | |
| 573 0.01612903, | |
| 574 0.0, | |
| 575 0.01612903, | |
| 576 0.01612903, | |
| 577 0.01612903, | |
| 578 0.01612903, | |
| 579 0.0, | |
| 580 0.01851852, | |
| 581 0.01960784, | |
| 582 0.0, | |
| 583 0.04878049, | |
| 584 0.0, | |
| 585 ], | |
| 586 [ | |
| 587 0.01612903, | |
| 588 0.0, | |
| 589 0.01612903, | |
| 590 0.12903226, | |
| 591 0.03225807, | |
| 592 0.03225807, | |
| 593 0.0483871, | |
| 594 0.17741936, | |
| 595 0.0, | |
| 596 0.03225807, | |
| 597 0.09677419, | |
| 598 0.0483871, | |
| 599 0.01666667, | |
| 600 0.0, | |
| 601 0.15686274, | |
| 602 0.1, | |
| 603 0.0, | |
| 604 0.05555556, | |
| 605 ], | |
| 606 [ | |
| 607 0.01612903, | |
| 608 0.01612903, | |
| 609 0.0, | |
| 610 0.01612903, | |
| 611 0.0483871, | |
| 612 0.01612903, | |
| 613 0.0, | |
| 614 0.01612903, | |
| 615 0.0, | |
| 616 0.01612903, | |
| 617 0.01612903, | |
| 618 0.11290323, | |
| 619 0.0, | |
| 620 0.01851852, | |
| 621 0.03921569, | |
| 622 0.02, | |
| 623 0.0, | |
| 624 0.05555556, | |
| 625 ], | |
| 626 [ | |
| 627 0.01612903, | |
| 628 0.01612903, | |
| 629 0.01612903, | |
| 630 0.01612903, | |
| 631 0.20967742, | |
| 632 0.16129032, | |
| 633 0.01612903, | |
| 634 0.0483871, | |
| 635 0.33870968, | |
| 636 0.16129032, | |
| 637 0.0, | |
| 638 0.14516129, | |
| 639 0.25, | |
| 640 0.11111111, | |
| 641 0.01960784, | |
| 642 0.02, | |
| 643 0.21951219, | |
| 644 0.22222222, | |
| 645 ], | |
| 646 [ | |
| 647 0.0, | |
| 648 0.0, | |
| 649 0.12903226, | |
| 650 0.01612903, | |
| 651 0.0, | |
| 652 0.0, | |
| 653 0.0, | |
| 654 0.0, | |
| 655 0.01612903, | |
| 656 0.0, | |
| 657 0.0, | |
| 658 0.0, | |
| 659 0.0, | |
| 660 0.0, | |
| 661 0.0, | |
| 662 0.0, | |
| 663 0.02439024, | |
| 664 0.0, | |
| 665 ], | |
| 666 [ | |
| 667 0.0, | |
| 668 0.0, | |
| 669 0.0, | |
| 670 0.0, | |
| 671 0.0, | |
| 672 0.0, | |
| 673 0.0, | |
| 674 0.0, | |
| 675 0.0, | |
| 676 0.0, | |
| 677 0.0, | |
| 678 0.01612903, | |
| 679 0.0, | |
| 680 0.0, | |
| 681 0.0, | |
| 682 0.0, | |
| 683 0.0, | |
| 684 0.0, | |
| 685 ], | |
| 686 ] | |
| 687 ) | |
| 688 | |
| 689 def save_fasta(self, filename, names=False): | |
| 690 """Method to save generated sequences in a ``.FASTA`` formatted file. | |
| 691 | |
| 692 :param filename: output filename in which the sequences from :py:attr:`sequences` are safed in fasta format. | |
| 693 :param names: {bool} whether sequence names from :py:attr:`names` should be saved as sequence identifiers | |
| 694 :return: a FASTA formatted file containing the generated sequences | |
| 695 :Example: | |
| 696 | |
| 697 >>> b = BaseSequence(2) | |
| 698 >>> b.sequences = ['KLLSLSLALDLLS', 'KLPERTVVNSSDF'] | |
| 699 >>> b.names = ['Sequence1', 'Sequence2'] | |
| 700 >>> b.save_fasta('/location/of/fasta/file.fasta', names=True) | |
| 701 """ | |
| 702 if names: | |
| 703 save_fasta(filename, self.sequences, self.names) | |
| 704 else: | |
| 705 save_fasta(filename, self.sequences) | |
| 706 | |
| 707 def mutate_AA(self, nr, prob): | |
| 708 """Method to mutate with **prob** probability a **nr** of positions per sequence randomly. | |
| 709 | |
| 710 :param nr: number of mutations to perform per sequence | |
| 711 :param prob: probability of mutating a sequence | |
| 712 :return: mutated sequences in the attribute :py:attr:`sequences`. | |
| 713 :Example: | |
| 714 | |
| 715 >>> b = BaseSequence(1) | |
| 716 >>> b.sequences = ['IAKAGRAIIK'] | |
| 717 >>> b.mutate_AA(3, 1.) | |
| 718 >>> b.sequences | |
| 719 ['NAKAGRAWIK'] | |
| 720 """ | |
| 721 for s in range(len(self.sequences)): | |
| 722 # mutate: yes or no? prob = mutation probability | |
| 723 mutate = np.random.choice([1, 0], 1, p=[prob, 1 - float(prob)]) | |
| 724 if mutate == 1: | |
| 725 seq = list(self.sequences[s]) | |
| 726 cnt = 0 | |
| 727 while cnt < nr: # mutate "nr" AA | |
| 728 seq[random.choice(range(len(seq)))] = random.choice(self.AAs) | |
| 729 cnt += 1 | |
| 730 self.sequences[s] = "".join(seq) | |
| 731 | |
| 732 def filter_duplicates(self): | |
| 733 """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences` | |
| 734 | |
| 735 :return: filtered sequences list in the attribute :py:attr:`sequences` and corresponding names. | |
| 736 :Example: | |
| 737 | |
| 738 >>> b = BaseSequence(4) | |
| 739 >>> b.sequences = ['KLLKLLKKLLKLLK', 'KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK', 'KLAKLAKKLAKLAK'] | |
| 740 >>> b.filter_duplicates() | |
| 741 >>> b.sequences | |
| 742 ['KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK'] | |
| 743 | |
| 744 .. versionadded:: v2.2.5 | |
| 745 """ | |
| 746 if not self.names: | |
| 747 self.names = ["Seq_" + str(i) for i in range(len(self.sequences))] | |
| 748 df = pd.DataFrame( | |
| 749 list(zip(self.sequences, self.names)), columns=["Sequences", "Names"] | |
| 750 ) | |
| 751 df = df.drop_duplicates( | |
| 752 "Sequences", "first" | |
| 753 ) # keep first occurrence of duplicate | |
| 754 self.sequences = df["Sequences"].get_values().tolist() | |
| 755 self.names = df["Names"].get_values().tolist() | |
| 756 | |
| 757 def keep_natural_aa(self): | |
| 758 """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character | |
| 759 that is not in ``['A','C','D,'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']``. | |
| 760 | |
| 761 :return: filtered sequence list in the attribute :py:attr:`sequences`. The other attributes are also filtered | |
| 762 accordingly (if present). | |
| 763 :Example: | |
| 764 | |
| 765 >>> b = BaseSequence(2) | |
| 766 >>> b.sequences = ['BBBsdflUasUJfBJ', 'GLFDIVKKVVGALGSL'] | |
| 767 >>> b.keep_natural_aa() | |
| 768 >>> b.sequences | |
| 769 ['GLFDIVKKVVGALGSL'] | |
| 770 """ | |
| 771 natural_aa = [ | |
| 772 "A", | |
| 773 "C", | |
| 774 "D", | |
| 775 "E", | |
| 776 "F", | |
| 777 "G", | |
| 778 "H", | |
| 779 "I", | |
| 780 "K", | |
| 781 "L", | |
| 782 "M", | |
| 783 "N", | |
| 784 "P", | |
| 785 "Q", | |
| 786 "R", | |
| 787 "S", | |
| 788 "T", | |
| 789 "V", | |
| 790 "W", | |
| 791 "Y", | |
| 792 ] | |
| 793 | |
| 794 seqs = [] | |
| 795 names = [] | |
| 796 | |
| 797 for i, s in enumerate(self.sequences): | |
| 798 seq = list(s.upper()) | |
| 799 if all(c in natural_aa for c in seq): | |
| 800 seqs.append(s.upper()) | |
| 801 if hasattr(self, "names") and self.names: | |
| 802 names.append(self.names[i]) | |
| 803 | |
| 804 self.sequences = seqs | |
| 805 self.names = names | |
| 806 | |
| 807 def filter_aa(self, amino_acids): | |
| 808 """Method to filter out corresponding names and descriptor values of sequences with given amino acids in the | |
| 809 argument list *aminoacids*. | |
| 810 | |
| 811 :param amino_acids: {list} amino acids to be filtered | |
| 812 :return: filtered list of sequences names in the corresponding attributes. | |
| 813 :Example: | |
| 814 | |
| 815 >>> b = BaseSequence(3) | |
| 816 >>> b.sequences = ['AAALLLIIIKKK', 'CCEERRT', 'LLVVIIFFFQQ'] | |
| 817 >>> b.filter_aa(['C']) | |
| 818 >>> b.sequences | |
| 819 ['AAALLLIIIKKK', 'LLVVIIFFFQQ'] | |
| 820 """ | |
| 821 | |
| 822 pattern = re.compile("|".join(amino_acids)) | |
| 823 seqs = [] | |
| 824 names = [] | |
| 825 | |
| 826 for i, s in enumerate(self.sequences): | |
| 827 if not pattern.search(s): | |
| 828 seqs.append(s) | |
| 829 if hasattr(self, "names") and self.names: | |
| 830 names.append(self.names[i]) | |
| 831 | |
| 832 self.sequences = seqs | |
| 833 self.names = names | |
| 834 | |
| 835 def clean(self): | |
| 836 """Method to clean / clear / empty the attributes :py:attr:`sequences` and :py:attr:`names`. | |
| 837 | |
| 838 :return: freshly initialized, empty class attributes. | |
| 839 """ | |
| 840 self.__init__(self.seqnum, self.lenmin, self.lenmax) | |
| 841 | |
| 842 | |
| 843 class BaseDescriptor(object): | |
| 844 """ | |
| 845 Base class inheriting to both peptide descriptor classes :py:class:`modlamp.descriptors.GlobalDescriptor` and | |
| 846 :py:class:`modlamp.descriptors.PeptideDescriptor`. | |
| 847 """ | |
| 848 | |
| 849 def __init__(self, seqs): | |
| 850 """ | |
| 851 :param seqs: a ``.FASTA`` file with sequences, a list / array of sequences or a single sequence as string to | |
| 852 calculate the descriptor values for. | |
| 853 :return: initialized attributes :py:attr:`sequences` and :py:attr:`names`. | |
| 854 :Example: | |
| 855 | |
| 856 >>> AMP = BaseDescriptor('KLLKLLKKLLKLLK','pepCATS') | |
| 857 >>> AMP.sequences | |
| 858 ['KLLKLLKKLLKLLK'] | |
| 859 >>> seqs = BaseDescriptor('/Path/to/file.fasta', 'eisenberg') # load sequences from .fasta file | |
| 860 >>> seqs.sequences | |
| 861 ['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'...] | |
| 862 """ | |
| 863 if type(seqs) == list and seqs[0].isupper(): | |
| 864 self.sequences = [s.strip() for s in seqs] | |
| 865 self.names = [] | |
| 866 elif type(seqs) == np.ndarray and seqs[0].isupper(): | |
| 867 self.sequences = [s.strip() for s in seqs.tolist()] | |
| 868 self.names = [] | |
| 869 elif type(seqs) == str and seqs.isupper(): | |
| 870 self.sequences = [seqs.strip()] | |
| 871 self.names = [] | |
| 872 elif os.path.isfile(seqs): | |
| 873 if seqs.endswith(".fasta"): # read .fasta file | |
| 874 self.sequences, self.names = read_fasta(seqs) | |
| 875 elif seqs.endswith(".csv"): # read .csv file with sequences every line | |
| 876 with open(seqs) as f: | |
| 877 self.sequences = list() | |
| 878 cntr = 0 | |
| 879 self.names = [] | |
| 880 for line in f: | |
| 881 if line.isupper(): | |
| 882 self.sequences.append(line.strip()) | |
| 883 self.names.append("seq_" + str(cntr)) | |
| 884 cntr += 1 | |
| 885 else: | |
| 886 print("Sorry, currently only .fasta or .csv files can be read!") | |
| 887 else: | |
| 888 print( | |
| 889 "%s does not exist, is not a valid list of AA sequences or is not a valid sequence string" | |
| 890 % seqs | |
| 891 ) | |
| 892 | |
| 893 self.descriptor = np.array([[]]) | |
| 894 self.target = np.array([], dtype="int") | |
| 895 self.scaler = None | |
| 896 self.featurenames = [] | |
| 897 | |
| 898 def read_fasta(self, filename): | |
| 899 """Method for loading sequences from a ``.FASTA`` formatted file into the attributes :py:attr:`sequences` and | |
| 900 :py:attr:`names`. | |
| 901 | |
| 902 :param filename: {str} ``.FASTA`` file with sequences and headers to read | |
| 903 :return: {list} sequences in the attribute :py:attr:`sequences` with corresponding sequence names in | |
| 904 :py:attr:`names`. | |
| 905 """ | |
| 906 self.sequences, self.names = read_fasta(filename) | |
| 907 | |
| 908 def save_fasta(self, filename, names=False): | |
| 909 """Method for saving sequences from :py:attr:`sequences` to a ``.FASTA`` formatted file. | |
| 910 | |
| 911 :param filename: {str} filename of the output ``.FASTA`` file | |
| 912 :param names: {bool} whether sequence names from self.names should be saved as sequence identifiers | |
| 913 :return: a FASTA formatted file containing the generated sequences | |
| 914 """ | |
| 915 if names: | |
| 916 save_fasta(filename, self.sequences, self.names) | |
| 917 else: | |
| 918 save_fasta(filename, self.sequences) | |
| 919 | |
| 920 def count_aa(self, scale="relative", average=False, append=False): | |
| 921 """Method for producing the amino acid distribution for the given sequences as a descriptor | |
| 922 | |
| 923 :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA | |
| 924 :param average: {boolean} whether the averaged amino acid counts for all sequences should be returned | |
| 925 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 926 attribute :py:attr:`descriptor`. | |
| 927 :return: the amino acid distributions for every sequence individually in the attribute :py:attr:`descriptor` | |
| 928 :Example: | |
| 929 | |
| 930 >>> AMP = PeptideDescriptor('ACDEFGHIKLMNPQRSTVWY') # aa_count() does not depend on the descriptor scale | |
| 931 >>> AMP.count_aa() | |
| 932 >>> AMP.descriptor | |
| 933 array([[ 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, ... ]]) | |
| 934 >>> AMP.descriptor.shape | |
| 935 (1, 20) | |
| 936 | |
| 937 .. seealso:: :py:func:`modlamp.core.count_aa()` | |
| 938 """ | |
| 939 desc = list() | |
| 940 for seq in self.sequences: | |
| 941 od = count_aas(seq, scale) | |
| 942 desc.append(list(od.values())) | |
| 943 | |
| 944 desc = np.array(desc) | |
| 945 self.featurenames = list(od.keys()) | |
| 946 | |
| 947 if append: | |
| 948 self.descriptor = np.hstack((self.descriptor, desc)) | |
| 949 elif average: | |
| 950 self.descriptor = np.mean(desc, axis=0) | |
| 951 else: | |
| 952 self.descriptor = desc | |
| 953 | |
| 954 def count_ngrams(self, n): | |
| 955 """Method for producing n-grams of all sequences in self.sequences | |
| 956 | |
| 957 :param n: {int or list of ints} defines whether counts or frequencies are given for each AA | |
| 958 :return: {dict} dictionary with n-grams as keys and their counts in the sequence as values in :py:attr:`descriptor` | |
| 959 :Example: | |
| 960 | |
| 961 >>> D = PeptideDescriptor('GLLDFLSLAALSLDKLVKKGALS') | |
| 962 >>> D.count_ngrams([2, 3]) | |
| 963 >>> D.descriptor | |
| 964 {'LS': 3, 'LD': 2, 'LSL': 2, 'AL': 2, ..., 'LVK': 1} | |
| 965 | |
| 966 .. seealso:: :py:func:`modlamp.core.count_ngrams()` | |
| 967 """ | |
| 968 ngrams = dict() | |
| 969 for seq in self.sequences: | |
| 970 d = count_ngrams(seq, n) | |
| 971 for k, v in d.items(): | |
| 972 if k in ngrams.keys(): | |
| 973 ngrams[k] += v | |
| 974 else: | |
| 975 ngrams[k] = v | |
| 976 self.descriptor = ngrams | |
| 977 | |
| 978 def feature_scaling(self, stype="standard", fit=True): | |
| 979 """Method for feature scaling of the calculated descriptor matrix. | |
| 980 | |
| 981 :param stype: {'standard' or 'minmax'} type of scaling to be used | |
| 982 :param fit: {boolean} defines whether the used scaler is first fitting on the data (True) or | |
| 983 whether the already fitted scaler in :py:attr:`scaler` should be used to transform (False). | |
| 984 :return: scaled descriptor values in :py:attr:`descriptor` | |
| 985 :Example: | |
| 986 | |
| 987 >>> D.descriptor | |
| 988 array([[0.155],[0.34],[0.16235294],[-0.08842105],[0.116]]) | |
| 989 >>> D.feature_scaling(type='minmax',fit=True) | |
| 990 array([[0.56818182],[1.],[0.5853447],[0.],[0.47714988]]) | |
| 991 """ | |
| 992 if stype in ["standard", "minmax"]: | |
| 993 if stype == "standard": | |
| 994 self.scaler = StandardScaler() | |
| 995 elif stype == "minmax": | |
| 996 self.scaler = MinMaxScaler() | |
| 997 | |
| 998 if fit: | |
| 999 self.descriptor = self.scaler.fit_transform(self.descriptor) | |
| 1000 else: | |
| 1001 self.descriptor = self.scaler.transform(self.descriptor) | |
| 1002 else: | |
| 1003 print("Unknown scaler type!\nAvailable: 'standard', 'minmax'") | |
| 1004 | |
| 1005 def feature_shuffle(self): | |
| 1006 """Method for shuffling feature columns randomly. | |
| 1007 | |
| 1008 :return: descriptor matrix with shuffled feature columns in :py:attr:`descriptor` | |
| 1009 :Example: | |
| 1010 | |
| 1011 >>> D.descriptor | |
| 1012 array([[0.80685625,167.05234375,39.56818125,-0.26338667,155.16888667,33.48778]]) | |
| 1013 >>> D.feature_shuffle() | |
| 1014 array([[155.16888667,-0.26338667,167.05234375,0.80685625,39.56818125,33.48778]]) | |
| 1015 """ | |
| 1016 self.descriptor = shuffle(self.descriptor.transpose()).transpose() | |
| 1017 | |
| 1018 def sequence_order_shuffle(self): | |
| 1019 """Method for shuffling sequence order in the attribute :py:attr:`sequences`. | |
| 1020 | |
| 1021 :return: sequences in :py:attr:`sequences` with shuffled order in the list. | |
| 1022 :Example: | |
| 1023 | |
| 1024 >>> D.sequences | |
| 1025 ['LILRALKGAARALKVA','VKIAKIALKIIKGLG','VGVRLIKGIGRVARGAI','LRGLRGVIRGGKAIVRVGK','GGKLVRLIARIGKGV'] | |
| 1026 >>> D.sequence_order_shuffle() | |
| 1027 >>> D.sequences | |
| 1028 ['VGVRLIKGIGRVARGAI','LILRALKGAARALKVA','LRGLRGVIRGGKAIVRVGK','GGKLVRLIARIGKGV','VKIAKIALKIIKGLG'] | |
| 1029 """ | |
| 1030 self.sequences = shuffle(self.sequences) | |
| 1031 | |
| 1032 def random_selection(self, num): | |
| 1033 """Method to randomly select a specified number of sequences (with names and descriptors if present) out of a given | |
| 1034 descriptor instance. | |
| 1035 | |
| 1036 :param num: {int} number of entries to be randomly selected | |
| 1037 :return: updated instance | |
| 1038 :Example: | |
| 1039 | |
| 1040 >>> h = Helices(7, 28, 100) | |
| 1041 >>> h.generate_helices() | |
| 1042 >>> desc = PeptideDescriptor(h.sequences, 'eisenberg') | |
| 1043 >>> desc.calculate_moment() | |
| 1044 >>> len(desc.sequences) | |
| 1045 100 | |
| 1046 >>> len(desc.descriptor) | |
| 1047 100 | |
| 1048 >>> desc.random_selection(10) | |
| 1049 >>> len(desc.descriptor) | |
| 1050 10 | |
| 1051 >>> len(desc.descriptor) | |
| 1052 10 | |
| 1053 | |
| 1054 .. versionadded:: v2.2.3 | |
| 1055 """ | |
| 1056 | |
| 1057 sel = np.random.choice(len(self.sequences), size=num, replace=False) | |
| 1058 self.sequences = np.array(self.sequences)[sel].tolist() | |
| 1059 if hasattr(self, "descriptor") and self.descriptor.size: | |
| 1060 self.descriptor = self.descriptor[sel] | |
| 1061 if hasattr(self, "names") and self.names: | |
| 1062 self.names = np.array(self.names)[sel].tolist() | |
| 1063 if hasattr(self, "target") and self.target.size: | |
| 1064 self.target = self.target[sel] | |
| 1065 | |
| 1066 def minmax_selection(self, iterations, distmetric="euclidean", seed=0): | |
| 1067 """Method to select a specified number of sequences according to the minmax algorithm. | |
| 1068 | |
| 1069 :param iterations: {int} Number of sequences to retrieve. | |
| 1070 :param distmetric: Distance metric to calculate the distances between the sequences in descriptor space. | |
| 1071 Choose from 'euclidean' or 'minkowsky'. | |
| 1072 :param seed: {int} Set a random seed for numpy to pick the first sequence. | |
| 1073 :return: updated instance | |
| 1074 | |
| 1075 .. seealso:: **SciPy** http://docs.scipy.org/doc/scipy/reference/spatial.distance.html | |
| 1076 """ | |
| 1077 | |
| 1078 # Storing M into pool, where selections get deleted | |
| 1079 pool = self.descriptor # Store pool where selections get deleted | |
| 1080 minmaxidx = list() # Store original indices of selections to return | |
| 1081 | |
| 1082 # Randomly selecting first peptide into the sele | |
| 1083 np.random.seed(seed) | |
| 1084 idx = int(np.random.random_integers(0, len(pool), 1)) | |
| 1085 sele = pool[idx : idx + 1, :] | |
| 1086 minmaxidx.append( | |
| 1087 int(*np.where(np.all(self.descriptor == pool[idx : idx + 1, :], axis=1))) | |
| 1088 ) | |
| 1089 | |
| 1090 # Deleting peptide in selection from pool | |
| 1091 pool = np.delete(pool, idx, axis=0) | |
| 1092 | |
| 1093 for i in range(iterations - 1): | |
| 1094 # Calculating distance from sele to the rest of the peptides | |
| 1095 dist = distance.cdist(pool, sele, distmetric) | |
| 1096 | |
| 1097 # Choosing maximal distances for every sele instance | |
| 1098 maxidx = np.argmax(dist, axis=0) | |
| 1099 maxcols = np.max(dist, axis=0) | |
| 1100 | |
| 1101 # Choosing minimal distance among the maximal distances | |
| 1102 minmax = np.argmin(maxcols) | |
| 1103 maxidx = int(maxidx[minmax]) | |
| 1104 | |
| 1105 # Adding it to selection and removing from pool | |
| 1106 sele = np.append(sele, pool[maxidx : maxidx + 1, :], axis=0) | |
| 1107 pool = np.delete(pool, maxidx, axis=0) | |
| 1108 minmaxidx.append( | |
| 1109 int( | |
| 1110 *np.where( | |
| 1111 np.all(self.descriptor == pool[maxidx : maxidx + 1, :], axis=1) | |
| 1112 ) | |
| 1113 ) | |
| 1114 ) | |
| 1115 | |
| 1116 self.sequences = np.array(self.sequences)[minmaxidx].tolist() | |
| 1117 if hasattr(self, "descriptor") and self.descriptor.size: | |
| 1118 self.descriptor = self.descriptor[minmaxidx] | |
| 1119 if hasattr(self, "names") and self.names: | |
| 1120 self.names = np.array(self.names)[minmaxidx].tolist() | |
| 1121 if hasattr(self, "target") and self.target.size: | |
| 1122 self.target = self.descriptor[minmaxidx] | |
| 1123 | |
| 1124 def filter_sequences(self, sequences): | |
| 1125 """Method to filter out entries for given sequences in *sequences* out of a descriptor instance. All | |
| 1126 corresponding attribute values of these sequences (e.g. in :py:attr:`descriptor`, :py:attr:`name`) are deleted | |
| 1127 as well. The method returns an updated descriptor instance. | |
| 1128 | |
| 1129 :param sequences: {list} sequences to be filtered out of the whole instance, including corresponding data | |
| 1130 :return: updated instance without filtered sequences | |
| 1131 :Example: | |
| 1132 | |
| 1133 >>> sequences = ['KLLKLLKKLLKLLK', 'ACDEFGHIK', 'GLFDIVKKVV', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALGSL'] | |
| 1134 >>> desc = PeptideDescriptor(sequences, 'pepcats') | |
| 1135 >>> desc.calculate_crosscorr(7) | |
| 1136 >>> len(desc.descriptor) | |
| 1137 5 | |
| 1138 >>> desc.filter_sequences('KLLKLLKKLLKLLK') | |
| 1139 >>> len(desc.descriptor) | |
| 1140 4 | |
| 1141 >>> desc.sequences | |
| 1142 ['ACDEFGHIK', 'GLFDIVKKVV', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALGSL'] | |
| 1143 """ | |
| 1144 indices = list() | |
| 1145 if isinstance( | |
| 1146 sequences, str | |
| 1147 ): # check if sequences is only one sequence string and convert it to a list | |
| 1148 sequences = [sequences] | |
| 1149 for s in sequences: # get indices of queried sequences | |
| 1150 indices.append(self.sequences.index(s)) | |
| 1151 | |
| 1152 self.sequences = np.delete(np.array(self.sequences), indices, 0).tolist() | |
| 1153 if hasattr(self, "descriptor") and self.descriptor.size: | |
| 1154 self.descriptor = np.delete(self.descriptor, indices, 0) | |
| 1155 if hasattr(self, "names") and self.names: | |
| 1156 self.names = np.delete(np.array(self.names), indices, 0).tolist() | |
| 1157 if hasattr(self, "target") and self.target.size: | |
| 1158 self.target = np.delete(self.target, indices, 0) | |
| 1159 | |
| 1160 def filter_values(self, values, operator="=="): | |
| 1161 """Method to filter the descriptor matrix in the attribute :py:attr:`descriptor` for a given list of values (same | |
| 1162 size as the number of features in the descriptor matrix!) The operator option tells the method whether to | |
| 1163 filter for values equal, lower, higher ect. to the given values in the *values* array. | |
| 1164 | |
| 1165 :param values: {list} values to filter the attribute :py:attr:`descriptor` for | |
| 1166 :param operator: {str} filter criterion, available the operators ``==``, ``<``, ``>``, ``<=``and ``>=``. | |
| 1167 :return: descriptor matrix and updated sequences containing only entries with descriptor values given in | |
| 1168 *values* in the corresponding attributes. | |
| 1169 :Example: | |
| 1170 | |
| 1171 >>> desc.descriptor # desc = BaseDescriptor instance | |
| 1172 array([[ 0.7666517 ], | |
| 1173 [ 0.38373498]]) | |
| 1174 >>> desc.filter_values([0.5], '<') | |
| 1175 >>> desc.descriptor | |
| 1176 array([[ 0.38373498]]) | |
| 1177 """ | |
| 1178 dim = self.descriptor.shape[1] | |
| 1179 for d in range(dim): # for all the features in self.descriptor | |
| 1180 if operator == "==": | |
| 1181 indices = np.where(self.descriptor[:, d] == values[d])[0] | |
| 1182 elif operator == "<": | |
| 1183 indices = np.where(self.descriptor[:, d] < values[d])[0] | |
| 1184 elif operator == ">": | |
| 1185 indices = np.where(self.descriptor[:, d] > values[d])[0] | |
| 1186 elif operator == "<=": | |
| 1187 indices = np.where(self.descriptor[:, d] <= values[d])[0] | |
| 1188 elif operator == ">=": | |
| 1189 indices = np.where(self.descriptor[:, d] >= values[d])[0] | |
| 1190 else: | |
| 1191 raise KeyError( | |
| 1192 "available operators: ``==``, ``<``, ``>``, ``<=``and ``>=``" | |
| 1193 ) | |
| 1194 | |
| 1195 # filter descriptor matrix, sequence list and names list according to obtained indices | |
| 1196 self.sequences = np.array(self.sequences)[indices].tolist() | |
| 1197 if hasattr(self, "descriptor") and self.descriptor.size: | |
| 1198 self.descriptor = self.descriptor[indices] | |
| 1199 if hasattr(self, "names") and self.names: | |
| 1200 self.names = np.array(self.names)[indices].tolist() | |
| 1201 if hasattr(self, "target") and self.target.size: | |
| 1202 self.target = self.target[indices] | |
| 1203 | |
| 1204 def filter_aa(self, amino_acids): | |
| 1205 """Method to filter out corresponding names and descriptor values of sequences with given amino acids in the | |
| 1206 argument list *aminoacids*. | |
| 1207 | |
| 1208 :param amino_acids: list of amino acids to be filtered | |
| 1209 :return: filtered list of sequences, descriptor values, target values and names in the corresponding attributes. | |
| 1210 :Example: | |
| 1211 | |
| 1212 >>> b = BaseSequence(3) | |
| 1213 >>> b.sequences = ['AAALLLIIIKKK', 'CCEERRT', 'LLVVIIFFFQQ'] | |
| 1214 >>> b.filter_aa(['C']) | |
| 1215 >>> b.sequences | |
| 1216 ['AAALLLIIIKKK', 'LLVVIIFFFQQ'] | |
| 1217 """ | |
| 1218 | |
| 1219 pattern = re.compile("|".join(amino_acids)) | |
| 1220 seqs = [] | |
| 1221 desc = [] | |
| 1222 names = [] | |
| 1223 target = [] | |
| 1224 | |
| 1225 for i, s in enumerate(self.sequences): | |
| 1226 if not pattern.search(s): | |
| 1227 seqs.append(s) | |
| 1228 if hasattr(self, "descriptor") and self.descriptor.size: | |
| 1229 desc.append(self.descriptor[i]) | |
| 1230 if hasattr(self, "names") and self.names: | |
| 1231 names.append(self.names[i]) | |
| 1232 if hasattr(self, "target") and self.target.size: | |
| 1233 target.append(self.target[i]) | |
| 1234 | |
| 1235 self.sequences = seqs | |
| 1236 self.names = names | |
| 1237 self.descriptor = np.array(desc) | |
| 1238 self.target = np.array(target, dtype="int") | |
| 1239 | |
| 1240 def filter_duplicates(self): | |
| 1241 """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences` | |
| 1242 | |
| 1243 :return: filtered sequences list in the attribute :py:attr:`sequences` and corresponding names. | |
| 1244 :Example: | |
| 1245 | |
| 1246 >>> b = BaseDescriptor(['KLLKLLKKLLKLLK', 'KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK', 'KLAKLAKKLAKLAK']) | |
| 1247 >>> b.filter_duplicates() | |
| 1248 >>> b.sequences | |
| 1249 ['KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK'] | |
| 1250 | |
| 1251 .. versionadded:: v2.2.5 | |
| 1252 """ | |
| 1253 if not self.names: | |
| 1254 self.names = ["Seq_" + str(i) for i in range(len(self.sequences))] | |
| 1255 if not self.target: | |
| 1256 self.target = [0] * len(self.sequences) | |
| 1257 if not self.descriptor: | |
| 1258 self.descriptor = np.zeros(len(self.sequences)) | |
| 1259 df = pd.DataFrame( | |
| 1260 np.array([self.sequences, self.names, self.descriptor, self.target]).T, | |
| 1261 columns=["Sequences", "Names", "Descriptor", "Target"], | |
| 1262 ) | |
| 1263 df = df.drop_duplicates( | |
| 1264 "Sequences", "first" | |
| 1265 ) # keep first occurrence of duplicate | |
| 1266 self.sequences = df["Sequences"].get_values().tolist() | |
| 1267 self.names = df["Names"].get_values().tolist() | |
| 1268 self.descriptor = df["Descriptor"].get_values() | |
| 1269 self.target = df["Target"].get_values() | |
| 1270 | |
| 1271 def keep_natural_aa(self): | |
| 1272 """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character | |
| 1273 that is not in ['A','C','D,'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']. | |
| 1274 | |
| 1275 :return: filtered sequence list in the attribute :py:attr:`sequences`. The other attributes are also filtered | |
| 1276 accordingly (if present). | |
| 1277 :Example: | |
| 1278 | |
| 1279 >>> b = BaseSequence(2) | |
| 1280 >>> b.sequences = ['BBBsdflUasUJfBJ', 'GLFDIVKKVVGALGSL'] | |
| 1281 >>> b.keep_natural_aa() | |
| 1282 >>> b.sequences | |
| 1283 ['GLFDIVKKVVGALGSL'] | |
| 1284 """ | |
| 1285 | |
| 1286 natural_aa = [ | |
| 1287 "A", | |
| 1288 "C", | |
| 1289 "D", | |
| 1290 "E", | |
| 1291 "F", | |
| 1292 "G", | |
| 1293 "H", | |
| 1294 "I", | |
| 1295 "K", | |
| 1296 "L", | |
| 1297 "M", | |
| 1298 "N", | |
| 1299 "P", | |
| 1300 "Q", | |
| 1301 "R", | |
| 1302 "S", | |
| 1303 "T", | |
| 1304 "V", | |
| 1305 "W", | |
| 1306 "Y", | |
| 1307 ] | |
| 1308 | |
| 1309 seqs = [] | |
| 1310 desc = [] | |
| 1311 names = [] | |
| 1312 target = [] | |
| 1313 | |
| 1314 for i, s in enumerate(self.sequences): | |
| 1315 seq = list(s.upper()) | |
| 1316 if all(c in natural_aa for c in seq): | |
| 1317 seqs.append(s.upper()) | |
| 1318 if hasattr(self, "descriptor") and self.descriptor.size: | |
| 1319 desc.append(self.descriptor[i]) | |
| 1320 if hasattr(self, "names") and self.names: | |
| 1321 names.append(self.names[i]) | |
| 1322 if hasattr(self, "target") and self.target.size: | |
| 1323 target.append(self.target[i]) | |
| 1324 | |
| 1325 self.sequences = seqs | |
| 1326 self.names = names | |
| 1327 self.descriptor = np.array(desc) | |
| 1328 self.target = np.array(target, dtype="int") | |
| 1329 | |
| 1330 def load_descriptordata( | |
| 1331 self, filename, delimiter=",", targets=False, skip_header=0 | |
| 1332 ): | |
| 1333 """Method to load any data file with sequences and descriptor values and save it to a new insatnce of the | |
| 1334 class :class:`modlamp.descriptors.PeptideDescriptor`. | |
| 1335 | |
| 1336 .. note:: Headers are not considered. To skip initial lines in the file, use the *skip_header* option. | |
| 1337 | |
| 1338 :param filename: {str} filename of the data file to be loaded | |
| 1339 :param delimiter: {str} column delimiter | |
| 1340 :param targets: {boolean} whether last column in the file contains a target class vector | |
| 1341 :param skip_header: {int} number of initial lines to skip in the file | |
| 1342 :return: loaded sequences, descriptor values and targets in the corresponding attributes. | |
| 1343 """ | |
| 1344 data = np.genfromtxt(filename, delimiter=delimiter, skip_header=skip_header) | |
| 1345 data = data[:, 1:] # skip sequences as they are "nan" when read as float | |
| 1346 seqs = np.genfromtxt(filename, delimiter=delimiter, dtype="str") | |
| 1347 seqs = seqs[:, 0] | |
| 1348 if targets: | |
| 1349 self.target = np.array(data[:, -1], dtype="int") | |
| 1350 self.sequences = seqs | |
| 1351 self.descriptor = data | |
| 1352 | |
| 1353 def save_descriptor(self, filename, delimiter=",", targets=None, header=None): | |
| 1354 """Method to save the descriptor values to a .csv/.txt file | |
| 1355 | |
| 1356 :param filename: filename of the output file | |
| 1357 :param delimiter: column delimiter | |
| 1358 :param targets: target class vector to be added to descriptor (same length as :py:attr:`sequences`) | |
| 1359 :param header: {str} header to be written at the beginning of the file (if ``None``: feature names are taken) | |
| 1360 :return: output file with peptide names and descriptor values | |
| 1361 """ | |
| 1362 seqs = np.array(self.sequences, dtype="|S80")[:, np.newaxis] | |
| 1363 ids = np.array(self.names, dtype="|S80")[:, np.newaxis] | |
| 1364 if ids.shape == seqs.shape: | |
| 1365 names = np.hstack((ids, seqs)) | |
| 1366 else: | |
| 1367 names = seqs | |
| 1368 if targets and len(targets) == len(self.sequences): | |
| 1369 target = np.array(targets)[:, np.newaxis] | |
| 1370 data = np.hstack((names, self.descriptor, target)) | |
| 1371 else: | |
| 1372 data = np.hstack((names, self.descriptor)) | |
| 1373 if not header: | |
| 1374 featurenames = [["Sequence"]] + self.featurenames | |
| 1375 header = ", ".join([f[0] for f in featurenames]) | |
| 1376 np.savetxt(filename, data, delimiter=delimiter, fmt="%s", header=header) | |
| 1377 | |
| 1378 | |
| 1379 def load_scale(scalename): | |
| 1380 """Method to load scale values for a given amino acid scale | |
| 1381 | |
| 1382 :param scalename: amino acid scale name, for available scales see the | |
| 1383 :class:`modlamp.descriptors.PeptideDescriptor()` documentation. | |
| 1384 :return: amino acid scale values in dictionary format. | |
| 1385 """ | |
| 1386 # predefined amino acid scales dictionary | |
| 1387 scales = { | |
| 1388 "aasi": { | |
| 1389 "A": [1.89], | |
| 1390 "C": [1.73], | |
| 1391 "D": [3.13], | |
| 1392 "E": [3.14], | |
| 1393 "F": [1.53], | |
| 1394 "G": [2.67], | |
| 1395 "H": [3], | |
| 1396 "I": [1.97], | |
| 1397 "K": [2.28], | |
| 1398 "L": [1.74], | |
| 1399 "M": [2.5], | |
| 1400 "N": [2.33], | |
| 1401 "P": [0.22], | |
| 1402 "Q": [3.05], | |
| 1403 "R": [1.91], | |
| 1404 "S": [2.14], | |
| 1405 "T": [2.18], | |
| 1406 "V": [2.37], | |
| 1407 "W": [2], | |
| 1408 "Y": [2.01], | |
| 1409 }, | |
| 1410 "abhprk": { | |
| 1411 "A": [0, 0, 0, 0, 0, 0], | |
| 1412 "C": [0, 0, 0, 0, 0, 0], | |
| 1413 "D": [1, 0, 0, 1, 0, 0], | |
| 1414 "E": [1, 0, 0, 1, 0, 0], | |
| 1415 "F": [0, 0, 1, 0, 1, 0], | |
| 1416 "G": [0, 0, 0, 0, 0, 0], | |
| 1417 "H": [0, 0, 0, 1, 1, 0], | |
| 1418 "I": [0, 0, 1, 0, 0, 0], | |
| 1419 "K": [0, 1, 0, 1, 0, 0], | |
| 1420 "L": [0, 0, 1, 0, 0, 0], | |
| 1421 "M": [0, 0, 1, 0, 0, 0], | |
| 1422 "N": [0, 0, 0, 1, 0, 0], | |
| 1423 "P": [0, 0, 0, 0, 0, 1], | |
| 1424 "Q": [0, 0, 0, 1, 0, 0], | |
| 1425 "R": [0, 1, 0, 1, 0, 0], | |
| 1426 "S": [0, 0, 0, 1, 0, 0], | |
| 1427 "T": [0, 0, 0, 1, 0, 0], | |
| 1428 "V": [0, 0, 1, 0, 0, 0], | |
| 1429 "W": [0, 0, 1, 0, 1, 0], | |
| 1430 "Y": [0, 0, 0, 1, 1, 0], | |
| 1431 }, | |
| 1432 "argos": { | |
| 1433 "I": [0.77], | |
| 1434 "F": [1.2], | |
| 1435 "V": [0.14], | |
| 1436 "L": [2.3], | |
| 1437 "W": [0.07], | |
| 1438 "M": [2.3], | |
| 1439 "A": [0.64], | |
| 1440 "G": [-0.48], | |
| 1441 "C": [0.25], | |
| 1442 "Y": [-0.41], | |
| 1443 "P": [-0.31], | |
| 1444 "T": [-0.13], | |
| 1445 "S": [-0.25], | |
| 1446 "H": [-0.87], | |
| 1447 "E": [-0.94], | |
| 1448 "N": [-0.89], | |
| 1449 "Q": [-0.61], | |
| 1450 "D": [-1], | |
| 1451 "K": [-1], | |
| 1452 "R": [-0.68], | |
| 1453 }, | |
| 1454 "bulkiness": { | |
| 1455 "A": [0.443], | |
| 1456 "C": [0.551], | |
| 1457 "D": [0.453], | |
| 1458 "E": [0.557], | |
| 1459 "F": [0.898], | |
| 1460 "G": [0], | |
| 1461 "H": [0.563], | |
| 1462 "I": [0.985], | |
| 1463 "K": [0.674], | |
| 1464 "L": [0.985], | |
| 1465 "M": [0.703], | |
| 1466 "N": [0.516], | |
| 1467 "P": [0.768], | |
| 1468 "Q": [0.605], | |
| 1469 "R": [0.596], | |
| 1470 "S": [0.332], | |
| 1471 "T": [0.677], | |
| 1472 "V": [0.995], | |
| 1473 "W": [1], | |
| 1474 "Y": [0.801], | |
| 1475 }, | |
| 1476 "charge_phys": { | |
| 1477 "A": [0.0], | |
| 1478 "C": [-0.1], | |
| 1479 "D": [-1.0], | |
| 1480 "E": [-1.0], | |
| 1481 "F": [0.0], | |
| 1482 "G": [0.0], | |
| 1483 "H": [0.1], | |
| 1484 "I": [0.0], | |
| 1485 "K": [1.0], | |
| 1486 "L": [0.0], | |
| 1487 "M": [0.0], | |
| 1488 "N": [0.0], | |
| 1489 "P": [0.0], | |
| 1490 "Q": [0.0], | |
| 1491 "R": [1.0], | |
| 1492 "S": [0.0], | |
| 1493 "T": [0.0], | |
| 1494 "V": [0.0], | |
| 1495 "W": [0.0], | |
| 1496 "Y": [0.0], | |
| 1497 }, | |
| 1498 "charge_acid": { | |
| 1499 "A": [0.0], | |
| 1500 "C": [-0.1], | |
| 1501 "D": [-1.0], | |
| 1502 "E": [-1.0], | |
| 1503 "F": [0.0], | |
| 1504 "G": [0.0], | |
| 1505 "H": [1.0], | |
| 1506 "I": [0.0], | |
| 1507 "K": [1.0], | |
| 1508 "L": [0.0], | |
| 1509 "M": [0.0], | |
| 1510 "N": [0.0], | |
| 1511 "P": [0.0], | |
| 1512 "Q": [0.0], | |
| 1513 "R": [1.0], | |
| 1514 "S": [0.0], | |
| 1515 "T": [0.0], | |
| 1516 "V": [0.0], | |
| 1517 "W": [0.0], | |
| 1518 "Y": [0.0], | |
| 1519 }, | |
| 1520 "cougar": { | |
| 1521 "A": [0.25, 0.62, 1.89], | |
| 1522 "C": [0.208, 0.29, 1.73], | |
| 1523 "D": [0.875, -0.9, 3.13], | |
| 1524 "E": [0.833, -0.74, 3.14], | |
| 1525 "F": [0.042, 1.2, 1.53], | |
| 1526 "G": [1, 0.48, 2.67], | |
| 1527 "H": [0.083, -0.4, 3], | |
| 1528 "I": [0.667, 1.4, 1.97], | |
| 1529 "K": [0.708, -1.5, 2.28], | |
| 1530 "L": [0.292, 1.1, 1.74], | |
| 1531 "M": [0, 0.64, 2.5], | |
| 1532 "N": [0.667, -0.78, 2.33], | |
| 1533 "P": [0.875, 0.12, 0.22], | |
| 1534 "Q": [0.792, -0.85, 3.05], | |
| 1535 "R": [0.958, -2.5, 1.91], | |
| 1536 "S": [0.875, -0.18, 2.14], | |
| 1537 "T": [0.583, -0.05, 2.18], | |
| 1538 "V": [0.375, 1.1, 2.37], | |
| 1539 "W": [0.042, 0.81, 2], | |
| 1540 "Y": [0.5, 0.26, 2.01], | |
| 1541 }, | |
| 1542 "eisenberg": { | |
| 1543 "I": [1.4], | |
| 1544 "F": [1.2], | |
| 1545 "V": [1.1], | |
| 1546 "L": [1.1], | |
| 1547 "W": [0.81], | |
| 1548 "M": [0.64], | |
| 1549 "A": [0.62], | |
| 1550 "G": [0.48], | |
| 1551 "C": [0.29], | |
| 1552 "Y": [0.26], | |
| 1553 "P": [0.12], | |
| 1554 "T": [-0.05], | |
| 1555 "S": [-0.18], | |
| 1556 "H": [-0.4], | |
| 1557 "E": [-0.74], | |
| 1558 "N": [-0.78], | |
| 1559 "Q": [-0.85], | |
| 1560 "D": [-0.9], | |
| 1561 "K": [-1.5], | |
| 1562 "R": [-2.5], | |
| 1563 }, | |
| 1564 "ez": { | |
| 1565 "A": [-0.29, 10.22, 4.67], | |
| 1566 "C": [0.95, 13.69, 5.77], | |
| 1567 "D": [1.19, 14.25, 8.98], | |
| 1568 "E": [1.3, 14.66, 4.16], | |
| 1569 "F": [-0.8, 19.67, 7.12], | |
| 1570 "G": [-0.01, 13.86, 6], | |
| 1571 "H": [0.75, 12.26, 2.77], | |
| 1572 "I": [-0.56, 14.34, 10.69], | |
| 1573 "K": [1.66, 11.11, 2.09], | |
| 1574 "L": [-0.64, 17.34, 8.61], | |
| 1575 "M": [-0.28, 18.04, 7.13], | |
| 1576 "N": [0.89, 12.78, 6.28], | |
| 1577 "P": [0.83, 18.09, 3.53], | |
| 1578 "Q": [1.21, 10.46, 2.59], | |
| 1579 "R": [1.55, 9.34, 4.68], | |
| 1580 "S": [0.1, 13.86, 6], | |
| 1581 "T": [0.01, 13.86, 6], | |
| 1582 "V": [-0.47, 11.35, 4.97], | |
| 1583 "W": [-0.85, 11.65, 7.2], | |
| 1584 "Y": [-0.42, 13.04, 6.2], | |
| 1585 }, | |
| 1586 "flexibility": { | |
| 1587 "A": [0.25], | |
| 1588 "C": [0.208], | |
| 1589 "D": [0.875], | |
| 1590 "E": [0.833], | |
| 1591 "F": [0.042], | |
| 1592 "G": [1], | |
| 1593 "H": [0.083], | |
| 1594 "I": [0.667], | |
| 1595 "K": [0.708], | |
| 1596 "L": [0.292], | |
| 1597 "M": [0.0], | |
| 1598 "N": [0.667], | |
| 1599 "P": [0.875], | |
| 1600 "Q": [0.792], | |
| 1601 "R": [0.958], | |
| 1602 "S": [0.875], | |
| 1603 "T": [0.583], | |
| 1604 "V": [0.375], | |
| 1605 "W": [0.042], | |
| 1606 "Y": [0.5], | |
| 1607 }, | |
| 1608 "grantham": { | |
| 1609 "A": [0, 8.1, 31], | |
| 1610 "C": [2.75, 5.5, 55], | |
| 1611 "D": [1.38, 13.0, 54], | |
| 1612 "E": [0.92, 12.3, 83], | |
| 1613 "F": [0, 5.2, 132], | |
| 1614 "G": [0.74, 9.0, 3], | |
| 1615 "H": [0.58, 10.4, 96], | |
| 1616 "I": [0, 5.2, 111], | |
| 1617 "K": [0.33, 11.3, 119], | |
| 1618 "L": [0, 4.9, 111], | |
| 1619 "M": [0, 5.7, 105], | |
| 1620 "N": [1.33, 11.6, 56], | |
| 1621 "P": [0.39, 8.0, 32.5], | |
| 1622 "Q": [0.89, 10.5, 85], | |
| 1623 "R": [0.65, 10.5, 124], | |
| 1624 "S": [1.42, 9.2, 32], | |
| 1625 "T": [0.71, 8.6, 61], | |
| 1626 "V": [0, 5.9, 84], | |
| 1627 "W": [0.13, 5.4, 170], | |
| 1628 "Y": [0.20, 6.2, 136], | |
| 1629 }, | |
| 1630 "gravy": { | |
| 1631 "I": [4.5], | |
| 1632 "V": [4.2], | |
| 1633 "L": [3.8], | |
| 1634 "F": [2.8], | |
| 1635 "C": [2.5], | |
| 1636 "M": [1.9], | |
| 1637 "A": [1.8], | |
| 1638 "G": [-0.4], | |
| 1639 "T": [-0.7], | |
| 1640 "W": [-0.9], | |
| 1641 "S": [-0.8], | |
| 1642 "Y": [-1.3], | |
| 1643 "P": [-1.6], | |
| 1644 "H": [-3.2], | |
| 1645 "E": [-3.5], | |
| 1646 "Q": [-3.5], | |
| 1647 "D": [-3.5], | |
| 1648 "N": [-3.5], | |
| 1649 "K": [-3.9], | |
| 1650 "R": [-4.5], | |
| 1651 }, | |
| 1652 "hopp-woods": { | |
| 1653 "A": [-0.5], | |
| 1654 "C": [-1], | |
| 1655 "D": [3], | |
| 1656 "E": [3], | |
| 1657 "F": [-2.5], | |
| 1658 "G": [0], | |
| 1659 "H": [-0.5], | |
| 1660 "I": [-1.8], | |
| 1661 "K": [3], | |
| 1662 "L": [-1.8], | |
| 1663 "M": [-1.3], | |
| 1664 "N": [0.2], | |
| 1665 "P": [0], | |
| 1666 "Q": [0.2], | |
| 1667 "R": [3], | |
| 1668 "S": [0.3], | |
| 1669 "T": [-0.4], | |
| 1670 "V": [-1.5], | |
| 1671 "W": [-3.4], | |
| 1672 "Y": [-2.3], | |
| 1673 }, | |
| 1674 "isaeci": { | |
| 1675 "A": [62.9, 0.05], | |
| 1676 "C": [78.51, 0.15], | |
| 1677 "D": [18.46, 1.25], | |
| 1678 "E": [30.19, 1.31], | |
| 1679 "F": [189.42, 0.14], | |
| 1680 "G": [19.93, 0.02], | |
| 1681 "H": [87.38, 0.56], | |
| 1682 "I": [149.77, 0.09], | |
| 1683 "K": [102.78, 0.53], | |
| 1684 "L": [154.35, 0.1], | |
| 1685 "M": [132.22, 0.34], | |
| 1686 "N": [19.53, 1.36], | |
| 1687 "P": [122.35, 0.16], | |
| 1688 "Q": [17.87, 1.31], | |
| 1689 "R": [52.98, 1.69], | |
| 1690 "S": [19.75, 0.56], | |
| 1691 "T": [59.44, 0.65], | |
| 1692 "V": [120.91, 0.07], | |
| 1693 "W": [179.16, 1.08], | |
| 1694 "Y": [132.16, 0.72], | |
| 1695 }, | |
| 1696 "janin": { | |
| 1697 "I": [1.2], | |
| 1698 "F": [0.87], | |
| 1699 "V": [1], | |
| 1700 "L": [0.87], | |
| 1701 "W": [0.59], | |
| 1702 "M": [0.73], | |
| 1703 "A": [0.59], | |
| 1704 "G": [0.59], | |
| 1705 "C": [1.4], | |
| 1706 "Y": [-0.4], | |
| 1707 "P": [-0.26], | |
| 1708 "T": [-0.12], | |
| 1709 "S": [0.02], | |
| 1710 "H": [0.02], | |
| 1711 "E": [-0.83], | |
| 1712 "N": [-0.55], | |
| 1713 "Q": [-0.83], | |
| 1714 "D": [-0.69], | |
| 1715 "K": [-2.4], | |
| 1716 "R": [-1.8], | |
| 1717 }, | |
| 1718 "kytedoolittle": { | |
| 1719 "I": [1.7], | |
| 1720 "F": [1.1], | |
| 1721 "V": [1.6], | |
| 1722 "L": [1.4], | |
| 1723 "W": [-0.14], | |
| 1724 "M": [0.8], | |
| 1725 "A": [0.77], | |
| 1726 "G": [0.03], | |
| 1727 "C": [1], | |
| 1728 "Y": [-0.27], | |
| 1729 "P": [-0.37], | |
| 1730 "T": [-0.07], | |
| 1731 "S": [-0.1], | |
| 1732 "H": [-0.91], | |
| 1733 "E": [-1], | |
| 1734 "N": [-1], | |
| 1735 "Q": [-1], | |
| 1736 "D": [-1], | |
| 1737 "K": [-1.1], | |
| 1738 "R": [-1.3], | |
| 1739 }, | |
| 1740 "levitt_alpha": { | |
| 1741 "A": [1.29], | |
| 1742 "C": [1.11], | |
| 1743 "D": [1.04], | |
| 1744 "E": [1.44], | |
| 1745 "F": [1.07], | |
| 1746 "G": [0.56], | |
| 1747 "H": [1.22], | |
| 1748 "I": [0.97], | |
| 1749 "K": [1.23], | |
| 1750 "L": [1.3], | |
| 1751 "M": [1.47], | |
| 1752 "N": [0.9], | |
| 1753 "P": [0.52], | |
| 1754 "Q": [1.27], | |
| 1755 "R": [0.96], | |
| 1756 "S": [0.82], | |
| 1757 "T": [0.82], | |
| 1758 "V": [0.91], | |
| 1759 "W": [0.99], | |
| 1760 "Y": [0.72], | |
| 1761 }, | |
| 1762 "mss": { | |
| 1763 "A": [13.02], | |
| 1764 "C": [23.7067], | |
| 1765 "D": [22.02], | |
| 1766 "E": [20.0233], | |
| 1767 "F": [23.5288], | |
| 1768 "G": [1.01], | |
| 1769 "H": [23.5283], | |
| 1770 "I": [22.3611], | |
| 1771 "K": [18.9756], | |
| 1772 "L": [19.6944], | |
| 1773 "M": [21.92], | |
| 1774 "N": [21.8567], | |
| 1775 "P": [19.0242], | |
| 1776 "Q": [19.9689], | |
| 1777 "R": [19.0434], | |
| 1778 "S": [18.3533], | |
| 1779 "T": [22.3567], | |
| 1780 "V": [21.0267], | |
| 1781 "W": [26.1975], | |
| 1782 "Y": [24.1954], | |
| 1783 }, | |
| 1784 "msw": { | |
| 1785 "A": [-0.73, 0.2, -0.62], | |
| 1786 "C": [-0.66, 0.26, -0.27], | |
| 1787 "D": [0.11, -1, -0.96], | |
| 1788 "E": [0.24, -0.39, -0.04], | |
| 1789 "F": [0.76, 0.85, -0.34], | |
| 1790 "G": [-0.31, -0.28, -0.75], | |
| 1791 "H": [0.84, 0.67, -0.78], | |
| 1792 "I": [-0.91, 0.83, -0.25], | |
| 1793 "K": [-0.51, 0.08, 0.6], | |
| 1794 "L": [-0.74, 0.72, -0.16], | |
| 1795 "M": [-0.7, 1, -0.32], | |
| 1796 "N": [0.14, 0.2, -0.66], | |
| 1797 "P": [-0.43, 0.73, -0.6], | |
| 1798 "Q": [0.3, 1, -0.3], | |
| 1799 "R": [-0.22, 0.27, 1], | |
| 1800 "S": [-0.8, 0.61, -1], | |
| 1801 "T": [-0.58, 0.85, -0.89], | |
| 1802 "V": [-1, 0.79, -0.58], | |
| 1803 "W": [1, 0.98, -0.47], | |
| 1804 "Y": [0.97, 0.66, -0.16], | |
| 1805 }, | |
| 1806 "pepcats": { | |
| 1807 "A": [1, 0, 0, 0, 0, 0], | |
| 1808 "C": [1, 0, 1, 1, 0, 0], | |
| 1809 "D": [0, 0, 1, 0, 0, 1], | |
| 1810 "E": [0, 0, 1, 0, 0, 1], | |
| 1811 "F": [1, 1, 0, 0, 0, 0], | |
| 1812 "G": [0, 0, 0, 0, 0, 0], | |
| 1813 "H": [1, 1, 0, 1, 1, 0], | |
| 1814 "I": [1, 0, 0, 0, 0, 0], | |
| 1815 "K": [1, 0, 0, 1, 1, 0], | |
| 1816 "L": [1, 0, 0, 0, 0, 0], | |
| 1817 "M": [1, 0, 1, 0, 0, 0], | |
| 1818 "N": [0, 0, 1, 1, 0, 0], | |
| 1819 "P": [1, 0, 0, 0, 0, 0], | |
| 1820 "Q": [0, 0, 1, 1, 0, 0], | |
| 1821 "R": [1, 0, 0, 1, 1, 0], | |
| 1822 "S": [0, 0, 1, 1, 0, 0], | |
| 1823 "T": [0, 0, 1, 1, 0, 0], | |
| 1824 "V": [1, 0, 0, 0, 0, 0], | |
| 1825 "W": [1, 1, 0, 1, 0, 0], | |
| 1826 "Y": [1, 1, 1, 1, 0, 0], | |
| 1827 }, | |
| 1828 "peparc": { | |
| 1829 "A": [1, 0, 0, 0, 0], | |
| 1830 "C": [0, 1, 0, 0, 0], | |
| 1831 "D": [0, 1, 0, 1, 0], | |
| 1832 "E": [0, 1, 0, 1, 0], | |
| 1833 "F": [1, 0, 0, 0, 0], | |
| 1834 "G": [0, 0, 0, 0, 0], | |
| 1835 "H": [0, 1, 1, 0, 0], | |
| 1836 "I": [1, 0, 0, 0, 0], | |
| 1837 "K": [0, 1, 1, 0, 0], | |
| 1838 "L": [1, 0, 0, 0, 0], | |
| 1839 "M": [1, 0, 0, 0, 0], | |
| 1840 "N": [0, 1, 0, 0, 0], | |
| 1841 "P": [0, 0, 0, 0, 1], | |
| 1842 "Q": [0, 1, 0, 0, 0], | |
| 1843 "R": [0, 1, 1, 0, 0], | |
| 1844 "S": [0, 1, 0, 0, 0], | |
| 1845 "T": [0, 1, 0, 0, 0], | |
| 1846 "V": [1, 0, 0, 0, 0], | |
| 1847 "W": [1, 0, 0, 0, 0], | |
| 1848 "Y": [1, 0, 0, 0, 0], | |
| 1849 }, | |
| 1850 "polarity": { | |
| 1851 "A": [0.395], | |
| 1852 "C": [0.074], | |
| 1853 "D": [1.0], | |
| 1854 "E": [0.914], | |
| 1855 "F": [0.037], | |
| 1856 "G": [0.506], | |
| 1857 "H": [0.679], | |
| 1858 "I": [0.037], | |
| 1859 "K": [0.79], | |
| 1860 "L": [0.0], | |
| 1861 "M": [0.099], | |
| 1862 "N": [0.827], | |
| 1863 "P": [0.383], | |
| 1864 "Q": [0.691], | |
| 1865 "R": [0.691], | |
| 1866 "S": [0.531], | |
| 1867 "T": [0.457], | |
| 1868 "V": [0.123], | |
| 1869 "W": [0.062], | |
| 1870 "Y": [0.16], | |
| 1871 }, | |
| 1872 "ppcali": { | |
| 1873 "A": [ | |
| 1874 0.070781, | |
| 1875 0.036271, | |
| 1876 2.042, | |
| 1877 0.083272, | |
| 1878 0.69089, | |
| 1879 0.15948, | |
| 1880 -0.80893, | |
| 1881 0.24698, | |
| 1882 0.86525, | |
| 1883 0.68563, | |
| 1884 -0.24665, | |
| 1885 0.61314, | |
| 1886 -0.53343, | |
| 1887 -0.50878, | |
| 1888 -1.3646, | |
| 1889 2.2679, | |
| 1890 -1.5644, | |
| 1891 -0.75043, | |
| 1892 -0.65875, | |
| 1893 ], | |
| 1894 "C": [ | |
| 1895 0.61013, | |
| 1896 -0.93043, | |
| 1897 -0.85983, | |
| 1898 -2.2704, | |
| 1899 1.5877, | |
| 1900 -2.0066, | |
| 1901 -0.30314, | |
| 1902 1.2544, | |
| 1903 -0.2832, | |
| 1904 -1.2844, | |
| 1905 -0.73449, | |
| 1906 -0.11235, | |
| 1907 -0.41152, | |
| 1908 -0.0050164, | |
| 1909 0.28307, | |
| 1910 0.20522, | |
| 1911 -0.021084, | |
| 1912 -0.15627, | |
| 1913 -0.32689, | |
| 1914 ], | |
| 1915 "D": [ | |
| 1916 -1.3215, | |
| 1917 0.24063, | |
| 1918 -0.032754, | |
| 1919 -0.37863, | |
| 1920 1.2051, | |
| 1921 1.0001, | |
| 1922 2.1827, | |
| 1923 0.19212, | |
| 1924 -0.60529, | |
| 1925 0.37639, | |
| 1926 -0.46451, | |
| 1927 -0.46788, | |
| 1928 1.4077, | |
| 1929 -2.1661, | |
| 1930 0.72604, | |
| 1931 -0.12332, | |
| 1932 -0.8243, | |
| 1933 -0.082989, | |
| 1934 0.053476, | |
| 1935 ], | |
| 1936 "E": [ | |
| 1937 -0.87713, | |
| 1938 1.4905, | |
| 1939 1.0755, | |
| 1940 0.35944, | |
| 1941 1.567, | |
| 1942 0.41365, | |
| 1943 1.0944, | |
| 1944 0.72634, | |
| 1945 -0.74957, | |
| 1946 0.038939, | |
| 1947 0.075057, | |
| 1948 0.78637, | |
| 1949 -1.4543, | |
| 1950 1.6667, | |
| 1951 -0.097439, | |
| 1952 -0.24293, | |
| 1953 1.7687, | |
| 1954 0.36174, | |
| 1955 -0.11585, | |
| 1956 ], | |
| 1957 "F": [ | |
| 1958 1.3557, | |
| 1959 -0.10336, | |
| 1960 -0.4309, | |
| 1961 0.41269, | |
| 1962 -0.083356, | |
| 1963 0.83783, | |
| 1964 0.095381, | |
| 1965 -0.65222, | |
| 1966 -0.3119, | |
| 1967 0.43293, | |
| 1968 -1.0011, | |
| 1969 -0.66855, | |
| 1970 -0.10242, | |
| 1971 1.2066, | |
| 1972 2.6234, | |
| 1973 1.9981, | |
| 1974 -0.25016, | |
| 1975 0.71979, | |
| 1976 0.21569, | |
| 1977 ], | |
| 1978 "G": [ | |
| 1979 -1.0818, | |
| 1980 -2.1561, | |
| 1981 0.77082, | |
| 1982 -0.92747, | |
| 1983 -1.0748, | |
| 1984 1.7997, | |
| 1985 -1.3708, | |
| 1986 1.279, | |
| 1987 -1.2098, | |
| 1988 0.46065, | |
| 1989 0.43076, | |
| 1990 0.20037, | |
| 1991 -0.2302, | |
| 1992 0.2646, | |
| 1993 0.57149, | |
| 1994 -0.68432, | |
| 1995 0.19341, | |
| 1996 -0.061606, | |
| 1997 -0.08071, | |
| 1998 ], | |
| 1999 "H": [ | |
| 2000 -0.050161, | |
| 2001 0.69246, | |
| 2002 -0.88397, | |
| 2003 -0.64601, | |
| 2004 0.24622, | |
| 2005 0.10487, | |
| 2006 -1.1317, | |
| 2007 -2.3661, | |
| 2008 -0.89918, | |
| 2009 0.46391, | |
| 2010 -0.62359, | |
| 2011 2.5478, | |
| 2012 -0.34737, | |
| 2013 -0.52062, | |
| 2014 0.17522, | |
| 2015 -0.88648, | |
| 2016 -0.4755, | |
| 2017 0.023187, | |
| 2018 -0.28261, | |
| 2019 ], | |
| 2020 "I": [ | |
| 2021 1.4829, | |
| 2022 -0.46435, | |
| 2023 0.50189, | |
| 2024 0.55724, | |
| 2025 -0.51535, | |
| 2026 -0.29914, | |
| 2027 0.97236, | |
| 2028 -0.15793, | |
| 2029 -0.98246, | |
| 2030 -0.54347, | |
| 2031 0.97806, | |
| 2032 0.37577, | |
| 2033 1.618, | |
| 2034 0.62323, | |
| 2035 -0.59359, | |
| 2036 -0.35483, | |
| 2037 -0.085017, | |
| 2038 0.55825, | |
| 2039 -2.7542, | |
| 2040 ], | |
| 2041 "K": [ | |
| 2042 -0.85344, | |
| 2043 1.529, | |
| 2044 0.27747, | |
| 2045 0.32993, | |
| 2046 -1.1786, | |
| 2047 -0.16633, | |
| 2048 -1.0459, | |
| 2049 0.44621, | |
| 2050 0.41027, | |
| 2051 -2.5318, | |
| 2052 0.91329, | |
| 2053 0.53385, | |
| 2054 0.61417, | |
| 2055 -1.111, | |
| 2056 1.1323, | |
| 2057 0.95105, | |
| 2058 0.76769, | |
| 2059 -0.016115, | |
| 2060 0.054995, | |
| 2061 ], | |
| 2062 "L": [ | |
| 2063 1.2857, | |
| 2064 0.039488, | |
| 2065 1.5378, | |
| 2066 0.87969, | |
| 2067 -0.21419, | |
| 2068 0.40389, | |
| 2069 -0.20426, | |
| 2070 -0.14351, | |
| 2071 0.61024, | |
| 2072 -1.1927, | |
| 2073 -2.2149, | |
| 2074 -0.84248, | |
| 2075 -0.5061, | |
| 2076 -0.48548, | |
| 2077 0.10791, | |
| 2078 -2.1503, | |
| 2079 -0.12006, | |
| 2080 -0.60222, | |
| 2081 0.26546, | |
| 2082 ], | |
| 2083 "M": [ | |
| 2084 1.137, | |
| 2085 0.64388, | |
| 2086 0.13724, | |
| 2087 -0.2988, | |
| 2088 1.2288, | |
| 2089 0.24981, | |
| 2090 -1.6427, | |
| 2091 -0.75868, | |
| 2092 -0.54902, | |
| 2093 1.0571, | |
| 2094 1.272, | |
| 2095 -1.9104, | |
| 2096 0.70919, | |
| 2097 -0.93575, | |
| 2098 -0.6314, | |
| 2099 -0.079654, | |
| 2100 1.634, | |
| 2101 -0.0021923, | |
| 2102 0.49825, | |
| 2103 ], | |
| 2104 "N": [ | |
| 2105 -1.084, | |
| 2106 -0.176, | |
| 2107 -0.47062, | |
| 2108 -0.92245, | |
| 2109 -0.32953, | |
| 2110 0.74278, | |
| 2111 0.34551, | |
| 2112 -1.4605, | |
| 2113 0.25219, | |
| 2114 -1.2107, | |
| 2115 -0.59978, | |
| 2116 -0.79183, | |
| 2117 1.3268, | |
| 2118 1.9839, | |
| 2119 -1.6137, | |
| 2120 0.5333, | |
| 2121 0.033889, | |
| 2122 -1.0331, | |
| 2123 0.83019, | |
| 2124 ], | |
| 2125 "P": [ | |
| 2126 -1.1823, | |
| 2127 -1.6911, | |
| 2128 -1.1331, | |
| 2129 3.073, | |
| 2130 1.1942, | |
| 2131 -0.93426, | |
| 2132 -0.72985, | |
| 2133 -0.042441, | |
| 2134 -0.19264, | |
| 2135 -0.21603, | |
| 2136 -0.1239, | |
| 2137 0.054016, | |
| 2138 0.15241, | |
| 2139 -0.019691, | |
| 2140 -0.20543, | |
| 2141 0.10206, | |
| 2142 0.07671, | |
| 2143 -0.081968, | |
| 2144 0.20348, | |
| 2145 ], | |
| 2146 "Q": [ | |
| 2147 -0.57747, | |
| 2148 0.97452, | |
| 2149 -0.077547, | |
| 2150 -0.0033488, | |
| 2151 0.17184, | |
| 2152 -0.52537, | |
| 2153 -0.27362, | |
| 2154 -0.1366, | |
| 2155 0.2057, | |
| 2156 -0.013066, | |
| 2157 1.8834, | |
| 2158 -1.2736, | |
| 2159 -0.84991, | |
| 2160 1.0445, | |
| 2161 0.69027, | |
| 2162 -1.2866, | |
| 2163 -2.6776, | |
| 2164 0.1683, | |
| 2165 0.086105, | |
| 2166 ], | |
| 2167 "R": [ | |
| 2168 -0.62245, | |
| 2169 1.545, | |
| 2170 -0.61966, | |
| 2171 0.19057, | |
| 2172 -1.7485, | |
| 2173 -1.3909, | |
| 2174 -0.47526, | |
| 2175 1.3938, | |
| 2176 -0.84556, | |
| 2177 1.7344, | |
| 2178 -1.6516, | |
| 2179 -0.52678, | |
| 2180 0.6791, | |
| 2181 0.24374, | |
| 2182 -0.62551, | |
| 2183 -0.0028271, | |
| 2184 -0.053884, | |
| 2185 0.14926, | |
| 2186 -0.17232, | |
| 2187 ], | |
| 2188 "S": [ | |
| 2189 -0.86409, | |
| 2190 -0.77147, | |
| 2191 0.38542, | |
| 2192 -0.59389, | |
| 2193 -0.53313, | |
| 2194 -0.47585, | |
| 2195 0.31966, | |
| 2196 -0.89716, | |
| 2197 1.8029, | |
| 2198 0.26431, | |
| 2199 -0.23173, | |
| 2200 -0.37626, | |
| 2201 -0.47349, | |
| 2202 -0.42878, | |
| 2203 -0.47297, | |
| 2204 -0.079826, | |
| 2205 0.57043, | |
| 2206 3.2057, | |
| 2207 -0.18413, | |
| 2208 ], | |
| 2209 "T": [ | |
| 2210 -0.33027, | |
| 2211 -0.57447, | |
| 2212 0.18653, | |
| 2213 -0.28941, | |
| 2214 -0.62681, | |
| 2215 -1.0737, | |
| 2216 0.80363, | |
| 2217 -0.59525, | |
| 2218 1.8786, | |
| 2219 1.3971, | |
| 2220 0.63929, | |
| 2221 0.21281, | |
| 2222 -0.067048, | |
| 2223 0.096271, | |
| 2224 1.323, | |
| 2225 -0.36173, | |
| 2226 1.2261, | |
| 2227 -2.2771, | |
| 2228 -0.65412, | |
| 2229 ], | |
| 2230 "V": [ | |
| 2231 1.1675, | |
| 2232 -0.61554, | |
| 2233 0.95405, | |
| 2234 0.11662, | |
| 2235 -0.74473, | |
| 2236 -1.1482, | |
| 2237 1.1309, | |
| 2238 0.12079, | |
| 2239 -0.77171, | |
| 2240 0.18597, | |
| 2241 0.93442, | |
| 2242 1.201, | |
| 2243 0.3826, | |
| 2244 -0.091573, | |
| 2245 -0.31269, | |
| 2246 0.074367, | |
| 2247 -0.22946, | |
| 2248 0.24322, | |
| 2249 2.9836, | |
| 2250 ], | |
| 2251 "W": [ | |
| 2252 1.1881, | |
| 2253 0.43789, | |
| 2254 -1.7915, | |
| 2255 0.138, | |
| 2256 0.43088, | |
| 2257 1.6467, | |
| 2258 -0.11987, | |
| 2259 1.7369, | |
| 2260 2.0818, | |
| 2261 0.33122, | |
| 2262 0.31829, | |
| 2263 1.1586, | |
| 2264 0.67649, | |
| 2265 0.30819, | |
| 2266 -0.55772, | |
| 2267 -0.54491, | |
| 2268 -0.17969, | |
| 2269 0.24477, | |
| 2270 0.38674, | |
| 2271 ], | |
| 2272 "Y": [ | |
| 2273 0.54671, | |
| 2274 -0.1468, | |
| 2275 -1.5688, | |
| 2276 0.19001, | |
| 2277 -1.2736, | |
| 2278 0.66162, | |
| 2279 1.1614, | |
| 2280 -0.18614, | |
| 2281 -0.70654, | |
| 2282 -0.43634, | |
| 2283 0.44775, | |
| 2284 -0.71366, | |
| 2285 -2.5907, | |
| 2286 -1.1649, | |
| 2287 -1.1576, | |
| 2288 0.66572, | |
| 2289 0.21019, | |
| 2290 -0.61016, | |
| 2291 -0.34844, | |
| 2292 ], | |
| 2293 }, | |
| 2294 "refractivity": { | |
| 2295 "A": [0.102045615], | |
| 2296 "C": [0.841053374], | |
| 2297 "D": [0.282153774], | |
| 2298 "E": [0.405831178], | |
| 2299 "F": [0.691276746], | |
| 2300 "G": [0], | |
| 2301 "H": [0.512814484], | |
| 2302 "I": [0.448154244], | |
| 2303 "K": [0.50058782], | |
| 2304 "L": [0.441570656], | |
| 2305 "M": [0.508817305], | |
| 2306 "N": [0.282153774], | |
| 2307 "P": [0.256995062], | |
| 2308 "Q": [0.405831178], | |
| 2309 "R": [0.626851634], | |
| 2310 "S": [0.149306372], | |
| 2311 "T": [0.258876087], | |
| 2312 "V": [0.327298378], | |
| 2313 "W": [1], | |
| 2314 "Y": [0.741359041], | |
| 2315 }, | |
| 2316 "t_scale": { | |
| 2317 "A": [-8.4, -8.01, -3.73, -3.65, -6.12, -1.59, 1.56], | |
| 2318 "C": [-2.44, -1.96, 0.93, -2.35, 1.31, 2.29, -1.52], | |
| 2319 "D": [-6.84, -0.94, 17.68, -0.03, 3.44, 9.07, 4.32], | |
| 2320 "E": [-6.5, 16.2, 17.28, 3.11, -4.75, -2.54, 4.72], | |
| 2321 "F": [21.59, -5.73, 1.03, -3.3, 2.64, -5.02, 1.7], | |
| 2322 "G": [-8.48, -10.37, -5.14, -6.51, -11.84, -3.6, 2.01], | |
| 2323 "H": [15.28, -3.67, 6.72, -6.38, 4.12, -1.55, -2.85], | |
| 2324 "I": [-2.97, 4.64, -0.77, 11, 3.26, -4.36, -7.88], | |
| 2325 "K": [2.7, 13.46, -14.03, -2.55, 2.77, 0.15, 3.19], | |
| 2326 "L": [2.61, 5.96, 1.97, 2.59, -4.77, -4.84, -5.44], | |
| 2327 "M": [3.38, 12.43, -4.77, 0.45, -1.55, -0.6, 3.26], | |
| 2328 "N": [-3.11, -1.22, 6.26, -9.38, 9.94, 7.66, -4.81], | |
| 2329 "P": [-5.35, -9.07, -1.52, -8.79, -8.73, 4.29, -9.91], | |
| 2330 "Q": [-5.31, 15.64, 8.44, 1.03, -4.32, -4.4, -0.52], | |
| 2331 "R": [-2.27, 18.9, -18.24, -3.47, 3.03, 6.64, 0.45], | |
| 2332 "S": [-15.88, -11.21, -2.44, -3.61, 3.46, -0.37, 8.98], | |
| 2333 "T": [-17.81, -13.64, -5.19, 10.57, 6.91, -4.43, 3.49], | |
| 2334 "V": [-5.8, -6.15, -2.26, 9.87, 5.28, -1.49, -7.54], | |
| 2335 "W": [21.68, -8.78, -2.53, 15.53, -8.15, 11.98, 3.23], | |
| 2336 "Y": [23.9, -6.47, 0.31, -4.14, 4.08, -7.28, 3.59], | |
| 2337 }, | |
| 2338 "tm_tend": { | |
| 2339 "A": [0.38], | |
| 2340 "C": [-0.3], | |
| 2341 "D": [-3.27], | |
| 2342 "E": [-2.9], | |
| 2343 "F": [1.98], | |
| 2344 "G": [-0.19], | |
| 2345 "H": [-1.44], | |
| 2346 "I": [1.97], | |
| 2347 "K": [-3.46], | |
| 2348 "L": [1.82], | |
| 2349 "M": [1.4], | |
| 2350 "N": [-1.62], | |
| 2351 "P": [-1.44], | |
| 2352 "Q": [-1.84], | |
| 2353 "R": [-2.57], | |
| 2354 "S": [-0.53], | |
| 2355 "T": [-0.32], | |
| 2356 "V": [1.46], | |
| 2357 "W": [1.53], | |
| 2358 "Y": [0.49], | |
| 2359 }, | |
| 2360 "z3": { | |
| 2361 "A": [0.07, -1.73, 0.09], | |
| 2362 "C": [0.71, -0.97, 4.13], | |
| 2363 "D": [3.64, 1.13, 2.36], | |
| 2364 "E": [3.08, 0.39, -0.07], | |
| 2365 "F": [-4.92, 1.3, 0.45], | |
| 2366 "G": [2.23, -5.36, 0.3], | |
| 2367 "H": [2.41, 1.74, 1.11], | |
| 2368 "I": [-4.44, -1.68, -1.03], | |
| 2369 "K": [2.84, 1.41, -3.14], | |
| 2370 "L": [-4.19, -1.03, -0.98], | |
| 2371 "M": [-2.49, -0.27, -0.41], | |
| 2372 "N": [3.22, 1.45, 0.84], | |
| 2373 "P": [-1.22, 0.88, 2.23], | |
| 2374 "Q": [2.18, 0.53, -1.14], | |
| 2375 "R": [2.88, 2.52, -3.44], | |
| 2376 "S": [1.96, -1.63, 0.57], | |
| 2377 "T": [0.92, -2.09, -1.4], | |
| 2378 "V": [-2.69, -2.53, -1.29], | |
| 2379 "W": [-4.75, 3.65, 0.85], | |
| 2380 "Y": [-1.39, 2.32, 0.01], | |
| 2381 }, | |
| 2382 "z5": { | |
| 2383 "A": [0.24, -2.32, 0.6, -0.14, 1.3], | |
| 2384 "C": [0.84, -1.67, 3.71, 0.18, -2.65], | |
| 2385 "D": [3.98, 0.93, 1.93, -2.46, 0.75], | |
| 2386 "E": [3.11, 0.26, -0.11, -3.04, -0.25], | |
| 2387 "F": [-4.22, 1.94, 1.06, 0.54, -0.62], | |
| 2388 "G": [2.05, -4.06, 0.36, -0.82, -0.38], | |
| 2389 "H": [2.47, 1.95, 0.26, 3.9, 0.09], | |
| 2390 "I": [-3.89, -1.73, -1.71, -0.84, 0.26], | |
| 2391 "K": [2.29, 0.89, -2.49, 1.49, 0.31], | |
| 2392 "L": [-4.28, -1.3, -1.49, -0.72, 0.84], | |
| 2393 "M": [-2.85, -0.22, 0.47, 1.94, -0.98], | |
| 2394 "N": [3.05, 1.62, 1.04, -1.15, 1.61], | |
| 2395 "P": [-1.66, 0.27, 1.84, 0.7, 2], | |
| 2396 "Q": [1.75, 0.5, -1.44, -1.34, 0.66], | |
| 2397 "R": [3.52, 2.5, -3.5, 1.99, -0.17], | |
| 2398 "S": [2.39, -1.07, 1.15, -1.39, 0.67], | |
| 2399 "T": [0.75, -2.18, -1.12, -1.46, -0.4], | |
| 2400 "V": [-2.59, -2.64, -1.54, -0.85, -0.02], | |
| 2401 "W": [-4.36, 3.94, 0.59, 3.44, -1.59], | |
| 2402 "Y": [-2.54, 2.44, 0.43, 0.04, -1.47], | |
| 2403 }, | |
| 2404 } | |
| 2405 if scalename == "all": | |
| 2406 d = { | |
| 2407 "I": [], | |
| 2408 "F": [], | |
| 2409 "V": [], | |
| 2410 "L": [], | |
| 2411 "W": [], | |
| 2412 "M": [], | |
| 2413 "A": [], | |
| 2414 "G": [], | |
| 2415 "C": [], | |
| 2416 "Y": [], | |
| 2417 "P": [], | |
| 2418 "T": [], | |
| 2419 "S": [], | |
| 2420 "H": [], | |
| 2421 "E": [], | |
| 2422 "N": [], | |
| 2423 "Q": [], | |
| 2424 "D": [], | |
| 2425 "K": [], | |
| 2426 "R": [], | |
| 2427 } | |
| 2428 for scale in scales.keys(): | |
| 2429 for k, v in scales[scale].items(): | |
| 2430 d[k].extend(v) | |
| 2431 return "all", d | |
| 2432 | |
| 2433 elif scalename == "instability": | |
| 2434 d = { | |
| 2435 "A": { | |
| 2436 "A": 1.0, | |
| 2437 "C": 44.94, | |
| 2438 "E": 1.0, | |
| 2439 "D": -7.49, | |
| 2440 "G": 1.0, | |
| 2441 "F": 1.0, | |
| 2442 "I": 1.0, | |
| 2443 "H": -7.49, | |
| 2444 "K": 1.0, | |
| 2445 "M": 1.0, | |
| 2446 "L": 1.0, | |
| 2447 "N": 1.0, | |
| 2448 "Q": 1.0, | |
| 2449 "P": 20.26, | |
| 2450 "S": 1.0, | |
| 2451 "R": 1.0, | |
| 2452 "T": 1.0, | |
| 2453 "W": 1.0, | |
| 2454 "V": 1.0, | |
| 2455 "Y": 1.0, | |
| 2456 }, | |
| 2457 "C": { | |
| 2458 "A": 1.0, | |
| 2459 "C": 1.0, | |
| 2460 "E": 1.0, | |
| 2461 "D": 20.26, | |
| 2462 "G": 1.0, | |
| 2463 "F": 1.0, | |
| 2464 "I": 1.0, | |
| 2465 "H": 33.6, | |
| 2466 "K": 1.0, | |
| 2467 "M": 33.6, | |
| 2468 "L": 20.26, | |
| 2469 "N": 1.0, | |
| 2470 "Q": -6.54, | |
| 2471 "P": 20.26, | |
| 2472 "S": 1.0, | |
| 2473 "R": 1.0, | |
| 2474 "T": 33.6, | |
| 2475 "W": 24.68, | |
| 2476 "V": -6.54, | |
| 2477 "Y": 1.0, | |
| 2478 }, | |
| 2479 "E": { | |
| 2480 "A": 1.0, | |
| 2481 "C": 44.94, | |
| 2482 "E": 33.6, | |
| 2483 "D": 20.26, | |
| 2484 "G": 1.0, | |
| 2485 "F": 1.0, | |
| 2486 "I": 20.26, | |
| 2487 "H": -6.54, | |
| 2488 "K": 1.0, | |
| 2489 "M": 1.0, | |
| 2490 "L": 1.0, | |
| 2491 "N": 1.0, | |
| 2492 "Q": 20.26, | |
| 2493 "P": 20.26, | |
| 2494 "S": 20.26, | |
| 2495 "R": 1.0, | |
| 2496 "T": 1.0, | |
| 2497 "W": -14.03, | |
| 2498 "V": 1.0, | |
| 2499 "Y": 1.0, | |
| 2500 }, | |
| 2501 "D": { | |
| 2502 "A": 1.0, | |
| 2503 "C": 1.0, | |
| 2504 "E": 1.0, | |
| 2505 "D": 1.0, | |
| 2506 "G": 1.0, | |
| 2507 "F": -6.54, | |
| 2508 "I": 1.0, | |
| 2509 "H": 1.0, | |
| 2510 "K": -7.49, | |
| 2511 "M": 1.0, | |
| 2512 "L": 1.0, | |
| 2513 "N": 1.0, | |
| 2514 "Q": 1.0, | |
| 2515 "P": 1.0, | |
| 2516 "S": 20.26, | |
| 2517 "R": -6.54, | |
| 2518 "T": -14.03, | |
| 2519 "W": 1.0, | |
| 2520 "V": 1.0, | |
| 2521 "Y": 1.0, | |
| 2522 }, | |
| 2523 "G": { | |
| 2524 "A": -7.49, | |
| 2525 "C": 1.0, | |
| 2526 "E": -6.54, | |
| 2527 "D": 1.0, | |
| 2528 "G": 13.34, | |
| 2529 "F": 1.0, | |
| 2530 "I": -7.49, | |
| 2531 "H": 1.0, | |
| 2532 "K": -7.49, | |
| 2533 "M": 1.0, | |
| 2534 "L": 1.0, | |
| 2535 "N": -7.49, | |
| 2536 "Q": 1.0, | |
| 2537 "P": 1.0, | |
| 2538 "S": 1.0, | |
| 2539 "R": 1.0, | |
| 2540 "T": -7.49, | |
| 2541 "W": 13.34, | |
| 2542 "V": 1.0, | |
| 2543 "Y": -7.49, | |
| 2544 }, | |
| 2545 "F": { | |
| 2546 "A": 1.0, | |
| 2547 "C": 1.0, | |
| 2548 "E": 1.0, | |
| 2549 "D": 13.34, | |
| 2550 "G": 1.0, | |
| 2551 "F": 1.0, | |
| 2552 "I": 1.0, | |
| 2553 "H": 1.0, | |
| 2554 "K": -14.03, | |
| 2555 "M": 1.0, | |
| 2556 "L": 1.0, | |
| 2557 "N": 1.0, | |
| 2558 "Q": 1.0, | |
| 2559 "P": 20.26, | |
| 2560 "S": 1.0, | |
| 2561 "R": 1.0, | |
| 2562 "T": 1.0, | |
| 2563 "W": 1.0, | |
| 2564 "V": 1.0, | |
| 2565 "Y": 33.601, | |
| 2566 }, | |
| 2567 "I": { | |
| 2568 "A": 1.0, | |
| 2569 "C": 1.0, | |
| 2570 "E": 44.94, | |
| 2571 "D": 1.0, | |
| 2572 "G": 1.0, | |
| 2573 "F": 1.0, | |
| 2574 "I": 1.0, | |
| 2575 "H": 13.34, | |
| 2576 "K": -7.49, | |
| 2577 "M": 1.0, | |
| 2578 "L": 20.26, | |
| 2579 "N": 1.0, | |
| 2580 "Q": 1.0, | |
| 2581 "P": -1.88, | |
| 2582 "S": 1.0, | |
| 2583 "R": 1.0, | |
| 2584 "T": 1.0, | |
| 2585 "W": 1.0, | |
| 2586 "V": -7.49, | |
| 2587 "Y": 1.0, | |
| 2588 }, | |
| 2589 "H": { | |
| 2590 "A": 1.0, | |
| 2591 "C": 1.0, | |
| 2592 "E": 1.0, | |
| 2593 "D": 1.0, | |
| 2594 "G": -9.37, | |
| 2595 "F": -9.37, | |
| 2596 "I": 44.94, | |
| 2597 "H": 1.0, | |
| 2598 "K": 24.68, | |
| 2599 "M": 1.0, | |
| 2600 "L": 1.0, | |
| 2601 "N": 24.68, | |
| 2602 "Q": 1.0, | |
| 2603 "P": -1.88, | |
| 2604 "S": 1.0, | |
| 2605 "R": 1.0, | |
| 2606 "T": -6.54, | |
| 2607 "W": -1.88, | |
| 2608 "V": 1.0, | |
| 2609 "Y": 44.94, | |
| 2610 }, | |
| 2611 "K": { | |
| 2612 "A": 1.0, | |
| 2613 "C": 1.0, | |
| 2614 "E": 1.0, | |
| 2615 "D": 1.0, | |
| 2616 "G": -7.49, | |
| 2617 "F": 1.0, | |
| 2618 "I": -7.49, | |
| 2619 "H": 1.0, | |
| 2620 "K": 1.0, | |
| 2621 "M": 33.6, | |
| 2622 "L": -7.49, | |
| 2623 "N": 1.0, | |
| 2624 "Q": 24.64, | |
| 2625 "P": -6.54, | |
| 2626 "S": 1.0, | |
| 2627 "R": 33.6, | |
| 2628 "T": 1.0, | |
| 2629 "W": 1.0, | |
| 2630 "V": -7.49, | |
| 2631 "Y": 1.0, | |
| 2632 }, | |
| 2633 "M": { | |
| 2634 "A": 13.34, | |
| 2635 "C": 1.0, | |
| 2636 "E": 1.0, | |
| 2637 "D": 1.0, | |
| 2638 "G": 1.0, | |
| 2639 "F": 1.0, | |
| 2640 "I": 1.0, | |
| 2641 "H": 58.28, | |
| 2642 "K": 1.0, | |
| 2643 "M": -1.88, | |
| 2644 "L": 1.0, | |
| 2645 "N": 1.0, | |
| 2646 "Q": -6.54, | |
| 2647 "P": 44.94, | |
| 2648 "S": 44.94, | |
| 2649 "R": -6.54, | |
| 2650 "T": -1.88, | |
| 2651 "W": 1.0, | |
| 2652 "V": 1.0, | |
| 2653 "Y": 24.68, | |
| 2654 }, | |
| 2655 "L": { | |
| 2656 "A": 1.0, | |
| 2657 "C": 1.0, | |
| 2658 "E": 1.0, | |
| 2659 "D": 1.0, | |
| 2660 "G": 1.0, | |
| 2661 "F": 1.0, | |
| 2662 "I": 1.0, | |
| 2663 "H": 1.0, | |
| 2664 "K": -7.49, | |
| 2665 "M": 1.0, | |
| 2666 "L": 1.0, | |
| 2667 "N": 1.0, | |
| 2668 "Q": 33.6, | |
| 2669 "P": 20.26, | |
| 2670 "S": 1.0, | |
| 2671 "R": 20.26, | |
| 2672 "T": 1.0, | |
| 2673 "W": 24.68, | |
| 2674 "V": 1.0, | |
| 2675 "Y": 1.0, | |
| 2676 }, | |
| 2677 "N": { | |
| 2678 "A": 1.0, | |
| 2679 "C": -1.88, | |
| 2680 "E": 1.0, | |
| 2681 "D": 1.0, | |
| 2682 "G": -14.03, | |
| 2683 "F": -14.03, | |
| 2684 "I": 44.94, | |
| 2685 "H": 1.0, | |
| 2686 "K": 24.68, | |
| 2687 "M": 1.0, | |
| 2688 "L": 1.0, | |
| 2689 "N": 1.0, | |
| 2690 "Q": -6.54, | |
| 2691 "P": -1.88, | |
| 2692 "S": 1.0, | |
| 2693 "R": 1.0, | |
| 2694 "T": -7.49, | |
| 2695 "W": -9.37, | |
| 2696 "V": 1.0, | |
| 2697 "Y": 1.0, | |
| 2698 }, | |
| 2699 "Q": { | |
| 2700 "A": 1.0, | |
| 2701 "C": -6.54, | |
| 2702 "E": 20.26, | |
| 2703 "D": 20.26, | |
| 2704 "G": 1.0, | |
| 2705 "F": -6.54, | |
| 2706 "I": 1.0, | |
| 2707 "H": 1.0, | |
| 2708 "K": 1.0, | |
| 2709 "M": 1.0, | |
| 2710 "L": 1.0, | |
| 2711 "N": 1.0, | |
| 2712 "Q": 20.26, | |
| 2713 "P": 20.26, | |
| 2714 "S": 44.94, | |
| 2715 "R": 1.0, | |
| 2716 "T": 1.0, | |
| 2717 "W": 1.0, | |
| 2718 "V": -6.54, | |
| 2719 "Y": -6.54, | |
| 2720 }, | |
| 2721 "P": { | |
| 2722 "A": 20.26, | |
| 2723 "C": -6.54, | |
| 2724 "E": 18.38, | |
| 2725 "D": -6.54, | |
| 2726 "G": 1.0, | |
| 2727 "F": 20.26, | |
| 2728 "I": 1.0, | |
| 2729 "H": 1.0, | |
| 2730 "K": 1.0, | |
| 2731 "M": -6.54, | |
| 2732 "L": 1.0, | |
| 2733 "N": 1.0, | |
| 2734 "Q": 20.26, | |
| 2735 "P": 20.26, | |
| 2736 "S": 20.26, | |
| 2737 "R": -6.54, | |
| 2738 "T": 1.0, | |
| 2739 "W": -1.88, | |
| 2740 "V": 20.26, | |
| 2741 "Y": 1.0, | |
| 2742 }, | |
| 2743 "S": { | |
| 2744 "A": 1.0, | |
| 2745 "C": 33.6, | |
| 2746 "E": 20.26, | |
| 2747 "D": 1.0, | |
| 2748 "G": 1.0, | |
| 2749 "F": 1.0, | |
| 2750 "I": 1.0, | |
| 2751 "H": 1.0, | |
| 2752 "K": 1.0, | |
| 2753 "M": 1.0, | |
| 2754 "L": 1.0, | |
| 2755 "N": 1.0, | |
| 2756 "Q": 20.26, | |
| 2757 "P": 44.94, | |
| 2758 "S": 20.26, | |
| 2759 "R": 20.26, | |
| 2760 "T": 1.0, | |
| 2761 "W": 1.0, | |
| 2762 "V": 1.0, | |
| 2763 "Y": 1.0, | |
| 2764 }, | |
| 2765 "R": { | |
| 2766 "A": 1.0, | |
| 2767 "C": 1.0, | |
| 2768 "E": 1.0, | |
| 2769 "D": 1.0, | |
| 2770 "G": -7.49, | |
| 2771 "F": 1.0, | |
| 2772 "I": 1.0, | |
| 2773 "H": 20.26, | |
| 2774 "K": 1.0, | |
| 2775 "M": 1.0, | |
| 2776 "L": 1.0, | |
| 2777 "N": 13.34, | |
| 2778 "Q": 20.26, | |
| 2779 "P": 20.26, | |
| 2780 "S": 44.94, | |
| 2781 "R": 58.28, | |
| 2782 "T": 1.0, | |
| 2783 "W": 58.28, | |
| 2784 "V": 1.0, | |
| 2785 "Y": -6.54, | |
| 2786 }, | |
| 2787 "T": { | |
| 2788 "A": 1.0, | |
| 2789 "C": 1.0, | |
| 2790 "E": 20.26, | |
| 2791 "D": 1.0, | |
| 2792 "G": -7.49, | |
| 2793 "F": 13.34, | |
| 2794 "I": 1.0, | |
| 2795 "H": 1.0, | |
| 2796 "K": 1.0, | |
| 2797 "M": 1.0, | |
| 2798 "L": 1.0, | |
| 2799 "N": -14.03, | |
| 2800 "Q": -6.54, | |
| 2801 "P": 1.0, | |
| 2802 "S": 1.0, | |
| 2803 "R": 1.0, | |
| 2804 "T": 1.0, | |
| 2805 "W": -14.03, | |
| 2806 "V": 1.0, | |
| 2807 "Y": 1.0, | |
| 2808 }, | |
| 2809 "W": { | |
| 2810 "A": -14.03, | |
| 2811 "C": 1.0, | |
| 2812 "E": 1.0, | |
| 2813 "D": 1.0, | |
| 2814 "G": -9.37, | |
| 2815 "F": 1.0, | |
| 2816 "I": 1.0, | |
| 2817 "H": 24.68, | |
| 2818 "K": 1.0, | |
| 2819 "M": 24.68, | |
| 2820 "L": 13.34, | |
| 2821 "N": 13.34, | |
| 2822 "Q": 1.0, | |
| 2823 "P": 1.0, | |
| 2824 "S": 1.0, | |
| 2825 "R": 1.0, | |
| 2826 "T": -14.03, | |
| 2827 "W": 1.0, | |
| 2828 "V": -7.49, | |
| 2829 "Y": 1.0, | |
| 2830 }, | |
| 2831 "V": { | |
| 2832 "A": 1.0, | |
| 2833 "C": 1.0, | |
| 2834 "E": 1.0, | |
| 2835 "D": -14.03, | |
| 2836 "G": -7.49, | |
| 2837 "F": 1.0, | |
| 2838 "I": 1.0, | |
| 2839 "H": 1.0, | |
| 2840 "K": -1.88, | |
| 2841 "M": 1.0, | |
| 2842 "L": 1.0, | |
| 2843 "N": 1.0, | |
| 2844 "Q": 1.0, | |
| 2845 "P": 20.26, | |
| 2846 "S": 1.0, | |
| 2847 "R": 1.0, | |
| 2848 "T": -7.49, | |
| 2849 "W": 1.0, | |
| 2850 "V": 1.0, | |
| 2851 "Y": -6.54, | |
| 2852 }, | |
| 2853 "Y": { | |
| 2854 "A": 24.68, | |
| 2855 "C": 1.0, | |
| 2856 "E": -6.54, | |
| 2857 "D": 24.68, | |
| 2858 "G": -7.49, | |
| 2859 "F": 1.0, | |
| 2860 "I": 1.0, | |
| 2861 "H": 13.34, | |
| 2862 "K": 1.0, | |
| 2863 "M": 44.94, | |
| 2864 "L": 1.0, | |
| 2865 "N": 1.0, | |
| 2866 "Q": 1.0, | |
| 2867 "P": 13.34, | |
| 2868 "S": 1.0, | |
| 2869 "R": -15.91, | |
| 2870 "T": -7.49, | |
| 2871 "W": -9.37, | |
| 2872 "V": 1.0, | |
| 2873 "Y": 13.34, | |
| 2874 }, | |
| 2875 } | |
| 2876 return "instability", d | |
| 2877 | |
| 2878 else: | |
| 2879 return scalename, scales[scalename] | |
| 2880 | |
| 2881 | |
| 2882 def read_fasta(inputfile): | |
| 2883 """Method for loading sequences from a FASTA formatted file into :py:attr:`sequences` & :py:attr:`names`. | |
| 2884 This method is used by the base class :class:`modlamp.descriptors.PeptideDescriptor` if the input is a FASTA file. | |
| 2885 | |
| 2886 :param inputfile: .fasta file with sequences and headers to read | |
| 2887 :return: list of sequences in the attribute :py:attr:`sequences` with corresponding sequence names in | |
| 2888 :py:attr:`names`. | |
| 2889 """ | |
| 2890 names = list() # list for storing names | |
| 2891 sequences = list() # list for storing sequences | |
| 2892 seq = str() | |
| 2893 with open(inputfile) as f: | |
| 2894 all = f.readlines() | |
| 2895 last = all[-1] | |
| 2896 for line in all: | |
| 2897 if line.startswith(">"): | |
| 2898 names.append( | |
| 2899 line.split(" ")[0][1:].strip() | |
| 2900 ) # add FASTA name without description as molecule name | |
| 2901 sequences.append(seq.strip()) | |
| 2902 seq = str() | |
| 2903 elif line == last: | |
| 2904 seq += line.strip() # remove potential white space | |
| 2905 sequences.append(seq.strip()) | |
| 2906 else: | |
| 2907 seq += line.strip() # remove potential white space | |
| 2908 return sequences[1:], names | |
| 2909 | |
| 2910 | |
| 2911 def save_fasta(filename, sequences, names=None): | |
| 2912 """Method for saving sequences in the instance :py:attr:`sequences` to a file in FASTA format. | |
| 2913 | |
| 2914 :param filename: {str} output filename (ending .fasta) | |
| 2915 :param sequences: {list} sequences to be saved to file | |
| 2916 :param names: {list} whether sequence names from self.names should be saved as sequence identifiers | |
| 2917 :return: a FASTA formatted file containing the generated sequences | |
| 2918 """ | |
| 2919 if os.path.exists(filename): | |
| 2920 os.remove(filename) # remove outputfile, it it exists | |
| 2921 | |
| 2922 with open(filename, "w") as o: | |
| 2923 for n, seq in enumerate(sequences): | |
| 2924 if names: | |
| 2925 o.write(">" + str(names[n]) + "\n") | |
| 2926 else: | |
| 2927 o.write(">Seq_" + str(n) + "\n") | |
| 2928 o.write(seq + "\n") | |
| 2929 | |
| 2930 | |
| 2931 def aa_weights(): | |
| 2932 """Function holding molecular weight data on all natural amino acids. | |
| 2933 | |
| 2934 :return: dictionary with amino acid letters and corresponding weights | |
| 2935 | |
| 2936 .. versionadded:: v2.4.1 | |
| 2937 """ | |
| 2938 weights = { | |
| 2939 "A": 89.093, | |
| 2940 "C": 121.158, | |
| 2941 "D": 133.103, | |
| 2942 "E": 147.129, | |
| 2943 "F": 165.189, | |
| 2944 "G": 75.067, | |
| 2945 "H": 155.155, | |
| 2946 "I": 131.173, | |
| 2947 "K": 146.188, | |
| 2948 "L": 131.173, | |
| 2949 "M": 149.211, | |
| 2950 "N": 132.118, | |
| 2951 "P": 115.131, | |
| 2952 "Q": 146.145, | |
| 2953 "R": 174.20, | |
| 2954 "S": 105.093, | |
| 2955 "T": 119.119, | |
| 2956 "V": 117.146, | |
| 2957 "W": 204.225, | |
| 2958 "Y": 181.189, | |
| 2959 } | |
| 2960 return weights | |
| 2961 | |
| 2962 | |
| 2963 def count_aas(seq, scale="relative"): | |
| 2964 """Function to count the amino acids occuring in a given sequence. | |
| 2965 | |
| 2966 :param seq: {str} amino acid sequence | |
| 2967 :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA | |
| 2968 :return: {dict} dictionary with amino acids as keys and their counts in the sequence as values. | |
| 2969 """ | |
| 2970 if seq == "": # error if len(seq) == 0 | |
| 2971 seq = " " | |
| 2972 aas = [ | |
| 2973 "A", | |
| 2974 "C", | |
| 2975 "D", | |
| 2976 "E", | |
| 2977 "F", | |
| 2978 "G", | |
| 2979 "H", | |
| 2980 "I", | |
| 2981 "K", | |
| 2982 "L", | |
| 2983 "M", | |
| 2984 "N", | |
| 2985 "P", | |
| 2986 "Q", | |
| 2987 "R", | |
| 2988 "S", | |
| 2989 "T", | |
| 2990 "V", | |
| 2991 "W", | |
| 2992 "Y", | |
| 2993 ] | |
| 2994 scl = 1.0 | |
| 2995 if scale == "relative": | |
| 2996 scl = len(seq) | |
| 2997 aa = {a: (float(seq.count(a)) / scl) for a in aas} | |
| 2998 aa = collections.OrderedDict(sorted(list(aa.items()))) | |
| 2999 return aa | |
| 3000 | |
| 3001 | |
| 3002 def count_ngrams(seq, n): | |
| 3003 """Function to count the n-grams of an amino acid sequence. N can be one integer or a list of integers | |
| 3004 | |
| 3005 :param seq: {str} amino acid sequence | |
| 3006 :param n: {int or list of ints} defines whether counts or frequencies are given for each AA | |
| 3007 :return: {dict} dictionary with n-grams as keys and their counts in the sequence as values. | |
| 3008 """ | |
| 3009 if seq == "": | |
| 3010 seq = " " | |
| 3011 if isinstance(n, int): | |
| 3012 n = [n] | |
| 3013 ngrams = list() | |
| 3014 for i in n: | |
| 3015 ngrams.extend([seq[j : j + i] for j in range(len(seq) - (i - 1))]) | |
| 3016 counts = {g: (seq.count(g)) for g in set(ngrams)} | |
| 3017 counts = collections.OrderedDict( | |
| 3018 sorted(counts.items(), key=operator.itemgetter(1), reverse=True) | |
| 3019 ) | |
| 3020 return counts | |
| 3021 | |
| 3022 | |
| 3023 def aa_energies(): | |
| 3024 """Function holding free energies of transfer between cyclohexane and water for all natural amino acids. | |
| 3025 H. G. Boman, D. Wade, I. a Boman, B. Wåhlin, R. B. Merrifield, *FEBS Lett*. **1989**, *259*, 103–106. | |
| 3026 | |
| 3027 :return: dictionary with amino acid letters and corresponding energies. | |
| 3028 """ | |
| 3029 energies = { | |
| 3030 "L": -4.92, | |
| 3031 "I": -4.92, | |
| 3032 "V": -4.04, | |
| 3033 "F": -2.98, | |
| 3034 "M": -2.35, | |
| 3035 "W": -2.33, | |
| 3036 "A": -1.81, | |
| 3037 "C": -1.28, | |
| 3038 "G": -0.94, | |
| 3039 "Y": 0.14, | |
| 3040 "T": 2.57, | |
| 3041 "S": 3.40, | |
| 3042 "H": 4.66, | |
| 3043 "Q": 5.54, | |
| 3044 "K": 5.55, | |
| 3045 "N": 6.64, | |
| 3046 "E": 6.81, | |
| 3047 "D": 8.72, | |
| 3048 "R": 14.92, | |
| 3049 "P": 0.0, | |
| 3050 } | |
| 3051 return energies | |
| 3052 | |
| 3053 | |
| 3054 def ngrams_apd(): | |
| 3055 """Function returning the most frequent 2-, 3- and 4-grams from all sequences in the `APD3 | |
| 3056 <http://aps.unmc.edu/AP/>`_, version August 2016 with 2727 sequences. | |
| 3057 For all 2, 3 and 4grams, all possible ngrams were generated from all sequences and the top 50 most frequent | |
| 3058 assembled into a list. Finally, leading and tailing spaces were striped and duplicates as well as ngrams containing | |
| 3059 spaces were removed. | |
| 3060 | |
| 3061 :return: numpy.array containing most frequent ngrams | |
| 3062 """ | |
| 3063 return np.array( | |
| 3064 [ | |
| 3065 "AGK", | |
| 3066 "CKI", | |
| 3067 "RR", | |
| 3068 "YGGG", | |
| 3069 "LSGL", | |
| 3070 "RG", | |
| 3071 "YGGY", | |
| 3072 "PRP", | |
| 3073 "LGGG", | |
| 3074 "GV", | |
| 3075 "GT", | |
| 3076 "GS", | |
| 3077 "GR", | |
| 3078 "IAG", | |
| 3079 "GG", | |
| 3080 "GF", | |
| 3081 "GC", | |
| 3082 "GGYG", | |
| 3083 "GA", | |
| 3084 "GL", | |
| 3085 "GK", | |
| 3086 "GI", | |
| 3087 "IPC", | |
| 3088 "KAA", | |
| 3089 "LAK", | |
| 3090 "GLGG", | |
| 3091 "GGLG", | |
| 3092 "CKIT", | |
| 3093 "GAGK", | |
| 3094 "LLSG", | |
| 3095 "LKK", | |
| 3096 "FLP", | |
| 3097 "LSG", | |
| 3098 "SCK", | |
| 3099 "LLS", | |
| 3100 "GETC", | |
| 3101 "VLG", | |
| 3102 "GKLL", | |
| 3103 "LLG", | |
| 3104 "C", | |
| 3105 "KCKI", | |
| 3106 "G", | |
| 3107 "VGK", | |
| 3108 "CSC", | |
| 3109 "TKKC", | |
| 3110 "GCS", | |
| 3111 "GKA", | |
| 3112 "IGK", | |
| 3113 "GESC", | |
| 3114 "KVCY", | |
| 3115 "KKL", | |
| 3116 "KKI", | |
| 3117 "KKC", | |
| 3118 "LGGL", | |
| 3119 "GLL", | |
| 3120 "CGE", | |
| 3121 "GGYC", | |
| 3122 "GLLS", | |
| 3123 "GLF", | |
| 3124 "AKK", | |
| 3125 "GKAA", | |
| 3126 "ESCV", | |
| 3127 "GLP", | |
| 3128 "CGES", | |
| 3129 "PCGE", | |
| 3130 "FL", | |
| 3131 "CGET", | |
| 3132 "GLW", | |
| 3133 "KGAA", | |
| 3134 "KAAL", | |
| 3135 "GGY", | |
| 3136 "GGG", | |
| 3137 "IKG", | |
| 3138 "LKG", | |
| 3139 "GGL", | |
| 3140 "CK", | |
| 3141 "GTC", | |
| 3142 "CG", | |
| 3143 "SKKC", | |
| 3144 "CS", | |
| 3145 "CR", | |
| 3146 "KC", | |
| 3147 "AGKA", | |
| 3148 "KA", | |
| 3149 "KG", | |
| 3150 "LKCK", | |
| 3151 "SCKL", | |
| 3152 "KK", | |
| 3153 "KI", | |
| 3154 "KN", | |
| 3155 "KL", | |
| 3156 "SK", | |
| 3157 "KV", | |
| 3158 "SL", | |
| 3159 "SC", | |
| 3160 "SG", | |
| 3161 "AAA", | |
| 3162 "VAK", | |
| 3163 "AAL", | |
| 3164 "AAK", | |
| 3165 "GGGG", | |
| 3166 "KNVA", | |
| 3167 "GGGL", | |
| 3168 "GYG", | |
| 3169 "LG", | |
| 3170 "LA", | |
| 3171 "LL", | |
| 3172 "LK", | |
| 3173 "LS", | |
| 3174 "LP", | |
| 3175 "GCSC", | |
| 3176 "TC", | |
| 3177 "GAA", | |
| 3178 "AA", | |
| 3179 "VA", | |
| 3180 "VC", | |
| 3181 "AG", | |
| 3182 "VG", | |
| 3183 "AI", | |
| 3184 "AK", | |
| 3185 "VL", | |
| 3186 "AL", | |
| 3187 "TPGC", | |
| 3188 "IK", | |
| 3189 "IA", | |
| 3190 "IG", | |
| 3191 "YGG", | |
| 3192 "LGK", | |
| 3193 "CSCK", | |
| 3194 "GYGG", | |
| 3195 "LGG", | |
| 3196 "KGA", | |
| 3197 ] | |
| 3198 ) | |
| 3199 | |
| 3200 | |
| 3201 def aa_formulas(): | |
| 3202 """ | |
| 3203 Function returning the molecular formulas of all amino acids. All amino acids are considered in the neutral form | |
| 3204 (uncharged). | |
| 3205 """ | |
| 3206 formulas = { | |
| 3207 "A": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 0}, | |
| 3208 "C": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 1}, | |
| 3209 "D": {"C": 4, "H": 7, "N": 1, "O": 4, "S": 0}, | |
| 3210 "E": {"C": 5, "H": 9, "N": 1, "O": 4, "S": 0}, | |
| 3211 "F": {"C": 9, "H": 11, "N": 1, "O": 2, "S": 0}, | |
| 3212 "G": {"C": 2, "H": 5, "N": 1, "O": 2, "S": 0}, | |
| 3213 "H": {"C": 6, "H": 9, "N": 3, "O": 2, "S": 0}, | |
| 3214 "I": {"C": 6, "H": 13, "N": 1, "O": 2, "S": 0}, | |
| 3215 "K": {"C": 6, "H": 14, "N": 2, "O": 2, "S": 0}, | |
| 3216 "L": {"C": 6, "H": 13, "N": 1, "O": 2, "S": 0}, | |
| 3217 "M": {"C": 5, "H": 11, "N": 1, "O": 2, "S": 1}, | |
| 3218 "N": {"C": 4, "H": 8, "N": 2, "O": 3, "S": 0}, | |
| 3219 "P": {"C": 5, "H": 9, "N": 1, "O": 2, "S": 0}, | |
| 3220 "Q": {"C": 5, "H": 10, "N": 2, "O": 3, "S": 0}, | |
| 3221 "R": {"C": 6, "H": 14, "N": 4, "O": 2, "S": 0}, | |
| 3222 "S": {"C": 3, "H": 7, "N": 1, "O": 3, "S": 0}, | |
| 3223 "T": {"C": 4, "H": 9, "N": 1, "O": 3, "S": 0}, | |
| 3224 "V": {"C": 5, "H": 11, "N": 1, "O": 2, "S": 0}, | |
| 3225 "W": {"C": 11, "H": 12, "N": 2, "O": 2, "S": 0}, | |
| 3226 "Y": {"C": 9, "H": 11, "N": 1, "O": 3, "S": 0}, | |
| 3227 } | |
| 3228 return formulas |
