Mercurial > repos > cpt > cpt_helical_wheel

diff plotWheels/core.py @ 1:9b276485c94a draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author: cpt
date: Mon, 05 Jun 2023 02:44:43 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/plotWheels/core.py	Mon Jun 05 02:44:43 2023 +0000
@@ -0,0 +1,3228 @@
+# -*- coding: utf-8 -*-
+"""
+.. currentmodule:: modlamp.core
+
+.. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch>
+
+Core helper functions and classes for other modules. The two main classes are:
+
+=============================    =======================================================================================
+Class                            Characteristics
+=============================    =======================================================================================
+:py:class:`BaseSequence`         Base class inheriting to all sequence classes in the module :py:mod:`modlamp.sequences`
+:py:class:`BaseDescriptor`       Base class inheriting to the two descriptor classes in :py:mod:`modlamp.descriptors`
+=============================    =======================================================================================
+"""
+
+import os
+import random
+import re
+
+import numpy as np
+import pandas as pd
+import collections
+import operator
+from scipy.spatial import distance
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+from sklearn.utils import shuffle
+
+__author__ = "Alex Müller, Gisela Gabernet"
+__docformat__ = "restructuredtext en"
+
+
+class BaseSequence(object):
+    """Base class for sequence classes in the module :mod:`modlamp.sequences`.
+    It contains amino acid probabilities for different sequence generation classes.
+
+    The following amino acid probabilities are used: (extracted from the
+    `APD3 <http://aps.unmc.edu/AP/statistic/statistic.php>`_, March 17, 2016)
+
+    ===  ====    ======   =========    ==========
+    AA   rand    AMP      AMPnoCM      randnoCM
+    ===  ====    ======   =========    ==========
+    A    0.05    0.0766   0.0812275    0.05555555
+    C    0.05    0.071    0.0          0.0
+    D    0.05    0.026    0.0306275    0.05555555
+    E    0.05    0.0264   0.0310275    0.05555555
+    F    0.05    0.0405   0.0451275    0.05555555
+    G    0.05    0.1172   0.1218275    0.05555555
+    H    0.05    0.021    0.0256275    0.05555555
+    I    0.05    0.061    0.0656275    0.05555555
+    K    0.05    0.0958   0.1004275    0.05555555
+    L    0.05    0.0838   0.0884275    0.05555555
+    M    0.05    0.0123   0.0          0.0
+    N    0.05    0.0386   0.0432275    0.05555555
+    P    0.05    0.0463   0.0509275    0.05555555
+    Q    0.05    0.0251   0.0297275    0.05555555
+    R    0.05    0.0545   0.0591275    0.05555555
+    S    0.05    0.0613   0.0659275    0.05555555
+    T    0.05    0.0455   0.0501275    0.05555555
+    V    0.05    0.0572   0.0618275    0.05555555
+    W    0.05    0.0155   0.0201275    0.05555555
+    Y    0.05    0.0244   0.0290275    0.05555555
+    ===  ====    ======   =========    ==========
+
+    """
+
+    def __init__(self, seqnum, lenmin=7, lenmax=28):
+        """
+        :param seqnum: number of sequences to generate
+        :param lenmin: minimal length of the generated sequences
+        :param lenmax: maximal length of the generated sequences
+        :return: attributes :py:attr:`seqnum`, :py:attr:`lenmin` and :py:attr:`lenmax`.
+        :Example:
+
+        >>> b = BaseSequence(10, 7, 28)
+        >>> b.seqnum
+        10
+        >>> b.lenmin
+        7
+        >>> b.lenmax
+        28
+        """
+        self.sequences = list()
+        self.names = list()
+        self.lenmin = int(lenmin)
+        self.lenmax = int(lenmax)
+        self.seqnum = int(seqnum)
+
+        # AA classes:
+        self.AA_hyd = ["G", "A", "L", "I", "V"]
+        self.AA_basic = ["K", "R"]
+        self.AA_acidic = ["D", "E"]
+        self.AA_aroma = ["W", "Y", "F"]
+        self.AA_polar = ["S", "T", "Q", "N"]
+        # AA labels:
+        self.AAs = [
+            "A",
+            "C",
+            "D",
+            "E",
+            "F",
+            "G",
+            "H",
+            "I",
+            "K",
+            "L",
+            "M",
+            "N",
+            "P",
+            "Q",
+            "R",
+            "S",
+            "T",
+            "V",
+            "W",
+            "Y",
+        ]
+        # AA probability from the APD3 database:
+        self.prob_AMP = [
+            0.0766,
+            0.071,
+            0.026,
+            0.0264,
+            0.0405,
+            0.1172,
+            0.021,
+            0.061,
+            0.0958,
+            0.0838,
+            0.0123,
+            0.0386,
+            0.0463,
+            0.0251,
+            0.0545,
+            0.0613,
+            0.0455,
+            0.0572,
+            0.0155,
+            0.0244,
+        ]
+        # AA probability from the APD2 database without Cys and Met (synthesis reasons)
+        self.prob_AMPnoCM = [
+            0.081228,
+            0.0,
+            0.030627,
+            0.031027,
+            0.045128,
+            0.121828,
+            0.025627,
+            0.065628,
+            0.100428,
+            0.088428,
+            0.0,
+            0.043228,
+            0.050928,
+            0.029728,
+            0.059128,
+            0.065927,
+            0.050128,
+            0.061828,
+            0.020128,
+            0.029028,
+        ]
+        # equal AA probabilities:
+        self.prob = [
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+            0.05,
+        ]
+        # equal AA probabilities but 0 for Cys and Met:
+        self.prob_randnoCM = [
+            0.05555555555,
+            0.0,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.0,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+            0.05555555555,
+        ]
+
+        # AA probability from the linear CancerPPD peptides:
+        self.prob_ACP = [
+            0.14526966,
+            0.0,
+            0.00690031,
+            0.00780824,
+            0.06991102,
+            0.04957327,
+            0.01725077,
+            0.05647358,
+            0.27637552,
+            0.17759216,
+            0.00998729,
+            0.00798983,
+            0.01307427,
+            0.00381333,
+            0.02941711,
+            0.02651171,
+            0.0154349,
+            0.04013074,
+            0.0406755,
+            0.00581079,
+        ]
+
+        # AA probabilities for perfect amphipathic helix of different arc sizes
+        self.prob_amphihel = [
+            [
+                0.04545455,
+                0.0,
+                0.04545454,
+                0.04545455,
+                0.0,
+                0.04545455,
+                0.04545455,
+                0.0,
+                0.25,
+                0.0,
+                0.0,
+                0.04545454,
+                0.04545455,
+                0.04545454,
+                0.25,
+                0.04545454,
+                0.04545454,
+                0.0,
+                0.0,
+                0.04545454,
+            ],
+            [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.16666667,
+                0.0,
+                0.0,
+                0.16666667,
+                0.0,
+                0.16666667,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.16666667,
+                0.16666667,
+                (1.0 - 0.16666667 * 5),
+            ],
+        ]
+
+        # helical ACP AA probabilities, depending on the position of the AA in the helix.
+        self.prob_ACPhel = np.array(
+            [
+                [
+                    0.0483871,
+                    0.0,
+                    0.0,
+                    0.0483871,
+                    0.01612903,
+                    0.12903226,
+                    0.03225807,
+                    0.09677419,
+                    0.19354839,
+                    0.5,
+                    0.0483871,
+                    0.11290323,
+                    0.1,
+                    0.18518519,
+                    0.07843137,
+                    0.12,
+                    0.17073172,
+                    0.16666667,
+                ],
+                [
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.02439024,
+                    0.19444444,
+                ],
+                [
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.27419355,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                ],
+                [
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.06451613,
+                    0.0,
+                    0.01612903,
+                    0.0483871,
+                    0.01612903,
+                    0.0,
+                    0.01851852,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                ],
+                [
+                    0.16129032,
+                    0.0483871,
+                    0.30645161,
+                    0.0,
+                    0.0483871,
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.09677419,
+                    0.06666667,
+                    0.01851852,
+                    0.0,
+                    0.02,
+                    0.14634146,
+                    0.0,
+                ],
+                [
+                    0.64516129,
+                    0.0,
+                    0.17741936,
+                    0.14516129,
+                    0.0,
+                    0.01612903,
+                    0.25806452,
+                    0.11290323,
+                    0.06451613,
+                    0.08064516,
+                    0.22580645,
+                    0.03225807,
+                    0.06666667,
+                    0.2037037,
+                    0.1372549,
+                    0.1,
+                    0.0,
+                    0.05555556,
+                ],
+                [
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.03225807,
+                    0.0,
+                    0.0,
+                    0.20967742,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.16,
+                    0.0,
+                    0.0,
+                ],
+                [
+                    0.0483871,
+                    0.11290323,
+                    0.01612903,
+                    0.08064516,
+                    0.33870968,
+                    0.27419355,
+                    0.0,
+                    0.0483871,
+                    0.14516129,
+                    0.06451613,
+                    0.03225807,
+                    0.06451613,
+                    0.18333333,
+                    0.0,
+                    0.0,
+                    0.1,
+                    0.26829268,
+                    0.0,
+                ],
+                [
+                    0.0,
+                    0.03225807,
+                    0.01612903,
+                    0.12903226,
+                    0.12903226,
+                    0.0,
+                    0.38709677,
+                    0.33870968,
+                    0.0483871,
+                    0.03225807,
+                    0.41935484,
+                    0.08064516,
+                    0.0,
+                    0.03703704,
+                    0.29411765,
+                    0.04,
+                    0.02439024,
+                    0.02777778,
+                ],
+                [
+                    0.0483871,
+                    0.70967742,
+                    0.12903226,
+                    0.0483871,
+                    0.09677419,
+                    0.32258064,
+                    0.20967742,
+                    0.06451613,
+                    0.11290323,
+                    0.06451613,
+                    0.03225807,
+                    0.03225807,
+                    0.28333333,
+                    0.24074074,
+                    0.03921569,
+                    0.28,
+                    0.07317073,
+                    0.22222222,
+                ],
+                [
+                    0.0,
+                    0.01612903,
+                    0.01612903,
+                    0.0483871,
+                    0.01612903,
+                    0.03225807,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.03333333,
+                    0.0,
+                    0.01960784,
+                    0.02,
+                    0.0,
+                    0.0,
+                ],
+                [
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.03225807,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.01960784,
+                    0.02,
+                    0.0,
+                    0.0,
+                ],
+                [
+                    0.0,
+                    0.0,
+                    0.14516129,
+                    0.01612903,
+                    0.03225807,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.12962963,
+                    0.17647059,
+                    0.0,
+                    0.0,
+                    0.0,
+                ],
+                [
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.01851852,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                ],
+                [
+                    0.0,
+                    0.01612903,
+                    0.01612903,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.01612903,
+                    0.01612903,
+                    0.01612903,
+                    0.01612903,
+                    0.0,
+                    0.01851852,
+                    0.01960784,
+                    0.0,
+                    0.04878049,
+                    0.0,
+                ],
+                [
+                    0.01612903,
+                    0.0,
+                    0.01612903,
+                    0.12903226,
+                    0.03225807,
+                    0.03225807,
+                    0.0483871,
+                    0.17741936,
+                    0.0,
+                    0.03225807,
+                    0.09677419,
+                    0.0483871,
+                    0.01666667,
+                    0.0,
+                    0.15686274,
+                    0.1,
+                    0.0,
+                    0.05555556,
+                ],
+                [
+                    0.01612903,
+                    0.01612903,
+                    0.0,
+                    0.01612903,
+                    0.0483871,
+                    0.01612903,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.01612903,
+                    0.01612903,
+                    0.11290323,
+                    0.0,
+                    0.01851852,
+                    0.03921569,
+                    0.02,
+                    0.0,
+                    0.05555556,
+                ],
+                [
+                    0.01612903,
+                    0.01612903,
+                    0.01612903,
+                    0.01612903,
+                    0.20967742,
+                    0.16129032,
+                    0.01612903,
+                    0.0483871,
+                    0.33870968,
+                    0.16129032,
+                    0.0,
+                    0.14516129,
+                    0.25,
+                    0.11111111,
+                    0.01960784,
+                    0.02,
+                    0.21951219,
+                    0.22222222,
+                ],
+                [
+                    0.0,
+                    0.0,
+                    0.12903226,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.02439024,
+                    0.0,
+                ],
+                [
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.01612903,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                ],
+            ]
+        )
+
+    def save_fasta(self, filename, names=False):
+        """Method to save generated sequences in a ``.FASTA`` formatted file.
+
+        :param filename: output filename in which the sequences from :py:attr:`sequences` are safed in fasta format.
+        :param names: {bool} whether sequence names from :py:attr:`names` should be saved as sequence identifiers
+        :return: a FASTA formatted file containing the generated sequences
+        :Example:
+
+        >>> b = BaseSequence(2)
+        >>> b.sequences = ['KLLSLSLALDLLS', 'KLPERTVVNSSDF']
+        >>> b.names = ['Sequence1', 'Sequence2']
+        >>> b.save_fasta('/location/of/fasta/file.fasta', names=True)
+        """
+        if names:
+            save_fasta(filename, self.sequences, self.names)
+        else:
+            save_fasta(filename, self.sequences)
+
+    def mutate_AA(self, nr, prob):
+        """Method to mutate with **prob** probability a **nr** of positions per sequence randomly.
+
+        :param nr: number of mutations to perform per sequence
+        :param prob: probability of mutating a sequence
+        :return: mutated sequences in the attribute :py:attr:`sequences`.
+        :Example:
+
+        >>> b = BaseSequence(1)
+        >>> b.sequences = ['IAKAGRAIIK']
+        >>> b.mutate_AA(3, 1.)
+        >>> b.sequences
+        ['NAKAGRAWIK']
+        """
+        for s in range(len(self.sequences)):
+            # mutate: yes or no? prob = mutation probability
+            mutate = np.random.choice([1, 0], 1, p=[prob, 1 - float(prob)])
+            if mutate == 1:
+                seq = list(self.sequences[s])
+                cnt = 0
+                while cnt < nr:  # mutate "nr" AA
+                    seq[random.choice(range(len(seq)))] = random.choice(self.AAs)
+                    cnt += 1
+                self.sequences[s] = "".join(seq)
+
+    def filter_duplicates(self):
+        """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences`
+
+        :return: filtered sequences list in the attribute :py:attr:`sequences` and corresponding names.
+        :Example:
+
+        >>> b = BaseSequence(4)
+        >>> b.sequences = ['KLLKLLKKLLKLLK', 'KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK', 'KLAKLAKKLAKLAK']
+        >>> b.filter_duplicates()
+        >>> b.sequences
+        ['KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK']
+
+        .. versionadded:: v2.2.5
+        """
+        if not self.names:
+            self.names = ["Seq_" + str(i) for i in range(len(self.sequences))]
+        df = pd.DataFrame(
+            list(zip(self.sequences, self.names)), columns=["Sequences", "Names"]
+        )
+        df = df.drop_duplicates(
+            "Sequences", "first"
+        )  # keep first occurrence of duplicate
+        self.sequences = df["Sequences"].get_values().tolist()
+        self.names = df["Names"].get_values().tolist()
+
+    def keep_natural_aa(self):
+        """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character
+        that is not in ``['A','C','D,'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']``.
+
+        :return: filtered sequence list in the attribute :py:attr:`sequences`. The other attributes are also filtered
+            accordingly (if present).
+        :Example:
+
+        >>> b = BaseSequence(2)
+        >>> b.sequences = ['BBBsdflUasUJfBJ', 'GLFDIVKKVVGALGSL']
+        >>> b.keep_natural_aa()
+        >>> b.sequences
+        ['GLFDIVKKVVGALGSL']
+        """
+        natural_aa = [
+            "A",
+            "C",
+            "D",
+            "E",
+            "F",
+            "G",
+            "H",
+            "I",
+            "K",
+            "L",
+            "M",
+            "N",
+            "P",
+            "Q",
+            "R",
+            "S",
+            "T",
+            "V",
+            "W",
+            "Y",
+        ]
+
+        seqs = []
+        names = []
+
+        for i, s in enumerate(self.sequences):
+            seq = list(s.upper())
+            if all(c in natural_aa for c in seq):
+                seqs.append(s.upper())
+                if hasattr(self, "names") and self.names:
+                    names.append(self.names[i])
+
+        self.sequences = seqs
+        self.names = names
+
+    def filter_aa(self, amino_acids):
+        """Method to filter out corresponding names and descriptor values of sequences with given amino acids in the
+        argument list *aminoacids*.
+
+        :param amino_acids: {list} amino acids to be filtered
+        :return: filtered list of sequences names in the corresponding attributes.
+        :Example:
+
+        >>> b = BaseSequence(3)
+        >>> b.sequences = ['AAALLLIIIKKK', 'CCEERRT', 'LLVVIIFFFQQ']
+        >>> b.filter_aa(['C'])
+        >>> b.sequences
+        ['AAALLLIIIKKK', 'LLVVIIFFFQQ']
+        """
+
+        pattern = re.compile("|".join(amino_acids))
+        seqs = []
+        names = []
+
+        for i, s in enumerate(self.sequences):
+            if not pattern.search(s):
+                seqs.append(s)
+                if hasattr(self, "names") and self.names:
+                    names.append(self.names[i])
+
+        self.sequences = seqs
+        self.names = names
+
+    def clean(self):
+        """Method to clean / clear / empty the attributes :py:attr:`sequences` and :py:attr:`names`.
+
+        :return: freshly initialized, empty class attributes.
+        """
+        self.__init__(self.seqnum, self.lenmin, self.lenmax)
+
+
+class BaseDescriptor(object):
+    """
+    Base class inheriting to both peptide descriptor classes :py:class:`modlamp.descriptors.GlobalDescriptor` and
+    :py:class:`modlamp.descriptors.PeptideDescriptor`.
+    """
+
+    def __init__(self, seqs):
+        """
+        :param seqs: a ``.FASTA`` file with sequences, a list / array of sequences or a single sequence as string to
+            calculate the descriptor values for.
+        :return: initialized attributes :py:attr:`sequences` and :py:attr:`names`.
+        :Example:
+
+        >>> AMP = BaseDescriptor('KLLKLLKKLLKLLK','pepCATS')
+        >>> AMP.sequences
+        ['KLLKLLKKLLKLLK']
+        >>> seqs = BaseDescriptor('/Path/to/file.fasta', 'eisenberg')  # load sequences from .fasta file
+        >>> seqs.sequences
+        ['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'...]
+        """
+        if type(seqs) == list and seqs[0].isupper():
+            self.sequences = [s.strip() for s in seqs]
+            self.names = []
+        elif type(seqs) == np.ndarray and seqs[0].isupper():
+            self.sequences = [s.strip() for s in seqs.tolist()]
+            self.names = []
+        elif type(seqs) == str and seqs.isupper():
+            self.sequences = [seqs.strip()]
+            self.names = []
+        elif os.path.isfile(seqs):
+            if seqs.endswith(".fasta"):  # read .fasta file
+                self.sequences, self.names = read_fasta(seqs)
+            elif seqs.endswith(".csv"):  # read .csv file with sequences every line
+                with open(seqs) as f:
+                    self.sequences = list()
+                    cntr = 0
+                    self.names = []
+                    for line in f:
+                        if line.isupper():
+                            self.sequences.append(line.strip())
+                            self.names.append("seq_" + str(cntr))
+                            cntr += 1
+            else:
+                print("Sorry, currently only .fasta or .csv files can be read!")
+        else:
+            print(
+                "%s does not exist, is not a valid list of AA sequences or is not a valid sequence string"
+                % seqs
+            )
+
+        self.descriptor = np.array([[]])
+        self.target = np.array([], dtype="int")
+        self.scaler = None
+        self.featurenames = []
+
+    def read_fasta(self, filename):
+        """Method for loading sequences from a ``.FASTA`` formatted file into the attributes :py:attr:`sequences` and
+        :py:attr:`names`.
+
+        :param filename: {str} ``.FASTA`` file with sequences and headers to read
+        :return: {list} sequences in the attribute :py:attr:`sequences` with corresponding sequence names in
+            :py:attr:`names`.
+        """
+        self.sequences, self.names = read_fasta(filename)
+
+    def save_fasta(self, filename, names=False):
+        """Method for saving sequences from :py:attr:`sequences` to a ``.FASTA`` formatted file.
+
+        :param filename: {str} filename of the output ``.FASTA`` file
+        :param names: {bool} whether sequence names from self.names should be saved as sequence identifiers
+        :return: a FASTA formatted file containing the generated sequences
+        """
+        if names:
+            save_fasta(filename, self.sequences, self.names)
+        else:
+            save_fasta(filename, self.sequences)
+
+    def count_aa(self, scale="relative", average=False, append=False):
+        """Method for producing the amino acid distribution for the given sequences as a descriptor
+
+        :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA
+        :param average: {boolean} whether the averaged amino acid counts for all sequences should be returned
+        :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+            attribute :py:attr:`descriptor`.
+        :return: the amino acid distributions for every sequence individually in the attribute :py:attr:`descriptor`
+        :Example:
+
+        >>> AMP = PeptideDescriptor('ACDEFGHIKLMNPQRSTVWY') # aa_count() does not depend on the descriptor scale
+        >>> AMP.count_aa()
+        >>> AMP.descriptor
+        array([[ 0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05, ... ]])
+        >>> AMP.descriptor.shape
+        (1, 20)
+
+        .. seealso:: :py:func:`modlamp.core.count_aa()`
+        """
+        desc = list()
+        for seq in self.sequences:
+            od = count_aas(seq, scale)
+            desc.append(list(od.values()))
+
+        desc = np.array(desc)
+        self.featurenames = list(od.keys())
+
+        if append:
+            self.descriptor = np.hstack((self.descriptor, desc))
+        elif average:
+            self.descriptor = np.mean(desc, axis=0)
+        else:
+            self.descriptor = desc
+
+    def count_ngrams(self, n):
+        """Method for producing n-grams of all sequences in self.sequences
+
+        :param n: {int or list of ints} defines whether counts or frequencies are given for each AA
+        :return: {dict} dictionary with n-grams as keys and their counts in the sequence as values in :py:attr:`descriptor`
+        :Example:
+
+        >>> D = PeptideDescriptor('GLLDFLSLAALSLDKLVKKGALS')
+        >>> D.count_ngrams([2, 3])
+        >>> D.descriptor
+        {'LS': 3, 'LD': 2, 'LSL': 2, 'AL': 2, ..., 'LVK': 1}
+
+        .. seealso:: :py:func:`modlamp.core.count_ngrams()`
+        """
+        ngrams = dict()
+        for seq in self.sequences:
+            d = count_ngrams(seq, n)
+            for k, v in d.items():
+                if k in ngrams.keys():
+                    ngrams[k] += v
+                else:
+                    ngrams[k] = v
+        self.descriptor = ngrams
+
+    def feature_scaling(self, stype="standard", fit=True):
+        """Method for feature scaling of the calculated descriptor matrix.
+
+        :param stype: {'standard' or 'minmax'} type of scaling to be used
+        :param fit: {boolean} defines whether the used scaler is first fitting on the data (True) or
+            whether the already fitted scaler in :py:attr:`scaler` should be used to transform (False).
+        :return: scaled descriptor values in :py:attr:`descriptor`
+        :Example:
+
+        >>> D.descriptor
+        array([[0.155],[0.34],[0.16235294],[-0.08842105],[0.116]])
+        >>> D.feature_scaling(type='minmax',fit=True)
+        array([[0.56818182],[1.],[0.5853447],[0.],[0.47714988]])
+        """
+        if stype in ["standard", "minmax"]:
+            if stype == "standard":
+                self.scaler = StandardScaler()
+            elif stype == "minmax":
+                self.scaler = MinMaxScaler()
+
+            if fit:
+                self.descriptor = self.scaler.fit_transform(self.descriptor)
+            else:
+                self.descriptor = self.scaler.transform(self.descriptor)
+        else:
+            print("Unknown scaler type!\nAvailable: 'standard', 'minmax'")
+
+    def feature_shuffle(self):
+        """Method for shuffling feature columns randomly.
+
+        :return: descriptor matrix with shuffled feature columns in :py:attr:`descriptor`
+        :Example:
+
+        >>> D.descriptor
+        array([[0.80685625,167.05234375,39.56818125,-0.26338667,155.16888667,33.48778]])
+        >>> D.feature_shuffle()
+        array([[155.16888667,-0.26338667,167.05234375,0.80685625,39.56818125,33.48778]])
+        """
+        self.descriptor = shuffle(self.descriptor.transpose()).transpose()
+
+    def sequence_order_shuffle(self):
+        """Method for shuffling sequence order in the attribute :py:attr:`sequences`.
+
+        :return: sequences in :py:attr:`sequences` with shuffled order in the list.
+        :Example:
+
+        >>> D.sequences
+        ['LILRALKGAARALKVA','VKIAKIALKIIKGLG','VGVRLIKGIGRVARGAI','LRGLRGVIRGGKAIVRVGK','GGKLVRLIARIGKGV']
+        >>> D.sequence_order_shuffle()
+        >>> D.sequences
+        ['VGVRLIKGIGRVARGAI','LILRALKGAARALKVA','LRGLRGVIRGGKAIVRVGK','GGKLVRLIARIGKGV','VKIAKIALKIIKGLG']
+        """
+        self.sequences = shuffle(self.sequences)
+
+    def random_selection(self, num):
+        """Method to randomly select a specified number of sequences (with names and descriptors if present) out of a given
+        descriptor instance.
+
+        :param num: {int} number of entries to be randomly selected
+        :return: updated instance
+        :Example:
+
+        >>> h = Helices(7, 28, 100)
+        >>> h.generate_helices()
+        >>> desc = PeptideDescriptor(h.sequences, 'eisenberg')
+        >>> desc.calculate_moment()
+        >>> len(desc.sequences)
+        100
+        >>> len(desc.descriptor)
+        100
+        >>> desc.random_selection(10)
+        >>> len(desc.descriptor)
+        10
+        >>> len(desc.descriptor)
+        10
+
+        .. versionadded:: v2.2.3
+        """
+
+        sel = np.random.choice(len(self.sequences), size=num, replace=False)
+        self.sequences = np.array(self.sequences)[sel].tolist()
+        if hasattr(self, "descriptor") and self.descriptor.size:
+            self.descriptor = self.descriptor[sel]
+        if hasattr(self, "names") and self.names:
+            self.names = np.array(self.names)[sel].tolist()
+        if hasattr(self, "target") and self.target.size:
+            self.target = self.target[sel]
+
+    def minmax_selection(self, iterations, distmetric="euclidean", seed=0):
+        """Method to select a specified number of sequences according to the minmax algorithm.
+
+        :param iterations: {int} Number of sequences to retrieve.
+        :param distmetric: Distance metric to calculate the distances between the sequences in descriptor space.
+            Choose from 'euclidean' or 'minkowsky'.
+        :param seed: {int} Set a random seed for numpy to pick the first sequence.
+        :return: updated instance
+
+        .. seealso:: **SciPy** http://docs.scipy.org/doc/scipy/reference/spatial.distance.html
+        """
+
+        # Storing M into pool, where selections get deleted
+        pool = self.descriptor  # Store pool where selections get deleted
+        minmaxidx = list()  # Store original indices of selections to return
+
+        # Randomly selecting first peptide into the sele
+        np.random.seed(seed)
+        idx = int(np.random.random_integers(0, len(pool), 1))
+        sele = pool[idx : idx + 1, :]
+        minmaxidx.append(
+            int(*np.where(np.all(self.descriptor == pool[idx : idx + 1, :], axis=1)))
+        )
+
+        # Deleting peptide in selection from pool
+        pool = np.delete(pool, idx, axis=0)
+
+        for i in range(iterations - 1):
+            # Calculating distance from sele to the rest of the peptides
+            dist = distance.cdist(pool, sele, distmetric)
+
+            # Choosing maximal distances for every sele instance
+            maxidx = np.argmax(dist, axis=0)
+            maxcols = np.max(dist, axis=0)
+
+            # Choosing minimal distance among the maximal distances
+            minmax = np.argmin(maxcols)
+            maxidx = int(maxidx[minmax])
+
+            # Adding it to selection and removing from pool
+            sele = np.append(sele, pool[maxidx : maxidx + 1, :], axis=0)
+            pool = np.delete(pool, maxidx, axis=0)
+            minmaxidx.append(
+                int(
+                    *np.where(
+                        np.all(self.descriptor == pool[maxidx : maxidx + 1, :], axis=1)
+                    )
+                )
+            )
+
+        self.sequences = np.array(self.sequences)[minmaxidx].tolist()
+        if hasattr(self, "descriptor") and self.descriptor.size:
+            self.descriptor = self.descriptor[minmaxidx]
+        if hasattr(self, "names") and self.names:
+            self.names = np.array(self.names)[minmaxidx].tolist()
+        if hasattr(self, "target") and self.target.size:
+            self.target = self.descriptor[minmaxidx]
+
+    def filter_sequences(self, sequences):
+        """Method to filter out entries for given sequences in *sequences* out of a descriptor instance. All
+        corresponding attribute values of these sequences (e.g. in :py:attr:`descriptor`, :py:attr:`name`) are deleted
+        as well. The method returns an updated descriptor instance.
+
+        :param sequences: {list} sequences to be filtered out of the whole instance, including corresponding data
+        :return: updated instance without filtered sequences
+        :Example:
+
+        >>> sequences = ['KLLKLLKKLLKLLK', 'ACDEFGHIK', 'GLFDIVKKVV', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALGSL']
+        >>> desc = PeptideDescriptor(sequences, 'pepcats')
+        >>> desc.calculate_crosscorr(7)
+        >>> len(desc.descriptor)
+        5
+        >>> desc.filter_sequences('KLLKLLKKLLKLLK')
+        >>> len(desc.descriptor)
+        4
+        >>> desc.sequences
+        ['ACDEFGHIK', 'GLFDIVKKVV', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALGSL']
+        """
+        indices = list()
+        if isinstance(
+            sequences, str
+        ):  # check if sequences is only one sequence string and convert it to a list
+            sequences = [sequences]
+        for s in sequences:  # get indices of queried sequences
+            indices.append(self.sequences.index(s))
+
+        self.sequences = np.delete(np.array(self.sequences), indices, 0).tolist()
+        if hasattr(self, "descriptor") and self.descriptor.size:
+            self.descriptor = np.delete(self.descriptor, indices, 0)
+        if hasattr(self, "names") and self.names:
+            self.names = np.delete(np.array(self.names), indices, 0).tolist()
+        if hasattr(self, "target") and self.target.size:
+            self.target = np.delete(self.target, indices, 0)
+
+    def filter_values(self, values, operator="=="):
+        """Method to filter the descriptor matrix in the attribute :py:attr:`descriptor` for a given list of values (same
+        size as the number of features in the descriptor matrix!) The operator option tells the method whether to
+        filter for values equal, lower, higher ect. to the given values in the *values* array.
+
+        :param values: {list} values to filter the attribute :py:attr:`descriptor` for
+        :param operator: {str} filter criterion, available the operators ``==``, ``<``, ``>``, ``<=``and ``>=``.
+        :return: descriptor matrix and updated sequences containing only entries with descriptor values given in
+            *values* in the corresponding attributes.
+        :Example:
+
+        >>> desc.descriptor  # desc = BaseDescriptor instance
+        array([[ 0.7666517 ],
+               [ 0.38373498]])
+        >>> desc.filter_values([0.5], '<')
+        >>> desc.descriptor
+        array([[ 0.38373498]])
+        """
+        dim = self.descriptor.shape[1]
+        for d in range(dim):  # for all the features in self.descriptor
+            if operator == "==":
+                indices = np.where(self.descriptor[:, d] == values[d])[0]
+            elif operator == "<":
+                indices = np.where(self.descriptor[:, d] < values[d])[0]
+            elif operator == ">":
+                indices = np.where(self.descriptor[:, d] > values[d])[0]
+            elif operator == "<=":
+                indices = np.where(self.descriptor[:, d] <= values[d])[0]
+            elif operator == ">=":
+                indices = np.where(self.descriptor[:, d] >= values[d])[0]
+            else:
+                raise KeyError(
+                    "available operators: ``==``, ``<``, ``>``, ``<=``and ``>=``"
+                )
+
+            # filter descriptor matrix, sequence list and names list according to obtained indices
+            self.sequences = np.array(self.sequences)[indices].tolist()
+            if hasattr(self, "descriptor") and self.descriptor.size:
+                self.descriptor = self.descriptor[indices]
+            if hasattr(self, "names") and self.names:
+                self.names = np.array(self.names)[indices].tolist()
+            if hasattr(self, "target") and self.target.size:
+                self.target = self.target[indices]
+
+    def filter_aa(self, amino_acids):
+        """Method to filter out corresponding names and descriptor values of sequences with given amino acids in the
+        argument list *aminoacids*.
+
+        :param amino_acids: list of amino acids to be filtered
+        :return: filtered list of sequences, descriptor values, target values and names in the corresponding attributes.
+        :Example:
+
+        >>> b = BaseSequence(3)
+        >>> b.sequences = ['AAALLLIIIKKK', 'CCEERRT', 'LLVVIIFFFQQ']
+        >>> b.filter_aa(['C'])
+        >>> b.sequences
+        ['AAALLLIIIKKK', 'LLVVIIFFFQQ']
+        """
+
+        pattern = re.compile("|".join(amino_acids))
+        seqs = []
+        desc = []
+        names = []
+        target = []
+
+        for i, s in enumerate(self.sequences):
+            if not pattern.search(s):
+                seqs.append(s)
+                if hasattr(self, "descriptor") and self.descriptor.size:
+                    desc.append(self.descriptor[i])
+                if hasattr(self, "names") and self.names:
+                    names.append(self.names[i])
+                if hasattr(self, "target") and self.target.size:
+                    target.append(self.target[i])
+
+        self.sequences = seqs
+        self.names = names
+        self.descriptor = np.array(desc)
+        self.target = np.array(target, dtype="int")
+
+    def filter_duplicates(self):
+        """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences`
+
+        :return: filtered sequences list in the attribute :py:attr:`sequences` and corresponding names.
+        :Example:
+
+        >>> b = BaseDescriptor(['KLLKLLKKLLKLLK', 'KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK', 'KLAKLAKKLAKLAK'])
+        >>> b.filter_duplicates()
+        >>> b.sequences
+        ['KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK']
+
+        .. versionadded:: v2.2.5
+        """
+        if not self.names:
+            self.names = ["Seq_" + str(i) for i in range(len(self.sequences))]
+        if not self.target:
+            self.target = [0] * len(self.sequences)
+        if not self.descriptor:
+            self.descriptor = np.zeros(len(self.sequences))
+        df = pd.DataFrame(
+            np.array([self.sequences, self.names, self.descriptor, self.target]).T,
+            columns=["Sequences", "Names", "Descriptor", "Target"],
+        )
+        df = df.drop_duplicates(
+            "Sequences", "first"
+        )  # keep first occurrence of duplicate
+        self.sequences = df["Sequences"].get_values().tolist()
+        self.names = df["Names"].get_values().tolist()
+        self.descriptor = df["Descriptor"].get_values()
+        self.target = df["Target"].get_values()
+
+    def keep_natural_aa(self):
+        """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character
+        that is not in ['A','C','D,'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'].
+
+        :return: filtered sequence list in the attribute :py:attr:`sequences`. The other attributes are also filtered
+            accordingly (if present).
+        :Example:
+
+        >>> b = BaseSequence(2)
+        >>> b.sequences = ['BBBsdflUasUJfBJ', 'GLFDIVKKVVGALGSL']
+        >>> b.keep_natural_aa()
+        >>> b.sequences
+        ['GLFDIVKKVVGALGSL']
+        """
+
+        natural_aa = [
+            "A",
+            "C",
+            "D",
+            "E",
+            "F",
+            "G",
+            "H",
+            "I",
+            "K",
+            "L",
+            "M",
+            "N",
+            "P",
+            "Q",
+            "R",
+            "S",
+            "T",
+            "V",
+            "W",
+            "Y",
+        ]
+
+        seqs = []
+        desc = []
+        names = []
+        target = []
+
+        for i, s in enumerate(self.sequences):
+            seq = list(s.upper())
+            if all(c in natural_aa for c in seq):
+                seqs.append(s.upper())
+                if hasattr(self, "descriptor") and self.descriptor.size:
+                    desc.append(self.descriptor[i])
+                if hasattr(self, "names") and self.names:
+                    names.append(self.names[i])
+                if hasattr(self, "target") and self.target.size:
+                    target.append(self.target[i])
+
+        self.sequences = seqs
+        self.names = names
+        self.descriptor = np.array(desc)
+        self.target = np.array(target, dtype="int")
+
+    def load_descriptordata(
+        self, filename, delimiter=",", targets=False, skip_header=0
+    ):
+        """Method to load any data file with sequences and descriptor values and save it to a new insatnce of the
+        class :class:`modlamp.descriptors.PeptideDescriptor`.
+
+        .. note:: Headers are not considered. To skip initial lines in the file, use the *skip_header* option.
+
+        :param filename: {str} filename of the data file to be loaded
+        :param delimiter: {str} column delimiter
+        :param targets: {boolean} whether last column in the file contains a target class vector
+        :param skip_header: {int} number of initial lines to skip in the file
+        :return: loaded sequences, descriptor values and targets in the corresponding attributes.
+        """
+        data = np.genfromtxt(filename, delimiter=delimiter, skip_header=skip_header)
+        data = data[:, 1:]  # skip sequences as they are "nan" when read as float
+        seqs = np.genfromtxt(filename, delimiter=delimiter, dtype="str")
+        seqs = seqs[:, 0]
+        if targets:
+            self.target = np.array(data[:, -1], dtype="int")
+        self.sequences = seqs
+        self.descriptor = data
+
+    def save_descriptor(self, filename, delimiter=",", targets=None, header=None):
+        """Method to save the descriptor values to a .csv/.txt file
+
+        :param filename: filename of the output file
+        :param delimiter: column delimiter
+        :param targets: target class vector to be added to descriptor (same length as :py:attr:`sequences`)
+        :param header: {str} header to be written at the beginning of the file (if ``None``: feature names are taken)
+        :return: output file with peptide names and descriptor values
+        """
+        seqs = np.array(self.sequences, dtype="|S80")[:, np.newaxis]
+        ids = np.array(self.names, dtype="|S80")[:, np.newaxis]
+        if ids.shape == seqs.shape:
+            names = np.hstack((ids, seqs))
+        else:
+            names = seqs
+        if targets and len(targets) == len(self.sequences):
+            target = np.array(targets)[:, np.newaxis]
+            data = np.hstack((names, self.descriptor, target))
+        else:
+            data = np.hstack((names, self.descriptor))
+        if not header:
+            featurenames = [["Sequence"]] + self.featurenames
+            header = ", ".join([f[0] for f in featurenames])
+        np.savetxt(filename, data, delimiter=delimiter, fmt="%s", header=header)
+
+
+def load_scale(scalename):
+    """Method to load scale values for a given amino acid scale
+
+    :param scalename: amino acid scale name, for available scales see the
+        :class:`modlamp.descriptors.PeptideDescriptor()` documentation.
+    :return: amino acid scale values in dictionary format.
+    """
+    # predefined amino acid scales dictionary
+    scales = {
+        "aasi": {
+            "A": [1.89],
+            "C": [1.73],
+            "D": [3.13],
+            "E": [3.14],
+            "F": [1.53],
+            "G": [2.67],
+            "H": [3],
+            "I": [1.97],
+            "K": [2.28],
+            "L": [1.74],
+            "M": [2.5],
+            "N": [2.33],
+            "P": [0.22],
+            "Q": [3.05],
+            "R": [1.91],
+            "S": [2.14],
+            "T": [2.18],
+            "V": [2.37],
+            "W": [2],
+            "Y": [2.01],
+        },
+        "abhprk": {
+            "A": [0, 0, 0, 0, 0, 0],
+            "C": [0, 0, 0, 0, 0, 0],
+            "D": [1, 0, 0, 1, 0, 0],
+            "E": [1, 0, 0, 1, 0, 0],
+            "F": [0, 0, 1, 0, 1, 0],
+            "G": [0, 0, 0, 0, 0, 0],
+            "H": [0, 0, 0, 1, 1, 0],
+            "I": [0, 0, 1, 0, 0, 0],
+            "K": [0, 1, 0, 1, 0, 0],
+            "L": [0, 0, 1, 0, 0, 0],
+            "M": [0, 0, 1, 0, 0, 0],
+            "N": [0, 0, 0, 1, 0, 0],
+            "P": [0, 0, 0, 0, 0, 1],
+            "Q": [0, 0, 0, 1, 0, 0],
+            "R": [0, 1, 0, 1, 0, 0],
+            "S": [0, 0, 0, 1, 0, 0],
+            "T": [0, 0, 0, 1, 0, 0],
+            "V": [0, 0, 1, 0, 0, 0],
+            "W": [0, 0, 1, 0, 1, 0],
+            "Y": [0, 0, 0, 1, 1, 0],
+        },
+        "argos": {
+            "I": [0.77],
+            "F": [1.2],
+            "V": [0.14],
+            "L": [2.3],
+            "W": [0.07],
+            "M": [2.3],
+            "A": [0.64],
+            "G": [-0.48],
+            "C": [0.25],
+            "Y": [-0.41],
+            "P": [-0.31],
+            "T": [-0.13],
+            "S": [-0.25],
+            "H": [-0.87],
+            "E": [-0.94],
+            "N": [-0.89],
+            "Q": [-0.61],
+            "D": [-1],
+            "K": [-1],
+            "R": [-0.68],
+        },
+        "bulkiness": {
+            "A": [0.443],
+            "C": [0.551],
+            "D": [0.453],
+            "E": [0.557],
+            "F": [0.898],
+            "G": [0],
+            "H": [0.563],
+            "I": [0.985],
+            "K": [0.674],
+            "L": [0.985],
+            "M": [0.703],
+            "N": [0.516],
+            "P": [0.768],
+            "Q": [0.605],
+            "R": [0.596],
+            "S": [0.332],
+            "T": [0.677],
+            "V": [0.995],
+            "W": [1],
+            "Y": [0.801],
+        },
+        "charge_phys": {
+            "A": [0.0],
+            "C": [-0.1],
+            "D": [-1.0],
+            "E": [-1.0],
+            "F": [0.0],
+            "G": [0.0],
+            "H": [0.1],
+            "I": [0.0],
+            "K": [1.0],
+            "L": [0.0],
+            "M": [0.0],
+            "N": [0.0],
+            "P": [0.0],
+            "Q": [0.0],
+            "R": [1.0],
+            "S": [0.0],
+            "T": [0.0],
+            "V": [0.0],
+            "W": [0.0],
+            "Y": [0.0],
+        },
+        "charge_acid": {
+            "A": [0.0],
+            "C": [-0.1],
+            "D": [-1.0],
+            "E": [-1.0],
+            "F": [0.0],
+            "G": [0.0],
+            "H": [1.0],
+            "I": [0.0],
+            "K": [1.0],
+            "L": [0.0],
+            "M": [0.0],
+            "N": [0.0],
+            "P": [0.0],
+            "Q": [0.0],
+            "R": [1.0],
+            "S": [0.0],
+            "T": [0.0],
+            "V": [0.0],
+            "W": [0.0],
+            "Y": [0.0],
+        },
+        "cougar": {
+            "A": [0.25, 0.62, 1.89],
+            "C": [0.208, 0.29, 1.73],
+            "D": [0.875, -0.9, 3.13],
+            "E": [0.833, -0.74, 3.14],
+            "F": [0.042, 1.2, 1.53],
+            "G": [1, 0.48, 2.67],
+            "H": [0.083, -0.4, 3],
+            "I": [0.667, 1.4, 1.97],
+            "K": [0.708, -1.5, 2.28],
+            "L": [0.292, 1.1, 1.74],
+            "M": [0, 0.64, 2.5],
+            "N": [0.667, -0.78, 2.33],
+            "P": [0.875, 0.12, 0.22],
+            "Q": [0.792, -0.85, 3.05],
+            "R": [0.958, -2.5, 1.91],
+            "S": [0.875, -0.18, 2.14],
+            "T": [0.583, -0.05, 2.18],
+            "V": [0.375, 1.1, 2.37],
+            "W": [0.042, 0.81, 2],
+            "Y": [0.5, 0.26, 2.01],
+        },
+        "eisenberg": {
+            "I": [1.4],
+            "F": [1.2],
+            "V": [1.1],
+            "L": [1.1],
+            "W": [0.81],
+            "M": [0.64],
+            "A": [0.62],
+            "G": [0.48],
+            "C": [0.29],
+            "Y": [0.26],
+            "P": [0.12],
+            "T": [-0.05],
+            "S": [-0.18],
+            "H": [-0.4],
+            "E": [-0.74],
+            "N": [-0.78],
+            "Q": [-0.85],
+            "D": [-0.9],
+            "K": [-1.5],
+            "R": [-2.5],
+        },
+        "ez": {
+            "A": [-0.29, 10.22, 4.67],
+            "C": [0.95, 13.69, 5.77],
+            "D": [1.19, 14.25, 8.98],
+            "E": [1.3, 14.66, 4.16],
+            "F": [-0.8, 19.67, 7.12],
+            "G": [-0.01, 13.86, 6],
+            "H": [0.75, 12.26, 2.77],
+            "I": [-0.56, 14.34, 10.69],
+            "K": [1.66, 11.11, 2.09],
+            "L": [-0.64, 17.34, 8.61],
+            "M": [-0.28, 18.04, 7.13],
+            "N": [0.89, 12.78, 6.28],
+            "P": [0.83, 18.09, 3.53],
+            "Q": [1.21, 10.46, 2.59],
+            "R": [1.55, 9.34, 4.68],
+            "S": [0.1, 13.86, 6],
+            "T": [0.01, 13.86, 6],
+            "V": [-0.47, 11.35, 4.97],
+            "W": [-0.85, 11.65, 7.2],
+            "Y": [-0.42, 13.04, 6.2],
+        },
+        "flexibility": {
+            "A": [0.25],
+            "C": [0.208],
+            "D": [0.875],
+            "E": [0.833],
+            "F": [0.042],
+            "G": [1],
+            "H": [0.083],
+            "I": [0.667],
+            "K": [0.708],
+            "L": [0.292],
+            "M": [0.0],
+            "N": [0.667],
+            "P": [0.875],
+            "Q": [0.792],
+            "R": [0.958],
+            "S": [0.875],
+            "T": [0.583],
+            "V": [0.375],
+            "W": [0.042],
+            "Y": [0.5],
+        },
+        "grantham": {
+            "A": [0, 8.1, 31],
+            "C": [2.75, 5.5, 55],
+            "D": [1.38, 13.0, 54],
+            "E": [0.92, 12.3, 83],
+            "F": [0, 5.2, 132],
+            "G": [0.74, 9.0, 3],
+            "H": [0.58, 10.4, 96],
+            "I": [0, 5.2, 111],
+            "K": [0.33, 11.3, 119],
+            "L": [0, 4.9, 111],
+            "M": [0, 5.7, 105],
+            "N": [1.33, 11.6, 56],
+            "P": [0.39, 8.0, 32.5],
+            "Q": [0.89, 10.5, 85],
+            "R": [0.65, 10.5, 124],
+            "S": [1.42, 9.2, 32],
+            "T": [0.71, 8.6, 61],
+            "V": [0, 5.9, 84],
+            "W": [0.13, 5.4, 170],
+            "Y": [0.20, 6.2, 136],
+        },
+        "gravy": {
+            "I": [4.5],
+            "V": [4.2],
+            "L": [3.8],
+            "F": [2.8],
+            "C": [2.5],
+            "M": [1.9],
+            "A": [1.8],
+            "G": [-0.4],
+            "T": [-0.7],
+            "W": [-0.9],
+            "S": [-0.8],
+            "Y": [-1.3],
+            "P": [-1.6],
+            "H": [-3.2],
+            "E": [-3.5],
+            "Q": [-3.5],
+            "D": [-3.5],
+            "N": [-3.5],
+            "K": [-3.9],
+            "R": [-4.5],
+        },
+        "hopp-woods": {
+            "A": [-0.5],
+            "C": [-1],
+            "D": [3],
+            "E": [3],
+            "F": [-2.5],
+            "G": [0],
+            "H": [-0.5],
+            "I": [-1.8],
+            "K": [3],
+            "L": [-1.8],
+            "M": [-1.3],
+            "N": [0.2],
+            "P": [0],
+            "Q": [0.2],
+            "R": [3],
+            "S": [0.3],
+            "T": [-0.4],
+            "V": [-1.5],
+            "W": [-3.4],
+            "Y": [-2.3],
+        },
+        "isaeci": {
+            "A": [62.9, 0.05],
+            "C": [78.51, 0.15],
+            "D": [18.46, 1.25],
+            "E": [30.19, 1.31],
+            "F": [189.42, 0.14],
+            "G": [19.93, 0.02],
+            "H": [87.38, 0.56],
+            "I": [149.77, 0.09],
+            "K": [102.78, 0.53],
+            "L": [154.35, 0.1],
+            "M": [132.22, 0.34],
+            "N": [19.53, 1.36],
+            "P": [122.35, 0.16],
+            "Q": [17.87, 1.31],
+            "R": [52.98, 1.69],
+            "S": [19.75, 0.56],
+            "T": [59.44, 0.65],
+            "V": [120.91, 0.07],
+            "W": [179.16, 1.08],
+            "Y": [132.16, 0.72],
+        },
+        "janin": {
+            "I": [1.2],
+            "F": [0.87],
+            "V": [1],
+            "L": [0.87],
+            "W": [0.59],
+            "M": [0.73],
+            "A": [0.59],
+            "G": [0.59],
+            "C": [1.4],
+            "Y": [-0.4],
+            "P": [-0.26],
+            "T": [-0.12],
+            "S": [0.02],
+            "H": [0.02],
+            "E": [-0.83],
+            "N": [-0.55],
+            "Q": [-0.83],
+            "D": [-0.69],
+            "K": [-2.4],
+            "R": [-1.8],
+        },
+        "kytedoolittle": {
+            "I": [1.7],
+            "F": [1.1],
+            "V": [1.6],
+            "L": [1.4],
+            "W": [-0.14],
+            "M": [0.8],
+            "A": [0.77],
+            "G": [0.03],
+            "C": [1],
+            "Y": [-0.27],
+            "P": [-0.37],
+            "T": [-0.07],
+            "S": [-0.1],
+            "H": [-0.91],
+            "E": [-1],
+            "N": [-1],
+            "Q": [-1],
+            "D": [-1],
+            "K": [-1.1],
+            "R": [-1.3],
+        },
+        "levitt_alpha": {
+            "A": [1.29],
+            "C": [1.11],
+            "D": [1.04],
+            "E": [1.44],
+            "F": [1.07],
+            "G": [0.56],
+            "H": [1.22],
+            "I": [0.97],
+            "K": [1.23],
+            "L": [1.3],
+            "M": [1.47],
+            "N": [0.9],
+            "P": [0.52],
+            "Q": [1.27],
+            "R": [0.96],
+            "S": [0.82],
+            "T": [0.82],
+            "V": [0.91],
+            "W": [0.99],
+            "Y": [0.72],
+        },
+        "mss": {
+            "A": [13.02],
+            "C": [23.7067],
+            "D": [22.02],
+            "E": [20.0233],
+            "F": [23.5288],
+            "G": [1.01],
+            "H": [23.5283],
+            "I": [22.3611],
+            "K": [18.9756],
+            "L": [19.6944],
+            "M": [21.92],
+            "N": [21.8567],
+            "P": [19.0242],
+            "Q": [19.9689],
+            "R": [19.0434],
+            "S": [18.3533],
+            "T": [22.3567],
+            "V": [21.0267],
+            "W": [26.1975],
+            "Y": [24.1954],
+        },
+        "msw": {
+            "A": [-0.73, 0.2, -0.62],
+            "C": [-0.66, 0.26, -0.27],
+            "D": [0.11, -1, -0.96],
+            "E": [0.24, -0.39, -0.04],
+            "F": [0.76, 0.85, -0.34],
+            "G": [-0.31, -0.28, -0.75],
+            "H": [0.84, 0.67, -0.78],
+            "I": [-0.91, 0.83, -0.25],
+            "K": [-0.51, 0.08, 0.6],
+            "L": [-0.74, 0.72, -0.16],
+            "M": [-0.7, 1, -0.32],
+            "N": [0.14, 0.2, -0.66],
+            "P": [-0.43, 0.73, -0.6],
+            "Q": [0.3, 1, -0.3],
+            "R": [-0.22, 0.27, 1],
+            "S": [-0.8, 0.61, -1],
+            "T": [-0.58, 0.85, -0.89],
+            "V": [-1, 0.79, -0.58],
+            "W": [1, 0.98, -0.47],
+            "Y": [0.97, 0.66, -0.16],
+        },
+        "pepcats": {
+            "A": [1, 0, 0, 0, 0, 0],
+            "C": [1, 0, 1, 1, 0, 0],
+            "D": [0, 0, 1, 0, 0, 1],
+            "E": [0, 0, 1, 0, 0, 1],
+            "F": [1, 1, 0, 0, 0, 0],
+            "G": [0, 0, 0, 0, 0, 0],
+            "H": [1, 1, 0, 1, 1, 0],
+            "I": [1, 0, 0, 0, 0, 0],
+            "K": [1, 0, 0, 1, 1, 0],
+            "L": [1, 0, 0, 0, 0, 0],
+            "M": [1, 0, 1, 0, 0, 0],
+            "N": [0, 0, 1, 1, 0, 0],
+            "P": [1, 0, 0, 0, 0, 0],
+            "Q": [0, 0, 1, 1, 0, 0],
+            "R": [1, 0, 0, 1, 1, 0],
+            "S": [0, 0, 1, 1, 0, 0],
+            "T": [0, 0, 1, 1, 0, 0],
+            "V": [1, 0, 0, 0, 0, 0],
+            "W": [1, 1, 0, 1, 0, 0],
+            "Y": [1, 1, 1, 1, 0, 0],
+        },
+        "peparc": {
+            "A": [1, 0, 0, 0, 0],
+            "C": [0, 1, 0, 0, 0],
+            "D": [0, 1, 0, 1, 0],
+            "E": [0, 1, 0, 1, 0],
+            "F": [1, 0, 0, 0, 0],
+            "G": [0, 0, 0, 0, 0],
+            "H": [0, 1, 1, 0, 0],
+            "I": [1, 0, 0, 0, 0],
+            "K": [0, 1, 1, 0, 0],
+            "L": [1, 0, 0, 0, 0],
+            "M": [1, 0, 0, 0, 0],
+            "N": [0, 1, 0, 0, 0],
+            "P": [0, 0, 0, 0, 1],
+            "Q": [0, 1, 0, 0, 0],
+            "R": [0, 1, 1, 0, 0],
+            "S": [0, 1, 0, 0, 0],
+            "T": [0, 1, 0, 0, 0],
+            "V": [1, 0, 0, 0, 0],
+            "W": [1, 0, 0, 0, 0],
+            "Y": [1, 0, 0, 0, 0],
+        },
+        "polarity": {
+            "A": [0.395],
+            "C": [0.074],
+            "D": [1.0],
+            "E": [0.914],
+            "F": [0.037],
+            "G": [0.506],
+            "H": [0.679],
+            "I": [0.037],
+            "K": [0.79],
+            "L": [0.0],
+            "M": [0.099],
+            "N": [0.827],
+            "P": [0.383],
+            "Q": [0.691],
+            "R": [0.691],
+            "S": [0.531],
+            "T": [0.457],
+            "V": [0.123],
+            "W": [0.062],
+            "Y": [0.16],
+        },
+        "ppcali": {
+            "A": [
+                0.070781,
+                0.036271,
+                2.042,
+                0.083272,
+                0.69089,
+                0.15948,
+                -0.80893,
+                0.24698,
+                0.86525,
+                0.68563,
+                -0.24665,
+                0.61314,
+                -0.53343,
+                -0.50878,
+                -1.3646,
+                2.2679,
+                -1.5644,
+                -0.75043,
+                -0.65875,
+            ],
+            "C": [
+                0.61013,
+                -0.93043,
+                -0.85983,
+                -2.2704,
+                1.5877,
+                -2.0066,
+                -0.30314,
+                1.2544,
+                -0.2832,
+                -1.2844,
+                -0.73449,
+                -0.11235,
+                -0.41152,
+                -0.0050164,
+                0.28307,
+                0.20522,
+                -0.021084,
+                -0.15627,
+                -0.32689,
+            ],
+            "D": [
+                -1.3215,
+                0.24063,
+                -0.032754,
+                -0.37863,
+                1.2051,
+                1.0001,
+                2.1827,
+                0.19212,
+                -0.60529,
+                0.37639,
+                -0.46451,
+                -0.46788,
+                1.4077,
+                -2.1661,
+                0.72604,
+                -0.12332,
+                -0.8243,
+                -0.082989,
+                0.053476,
+            ],
+            "E": [
+                -0.87713,
+                1.4905,
+                1.0755,
+                0.35944,
+                1.567,
+                0.41365,
+                1.0944,
+                0.72634,
+                -0.74957,
+                0.038939,
+                0.075057,
+                0.78637,
+                -1.4543,
+                1.6667,
+                -0.097439,
+                -0.24293,
+                1.7687,
+                0.36174,
+                -0.11585,
+            ],
+            "F": [
+                1.3557,
+                -0.10336,
+                -0.4309,
+                0.41269,
+                -0.083356,
+                0.83783,
+                0.095381,
+                -0.65222,
+                -0.3119,
+                0.43293,
+                -1.0011,
+                -0.66855,
+                -0.10242,
+                1.2066,
+                2.6234,
+                1.9981,
+                -0.25016,
+                0.71979,
+                0.21569,
+            ],
+            "G": [
+                -1.0818,
+                -2.1561,
+                0.77082,
+                -0.92747,
+                -1.0748,
+                1.7997,
+                -1.3708,
+                1.279,
+                -1.2098,
+                0.46065,
+                0.43076,
+                0.20037,
+                -0.2302,
+                0.2646,
+                0.57149,
+                -0.68432,
+                0.19341,
+                -0.061606,
+                -0.08071,
+            ],
+            "H": [
+                -0.050161,
+                0.69246,
+                -0.88397,
+                -0.64601,
+                0.24622,
+                0.10487,
+                -1.1317,
+                -2.3661,
+                -0.89918,
+                0.46391,
+                -0.62359,
+                2.5478,
+                -0.34737,
+                -0.52062,
+                0.17522,
+                -0.88648,
+                -0.4755,
+                0.023187,
+                -0.28261,
+            ],
+            "I": [
+                1.4829,
+                -0.46435,
+                0.50189,
+                0.55724,
+                -0.51535,
+                -0.29914,
+                0.97236,
+                -0.15793,
+                -0.98246,
+                -0.54347,
+                0.97806,
+                0.37577,
+                1.618,
+                0.62323,
+                -0.59359,
+                -0.35483,
+                -0.085017,
+                0.55825,
+                -2.7542,
+            ],
+            "K": [
+                -0.85344,
+                1.529,
+                0.27747,
+                0.32993,
+                -1.1786,
+                -0.16633,
+                -1.0459,
+                0.44621,
+                0.41027,
+                -2.5318,
+                0.91329,
+                0.53385,
+                0.61417,
+                -1.111,
+                1.1323,
+                0.95105,
+                0.76769,
+                -0.016115,
+                0.054995,
+            ],
+            "L": [
+                1.2857,
+                0.039488,
+                1.5378,
+                0.87969,
+                -0.21419,
+                0.40389,
+                -0.20426,
+                -0.14351,
+                0.61024,
+                -1.1927,
+                -2.2149,
+                -0.84248,
+                -0.5061,
+                -0.48548,
+                0.10791,
+                -2.1503,
+                -0.12006,
+                -0.60222,
+                0.26546,
+            ],
+            "M": [
+                1.137,
+                0.64388,
+                0.13724,
+                -0.2988,
+                1.2288,
+                0.24981,
+                -1.6427,
+                -0.75868,
+                -0.54902,
+                1.0571,
+                1.272,
+                -1.9104,
+                0.70919,
+                -0.93575,
+                -0.6314,
+                -0.079654,
+                1.634,
+                -0.0021923,
+                0.49825,
+            ],
+            "N": [
+                -1.084,
+                -0.176,
+                -0.47062,
+                -0.92245,
+                -0.32953,
+                0.74278,
+                0.34551,
+                -1.4605,
+                0.25219,
+                -1.2107,
+                -0.59978,
+                -0.79183,
+                1.3268,
+                1.9839,
+                -1.6137,
+                0.5333,
+                0.033889,
+                -1.0331,
+                0.83019,
+            ],
+            "P": [
+                -1.1823,
+                -1.6911,
+                -1.1331,
+                3.073,
+                1.1942,
+                -0.93426,
+                -0.72985,
+                -0.042441,
+                -0.19264,
+                -0.21603,
+                -0.1239,
+                0.054016,
+                0.15241,
+                -0.019691,
+                -0.20543,
+                0.10206,
+                0.07671,
+                -0.081968,
+                0.20348,
+            ],
+            "Q": [
+                -0.57747,
+                0.97452,
+                -0.077547,
+                -0.0033488,
+                0.17184,
+                -0.52537,
+                -0.27362,
+                -0.1366,
+                0.2057,
+                -0.013066,
+                1.8834,
+                -1.2736,
+                -0.84991,
+                1.0445,
+                0.69027,
+                -1.2866,
+                -2.6776,
+                0.1683,
+                0.086105,
+            ],
+            "R": [
+                -0.62245,
+                1.545,
+                -0.61966,
+                0.19057,
+                -1.7485,
+                -1.3909,
+                -0.47526,
+                1.3938,
+                -0.84556,
+                1.7344,
+                -1.6516,
+                -0.52678,
+                0.6791,
+                0.24374,
+                -0.62551,
+                -0.0028271,
+                -0.053884,
+                0.14926,
+                -0.17232,
+            ],
+            "S": [
+                -0.86409,
+                -0.77147,
+                0.38542,
+                -0.59389,
+                -0.53313,
+                -0.47585,
+                0.31966,
+                -0.89716,
+                1.8029,
+                0.26431,
+                -0.23173,
+                -0.37626,
+                -0.47349,
+                -0.42878,
+                -0.47297,
+                -0.079826,
+                0.57043,
+                3.2057,
+                -0.18413,
+            ],
+            "T": [
+                -0.33027,
+                -0.57447,
+                0.18653,
+                -0.28941,
+                -0.62681,
+                -1.0737,
+                0.80363,
+                -0.59525,
+                1.8786,
+                1.3971,
+                0.63929,
+                0.21281,
+                -0.067048,
+                0.096271,
+                1.323,
+                -0.36173,
+                1.2261,
+                -2.2771,
+                -0.65412,
+            ],
+            "V": [
+                1.1675,
+                -0.61554,
+                0.95405,
+                0.11662,
+                -0.74473,
+                -1.1482,
+                1.1309,
+                0.12079,
+                -0.77171,
+                0.18597,
+                0.93442,
+                1.201,
+                0.3826,
+                -0.091573,
+                -0.31269,
+                0.074367,
+                -0.22946,
+                0.24322,
+                2.9836,
+            ],
+            "W": [
+                1.1881,
+                0.43789,
+                -1.7915,
+                0.138,
+                0.43088,
+                1.6467,
+                -0.11987,
+                1.7369,
+                2.0818,
+                0.33122,
+                0.31829,
+                1.1586,
+                0.67649,
+                0.30819,
+                -0.55772,
+                -0.54491,
+                -0.17969,
+                0.24477,
+                0.38674,
+            ],
+            "Y": [
+                0.54671,
+                -0.1468,
+                -1.5688,
+                0.19001,
+                -1.2736,
+                0.66162,
+                1.1614,
+                -0.18614,
+                -0.70654,
+                -0.43634,
+                0.44775,
+                -0.71366,
+                -2.5907,
+                -1.1649,
+                -1.1576,
+                0.66572,
+                0.21019,
+                -0.61016,
+                -0.34844,
+            ],
+        },
+        "refractivity": {
+            "A": [0.102045615],
+            "C": [0.841053374],
+            "D": [0.282153774],
+            "E": [0.405831178],
+            "F": [0.691276746],
+            "G": [0],
+            "H": [0.512814484],
+            "I": [0.448154244],
+            "K": [0.50058782],
+            "L": [0.441570656],
+            "M": [0.508817305],
+            "N": [0.282153774],
+            "P": [0.256995062],
+            "Q": [0.405831178],
+            "R": [0.626851634],
+            "S": [0.149306372],
+            "T": [0.258876087],
+            "V": [0.327298378],
+            "W": [1],
+            "Y": [0.741359041],
+        },
+        "t_scale": {
+            "A": [-8.4, -8.01, -3.73, -3.65, -6.12, -1.59, 1.56],
+            "C": [-2.44, -1.96, 0.93, -2.35, 1.31, 2.29, -1.52],
+            "D": [-6.84, -0.94, 17.68, -0.03, 3.44, 9.07, 4.32],
+            "E": [-6.5, 16.2, 17.28, 3.11, -4.75, -2.54, 4.72],
+            "F": [21.59, -5.73, 1.03, -3.3, 2.64, -5.02, 1.7],
+            "G": [-8.48, -10.37, -5.14, -6.51, -11.84, -3.6, 2.01],
+            "H": [15.28, -3.67, 6.72, -6.38, 4.12, -1.55, -2.85],
+            "I": [-2.97, 4.64, -0.77, 11, 3.26, -4.36, -7.88],
+            "K": [2.7, 13.46, -14.03, -2.55, 2.77, 0.15, 3.19],
+            "L": [2.61, 5.96, 1.97, 2.59, -4.77, -4.84, -5.44],
+            "M": [3.38, 12.43, -4.77, 0.45, -1.55, -0.6, 3.26],
+            "N": [-3.11, -1.22, 6.26, -9.38, 9.94, 7.66, -4.81],
+            "P": [-5.35, -9.07, -1.52, -8.79, -8.73, 4.29, -9.91],
+            "Q": [-5.31, 15.64, 8.44, 1.03, -4.32, -4.4, -0.52],
+            "R": [-2.27, 18.9, -18.24, -3.47, 3.03, 6.64, 0.45],
+            "S": [-15.88, -11.21, -2.44, -3.61, 3.46, -0.37, 8.98],
+            "T": [-17.81, -13.64, -5.19, 10.57, 6.91, -4.43, 3.49],
+            "V": [-5.8, -6.15, -2.26, 9.87, 5.28, -1.49, -7.54],
+            "W": [21.68, -8.78, -2.53, 15.53, -8.15, 11.98, 3.23],
+            "Y": [23.9, -6.47, 0.31, -4.14, 4.08, -7.28, 3.59],
+        },
+        "tm_tend": {
+            "A": [0.38],
+            "C": [-0.3],
+            "D": [-3.27],
+            "E": [-2.9],
+            "F": [1.98],
+            "G": [-0.19],
+            "H": [-1.44],
+            "I": [1.97],
+            "K": [-3.46],
+            "L": [1.82],
+            "M": [1.4],
+            "N": [-1.62],
+            "P": [-1.44],
+            "Q": [-1.84],
+            "R": [-2.57],
+            "S": [-0.53],
+            "T": [-0.32],
+            "V": [1.46],
+            "W": [1.53],
+            "Y": [0.49],
+        },
+        "z3": {
+            "A": [0.07, -1.73, 0.09],
+            "C": [0.71, -0.97, 4.13],
+            "D": [3.64, 1.13, 2.36],
+            "E": [3.08, 0.39, -0.07],
+            "F": [-4.92, 1.3, 0.45],
+            "G": [2.23, -5.36, 0.3],
+            "H": [2.41, 1.74, 1.11],
+            "I": [-4.44, -1.68, -1.03],
+            "K": [2.84, 1.41, -3.14],
+            "L": [-4.19, -1.03, -0.98],
+            "M": [-2.49, -0.27, -0.41],
+            "N": [3.22, 1.45, 0.84],
+            "P": [-1.22, 0.88, 2.23],
+            "Q": [2.18, 0.53, -1.14],
+            "R": [2.88, 2.52, -3.44],
+            "S": [1.96, -1.63, 0.57],
+            "T": [0.92, -2.09, -1.4],
+            "V": [-2.69, -2.53, -1.29],
+            "W": [-4.75, 3.65, 0.85],
+            "Y": [-1.39, 2.32, 0.01],
+        },
+        "z5": {
+            "A": [0.24, -2.32, 0.6, -0.14, 1.3],
+            "C": [0.84, -1.67, 3.71, 0.18, -2.65],
+            "D": [3.98, 0.93, 1.93, -2.46, 0.75],
+            "E": [3.11, 0.26, -0.11, -3.04, -0.25],
+            "F": [-4.22, 1.94, 1.06, 0.54, -0.62],
+            "G": [2.05, -4.06, 0.36, -0.82, -0.38],
+            "H": [2.47, 1.95, 0.26, 3.9, 0.09],
+            "I": [-3.89, -1.73, -1.71, -0.84, 0.26],
+            "K": [2.29, 0.89, -2.49, 1.49, 0.31],
+            "L": [-4.28, -1.3, -1.49, -0.72, 0.84],
+            "M": [-2.85, -0.22, 0.47, 1.94, -0.98],
+            "N": [3.05, 1.62, 1.04, -1.15, 1.61],
+            "P": [-1.66, 0.27, 1.84, 0.7, 2],
+            "Q": [1.75, 0.5, -1.44, -1.34, 0.66],
+            "R": [3.52, 2.5, -3.5, 1.99, -0.17],
+            "S": [2.39, -1.07, 1.15, -1.39, 0.67],
+            "T": [0.75, -2.18, -1.12, -1.46, -0.4],
+            "V": [-2.59, -2.64, -1.54, -0.85, -0.02],
+            "W": [-4.36, 3.94, 0.59, 3.44, -1.59],
+            "Y": [-2.54, 2.44, 0.43, 0.04, -1.47],
+        },
+    }
+    if scalename == "all":
+        d = {
+            "I": [],
+            "F": [],
+            "V": [],
+            "L": [],
+            "W": [],
+            "M": [],
+            "A": [],
+            "G": [],
+            "C": [],
+            "Y": [],
+            "P": [],
+            "T": [],
+            "S": [],
+            "H": [],
+            "E": [],
+            "N": [],
+            "Q": [],
+            "D": [],
+            "K": [],
+            "R": [],
+        }
+        for scale in scales.keys():
+            for k, v in scales[scale].items():
+                d[k].extend(v)
+        return "all", d
+
+    elif scalename == "instability":
+        d = {
+            "A": {
+                "A": 1.0,
+                "C": 44.94,
+                "E": 1.0,
+                "D": -7.49,
+                "G": 1.0,
+                "F": 1.0,
+                "I": 1.0,
+                "H": -7.49,
+                "K": 1.0,
+                "M": 1.0,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": 1.0,
+                "P": 20.26,
+                "S": 1.0,
+                "R": 1.0,
+                "T": 1.0,
+                "W": 1.0,
+                "V": 1.0,
+                "Y": 1.0,
+            },
+            "C": {
+                "A": 1.0,
+                "C": 1.0,
+                "E": 1.0,
+                "D": 20.26,
+                "G": 1.0,
+                "F": 1.0,
+                "I": 1.0,
+                "H": 33.6,
+                "K": 1.0,
+                "M": 33.6,
+                "L": 20.26,
+                "N": 1.0,
+                "Q": -6.54,
+                "P": 20.26,
+                "S": 1.0,
+                "R": 1.0,
+                "T": 33.6,
+                "W": 24.68,
+                "V": -6.54,
+                "Y": 1.0,
+            },
+            "E": {
+                "A": 1.0,
+                "C": 44.94,
+                "E": 33.6,
+                "D": 20.26,
+                "G": 1.0,
+                "F": 1.0,
+                "I": 20.26,
+                "H": -6.54,
+                "K": 1.0,
+                "M": 1.0,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": 20.26,
+                "P": 20.26,
+                "S": 20.26,
+                "R": 1.0,
+                "T": 1.0,
+                "W": -14.03,
+                "V": 1.0,
+                "Y": 1.0,
+            },
+            "D": {
+                "A": 1.0,
+                "C": 1.0,
+                "E": 1.0,
+                "D": 1.0,
+                "G": 1.0,
+                "F": -6.54,
+                "I": 1.0,
+                "H": 1.0,
+                "K": -7.49,
+                "M": 1.0,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": 1.0,
+                "P": 1.0,
+                "S": 20.26,
+                "R": -6.54,
+                "T": -14.03,
+                "W": 1.0,
+                "V": 1.0,
+                "Y": 1.0,
+            },
+            "G": {
+                "A": -7.49,
+                "C": 1.0,
+                "E": -6.54,
+                "D": 1.0,
+                "G": 13.34,
+                "F": 1.0,
+                "I": -7.49,
+                "H": 1.0,
+                "K": -7.49,
+                "M": 1.0,
+                "L": 1.0,
+                "N": -7.49,
+                "Q": 1.0,
+                "P": 1.0,
+                "S": 1.0,
+                "R": 1.0,
+                "T": -7.49,
+                "W": 13.34,
+                "V": 1.0,
+                "Y": -7.49,
+            },
+            "F": {
+                "A": 1.0,
+                "C": 1.0,
+                "E": 1.0,
+                "D": 13.34,
+                "G": 1.0,
+                "F": 1.0,
+                "I": 1.0,
+                "H": 1.0,
+                "K": -14.03,
+                "M": 1.0,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": 1.0,
+                "P": 20.26,
+                "S": 1.0,
+                "R": 1.0,
+                "T": 1.0,
+                "W": 1.0,
+                "V": 1.0,
+                "Y": 33.601,
+            },
+            "I": {
+                "A": 1.0,
+                "C": 1.0,
+                "E": 44.94,
+                "D": 1.0,
+                "G": 1.0,
+                "F": 1.0,
+                "I": 1.0,
+                "H": 13.34,
+                "K": -7.49,
+                "M": 1.0,
+                "L": 20.26,
+                "N": 1.0,
+                "Q": 1.0,
+                "P": -1.88,
+                "S": 1.0,
+                "R": 1.0,
+                "T": 1.0,
+                "W": 1.0,
+                "V": -7.49,
+                "Y": 1.0,
+            },
+            "H": {
+                "A": 1.0,
+                "C": 1.0,
+                "E": 1.0,
+                "D": 1.0,
+                "G": -9.37,
+                "F": -9.37,
+                "I": 44.94,
+                "H": 1.0,
+                "K": 24.68,
+                "M": 1.0,
+                "L": 1.0,
+                "N": 24.68,
+                "Q": 1.0,
+                "P": -1.88,
+                "S": 1.0,
+                "R": 1.0,
+                "T": -6.54,
+                "W": -1.88,
+                "V": 1.0,
+                "Y": 44.94,
+            },
+            "K": {
+                "A": 1.0,
+                "C": 1.0,
+                "E": 1.0,
+                "D": 1.0,
+                "G": -7.49,
+                "F": 1.0,
+                "I": -7.49,
+                "H": 1.0,
+                "K": 1.0,
+                "M": 33.6,
+                "L": -7.49,
+                "N": 1.0,
+                "Q": 24.64,
+                "P": -6.54,
+                "S": 1.0,
+                "R": 33.6,
+                "T": 1.0,
+                "W": 1.0,
+                "V": -7.49,
+                "Y": 1.0,
+            },
+            "M": {
+                "A": 13.34,
+                "C": 1.0,
+                "E": 1.0,
+                "D": 1.0,
+                "G": 1.0,
+                "F": 1.0,
+                "I": 1.0,
+                "H": 58.28,
+                "K": 1.0,
+                "M": -1.88,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": -6.54,
+                "P": 44.94,
+                "S": 44.94,
+                "R": -6.54,
+                "T": -1.88,
+                "W": 1.0,
+                "V": 1.0,
+                "Y": 24.68,
+            },
+            "L": {
+                "A": 1.0,
+                "C": 1.0,
+                "E": 1.0,
+                "D": 1.0,
+                "G": 1.0,
+                "F": 1.0,
+                "I": 1.0,
+                "H": 1.0,
+                "K": -7.49,
+                "M": 1.0,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": 33.6,
+                "P": 20.26,
+                "S": 1.0,
+                "R": 20.26,
+                "T": 1.0,
+                "W": 24.68,
+                "V": 1.0,
+                "Y": 1.0,
+            },
+            "N": {
+                "A": 1.0,
+                "C": -1.88,
+                "E": 1.0,
+                "D": 1.0,
+                "G": -14.03,
+                "F": -14.03,
+                "I": 44.94,
+                "H": 1.0,
+                "K": 24.68,
+                "M": 1.0,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": -6.54,
+                "P": -1.88,
+                "S": 1.0,
+                "R": 1.0,
+                "T": -7.49,
+                "W": -9.37,
+                "V": 1.0,
+                "Y": 1.0,
+            },
+            "Q": {
+                "A": 1.0,
+                "C": -6.54,
+                "E": 20.26,
+                "D": 20.26,
+                "G": 1.0,
+                "F": -6.54,
+                "I": 1.0,
+                "H": 1.0,
+                "K": 1.0,
+                "M": 1.0,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": 20.26,
+                "P": 20.26,
+                "S": 44.94,
+                "R": 1.0,
+                "T": 1.0,
+                "W": 1.0,
+                "V": -6.54,
+                "Y": -6.54,
+            },
+            "P": {
+                "A": 20.26,
+                "C": -6.54,
+                "E": 18.38,
+                "D": -6.54,
+                "G": 1.0,
+                "F": 20.26,
+                "I": 1.0,
+                "H": 1.0,
+                "K": 1.0,
+                "M": -6.54,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": 20.26,
+                "P": 20.26,
+                "S": 20.26,
+                "R": -6.54,
+                "T": 1.0,
+                "W": -1.88,
+                "V": 20.26,
+                "Y": 1.0,
+            },
+            "S": {
+                "A": 1.0,
+                "C": 33.6,
+                "E": 20.26,
+                "D": 1.0,
+                "G": 1.0,
+                "F": 1.0,
+                "I": 1.0,
+                "H": 1.0,
+                "K": 1.0,
+                "M": 1.0,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": 20.26,
+                "P": 44.94,
+                "S": 20.26,
+                "R": 20.26,
+                "T": 1.0,
+                "W": 1.0,
+                "V": 1.0,
+                "Y": 1.0,
+            },
+            "R": {
+                "A": 1.0,
+                "C": 1.0,
+                "E": 1.0,
+                "D": 1.0,
+                "G": -7.49,
+                "F": 1.0,
+                "I": 1.0,
+                "H": 20.26,
+                "K": 1.0,
+                "M": 1.0,
+                "L": 1.0,
+                "N": 13.34,
+                "Q": 20.26,
+                "P": 20.26,
+                "S": 44.94,
+                "R": 58.28,
+                "T": 1.0,
+                "W": 58.28,
+                "V": 1.0,
+                "Y": -6.54,
+            },
+            "T": {
+                "A": 1.0,
+                "C": 1.0,
+                "E": 20.26,
+                "D": 1.0,
+                "G": -7.49,
+                "F": 13.34,
+                "I": 1.0,
+                "H": 1.0,
+                "K": 1.0,
+                "M": 1.0,
+                "L": 1.0,
+                "N": -14.03,
+                "Q": -6.54,
+                "P": 1.0,
+                "S": 1.0,
+                "R": 1.0,
+                "T": 1.0,
+                "W": -14.03,
+                "V": 1.0,
+                "Y": 1.0,
+            },
+            "W": {
+                "A": -14.03,
+                "C": 1.0,
+                "E": 1.0,
+                "D": 1.0,
+                "G": -9.37,
+                "F": 1.0,
+                "I": 1.0,
+                "H": 24.68,
+                "K": 1.0,
+                "M": 24.68,
+                "L": 13.34,
+                "N": 13.34,
+                "Q": 1.0,
+                "P": 1.0,
+                "S": 1.0,
+                "R": 1.0,
+                "T": -14.03,
+                "W": 1.0,
+                "V": -7.49,
+                "Y": 1.0,
+            },
+            "V": {
+                "A": 1.0,
+                "C": 1.0,
+                "E": 1.0,
+                "D": -14.03,
+                "G": -7.49,
+                "F": 1.0,
+                "I": 1.0,
+                "H": 1.0,
+                "K": -1.88,
+                "M": 1.0,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": 1.0,
+                "P": 20.26,
+                "S": 1.0,
+                "R": 1.0,
+                "T": -7.49,
+                "W": 1.0,
+                "V": 1.0,
+                "Y": -6.54,
+            },
+            "Y": {
+                "A": 24.68,
+                "C": 1.0,
+                "E": -6.54,
+                "D": 24.68,
+                "G": -7.49,
+                "F": 1.0,
+                "I": 1.0,
+                "H": 13.34,
+                "K": 1.0,
+                "M": 44.94,
+                "L": 1.0,
+                "N": 1.0,
+                "Q": 1.0,
+                "P": 13.34,
+                "S": 1.0,
+                "R": -15.91,
+                "T": -7.49,
+                "W": -9.37,
+                "V": 1.0,
+                "Y": 13.34,
+            },
+        }
+        return "instability", d
+
+    else:
+        return scalename, scales[scalename]
+
+
+def read_fasta(inputfile):
+    """Method for loading sequences from a FASTA formatted file into :py:attr:`sequences` & :py:attr:`names`.
+    This method is used by the base class :class:`modlamp.descriptors.PeptideDescriptor` if the input is a FASTA file.
+
+    :param inputfile: .fasta file with sequences and headers to read
+    :return: list of sequences in the attribute :py:attr:`sequences` with corresponding sequence names in
+        :py:attr:`names`.
+    """
+    names = list()  # list for storing names
+    sequences = list()  # list for storing sequences
+    seq = str()
+    with open(inputfile) as f:
+        all = f.readlines()
+        last = all[-1]
+        for line in all:
+            if line.startswith(">"):
+                names.append(
+                    line.split(" ")[0][1:].strip()
+                )  # add FASTA name without description as molecule name
+                sequences.append(seq.strip())
+                seq = str()
+            elif line == last:
+                seq += line.strip()  # remove potential white space
+                sequences.append(seq.strip())
+            else:
+                seq += line.strip()  # remove potential white space
+    return sequences[1:], names
+
+
+def save_fasta(filename, sequences, names=None):
+    """Method for saving sequences in the instance :py:attr:`sequences` to a file in FASTA format.
+
+    :param filename: {str} output filename (ending .fasta)
+    :param sequences: {list} sequences to be saved to file
+    :param names: {list} whether sequence names from self.names should be saved as sequence identifiers
+    :return: a FASTA formatted file containing the generated sequences
+    """
+    if os.path.exists(filename):
+        os.remove(filename)  # remove outputfile, it it exists
+
+    with open(filename, "w") as o:
+        for n, seq in enumerate(sequences):
+            if names:
+                o.write(">" + str(names[n]) + "\n")
+            else:
+                o.write(">Seq_" + str(n) + "\n")
+            o.write(seq + "\n")
+
+
+def aa_weights():
+    """Function holding molecular weight data on all natural amino acids.
+
+    :return: dictionary with amino acid letters and corresponding weights
+
+    .. versionadded:: v2.4.1
+    """
+    weights = {
+        "A": 89.093,
+        "C": 121.158,
+        "D": 133.103,
+        "E": 147.129,
+        "F": 165.189,
+        "G": 75.067,
+        "H": 155.155,
+        "I": 131.173,
+        "K": 146.188,
+        "L": 131.173,
+        "M": 149.211,
+        "N": 132.118,
+        "P": 115.131,
+        "Q": 146.145,
+        "R": 174.20,
+        "S": 105.093,
+        "T": 119.119,
+        "V": 117.146,
+        "W": 204.225,
+        "Y": 181.189,
+    }
+    return weights
+
+
+def count_aas(seq, scale="relative"):
+    """Function to count the amino acids occuring in a given sequence.
+
+    :param seq: {str} amino acid sequence
+    :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA
+    :return: {dict} dictionary with amino acids as keys and their counts in the sequence as values.
+    """
+    if seq == "":  # error if len(seq) == 0
+        seq = " "
+    aas = [
+        "A",
+        "C",
+        "D",
+        "E",
+        "F",
+        "G",
+        "H",
+        "I",
+        "K",
+        "L",
+        "M",
+        "N",
+        "P",
+        "Q",
+        "R",
+        "S",
+        "T",
+        "V",
+        "W",
+        "Y",
+    ]
+    scl = 1.0
+    if scale == "relative":
+        scl = len(seq)
+    aa = {a: (float(seq.count(a)) / scl) for a in aas}
+    aa = collections.OrderedDict(sorted(list(aa.items())))
+    return aa
+
+
+def count_ngrams(seq, n):
+    """Function to count the n-grams of an amino acid sequence. N can be one integer or a list of integers
+
+    :param seq: {str} amino acid sequence
+    :param n: {int or list of ints} defines whether counts or frequencies are given for each AA
+    :return: {dict} dictionary with n-grams as keys and their counts in the sequence as values.
+    """
+    if seq == "":
+        seq = " "
+    if isinstance(n, int):
+        n = [n]
+    ngrams = list()
+    for i in n:
+        ngrams.extend([seq[j : j + i] for j in range(len(seq) - (i - 1))])
+    counts = {g: (seq.count(g)) for g in set(ngrams)}
+    counts = collections.OrderedDict(
+        sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
+    )
+    return counts
+
+
+def aa_energies():
+    """Function holding free energies of transfer between cyclohexane and water for all natural amino acids.
+    H. G. Boman, D. Wade, I. a Boman, B. Wåhlin, R. B. Merrifield, *FEBS Lett*. **1989**, *259*, 103–106.
+
+    :return: dictionary with amino acid letters and corresponding energies.
+    """
+    energies = {
+        "L": -4.92,
+        "I": -4.92,
+        "V": -4.04,
+        "F": -2.98,
+        "M": -2.35,
+        "W": -2.33,
+        "A": -1.81,
+        "C": -1.28,
+        "G": -0.94,
+        "Y": 0.14,
+        "T": 2.57,
+        "S": 3.40,
+        "H": 4.66,
+        "Q": 5.54,
+        "K": 5.55,
+        "N": 6.64,
+        "E": 6.81,
+        "D": 8.72,
+        "R": 14.92,
+        "P": 0.0,
+    }
+    return energies
+
+
+def ngrams_apd():
+    """Function returning the most frequent 2-, 3- and 4-grams from all sequences in the `APD3
+    <http://aps.unmc.edu/AP/>`_, version August 2016 with 2727 sequences.
+    For all 2, 3 and 4grams, all possible ngrams were generated from all sequences and the top 50 most frequent
+    assembled into a list. Finally, leading and tailing spaces were striped and duplicates as well as ngrams containing
+    spaces were removed.
+
+    :return: numpy.array containing most frequent ngrams
+    """
+    return np.array(
+        [
+            "AGK",
+            "CKI",
+            "RR",
+            "YGGG",
+            "LSGL",
+            "RG",
+            "YGGY",
+            "PRP",
+            "LGGG",
+            "GV",
+            "GT",
+            "GS",
+            "GR",
+            "IAG",
+            "GG",
+            "GF",
+            "GC",
+            "GGYG",
+            "GA",
+            "GL",
+            "GK",
+            "GI",
+            "IPC",
+            "KAA",
+            "LAK",
+            "GLGG",
+            "GGLG",
+            "CKIT",
+            "GAGK",
+            "LLSG",
+            "LKK",
+            "FLP",
+            "LSG",
+            "SCK",
+            "LLS",
+            "GETC",
+            "VLG",
+            "GKLL",
+            "LLG",
+            "C",
+            "KCKI",
+            "G",
+            "VGK",
+            "CSC",
+            "TKKC",
+            "GCS",
+            "GKA",
+            "IGK",
+            "GESC",
+            "KVCY",
+            "KKL",
+            "KKI",
+            "KKC",
+            "LGGL",
+            "GLL",
+            "CGE",
+            "GGYC",
+            "GLLS",
+            "GLF",
+            "AKK",
+            "GKAA",
+            "ESCV",
+            "GLP",
+            "CGES",
+            "PCGE",
+            "FL",
+            "CGET",
+            "GLW",
+            "KGAA",
+            "KAAL",
+            "GGY",
+            "GGG",
+            "IKG",
+            "LKG",
+            "GGL",
+            "CK",
+            "GTC",
+            "CG",
+            "SKKC",
+            "CS",
+            "CR",
+            "KC",
+            "AGKA",
+            "KA",
+            "KG",
+            "LKCK",
+            "SCKL",
+            "KK",
+            "KI",
+            "KN",
+            "KL",
+            "SK",
+            "KV",
+            "SL",
+            "SC",
+            "SG",
+            "AAA",
+            "VAK",
+            "AAL",
+            "AAK",
+            "GGGG",
+            "KNVA",
+            "GGGL",
+            "GYG",
+            "LG",
+            "LA",
+            "LL",
+            "LK",
+            "LS",
+            "LP",
+            "GCSC",
+            "TC",
+            "GAA",
+            "AA",
+            "VA",
+            "VC",
+            "AG",
+            "VG",
+            "AI",
+            "AK",
+            "VL",
+            "AL",
+            "TPGC",
+            "IK",
+            "IA",
+            "IG",
+            "YGG",
+            "LGK",
+            "CSCK",
+            "GYGG",
+            "LGG",
+            "KGA",
+        ]
+    )
+
+
+def aa_formulas():
+    """
+    Function returning the molecular formulas of all amino acids. All amino acids are considered in the neutral form
+    (uncharged).
+    """
+    formulas = {
+        "A": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 0},
+        "C": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 1},
+        "D": {"C": 4, "H": 7, "N": 1, "O": 4, "S": 0},
+        "E": {"C": 5, "H": 9, "N": 1, "O": 4, "S": 0},
+        "F": {"C": 9, "H": 11, "N": 1, "O": 2, "S": 0},
+        "G": {"C": 2, "H": 5, "N": 1, "O": 2, "S": 0},
+        "H": {"C": 6, "H": 9, "N": 3, "O": 2, "S": 0},
+        "I": {"C": 6, "H": 13, "N": 1, "O": 2, "S": 0},
+        "K": {"C": 6, "H": 14, "N": 2, "O": 2, "S": 0},
+        "L": {"C": 6, "H": 13, "N": 1, "O": 2, "S": 0},
+        "M": {"C": 5, "H": 11, "N": 1, "O": 2, "S": 1},
+        "N": {"C": 4, "H": 8, "N": 2, "O": 3, "S": 0},
+        "P": {"C": 5, "H": 9, "N": 1, "O": 2, "S": 0},
+        "Q": {"C": 5, "H": 10, "N": 2, "O": 3, "S": 0},
+        "R": {"C": 6, "H": 14, "N": 4, "O": 2, "S": 0},
+        "S": {"C": 3, "H": 7, "N": 1, "O": 3, "S": 0},
+        "T": {"C": 4, "H": 9, "N": 1, "O": 3, "S": 0},
+        "V": {"C": 5, "H": 11, "N": 1, "O": 2, "S": 0},
+        "W": {"C": 11, "H": 12, "N": 2, "O": 2, "S": 0},
+        "Y": {"C": 9, "H": 11, "N": 1, "O": 3, "S": 0},
+    }
+    return formulas