Mercurial > repos > cpt > cpt_helical_wheel
diff plotWheels/descriptors.py @ 1:9b276485c94a draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:44:43 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plotWheels/descriptors.py Mon Jun 05 02:44:43 2023 +0000 @@ -0,0 +1,1183 @@ +# -*- coding: utf-8 -*- +""" +.. currentmodule:: modlamp.descriptors + +.. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch> + +This module incorporates different classes to calculate peptide descriptor values. The following classes are available: + +============================= ============================================================================ +Class Characteristics +============================= ============================================================================ +:py:class:`GlobalDescriptor` Global one-dimensional peptide descriptors calculated from the AA sequence. +:py:class:`PeptideDescriptor` AA scale based global or convoluted descriptors (auto-/cross-correlated). +============================= ============================================================================ + +.. seealso:: :class:`modlamp.core.BaseDescriptor` from which the classes in :mod:`modlamp.descriptors` inherit. +""" + +import sys + +import numpy as np +from scipy import stats +from sklearn.externals.joblib import Parallel, delayed + +from plotWheels.core import ( + BaseDescriptor, + load_scale, + count_aas, + aa_weights, + aa_energies, + aa_formulas, +) + +__author__ = "Alex Müller, Gisela Gabernet" +__docformat__ = "restructuredtext en" + + +def _one_autocorr(seq, window, scale): + """Private function used for calculating auto-correlated descriptors for 1 given sequence, window and an AA scale. + This function is used by the :py:func:`calculate_autocorr` method of :py:class:`PeptideDescriptor`. + + :param seq: {str} amino acid sequence to calculate descriptor for + :param window: {int} correlation-window size + :param scale: {str} amino acid scale to be used to calculate descriptor + :return: {numpy.array} calculated descriptor data + """ + try: + m = list() # list of lists to store translated sequence values + for l in range(len(seq)): # translate AA sequence into values + m.append(scale[str(seq[l])]) + # auto-correlation in defined sequence window + seqdesc = list() + for dist in range(window): # for all correlation distances + for val in range( + len(scale["A"]) + ): # for all features of the descriptor scale + valsum = list() + cntr = 0.0 + for pos in range(len(seq)): # for every position in the sequence + if (pos + dist) < len( + seq + ): # check if corr distance is possible at that sequence position + cntr += 1 # counter to scale sum + valsum.append(m[pos][val] * m[pos + dist][val]) + seqdesc.append( + sum(valsum) / cntr + ) # append scaled correlation distance values + return seqdesc + except ZeroDivisionError: + print( + "ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" + % (window, seq) + ) + + +def _one_crosscorr(seq, window, scale): + """Private function used for calculating cross-correlated descriptors for 1 given sequence, window and an AA scale. + This function is used by the :py:func:`calculate_crosscorr` method of :py:class:`PeptideDescriptor`. + + :param seq: {str} amino acid sequence to calculate descriptor for + :param window: {int} correlation-window size + :param scale: {str} amino acid scale to be used to calculate descriptor + :return: {numpy.array} calculated descriptor data + """ + try: + m = list() # list of lists to store translated sequence values + for l in range(len(seq)): # translate AA sequence into values + m.append(scale[str(seq[l])]) + # auto-correlation in defined sequence window + seqdesc = list() + for val in range(len(scale["A"])): # for all features of the descriptor scale + for cc in range(len(scale["A"])): # for every feature cross correlation + if (val + cc) < len( + scale["A"] + ): # check if corr distance is in range of the num of features + for dist in range(window): # for all correlation distances + cntr = float() + valsum = list() + for pos in range( + len(seq) + ): # for every position in the sequence + if (pos + dist) < len( + seq + ): # check if corr distance is possible at that sequence pos + cntr += 1 # counter to scale sum + valsum.append(m[pos][val] * m[pos + dist][val + cc]) + seqdesc.append( + sum(valsum) / cntr + ) # append scaled correlation distance values + return seqdesc + except ZeroDivisionError: + print( + "ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" + % (window, seq) + ) + + +def _one_arc(seq, modality, scale): + """Privat function used for calculating arc descriptors for one sequence and AA scale. This function is used by + :py:func:`calculate_arc` method method of :py:class:`PeptideDescriptor`. + + :param seq: {str} amino acid sequence to calculate descriptor for + :param scale: {str} amino acid scale to be used to calculate descriptor + :return: {numpy.array} calculated descriptor data + """ + desc_mat = [] + for aa in seq: + desc_mat.append(scale[aa]) + desc_mat = np.asarray(desc_mat) + + # Check descriptor dimension + desc_dim = desc_mat.shape[1] + + # list to store descriptor values for all windows + allwindows_arc = [] + + if len(seq) > 18: + window = 18 + # calculates number of windows in sequence + num_windows = len(seq) - window + else: + window = len(seq) + num_windows = 1 + + # loop through all windows + for j in range(num_windows): + # slices descriptor matrix into current window + window_mat = desc_mat[j : j + window, :] + + # defines order of amino acids in helical projection + order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7] + + # orders window descriptor matrix into helical projection order + ordered = [] + for pos in order: + try: + ordered.append(window_mat[pos, :]) + except: + # for sequences of len < 18 adding dummy vector with 2s, length of descriptor dimensions + ordered.append([2] * desc_dim) + ordered = np.asarray(ordered) + + window_arc = [] + + # loop through pharmacophoric features + for m in range(desc_dim): + all_arcs = ( + [] + ) # stores all arcs that can be found of a pharmacophoric feature + arc = 0 + + for n in range( + 18 + ): # for all positions in helix, regardless of sequence length + if ( + ordered[n, m] == 0 + ): # if position does not contain pharmacophoric feature + all_arcs.append(arc) # append previous arc to all arcs list + arc = 0 # arc is initialized + elif ( + ordered[n, m] == 1 + ): # if position contains pharmacophoric feature(PF), elongate arc by 20° + arc += 20 + elif ordered[n, m] == 2: # if position doesn't contain amino acid: + if ( + ordered[n - 1, m] == 1 + ): # if previous position contained PF add 10° + arc += 10 + elif ( + ordered[n - 1, m] == 0 + ): # if previous position didn't contain PF don't add anything + arc += 0 + elif ( + ordered[n - 2, m] == 1 + ): # if previous position is empty then check second previous for PF + arc += 10 + if ( + n == 17 + ): # if we are at the last position check for position n=0 instead of next position. + if ordered[0, m] == 1: # if it contains PF add 10° extra + arc += 10 + else: # if next position contains PF add 10° extra + if ordered[n + 1, m] == 1: + arc += 10 + elif ordered[n + 1, m] == 0: + arc += 0 + else: # if next position is empty check for 2nd next position + if n == 16: + if ordered[0, m] == 1: + arc += 10 + else: + if ordered[n + 2, m] == 1: + arc += 10 + + all_arcs.append(arc) + if not arc == 360: + arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together + all_arcs = [arc0] + all_arcs[1:] + + window_arc.append( + np.max(all_arcs) + ) # append to window arcs the maximum arc of this PF + allwindows_arc.append(window_arc) # append all PF arcs of this window + + allwindows_arc = np.asarray(allwindows_arc) + + if modality == "max": + final_arc = np.max( + allwindows_arc, axis=0 + ) # calculate maximum / mean arc along all windows + elif modality == "mean": + final_arc = np.mean(allwindows_arc, axis=0) + else: + print('modality is unknown, please choose between "max" and "mean"\n.') + sys.exit() + return final_arc + + +def _charge(seq, ph=7.0, amide=False): + """Calculates charge of a single sequence. The method used is first described by Bjellqvist. In the case of + amidation, the value for the 'Cterm' pKa is 15 (and Cterm is added to the pos_pks dictionary. + The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed). + + **pos_pks** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} + + **neg_pks** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} + + :param ph: {float} pH at which to calculate peptide charge. + :param amide: {boolean} whether the sequences have an amidated C-terminus. + :return: {array} descriptor values in the attribute :py:attr:`descriptor + """ + + if amide: + pos_pks = {"Nterm": 9.38, "K": 10.67, "R": 12.10, "H": 6.04} + neg_pks = {"Cterm": 15.0, "D": 3.71, "E": 4.15, "C": 8.14, "Y": 10.10} + else: + pos_pks = {"Nterm": 9.38, "K": 10.67, "R": 12.10, "H": 6.04} + neg_pks = {"Cterm": 2.15, "D": 3.71, "E": 4.15, "C": 8.14, "Y": 10.10} + + aa_content = count_aas(seq, scale="absolute") + aa_content["Nterm"] = 1.0 + aa_content["Cterm"] = 1.0 + pos_charge = 0.0 + for aa, pK in pos_pks.items(): + c_r = 10 ** (pK - ph) + partial_charge = c_r / (c_r + 1.0) + pos_charge += aa_content[aa] * partial_charge + neg_charge = 0.0 + for aa, pK in neg_pks.items(): + c_r = 10 ** (ph - pK) + partial_charge = c_r / (c_r + 1.0) + neg_charge += aa_content[aa] * partial_charge + return round(pos_charge - neg_charge, 3) + + +class GlobalDescriptor(BaseDescriptor): + """ + Base class for global, non-amino acid scale dependant descriptors. The following descriptors can be calculated by + the **methods** linked below: + + - `Sequence Length <modlamp.html#modlamp.descriptors.GlobalDescriptor.length>`_ + - `Molecular Formula <modlamp.html#modlamp.descriptors.GlobalDescriptor.formula>`_ + - `Molecular Weight <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_MW>`_ + - `Sequence Charge <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_charge>`_ + - `Charge Density <modlamp.html#modlamp.descriptors.GlobalDescriptor.charge_density>`_ + - `Isoelectric Point <modlamp.html#modlamp.descriptors.GlobalDescriptor.isoelectric_point>`_ + - `Instability Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.instability_index>`_ + - `Aromaticity <modlamp.html#modlamp.descriptors.GlobalDescriptor.aromaticity>`_ + - `Aliphatic Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.aliphatic_index>`_ + - `Boman Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.boman_index>`_ + - `Hydrophobic Ratio <modlamp.html#modlamp.descriptors.GlobalDescriptor.hydrophobic_ratio>`_ + - `all of the above <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_all>`_ + """ + + def length(self, append=False): + """ + Method to calculate the length (total AA count) of every sequence in the attribute :py:attr:`sequences`. + + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: array of sequence lengths in the attribute :py:attr:`descriptor` + :Example: + + >>> desc = GlobalDescriptor(['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR']) + >>> desc.length() + >>> desc.descriptor + array([[ 8.], [12.], [12.]]) + """ + desc = [] + for seq in self.sequences: + desc.append(float(len(seq.strip()))) + desc = np.asarray(desc).reshape(len(desc), 1) + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + self.featurenames.append("Length") + else: + self.descriptor = np.array(desc) + self.featurenames = ["Length"] + + def formula(self, amide=False, append=False): + """Method to calculate the molecular formula of every sequence in the attribute :py:attr:`sequences`. + + :param amide: {boolean} whether the sequences are C-terminally amidated. + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: array of molecular formulas {str} in the attribute :py:attr:`descriptor` + :Example: + + >>> desc = GlobalDescriptor(['KADSFLSADGHSADFSLDKKLKERL', 'ERTILSDFPQWWFASLDFLNC', 'ACDEFGHIKLMNPQRSTVWY']) + >>> desc.formula(amide=True) + >>> for v in desc.descriptor: + ... print(v[0]) + C122 H197 N35 O39 + C121 H168 N28 O33 S + C106 H157 N29 O30 S2 + + .. seealso:: :py:func:`modlamp.core.aa_formulas()` + + .. versionadded:: v2.7.6 + """ + desc = [] + formulas = aa_formulas() + for seq in self.sequences: + f = {"C": 0, "H": 0, "N": 0, "O": 0, "S": 0} + for aa in seq: # loop over all AAs + for k in f.keys(): + f[k] += formulas[aa][k] + + # substract H2O for every peptide bond + f["H"] -= 2 * (len(seq) - 1) + f["O"] -= len(seq) - 1 + + if amide: # add C-terminal amide --> replace OH with NH2 + f["O"] -= 1 + f["H"] += 1 + f["N"] += 1 + + if f["S"] != 0: + val = "C%s H%s N%s O%s %s%s" % ( + f["C"], + f["H"], + f["N"], + f["O"], + "S", + f["S"], + ) + else: + val = "C%s H%s N%s O%s" % (f["C"], f["H"], f["N"], f["O"]) + + desc.append([val]) + + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + self.featurenames.append("Formula") + else: + self.descriptor = np.array(desc) + self.featurenames = ["Formula"] + + def calculate_MW(self, amide=False, append=False): + """Method to calculate the molecular weight [g/mol] of every sequence in the attribute :py:attr:`sequences`. + + :param amide: {boolean} whether the sequences are C-terminally amidated (subtracts 0.95 from the MW). + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: array of descriptor values in the attribute :py:attr:`descriptor` + :Example: + + >>> desc = GlobalDescriptor('IAESFKGHIPL') + >>> desc.calculate_MW(amide=True) + >>> desc.descriptor + array([[ 1210.43]]) + + .. seealso:: :py:func:`modlamp.core.aa_weights()` + + .. versionchanged:: v2.1.5 amide option added + """ + desc = [] + weights = aa_weights() + for seq in self.sequences: + mw = [] + for aa in seq: # sum over aa weights + mw.append(weights[aa]) + desc.append( + round(sum(mw) - 18.015 * (len(seq) - 1), 2) + ) # sum over AA MW and subtract H20 MW for every + # peptide bond + desc = np.asarray(desc).reshape(len(desc), 1) + if ( + amide + ): # if sequences are amidated, subtract 0.98 from calculated MW (OH - NH2) + desc = [d - 0.98 for d in desc] + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + self.featurenames.append("MW") + else: + self.descriptor = np.array(desc) + self.featurenames = ["MW"] + + def calculate_charge(self, ph=7.0, amide=False, append=False): + """Method to overall charge of every sequence in the attribute :py:attr:`sequences`. + + The method used is first described by Bjellqvist. In the case of amidation, the value for the 'Cterm' pKa is 15 + (and Cterm is added to the pos_pKs dictionary. + The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed). + + **pos_pKs** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} + + **neg_pKs** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} + + :param ph: {float} ph at which to calculate peptide charge. + :param amide: {boolean} whether the sequences have an amidated C-terminus. + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: array of descriptor values in the attribute :py:attr:`descriptor` + :Example: + + >>> desc = GlobalDescriptor('KLAKFGKRSELVALSG') + >>> desc.calculate_charge(ph=7.4, amide=True) + >>> desc.descriptor + array([[ 3.989]]) + """ + + desc = [] + for seq in self.sequences: + desc.append( + _charge(seq, ph, amide) + ) # calculate charge with helper function + desc = np.asarray(desc).reshape(len(desc), 1) + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + self.featurenames.append("Charge") + else: + self.descriptor = np.array(desc) + self.featurenames = ["Charge"] + + def charge_density(self, ph=7.0, amide=False, append=False): + """Method to calculate the charge density (charge / MW) of every sequences in the attributes :py:attr:`sequences` + + :param ph: {float} pH at which to calculate peptide charge. + :param amide: {boolean} whether the sequences have an amidated C-terminus. + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: array of descriptor values in the attribute :py:attr:`descriptor`. + :Example: + + >>> desc = GlobalDescriptor('GNSDLLIEQRTLLASDEF') + >>> desc.charge_density(ph=6, amide=True) + >>> desc.descriptor + array([[-0.00097119]]) + """ + self.calculate_charge(ph, amide) + charges = self.descriptor + self.calculate_MW(amide) + masses = self.descriptor + desc = charges / masses + desc = np.asarray(desc).reshape(len(desc), 1) + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + self.featurenames.append("ChargeDensity") + else: + self.descriptor = np.array(desc) + self.featurenames = ["ChargeDensity"] + + def isoelectric_point(self, amide=False, append=False): + """ + Method to calculate the isoelectric point of every sequence in the attribute :py:attr:`sequences`. + The pK scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed). + + **pos_pKs** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} + + **neg_pKs** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} + + :param amide: {boolean} whether the sequences have an amidated C-terminus. + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: array of descriptor values in the attribute :py:attr:`descriptor` + :Example: + + >>> desc = GlobalDescriptor('KLFDIKFGHIPQRST') + >>> desc.isoelectric_point() + >>> desc.descriptor + array([[ 10.6796875]]) + """ + ph, ph1, ph2 = float(), float(), float() + desc = [] + for seq in self.sequences: + + # Bracket between ph1 and ph2 + ph = 7.0 + charge = _charge(seq, ph, amide) + if charge > 0.0: + ph1 = ph + charge1 = charge + while charge1 > 0.0: + ph = ph1 + 1.0 + charge = _charge(seq, ph, amide) + if charge > 0.0: + ph1 = ph + charge1 = charge + else: + ph2 = ph + break + else: + ph2 = ph + charge2 = charge + while charge2 < 0.0: + ph = ph2 - 1.0 + charge = _charge(seq, ph, amide) + if charge < 0.0: + ph2 = ph + charge2 = charge + else: + ph1 = ph + break + # Bisection + while ph2 - ph1 > 0.0001 and charge != 0.0: + ph = (ph1 + ph2) / 2.0 + charge = _charge(seq, ph, amide) + if charge > 0.0: + ph1 = ph + else: + ph2 = ph + desc.append(ph) + desc = np.asarray(desc).reshape(len(desc), 1) + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + self.featurenames.append("pI") + else: + self.descriptor = np.array(desc) + self.featurenames = ["pI"] + + def instability_index(self, append=False): + """ + Method to calculate the instability of every sequence in the attribute :py:attr:`sequences`. + The instability index is a prediction of protein stability based on the amino acid composition. + ([1] K. Guruprasad, B. V Reddy, M. W. Pandit, Protein Eng. 1990, 4, 155–161.) + + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: array of descriptor values in the attribute :py:attr:`descriptor` + :Example: + + >>> desc = GlobalDescriptor('LLASMNDLLAKRST') + >>> desc.instability_index() + >>> desc.descriptor + array([[ 63.95714286]]) + """ + + desc = [] + dimv = load_scale("instability")[1] + for seq in self.sequences: + stabindex = float() + for i in range(len(seq) - 1): + stabindex += dimv[seq[i]][seq[i + 1]] + desc.append((10.0 / len(seq)) * stabindex) + desc = np.asarray(desc).reshape(len(desc), 1) + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + self.featurenames.append("InstabilityInd") + else: + self.descriptor = np.array(desc) + self.featurenames = ["InstabilityInd"] + + def aromaticity(self, append=False): + """ + Method to calculate the aromaticity of every sequence in the attribute :py:attr:`sequences`. + According to Lobry, 1994, it is simply the relative frequency of Phe+Trp+Tyr. + + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: array of descriptor values in the attribute :py:attr:`descriptor` + :Example: + + >>> desc = GlobalDescriptor('GLFYWRFFLQRRFLYWW') + >>> desc.aromaticity() + >>> desc.descriptor + array([[ 0.52941176]]) + """ + desc = [] + for seq in self.sequences: + f = seq.count("F") + w = seq.count("W") + y = seq.count("Y") + desc.append(float(f + w + y) / len(seq)) + desc = np.asarray(desc).reshape(len(desc), 1) + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + self.featurenames.append("Aromaticity") + else: + self.descriptor = np.array(desc) + self.featurenames = ["Aromaticity"] + + def aliphatic_index(self, append=False): + """ + Method to calculate the aliphatic index of every sequence in the attribute :py:attr:`sequences`. + According to Ikai, 1980, the aliphatic index is a measure of thermal stability of proteins and is dependant + on the relative volume occupied by aliphatic amino acids (A,I,L & V). + ([1] A. Ikai, J. Biochem. 1980, 88, 1895–1898.) + + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: array of descriptor values in the attribute :py:attr:`descriptor` + :Example: + + >>> desc = GlobalDescriptor('KWLKYLKKLAKLVK') + >>> desc.aliphatic_index() + >>> desc.descriptor + array([[ 139.28571429]]) + """ + desc = [] + aa_dict = aa_weights() + for seq in self.sequences: + d = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa + d = { + k: (float(d[k]) / len(seq)) * 100 for k in d.keys() + } # get mole percent of all AA + desc.append( + d["A"] + 2.9 * d["V"] + 3.9 * (d["I"] + d["L"]) + ) # formula for calculating the AI (Ikai, 1980) + desc = np.asarray(desc).reshape(len(desc), 1) + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + self.featurenames.append("AliphaticInd") + else: + self.descriptor = np.array(desc) + self.featurenames = ["AliphaticInd"] + + def boman_index(self, append=False): + """Method to calculate the boman index of every sequence in the attribute :py:attr:`sequences`. + According to Boman, 2003, the boman index is a measure for protein-protein interactions and is calculated by + summing over all amino acid free energy of transfer [kcal/mol] between water and cyclohexane,[2] followed by + dividing by sequence length. + ([1] H. G. Boman, D. Wade, I. a Boman, B. Wåhlin, R. B. Merrifield, *FEBS Lett*. **1989**, *259*, 103–106. + [2] A. Radzick, R. Wolfenden, *Biochemistry* **1988**, *27*, 1664–1670.) + + .. seealso:: :py:func:`modlamp.core.aa_energies()` + + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: array of descriptor values in the attribute :py:attr:`descriptor` + :Example: + + >>> desc = GlobalDescriptor('GLFDIVKKVVGALGSL') + >>> desc.boman_index() + >>> desc.descriptor + array([[-1.011875]]) + """ + d = aa_energies() + desc = [] + for seq in self.sequences: + val = [] + for a in seq: + val.append(d[a]) + desc.append(sum(val) / len(val)) + desc = np.asarray(desc).reshape(len(desc), 1) + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + self.featurenames.append("BomanInd") + else: + self.descriptor = np.array(desc) + self.featurenames = ["BomanInd"] + + def hydrophobic_ratio(self, append=False): + """ + Method to calculate the hydrophobic ratio of every sequence in the attribute :py:attr:`sequences`, which is the + relative frequency of the amino acids **A,C,F,I,L,M & V**. + + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: array of descriptor values in the attribute :py:attr:`descriptor` + :Example: + + >>> desc = GlobalDescriptor('VALLYWRTVLLAIII') + >>> desc.hydrophobic_ratio() + >>> desc.descriptor + array([[ 0.73333333]]) + """ + desc = [] + aa_dict = aa_weights() + for seq in self.sequences: + pa = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa + # formula for calculating the AI (Ikai, 1980): + desc.append( + (pa["A"] + pa["C"] + pa["F"] + pa["I"] + pa["L"] + pa["M"] + pa["V"]) + / float(len(seq)) + ) + desc = np.asarray(desc).reshape(len(desc), 1) + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + self.featurenames.append("HydrophRatio") + else: + self.descriptor = np.array(desc) + self.featurenames = ["HydrophRatio"] + + def calculate_all(self, ph=7.4, amide=True): + """Method combining all global descriptors and appending them into the feature matrix in the attribute + :py:attr:`descriptor`. + + :param ph: {float} pH at which to calculate peptide charge + :param amide: {boolean} whether the sequences have an amidated C-terminus. + :return: array of descriptor values in the attribute :py:attr:`descriptor` + :Example: + + >>> desc = GlobalDescriptor('AFGHFKLKKLFIFGHERT') + >>> desc.calculate_all(amide=True) + >>> desc.featurenames + ['Length', 'MW', 'ChargeDensity', 'pI', 'InstabilityInd', 'Aromaticity', 'AliphaticInd', 'BomanInd', 'HydRatio'] + >>> desc.descriptor + array([[ 18., 2.17559000e+03, 1.87167619e-03, 1.16757812e+01, ... 1.10555556e+00, 4.44444444e-01]]) + >>> desc.save_descriptor('/path/to/outputfile.csv') # save the descriptor data (with feature names header) + """ + + # This is a strange way of doing it. However, the append=True option excludes length and charge, no idea why! + fn = [] + self.length() # sequence length + l = self.descriptor + fn.extend(self.featurenames) + self.calculate_MW(amide=amide) # molecular weight + mw = self.descriptor + fn.extend(self.featurenames) + self.calculate_charge(ph=ph, amide=amide) # net charge + c = self.descriptor + fn.extend(self.featurenames) + self.charge_density(ph=ph, amide=amide) # charge density + cd = self.descriptor + fn.extend(self.featurenames) + self.isoelectric_point(amide=amide) # pI + pi = self.descriptor + fn.extend(self.featurenames) + self.instability_index() # instability index + si = self.descriptor + fn.extend(self.featurenames) + self.aromaticity() # global aromaticity + ar = self.descriptor + fn.extend(self.featurenames) + self.aliphatic_index() # aliphatic index + ai = self.descriptor + fn.extend(self.featurenames) + self.boman_index() # Boman index + bi = self.descriptor + fn.extend(self.featurenames) + self.hydrophobic_ratio() # Hydrophobic ratio + hr = self.descriptor + fn.extend(self.featurenames) + + self.descriptor = np.concatenate((l, mw, c, cd, pi, si, ar, ai, bi, hr), axis=1) + self.featurenames = fn + + +class PeptideDescriptor(BaseDescriptor): + """Base class for peptide descriptors. The following **amino acid descriptor scales** are available for descriptor + calculation: + + - **AASI** (An amino acid selectivity index scale for helical antimicrobial peptides, *[1] D. Juretić, D. Vukicević, N. Ilić, N. Antcheva, A. Tossi, J. Chem. Inf. Model. 2009, 49, 2873–2882.*) + - **ABHPRK** (modlabs inhouse physicochemical feature scale (Acidic, Basic, Hydrophobic, Polar, aRomatic, Kink-inducer) + - **argos** (Argos hydrophobicity amino acid scale, *[2] Argos, P., Rao, J. K. M. & Hargrave, P. A., Eur. J. Biochem. 2005, 128, 565–575.*) + - **bulkiness** (Amino acid side chain bulkiness scale, *[3] J. M. Zimmerman, N. Eliezer, R. Simha, J. Theor. Biol. 1968, 21, 170–201.*) + - **charge_phys** (Amino acid charge at pH 7.0 - Hystidine charge +0.1.) + - **charge_acid** (Amino acid charge at acidic pH - Hystidine charge +1.0.) + - **cougar** (modlabs inhouse selection of global peptide descriptors) + - **eisenberg** (the Eisenberg hydrophobicity consensus amino acid scale, *[4] D. Eisenberg, R. M. Weiss, T. C. Terwilliger, W. Wilcox, Faraday Symp. Chem. Soc. 1982, 17, 109.*) + - **Ez** (potential that assesses energies of insertion of amino acid side chains into lipid bilayers, *[5] A. Senes, D. C. Chadi, P. B. Law, R. F. S. Walters, V. Nanda, W. F. DeGrado, J. Mol. Biol. 2007, 366, 436–448.*) + - **flexibility** (amino acid side chain flexibilitiy scale, *[6] R. Bhaskaran, P. K. Ponnuswamy, Int. J. Pept. Protein Res. 1988, 32, 241–255.*) + - **grantham** (amino acid side chain composition, polarity and molecular volume, *[8] Grantham, R. Science. 185, 862–864 (1974).*) + - **gravy** (GRAVY hydrophobicity amino acid scale, *[9] J. Kyte, R. F. Doolittle, J. Mol. Biol. 1982, 157, 105–132.*) + - **hopp-woods** (Hopp-Woods amino acid hydrophobicity scale,*[10] T. P. Hopp, K. R. Woods, Proc. Natl. Acad. Sci. 1981, 78, 3824–3828.*) + - **ISAECI** (Isotropic Surface Area (ISA) and Electronic Charge Index (ECI) of amino acid side chains, *[11] E. R. Collantes, W. J. Dunn, J. Med. Chem. 1995, 38, 2705–2713.*) + - **janin** (Janin hydrophobicity amino acid scale, *[12] J. L. Cornette, K. B. Cease, H. Margalit, J. L. Spouge, J. A. Berzofsky, C. DeLisi, J. Mol. Biol. 1987, 195, 659–685.*) + - **kytedoolittle** (Kyte & Doolittle hydrophobicity amino acid scale, *[13] J. Kyte, R. F. Doolittle, J. Mol. Biol. 1982, 157, 105–132.*) + - **levitt_alpha** (Levitt amino acid alpha-helix propensity scale, extracted from http://web.expasy.org/protscale. *[14] M. Levitt, Biochemistry 1978, 17, 4277-4285.*) + - **MSS** (A graph-theoretical index that reflects topological shape and size of amino acid side chains, *[15] C. Raychaudhury, A. Banerjee, P. Bag, S. Roy, J. Chem. Inf. Comput. Sci. 1999, 39, 248–254.*) + - **MSW** (Amino acid scale based on a PCA of the molecular surface based WHIM descriptor (MS-WHIM), extended to natural amino acids, *[16] A. Zaliani, E. Gancia, J. Chem. Inf. Comput. Sci 1999, 39, 525–533.*) + - **pepArc** (modlabs pharmacophoric feature scale, dimensions are: hydrophobicity, polarity, positive charge, negative charge, proline.) + - **pepcats** (modlabs pharmacophoric feature based PEPCATS scale, *[17] C. P. Koch, A. M. Perna, M. Pillong, N. K. Todoroff, P. Wrede, G. Folkers, J. A. Hiss, G. Schneider, PLoS Comput. Biol. 2013, 9, e1003088.*) + - **polarity** (Amino acid polarity scale, *[18] J. M. Zimmerman, N. Eliezer, R. Simha, J. Theor. Biol. 1968, 21, 170–201.*) + - **PPCALI** (modlabs inhouse scale derived from a PCA of 143 amino acid property scales, *[19] C. P. Koch, A. M. Perna, M. Pillong, N. K. Todoroff, P. Wrede, G. Folkers, J. A. Hiss, G. Schneider, PLoS Comput. Biol. 2013, 9, e1003088.*) + - **refractivity** (Relative amino acid refractivity values, *[20] T. L. McMeekin, M. Wilensky, M. L. Groves, Biochem. Biophys. Res. Commun. 1962, 7, 151–156.*) + - **t_scale** (A PCA derived scale based on amino acid side chain properties calculated with 6 different probes of the GRID program, *[21] M. Cocchi, E. Johansson, Quant. Struct. Act. Relationships 1993, 12, 1–8.*) + - **TM_tend** (Amino acid transmembrane propensity scale, extracted from http://web.expasy.org/protscale, *[22] Zhao, G., London E. Protein Sci. 2006, 15, 1987-2001.*) + - **z3** (The original three dimensional Z-scale, *[23] S. Hellberg, M. Sjöström, B. Skagerberg, S. Wold, J. Med. Chem. 1987, 30, 1126–1135.*) + - **z5** (The extended five dimensional Z-scale, *[24] M. Sandberg, L. Eriksson, J. Jonsson, M. Sjöström, S. Wold, J. Med. Chem. 1998, 41, 2481–2491.*) + + Further, amino acid scale independent methods can be calculated with help of the :class:`GlobalDescriptor` class. + + """ + + def __init__(self, seqs, scalename="Eisenberg"): + """ + :param seqs: a .fasta file with sequences, a list of sequences or a single sequence as string to calculate the + descriptor values for. + :param scalename: {str} name of the amino acid scale (one of the given list above) used to calculate the + descriptor values + :return: initialized attributes :py:attr:`sequences`, :py:attr:`names` and dictionary :py:attr:`scale` with + amino acid scale values of the scale name in :py:attr:`scalename`. + :Example: + + >>> AMP = PeptideDescriptor('KLLKLLKKLLKLLK','pepcats') + >>> AMP.sequences + ['KLLKLLKKLLKLLK'] + >>> seqs = PeptideDescriptor('/Path/to/file.fasta', 'eisenberg') # load sequences from .fasta file + >>> seqs.sequences + ['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'...] + """ + super(PeptideDescriptor, self).__init__(seqs) + self.scalename, self.scale = load_scale(scalename.lower()) + self.all_moms = list() # for passing hydrophobic moments to calculate_profile + self.all_globs = list() # for passing global to calculate_profile + + def load_scale(self, scalename): + """Method to load amino acid values from a given scale + + :param scalename: {str} name of the amino acid scale to be loaded. + :return: loaded amino acid scale values in a dictionary in the attribute :py:attr:`scale`. + + .. seealso:: :func:`modlamp.core.load_scale()` + """ + self.scalename, self.scale = load_scale(scalename.lower()) + + def calculate_autocorr(self, window, append=False): + """Method for auto-correlating the amino acid values for a given descriptor scale + + :param window: {int} correlation window for descriptor calculation in a sliding window approach + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: calculated descriptor numpy.array in the attribute :py:attr:`descriptor`. + :Example: + + >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','PPCALI') + >>> AMP.calculate_autocorr(7) + >>> AMP.descriptor + array([[ 1.28442339e+00, 1.29025116e+00, 1.03240901e+00, .... ]]) + >>> AMP.descriptor.shape + (1, 133) + + .. versionchanged:: v.2.3.0 + """ + desc = Parallel(n_jobs=-1)( + delayed(_one_autocorr)(seq, window, self.scale) for seq in self.sequences + ) + + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + else: + self.descriptor = np.array(desc) + + def calculate_crosscorr(self, window, append=False): + """Method for cross-correlating the amino acid values for a given descriptor scale + + :param window: {int} correlation window for descriptor calculation in a sliding window approach + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: calculated descriptor numpy.array in the attribute :py:attr:`descriptor`. + :Example: + + >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','pepcats') + >>> AMP.calculate_crosscorr(7) + >>> AMP.descriptor + array([[ 0.6875 , 0.46666667, 0.42857143, 0.61538462, 0.58333333, ... ]]) + >>> AMP.descriptor.shape + (1, 147) + """ + desc = Parallel(n_jobs=-1)( + delayed(_one_crosscorr)(seq, window, self.scale) for seq in self.sequences + ) + + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + else: + self.descriptor = np.array(desc) + + def calculate_moment(self, window=1000, angle=100, modality="max", append=False): + """Method for calculating the maximum or mean moment of the amino acid values for a given descriptor scale and + window. + + :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the + window, the length of the sequence is taken. So if the default window of 1000 is chosen, for all sequences + shorter than 1000, the **global** hydrophobic moment will be calculated. Otherwise, the maximal + hydrophiobic moment for the chosen window size found in the sequence will be returned. + :param angle: {int} angle in which to calculate the moment. **100** for alpha helices, **180** for beta sheets. + :param modality: {'all', 'max' or 'mean'} Calculate respectively maximum or mean hydrophobic moment. If all, + moments for all windows are returned. + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: Calculated descriptor as a numpy.array in the attribute :py:attr:`descriptor` and all possible global + values in :py:attr:`all_moms` (needed for the :py:func:`calculate_profile` method) + :Example: + + >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL', 'eisenberg') + >>> AMP.calculate_moment() + >>> AMP.descriptor + array([[ 0.48790226]]) + """ + if self.scale["A"] == list: + print( + "\n Descriptor moment calculation is only possible for one dimensional descriptors.\n" + ) + + else: + desc = [] + for seq in self.sequences: + wdw = min( + window, len(seq) + ) # if sequence is shorter than window, take the whole sequence instead + mtrx = [] + mwdw = [] + + for aa in range(len(seq)): + mtrx.append(self.scale[str(seq[aa])]) + + for i in range(len(mtrx) - wdw + 1): + mwdw.append(sum(mtrx[i : i + wdw], [])) + + mwdw = np.asarray(mwdw) + rads = ( + angle * (np.pi / 180) * np.asarray(range(wdw)) + ) # calculate actual moment (radial) + vcos = (mwdw * np.cos(rads)).sum(axis=1) + vsin = (mwdw * np.sin(rads)).sum(axis=1) + moms = np.sqrt(vsin**2 + vcos**2) / wdw + + if modality == "max": # take window with maximal value + moment = np.max(moms) + elif modality == "mean": # take average value over all windows + moment = np.mean(moms) + elif modality == "all": + moment = moms + else: + print( + '\nERROR!\nModality parameter is wrong, please choose between "all", "max" and "mean".\n' + ) + return + desc.append(moment) + self.all_moms.append(moms) + + desc = np.asarray(desc).reshape(len(desc), 1) # final descriptor array + + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + else: + self.descriptor = np.array(desc) + + def calculate_global(self, window=1000, modality="max", append=False): + """Method for calculating a global / window averaging descriptor value of a given AA scale + + :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the + window, the length of the sequence is taken. + :param modality: {'max' or 'mean'} Calculate respectively maximum or mean hydrophobic moment. + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: Calculated descriptor as a numpy.array in the attribute :py:attr:`descriptor` and all possible global + values in :py:attr:`all_globs` (needed for the :py:func:`calculate_profile` method) + :Example: + + >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','eisenberg') + >>> AMP.calculate_global(window=1000, modality='max') + >>> AMP.descriptor + array([[ 0.44875]]) + """ + desc = list() + for n, seq in enumerate(self.sequences): + wdw = min( + window, len(seq) + ) # if sequence is shorter than window, take the whole sequence instead + mtrx = [] + mwdw = [] + + for l in range(len(seq)): # translate AA sequence into values + mtrx.append(self.scale[str(seq[l])]) + + for i in range(len(mtrx) - wdw + 1): + mwdw.append( + sum(mtrx[i : i + wdw], []) + ) # list of all the values for the different windows + + mwdw = np.asarray(mwdw) + glob = np.sum(mwdw, axis=1) / float(wdw) + outglob = float() + + if modality in ["max", "mean"]: + if modality == "max": + outglob = np.max( + glob + ) # returned moment will be the maximum of all windows + elif modality == "mean": + outglob = np.mean( + glob + ) # returned moment will be the mean of all windows + else: + print( + 'Modality parameter is wrong, please choose between "max" and "mean"\n.' + ) + return + desc.append(outglob) + self.all_globs.append(glob) + + desc = np.asarray(desc).reshape(len(desc), 1) + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + else: + self.descriptor = np.array(desc) + + def calculate_profile(self, prof_type="uH", window=7, append=False): + """Method for calculating hydrophobicity or hydrophobic moment profiles for given sequences and fitting for + slope and intercept. The hydrophobicity scale used is "eisenberg" + + :param prof_type: prof_type of profile, available: 'H' for hydrophobicity or 'uH' for hydrophobic moment + :param window: {int} size of sliding window used (odd-numbered). + :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the + attribute :py:attr:`descriptor`. + :return: Fitted slope and intercept of calculated profile for every given sequence in the attribute + :py:attr:`descriptor`. + :Example: + + >>> AMP = PeptideDescriptor('KLLKLLKKVVGALG','kytedoolittle') + >>> AMP.calculate_profile(prof_type='H') + >>> AMP.descriptor + array([[ 0.03731293, 0.19246599]]) + """ + if prof_type == "uH": + self.calculate_moment(window=window) + y_vals = self.all_moms + elif prof_type == "H": + self.calculate_global(window=window) + y_vals = self.all_globs + else: + print( + 'prof_type parameter is unknown, choose "uH" for hydrophobic moment or "H" for hydrophobicity\n.' + ) + sys.exit() + + desc = list() + for n, seq in enumerate(self.sequences): + x_vals = range(len(seq))[int((window - 1) / 2) : -int((window - 1) / 2)] + if len(seq) <= window: + slope, intercept, r_value, p_value, std_err = [0, 0, 0, 0, 0] + else: + slope, intercept, r_value, p_value, std_err = stats.linregress( + x_vals, y_vals[n] + ) + desc.append([slope, intercept]) + + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + else: + self.descriptor = np.array(desc) + + def calculate_arc(self, modality="max", append=False): + """Method for calculating property arcs as seen in the helical wheel plot. Use for binary amino acid scales only. + + :param modality: modality of the arc to calculate, to choose between "max" and "mean". + :param append: if true, append to current descriptor stored in the descriptor attribute. + :return: calculated descriptor as numpy.array in the descriptor attribute. + + :Example: + + >>> arc = PeptideDescriptor("KLLKLLKKLLKLLK", scalename="peparc") + >>> arc.calculate_arc(modality="max", append=False) + >>> arc.descriptor + array([[200, 160, 160, 0, 0]]) + """ + desc = Parallel(n_jobs=-1)( + delayed(_one_arc)(seq, modality, self.scale) for seq in self.sequences + ) + + # Converts each of the amino acids to descriptor vector + for seq in self.sequences: + + # desc_mat = [] + # for aa in seq: + # desc_mat.append(self.scale[aa]) + # desc_mat = np.asarray(desc_mat) + # + # # Check descriptor dimension + # desc_dim = desc_mat.shape[1] + # + # # list to store descriptor values for all windows + # allwindows_arc = [] + # + # if len(seq) > 18: + # window = 18 + # # calculates number of windows in sequence + # num_windows = len(seq) - window + # else: + # window = len(seq) + # num_windows = 1 + # + # # loop through all windows + # for j in range(num_windows): + # # slices descriptor matrix into current window + # window_mat = desc_mat[j:j + window, :] + # + # # defines order of amino acids in helical projection + # order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7] + # + # # orders window descriptor matrix into helical projection order + # ordered = [] + # for pos in order: + # try: + # ordered.append(window_mat[pos, :]) + # except: + # # for sequences of len < 18 adding dummy vector with 2s, length of descriptor dimensions + # ordered.append([2] * desc_dim) + # ordered = np.asarray(ordered) + # + # window_arc = [] + # + # # loop through pharmacophoric features + # for m in range(desc_dim): + # all_arcs = [] # stores all arcs that can be found of a pharmacophoric feature + # arc = 0 + # + # for n in range(18): # for all positions in helix, regardless of sequence length + # if ordered[n, m] == 0: # if position does not contain pharmacophoric feature + # all_arcs.append(arc) # append previous arc to all arcs list + # arc = 0 # arc is initialized + # elif ordered[n, m] == 1: # if position contains pharmacophoric feature(PF), elongate arc by 20° + # arc += 20 + # elif ordered[n, m] == 2: # if position doesn't contain amino acid: + # if ordered[n - 1, m] == 1: # if previous position contained PF add 10° + # arc += 10 + # elif ordered[n - 1, m] == 0: # if previous position didn't contain PF don't add anything + # arc += 0 + # elif ordered[ + # n - 2, m] == 1: # if previous position is empty then check second previous for PF + # arc += 10 + # if n == 17: # if we are at the last position check for position n=0 instead of next position. + # if ordered[0, m] == 1: # if it contains PF add 10° extra + # arc += 10 + # else: # if next position contains PF add 10° extra + # if ordered[n + 1, m] == 1: + # arc += 10 + # elif ordered[n + 1, m] == 0: + # arc += 0 + # else: # if next position is empty check for 2nd next position + # if n == 16: + # if ordered[0, m] == 1: + # arc += 10 + # else: + # if ordered[n + 2, m] == 1: + # arc += 10 + # + # all_arcs.append(arc) + # if not arc == 360: + # arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together + # all_arcs = [arc0] + all_arcs[1:] + # + # window_arc.append(np.max(all_arcs)) # append to window arcs the maximum arc of this PF + # allwindows_arc.append(window_arc) # append all PF arcs of this window + # + # allwindows_arc = np.asarray(allwindows_arc) + # + # if modality == 'max': + # final_arc = np.max(allwindows_arc, axis=0) # calculate maximum / mean arc along all windows + # elif modality == 'mean': + # final_arc = np.mean(allwindows_arc, axis=0) + # else: + # print('modality is unknown, please choose between "max" and "mean"\n.') + # sys.exit() + + if append: + self.descriptor = np.hstack((self.descriptor, np.array(desc))) + else: + self.descriptor = np.array(desc)