Mercurial > repos > cpt > cpt_helical_wheel
comparison plotWheels/descriptors.py @ 1:9b276485c94a draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt |
|---|---|
| date | Mon, 05 Jun 2023 02:44:43 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:9caa9aa44fd8 | 1:9b276485c94a |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """ | |
| 3 .. currentmodule:: modlamp.descriptors | |
| 4 | |
| 5 .. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch> | |
| 6 | |
| 7 This module incorporates different classes to calculate peptide descriptor values. The following classes are available: | |
| 8 | |
| 9 ============================= ============================================================================ | |
| 10 Class Characteristics | |
| 11 ============================= ============================================================================ | |
| 12 :py:class:`GlobalDescriptor` Global one-dimensional peptide descriptors calculated from the AA sequence. | |
| 13 :py:class:`PeptideDescriptor` AA scale based global or convoluted descriptors (auto-/cross-correlated). | |
| 14 ============================= ============================================================================ | |
| 15 | |
| 16 .. seealso:: :class:`modlamp.core.BaseDescriptor` from which the classes in :mod:`modlamp.descriptors` inherit. | |
| 17 """ | |
| 18 | |
| 19 import sys | |
| 20 | |
| 21 import numpy as np | |
| 22 from scipy import stats | |
| 23 from sklearn.externals.joblib import Parallel, delayed | |
| 24 | |
| 25 from plotWheels.core import ( | |
| 26 BaseDescriptor, | |
| 27 load_scale, | |
| 28 count_aas, | |
| 29 aa_weights, | |
| 30 aa_energies, | |
| 31 aa_formulas, | |
| 32 ) | |
| 33 | |
| 34 __author__ = "Alex Müller, Gisela Gabernet" | |
| 35 __docformat__ = "restructuredtext en" | |
| 36 | |
| 37 | |
| 38 def _one_autocorr(seq, window, scale): | |
| 39 """Private function used for calculating auto-correlated descriptors for 1 given sequence, window and an AA scale. | |
| 40 This function is used by the :py:func:`calculate_autocorr` method of :py:class:`PeptideDescriptor`. | |
| 41 | |
| 42 :param seq: {str} amino acid sequence to calculate descriptor for | |
| 43 :param window: {int} correlation-window size | |
| 44 :param scale: {str} amino acid scale to be used to calculate descriptor | |
| 45 :return: {numpy.array} calculated descriptor data | |
| 46 """ | |
| 47 try: | |
| 48 m = list() # list of lists to store translated sequence values | |
| 49 for l in range(len(seq)): # translate AA sequence into values | |
| 50 m.append(scale[str(seq[l])]) | |
| 51 # auto-correlation in defined sequence window | |
| 52 seqdesc = list() | |
| 53 for dist in range(window): # for all correlation distances | |
| 54 for val in range( | |
| 55 len(scale["A"]) | |
| 56 ): # for all features of the descriptor scale | |
| 57 valsum = list() | |
| 58 cntr = 0.0 | |
| 59 for pos in range(len(seq)): # for every position in the sequence | |
| 60 if (pos + dist) < len( | |
| 61 seq | |
| 62 ): # check if corr distance is possible at that sequence position | |
| 63 cntr += 1 # counter to scale sum | |
| 64 valsum.append(m[pos][val] * m[pos + dist][val]) | |
| 65 seqdesc.append( | |
| 66 sum(valsum) / cntr | |
| 67 ) # append scaled correlation distance values | |
| 68 return seqdesc | |
| 69 except ZeroDivisionError: | |
| 70 print( | |
| 71 "ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" | |
| 72 % (window, seq) | |
| 73 ) | |
| 74 | |
| 75 | |
| 76 def _one_crosscorr(seq, window, scale): | |
| 77 """Private function used for calculating cross-correlated descriptors for 1 given sequence, window and an AA scale. | |
| 78 This function is used by the :py:func:`calculate_crosscorr` method of :py:class:`PeptideDescriptor`. | |
| 79 | |
| 80 :param seq: {str} amino acid sequence to calculate descriptor for | |
| 81 :param window: {int} correlation-window size | |
| 82 :param scale: {str} amino acid scale to be used to calculate descriptor | |
| 83 :return: {numpy.array} calculated descriptor data | |
| 84 """ | |
| 85 try: | |
| 86 m = list() # list of lists to store translated sequence values | |
| 87 for l in range(len(seq)): # translate AA sequence into values | |
| 88 m.append(scale[str(seq[l])]) | |
| 89 # auto-correlation in defined sequence window | |
| 90 seqdesc = list() | |
| 91 for val in range(len(scale["A"])): # for all features of the descriptor scale | |
| 92 for cc in range(len(scale["A"])): # for every feature cross correlation | |
| 93 if (val + cc) < len( | |
| 94 scale["A"] | |
| 95 ): # check if corr distance is in range of the num of features | |
| 96 for dist in range(window): # for all correlation distances | |
| 97 cntr = float() | |
| 98 valsum = list() | |
| 99 for pos in range( | |
| 100 len(seq) | |
| 101 ): # for every position in the sequence | |
| 102 if (pos + dist) < len( | |
| 103 seq | |
| 104 ): # check if corr distance is possible at that sequence pos | |
| 105 cntr += 1 # counter to scale sum | |
| 106 valsum.append(m[pos][val] * m[pos + dist][val + cc]) | |
| 107 seqdesc.append( | |
| 108 sum(valsum) / cntr | |
| 109 ) # append scaled correlation distance values | |
| 110 return seqdesc | |
| 111 except ZeroDivisionError: | |
| 112 print( | |
| 113 "ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" | |
| 114 % (window, seq) | |
| 115 ) | |
| 116 | |
| 117 | |
| 118 def _one_arc(seq, modality, scale): | |
| 119 """Privat function used for calculating arc descriptors for one sequence and AA scale. This function is used by | |
| 120 :py:func:`calculate_arc` method method of :py:class:`PeptideDescriptor`. | |
| 121 | |
| 122 :param seq: {str} amino acid sequence to calculate descriptor for | |
| 123 :param scale: {str} amino acid scale to be used to calculate descriptor | |
| 124 :return: {numpy.array} calculated descriptor data | |
| 125 """ | |
| 126 desc_mat = [] | |
| 127 for aa in seq: | |
| 128 desc_mat.append(scale[aa]) | |
| 129 desc_mat = np.asarray(desc_mat) | |
| 130 | |
| 131 # Check descriptor dimension | |
| 132 desc_dim = desc_mat.shape[1] | |
| 133 | |
| 134 # list to store descriptor values for all windows | |
| 135 allwindows_arc = [] | |
| 136 | |
| 137 if len(seq) > 18: | |
| 138 window = 18 | |
| 139 # calculates number of windows in sequence | |
| 140 num_windows = len(seq) - window | |
| 141 else: | |
| 142 window = len(seq) | |
| 143 num_windows = 1 | |
| 144 | |
| 145 # loop through all windows | |
| 146 for j in range(num_windows): | |
| 147 # slices descriptor matrix into current window | |
| 148 window_mat = desc_mat[j : j + window, :] | |
| 149 | |
| 150 # defines order of amino acids in helical projection | |
| 151 order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7] | |
| 152 | |
| 153 # orders window descriptor matrix into helical projection order | |
| 154 ordered = [] | |
| 155 for pos in order: | |
| 156 try: | |
| 157 ordered.append(window_mat[pos, :]) | |
| 158 except: | |
| 159 # for sequences of len < 18 adding dummy vector with 2s, length of descriptor dimensions | |
| 160 ordered.append([2] * desc_dim) | |
| 161 ordered = np.asarray(ordered) | |
| 162 | |
| 163 window_arc = [] | |
| 164 | |
| 165 # loop through pharmacophoric features | |
| 166 for m in range(desc_dim): | |
| 167 all_arcs = ( | |
| 168 [] | |
| 169 ) # stores all arcs that can be found of a pharmacophoric feature | |
| 170 arc = 0 | |
| 171 | |
| 172 for n in range( | |
| 173 18 | |
| 174 ): # for all positions in helix, regardless of sequence length | |
| 175 if ( | |
| 176 ordered[n, m] == 0 | |
| 177 ): # if position does not contain pharmacophoric feature | |
| 178 all_arcs.append(arc) # append previous arc to all arcs list | |
| 179 arc = 0 # arc is initialized | |
| 180 elif ( | |
| 181 ordered[n, m] == 1 | |
| 182 ): # if position contains pharmacophoric feature(PF), elongate arc by 20° | |
| 183 arc += 20 | |
| 184 elif ordered[n, m] == 2: # if position doesn't contain amino acid: | |
| 185 if ( | |
| 186 ordered[n - 1, m] == 1 | |
| 187 ): # if previous position contained PF add 10° | |
| 188 arc += 10 | |
| 189 elif ( | |
| 190 ordered[n - 1, m] == 0 | |
| 191 ): # if previous position didn't contain PF don't add anything | |
| 192 arc += 0 | |
| 193 elif ( | |
| 194 ordered[n - 2, m] == 1 | |
| 195 ): # if previous position is empty then check second previous for PF | |
| 196 arc += 10 | |
| 197 if ( | |
| 198 n == 17 | |
| 199 ): # if we are at the last position check for position n=0 instead of next position. | |
| 200 if ordered[0, m] == 1: # if it contains PF add 10° extra | |
| 201 arc += 10 | |
| 202 else: # if next position contains PF add 10° extra | |
| 203 if ordered[n + 1, m] == 1: | |
| 204 arc += 10 | |
| 205 elif ordered[n + 1, m] == 0: | |
| 206 arc += 0 | |
| 207 else: # if next position is empty check for 2nd next position | |
| 208 if n == 16: | |
| 209 if ordered[0, m] == 1: | |
| 210 arc += 10 | |
| 211 else: | |
| 212 if ordered[n + 2, m] == 1: | |
| 213 arc += 10 | |
| 214 | |
| 215 all_arcs.append(arc) | |
| 216 if not arc == 360: | |
| 217 arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together | |
| 218 all_arcs = [arc0] + all_arcs[1:] | |
| 219 | |
| 220 window_arc.append( | |
| 221 np.max(all_arcs) | |
| 222 ) # append to window arcs the maximum arc of this PF | |
| 223 allwindows_arc.append(window_arc) # append all PF arcs of this window | |
| 224 | |
| 225 allwindows_arc = np.asarray(allwindows_arc) | |
| 226 | |
| 227 if modality == "max": | |
| 228 final_arc = np.max( | |
| 229 allwindows_arc, axis=0 | |
| 230 ) # calculate maximum / mean arc along all windows | |
| 231 elif modality == "mean": | |
| 232 final_arc = np.mean(allwindows_arc, axis=0) | |
| 233 else: | |
| 234 print('modality is unknown, please choose between "max" and "mean"\n.') | |
| 235 sys.exit() | |
| 236 return final_arc | |
| 237 | |
| 238 | |
| 239 def _charge(seq, ph=7.0, amide=False): | |
| 240 """Calculates charge of a single sequence. The method used is first described by Bjellqvist. In the case of | |
| 241 amidation, the value for the 'Cterm' pKa is 15 (and Cterm is added to the pos_pks dictionary. | |
| 242 The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed). | |
| 243 | |
| 244 **pos_pks** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} | |
| 245 | |
| 246 **neg_pks** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} | |
| 247 | |
| 248 :param ph: {float} pH at which to calculate peptide charge. | |
| 249 :param amide: {boolean} whether the sequences have an amidated C-terminus. | |
| 250 :return: {array} descriptor values in the attribute :py:attr:`descriptor | |
| 251 """ | |
| 252 | |
| 253 if amide: | |
| 254 pos_pks = {"Nterm": 9.38, "K": 10.67, "R": 12.10, "H": 6.04} | |
| 255 neg_pks = {"Cterm": 15.0, "D": 3.71, "E": 4.15, "C": 8.14, "Y": 10.10} | |
| 256 else: | |
| 257 pos_pks = {"Nterm": 9.38, "K": 10.67, "R": 12.10, "H": 6.04} | |
| 258 neg_pks = {"Cterm": 2.15, "D": 3.71, "E": 4.15, "C": 8.14, "Y": 10.10} | |
| 259 | |
| 260 aa_content = count_aas(seq, scale="absolute") | |
| 261 aa_content["Nterm"] = 1.0 | |
| 262 aa_content["Cterm"] = 1.0 | |
| 263 pos_charge = 0.0 | |
| 264 for aa, pK in pos_pks.items(): | |
| 265 c_r = 10 ** (pK - ph) | |
| 266 partial_charge = c_r / (c_r + 1.0) | |
| 267 pos_charge += aa_content[aa] * partial_charge | |
| 268 neg_charge = 0.0 | |
| 269 for aa, pK in neg_pks.items(): | |
| 270 c_r = 10 ** (ph - pK) | |
| 271 partial_charge = c_r / (c_r + 1.0) | |
| 272 neg_charge += aa_content[aa] * partial_charge | |
| 273 return round(pos_charge - neg_charge, 3) | |
| 274 | |
| 275 | |
| 276 class GlobalDescriptor(BaseDescriptor): | |
| 277 """ | |
| 278 Base class for global, non-amino acid scale dependant descriptors. The following descriptors can be calculated by | |
| 279 the **methods** linked below: | |
| 280 | |
| 281 - `Sequence Length <modlamp.html#modlamp.descriptors.GlobalDescriptor.length>`_ | |
| 282 - `Molecular Formula <modlamp.html#modlamp.descriptors.GlobalDescriptor.formula>`_ | |
| 283 - `Molecular Weight <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_MW>`_ | |
| 284 - `Sequence Charge <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_charge>`_ | |
| 285 - `Charge Density <modlamp.html#modlamp.descriptors.GlobalDescriptor.charge_density>`_ | |
| 286 - `Isoelectric Point <modlamp.html#modlamp.descriptors.GlobalDescriptor.isoelectric_point>`_ | |
| 287 - `Instability Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.instability_index>`_ | |
| 288 - `Aromaticity <modlamp.html#modlamp.descriptors.GlobalDescriptor.aromaticity>`_ | |
| 289 - `Aliphatic Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.aliphatic_index>`_ | |
| 290 - `Boman Index <modlamp.html#modlamp.descriptors.GlobalDescriptor.boman_index>`_ | |
| 291 - `Hydrophobic Ratio <modlamp.html#modlamp.descriptors.GlobalDescriptor.hydrophobic_ratio>`_ | |
| 292 - `all of the above <modlamp.html#modlamp.descriptors.GlobalDescriptor.calculate_all>`_ | |
| 293 """ | |
| 294 | |
| 295 def length(self, append=False): | |
| 296 """ | |
| 297 Method to calculate the length (total AA count) of every sequence in the attribute :py:attr:`sequences`. | |
| 298 | |
| 299 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 300 attribute :py:attr:`descriptor`. | |
| 301 :return: array of sequence lengths in the attribute :py:attr:`descriptor` | |
| 302 :Example: | |
| 303 | |
| 304 >>> desc = GlobalDescriptor(['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR']) | |
| 305 >>> desc.length() | |
| 306 >>> desc.descriptor | |
| 307 array([[ 8.], [12.], [12.]]) | |
| 308 """ | |
| 309 desc = [] | |
| 310 for seq in self.sequences: | |
| 311 desc.append(float(len(seq.strip()))) | |
| 312 desc = np.asarray(desc).reshape(len(desc), 1) | |
| 313 if append: | |
| 314 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 315 self.featurenames.append("Length") | |
| 316 else: | |
| 317 self.descriptor = np.array(desc) | |
| 318 self.featurenames = ["Length"] | |
| 319 | |
| 320 def formula(self, amide=False, append=False): | |
| 321 """Method to calculate the molecular formula of every sequence in the attribute :py:attr:`sequences`. | |
| 322 | |
| 323 :param amide: {boolean} whether the sequences are C-terminally amidated. | |
| 324 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 325 attribute :py:attr:`descriptor`. | |
| 326 :return: array of molecular formulas {str} in the attribute :py:attr:`descriptor` | |
| 327 :Example: | |
| 328 | |
| 329 >>> desc = GlobalDescriptor(['KADSFLSADGHSADFSLDKKLKERL', 'ERTILSDFPQWWFASLDFLNC', 'ACDEFGHIKLMNPQRSTVWY']) | |
| 330 >>> desc.formula(amide=True) | |
| 331 >>> for v in desc.descriptor: | |
| 332 ... print(v[0]) | |
| 333 C122 H197 N35 O39 | |
| 334 C121 H168 N28 O33 S | |
| 335 C106 H157 N29 O30 S2 | |
| 336 | |
| 337 .. seealso:: :py:func:`modlamp.core.aa_formulas()` | |
| 338 | |
| 339 .. versionadded:: v2.7.6 | |
| 340 """ | |
| 341 desc = [] | |
| 342 formulas = aa_formulas() | |
| 343 for seq in self.sequences: | |
| 344 f = {"C": 0, "H": 0, "N": 0, "O": 0, "S": 0} | |
| 345 for aa in seq: # loop over all AAs | |
| 346 for k in f.keys(): | |
| 347 f[k] += formulas[aa][k] | |
| 348 | |
| 349 # substract H2O for every peptide bond | |
| 350 f["H"] -= 2 * (len(seq) - 1) | |
| 351 f["O"] -= len(seq) - 1 | |
| 352 | |
| 353 if amide: # add C-terminal amide --> replace OH with NH2 | |
| 354 f["O"] -= 1 | |
| 355 f["H"] += 1 | |
| 356 f["N"] += 1 | |
| 357 | |
| 358 if f["S"] != 0: | |
| 359 val = "C%s H%s N%s O%s %s%s" % ( | |
| 360 f["C"], | |
| 361 f["H"], | |
| 362 f["N"], | |
| 363 f["O"], | |
| 364 "S", | |
| 365 f["S"], | |
| 366 ) | |
| 367 else: | |
| 368 val = "C%s H%s N%s O%s" % (f["C"], f["H"], f["N"], f["O"]) | |
| 369 | |
| 370 desc.append([val]) | |
| 371 | |
| 372 if append: | |
| 373 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 374 self.featurenames.append("Formula") | |
| 375 else: | |
| 376 self.descriptor = np.array(desc) | |
| 377 self.featurenames = ["Formula"] | |
| 378 | |
| 379 def calculate_MW(self, amide=False, append=False): | |
| 380 """Method to calculate the molecular weight [g/mol] of every sequence in the attribute :py:attr:`sequences`. | |
| 381 | |
| 382 :param amide: {boolean} whether the sequences are C-terminally amidated (subtracts 0.95 from the MW). | |
| 383 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 384 attribute :py:attr:`descriptor`. | |
| 385 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
| 386 :Example: | |
| 387 | |
| 388 >>> desc = GlobalDescriptor('IAESFKGHIPL') | |
| 389 >>> desc.calculate_MW(amide=True) | |
| 390 >>> desc.descriptor | |
| 391 array([[ 1210.43]]) | |
| 392 | |
| 393 .. seealso:: :py:func:`modlamp.core.aa_weights()` | |
| 394 | |
| 395 .. versionchanged:: v2.1.5 amide option added | |
| 396 """ | |
| 397 desc = [] | |
| 398 weights = aa_weights() | |
| 399 for seq in self.sequences: | |
| 400 mw = [] | |
| 401 for aa in seq: # sum over aa weights | |
| 402 mw.append(weights[aa]) | |
| 403 desc.append( | |
| 404 round(sum(mw) - 18.015 * (len(seq) - 1), 2) | |
| 405 ) # sum over AA MW and subtract H20 MW for every | |
| 406 # peptide bond | |
| 407 desc = np.asarray(desc).reshape(len(desc), 1) | |
| 408 if ( | |
| 409 amide | |
| 410 ): # if sequences are amidated, subtract 0.98 from calculated MW (OH - NH2) | |
| 411 desc = [d - 0.98 for d in desc] | |
| 412 if append: | |
| 413 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 414 self.featurenames.append("MW") | |
| 415 else: | |
| 416 self.descriptor = np.array(desc) | |
| 417 self.featurenames = ["MW"] | |
| 418 | |
| 419 def calculate_charge(self, ph=7.0, amide=False, append=False): | |
| 420 """Method to overall charge of every sequence in the attribute :py:attr:`sequences`. | |
| 421 | |
| 422 The method used is first described by Bjellqvist. In the case of amidation, the value for the 'Cterm' pKa is 15 | |
| 423 (and Cterm is added to the pos_pKs dictionary. | |
| 424 The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed). | |
| 425 | |
| 426 **pos_pKs** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} | |
| 427 | |
| 428 **neg_pKs** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} | |
| 429 | |
| 430 :param ph: {float} ph at which to calculate peptide charge. | |
| 431 :param amide: {boolean} whether the sequences have an amidated C-terminus. | |
| 432 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 433 attribute :py:attr:`descriptor`. | |
| 434 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
| 435 :Example: | |
| 436 | |
| 437 >>> desc = GlobalDescriptor('KLAKFGKRSELVALSG') | |
| 438 >>> desc.calculate_charge(ph=7.4, amide=True) | |
| 439 >>> desc.descriptor | |
| 440 array([[ 3.989]]) | |
| 441 """ | |
| 442 | |
| 443 desc = [] | |
| 444 for seq in self.sequences: | |
| 445 desc.append( | |
| 446 _charge(seq, ph, amide) | |
| 447 ) # calculate charge with helper function | |
| 448 desc = np.asarray(desc).reshape(len(desc), 1) | |
| 449 if append: | |
| 450 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 451 self.featurenames.append("Charge") | |
| 452 else: | |
| 453 self.descriptor = np.array(desc) | |
| 454 self.featurenames = ["Charge"] | |
| 455 | |
| 456 def charge_density(self, ph=7.0, amide=False, append=False): | |
| 457 """Method to calculate the charge density (charge / MW) of every sequences in the attributes :py:attr:`sequences` | |
| 458 | |
| 459 :param ph: {float} pH at which to calculate peptide charge. | |
| 460 :param amide: {boolean} whether the sequences have an amidated C-terminus. | |
| 461 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 462 attribute :py:attr:`descriptor`. | |
| 463 :return: array of descriptor values in the attribute :py:attr:`descriptor`. | |
| 464 :Example: | |
| 465 | |
| 466 >>> desc = GlobalDescriptor('GNSDLLIEQRTLLASDEF') | |
| 467 >>> desc.charge_density(ph=6, amide=True) | |
| 468 >>> desc.descriptor | |
| 469 array([[-0.00097119]]) | |
| 470 """ | |
| 471 self.calculate_charge(ph, amide) | |
| 472 charges = self.descriptor | |
| 473 self.calculate_MW(amide) | |
| 474 masses = self.descriptor | |
| 475 desc = charges / masses | |
| 476 desc = np.asarray(desc).reshape(len(desc), 1) | |
| 477 if append: | |
| 478 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 479 self.featurenames.append("ChargeDensity") | |
| 480 else: | |
| 481 self.descriptor = np.array(desc) | |
| 482 self.featurenames = ["ChargeDensity"] | |
| 483 | |
| 484 def isoelectric_point(self, amide=False, append=False): | |
| 485 """ | |
| 486 Method to calculate the isoelectric point of every sequence in the attribute :py:attr:`sequences`. | |
| 487 The pK scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed). | |
| 488 | |
| 489 **pos_pKs** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} | |
| 490 | |
| 491 **neg_pKs** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} | |
| 492 | |
| 493 :param amide: {boolean} whether the sequences have an amidated C-terminus. | |
| 494 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 495 attribute :py:attr:`descriptor`. | |
| 496 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
| 497 :Example: | |
| 498 | |
| 499 >>> desc = GlobalDescriptor('KLFDIKFGHIPQRST') | |
| 500 >>> desc.isoelectric_point() | |
| 501 >>> desc.descriptor | |
| 502 array([[ 10.6796875]]) | |
| 503 """ | |
| 504 ph, ph1, ph2 = float(), float(), float() | |
| 505 desc = [] | |
| 506 for seq in self.sequences: | |
| 507 | |
| 508 # Bracket between ph1 and ph2 | |
| 509 ph = 7.0 | |
| 510 charge = _charge(seq, ph, amide) | |
| 511 if charge > 0.0: | |
| 512 ph1 = ph | |
| 513 charge1 = charge | |
| 514 while charge1 > 0.0: | |
| 515 ph = ph1 + 1.0 | |
| 516 charge = _charge(seq, ph, amide) | |
| 517 if charge > 0.0: | |
| 518 ph1 = ph | |
| 519 charge1 = charge | |
| 520 else: | |
| 521 ph2 = ph | |
| 522 break | |
| 523 else: | |
| 524 ph2 = ph | |
| 525 charge2 = charge | |
| 526 while charge2 < 0.0: | |
| 527 ph = ph2 - 1.0 | |
| 528 charge = _charge(seq, ph, amide) | |
| 529 if charge < 0.0: | |
| 530 ph2 = ph | |
| 531 charge2 = charge | |
| 532 else: | |
| 533 ph1 = ph | |
| 534 break | |
| 535 # Bisection | |
| 536 while ph2 - ph1 > 0.0001 and charge != 0.0: | |
| 537 ph = (ph1 + ph2) / 2.0 | |
| 538 charge = _charge(seq, ph, amide) | |
| 539 if charge > 0.0: | |
| 540 ph1 = ph | |
| 541 else: | |
| 542 ph2 = ph | |
| 543 desc.append(ph) | |
| 544 desc = np.asarray(desc).reshape(len(desc), 1) | |
| 545 if append: | |
| 546 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 547 self.featurenames.append("pI") | |
| 548 else: | |
| 549 self.descriptor = np.array(desc) | |
| 550 self.featurenames = ["pI"] | |
| 551 | |
| 552 def instability_index(self, append=False): | |
| 553 """ | |
| 554 Method to calculate the instability of every sequence in the attribute :py:attr:`sequences`. | |
| 555 The instability index is a prediction of protein stability based on the amino acid composition. | |
| 556 ([1] K. Guruprasad, B. V Reddy, M. W. Pandit, Protein Eng. 1990, 4, 155–161.) | |
| 557 | |
| 558 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 559 attribute :py:attr:`descriptor`. | |
| 560 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
| 561 :Example: | |
| 562 | |
| 563 >>> desc = GlobalDescriptor('LLASMNDLLAKRST') | |
| 564 >>> desc.instability_index() | |
| 565 >>> desc.descriptor | |
| 566 array([[ 63.95714286]]) | |
| 567 """ | |
| 568 | |
| 569 desc = [] | |
| 570 dimv = load_scale("instability")[1] | |
| 571 for seq in self.sequences: | |
| 572 stabindex = float() | |
| 573 for i in range(len(seq) - 1): | |
| 574 stabindex += dimv[seq[i]][seq[i + 1]] | |
| 575 desc.append((10.0 / len(seq)) * stabindex) | |
| 576 desc = np.asarray(desc).reshape(len(desc), 1) | |
| 577 if append: | |
| 578 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 579 self.featurenames.append("InstabilityInd") | |
| 580 else: | |
| 581 self.descriptor = np.array(desc) | |
| 582 self.featurenames = ["InstabilityInd"] | |
| 583 | |
| 584 def aromaticity(self, append=False): | |
| 585 """ | |
| 586 Method to calculate the aromaticity of every sequence in the attribute :py:attr:`sequences`. | |
| 587 According to Lobry, 1994, it is simply the relative frequency of Phe+Trp+Tyr. | |
| 588 | |
| 589 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 590 attribute :py:attr:`descriptor`. | |
| 591 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
| 592 :Example: | |
| 593 | |
| 594 >>> desc = GlobalDescriptor('GLFYWRFFLQRRFLYWW') | |
| 595 >>> desc.aromaticity() | |
| 596 >>> desc.descriptor | |
| 597 array([[ 0.52941176]]) | |
| 598 """ | |
| 599 desc = [] | |
| 600 for seq in self.sequences: | |
| 601 f = seq.count("F") | |
| 602 w = seq.count("W") | |
| 603 y = seq.count("Y") | |
| 604 desc.append(float(f + w + y) / len(seq)) | |
| 605 desc = np.asarray(desc).reshape(len(desc), 1) | |
| 606 if append: | |
| 607 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 608 self.featurenames.append("Aromaticity") | |
| 609 else: | |
| 610 self.descriptor = np.array(desc) | |
| 611 self.featurenames = ["Aromaticity"] | |
| 612 | |
| 613 def aliphatic_index(self, append=False): | |
| 614 """ | |
| 615 Method to calculate the aliphatic index of every sequence in the attribute :py:attr:`sequences`. | |
| 616 According to Ikai, 1980, the aliphatic index is a measure of thermal stability of proteins and is dependant | |
| 617 on the relative volume occupied by aliphatic amino acids (A,I,L & V). | |
| 618 ([1] A. Ikai, J. Biochem. 1980, 88, 1895–1898.) | |
| 619 | |
| 620 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 621 attribute :py:attr:`descriptor`. | |
| 622 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
| 623 :Example: | |
| 624 | |
| 625 >>> desc = GlobalDescriptor('KWLKYLKKLAKLVK') | |
| 626 >>> desc.aliphatic_index() | |
| 627 >>> desc.descriptor | |
| 628 array([[ 139.28571429]]) | |
| 629 """ | |
| 630 desc = [] | |
| 631 aa_dict = aa_weights() | |
| 632 for seq in self.sequences: | |
| 633 d = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa | |
| 634 d = { | |
| 635 k: (float(d[k]) / len(seq)) * 100 for k in d.keys() | |
| 636 } # get mole percent of all AA | |
| 637 desc.append( | |
| 638 d["A"] + 2.9 * d["V"] + 3.9 * (d["I"] + d["L"]) | |
| 639 ) # formula for calculating the AI (Ikai, 1980) | |
| 640 desc = np.asarray(desc).reshape(len(desc), 1) | |
| 641 if append: | |
| 642 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 643 self.featurenames.append("AliphaticInd") | |
| 644 else: | |
| 645 self.descriptor = np.array(desc) | |
| 646 self.featurenames = ["AliphaticInd"] | |
| 647 | |
| 648 def boman_index(self, append=False): | |
| 649 """Method to calculate the boman index of every sequence in the attribute :py:attr:`sequences`. | |
| 650 According to Boman, 2003, the boman index is a measure for protein-protein interactions and is calculated by | |
| 651 summing over all amino acid free energy of transfer [kcal/mol] between water and cyclohexane,[2] followed by | |
| 652 dividing by sequence length. | |
| 653 ([1] H. G. Boman, D. Wade, I. a Boman, B. Wåhlin, R. B. Merrifield, *FEBS Lett*. **1989**, *259*, 103–106. | |
| 654 [2] A. Radzick, R. Wolfenden, *Biochemistry* **1988**, *27*, 1664–1670.) | |
| 655 | |
| 656 .. seealso:: :py:func:`modlamp.core.aa_energies()` | |
| 657 | |
| 658 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 659 attribute :py:attr:`descriptor`. | |
| 660 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
| 661 :Example: | |
| 662 | |
| 663 >>> desc = GlobalDescriptor('GLFDIVKKVVGALGSL') | |
| 664 >>> desc.boman_index() | |
| 665 >>> desc.descriptor | |
| 666 array([[-1.011875]]) | |
| 667 """ | |
| 668 d = aa_energies() | |
| 669 desc = [] | |
| 670 for seq in self.sequences: | |
| 671 val = [] | |
| 672 for a in seq: | |
| 673 val.append(d[a]) | |
| 674 desc.append(sum(val) / len(val)) | |
| 675 desc = np.asarray(desc).reshape(len(desc), 1) | |
| 676 if append: | |
| 677 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 678 self.featurenames.append("BomanInd") | |
| 679 else: | |
| 680 self.descriptor = np.array(desc) | |
| 681 self.featurenames = ["BomanInd"] | |
| 682 | |
| 683 def hydrophobic_ratio(self, append=False): | |
| 684 """ | |
| 685 Method to calculate the hydrophobic ratio of every sequence in the attribute :py:attr:`sequences`, which is the | |
| 686 relative frequency of the amino acids **A,C,F,I,L,M & V**. | |
| 687 | |
| 688 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 689 attribute :py:attr:`descriptor`. | |
| 690 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
| 691 :Example: | |
| 692 | |
| 693 >>> desc = GlobalDescriptor('VALLYWRTVLLAIII') | |
| 694 >>> desc.hydrophobic_ratio() | |
| 695 >>> desc.descriptor | |
| 696 array([[ 0.73333333]]) | |
| 697 """ | |
| 698 desc = [] | |
| 699 aa_dict = aa_weights() | |
| 700 for seq in self.sequences: | |
| 701 pa = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa | |
| 702 # formula for calculating the AI (Ikai, 1980): | |
| 703 desc.append( | |
| 704 (pa["A"] + pa["C"] + pa["F"] + pa["I"] + pa["L"] + pa["M"] + pa["V"]) | |
| 705 / float(len(seq)) | |
| 706 ) | |
| 707 desc = np.asarray(desc).reshape(len(desc), 1) | |
| 708 if append: | |
| 709 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 710 self.featurenames.append("HydrophRatio") | |
| 711 else: | |
| 712 self.descriptor = np.array(desc) | |
| 713 self.featurenames = ["HydrophRatio"] | |
| 714 | |
| 715 def calculate_all(self, ph=7.4, amide=True): | |
| 716 """Method combining all global descriptors and appending them into the feature matrix in the attribute | |
| 717 :py:attr:`descriptor`. | |
| 718 | |
| 719 :param ph: {float} pH at which to calculate peptide charge | |
| 720 :param amide: {boolean} whether the sequences have an amidated C-terminus. | |
| 721 :return: array of descriptor values in the attribute :py:attr:`descriptor` | |
| 722 :Example: | |
| 723 | |
| 724 >>> desc = GlobalDescriptor('AFGHFKLKKLFIFGHERT') | |
| 725 >>> desc.calculate_all(amide=True) | |
| 726 >>> desc.featurenames | |
| 727 ['Length', 'MW', 'ChargeDensity', 'pI', 'InstabilityInd', 'Aromaticity', 'AliphaticInd', 'BomanInd', 'HydRatio'] | |
| 728 >>> desc.descriptor | |
| 729 array([[ 18., 2.17559000e+03, 1.87167619e-03, 1.16757812e+01, ... 1.10555556e+00, 4.44444444e-01]]) | |
| 730 >>> desc.save_descriptor('/path/to/outputfile.csv') # save the descriptor data (with feature names header) | |
| 731 """ | |
| 732 | |
| 733 # This is a strange way of doing it. However, the append=True option excludes length and charge, no idea why! | |
| 734 fn = [] | |
| 735 self.length() # sequence length | |
| 736 l = self.descriptor | |
| 737 fn.extend(self.featurenames) | |
| 738 self.calculate_MW(amide=amide) # molecular weight | |
| 739 mw = self.descriptor | |
| 740 fn.extend(self.featurenames) | |
| 741 self.calculate_charge(ph=ph, amide=amide) # net charge | |
| 742 c = self.descriptor | |
| 743 fn.extend(self.featurenames) | |
| 744 self.charge_density(ph=ph, amide=amide) # charge density | |
| 745 cd = self.descriptor | |
| 746 fn.extend(self.featurenames) | |
| 747 self.isoelectric_point(amide=amide) # pI | |
| 748 pi = self.descriptor | |
| 749 fn.extend(self.featurenames) | |
| 750 self.instability_index() # instability index | |
| 751 si = self.descriptor | |
| 752 fn.extend(self.featurenames) | |
| 753 self.aromaticity() # global aromaticity | |
| 754 ar = self.descriptor | |
| 755 fn.extend(self.featurenames) | |
| 756 self.aliphatic_index() # aliphatic index | |
| 757 ai = self.descriptor | |
| 758 fn.extend(self.featurenames) | |
| 759 self.boman_index() # Boman index | |
| 760 bi = self.descriptor | |
| 761 fn.extend(self.featurenames) | |
| 762 self.hydrophobic_ratio() # Hydrophobic ratio | |
| 763 hr = self.descriptor | |
| 764 fn.extend(self.featurenames) | |
| 765 | |
| 766 self.descriptor = np.concatenate((l, mw, c, cd, pi, si, ar, ai, bi, hr), axis=1) | |
| 767 self.featurenames = fn | |
| 768 | |
| 769 | |
| 770 class PeptideDescriptor(BaseDescriptor): | |
| 771 """Base class for peptide descriptors. The following **amino acid descriptor scales** are available for descriptor | |
| 772 calculation: | |
| 773 | |
| 774 - **AASI** (An amino acid selectivity index scale for helical antimicrobial peptides, *[1] D. Juretić, D. Vukicević, N. Ilić, N. Antcheva, A. Tossi, J. Chem. Inf. Model. 2009, 49, 2873–2882.*) | |
| 775 - **ABHPRK** (modlabs inhouse physicochemical feature scale (Acidic, Basic, Hydrophobic, Polar, aRomatic, Kink-inducer) | |
| 776 - **argos** (Argos hydrophobicity amino acid scale, *[2] Argos, P., Rao, J. K. M. & Hargrave, P. A., Eur. J. Biochem. 2005, 128, 565–575.*) | |
| 777 - **bulkiness** (Amino acid side chain bulkiness scale, *[3] J. M. Zimmerman, N. Eliezer, R. Simha, J. Theor. Biol. 1968, 21, 170–201.*) | |
| 778 - **charge_phys** (Amino acid charge at pH 7.0 - Hystidine charge +0.1.) | |
| 779 - **charge_acid** (Amino acid charge at acidic pH - Hystidine charge +1.0.) | |
| 780 - **cougar** (modlabs inhouse selection of global peptide descriptors) | |
| 781 - **eisenberg** (the Eisenberg hydrophobicity consensus amino acid scale, *[4] D. Eisenberg, R. M. Weiss, T. C. Terwilliger, W. Wilcox, Faraday Symp. Chem. Soc. 1982, 17, 109.*) | |
| 782 - **Ez** (potential that assesses energies of insertion of amino acid side chains into lipid bilayers, *[5] A. Senes, D. C. Chadi, P. B. Law, R. F. S. Walters, V. Nanda, W. F. DeGrado, J. Mol. Biol. 2007, 366, 436–448.*) | |
| 783 - **flexibility** (amino acid side chain flexibilitiy scale, *[6] R. Bhaskaran, P. K. Ponnuswamy, Int. J. Pept. Protein Res. 1988, 32, 241–255.*) | |
| 784 - **grantham** (amino acid side chain composition, polarity and molecular volume, *[8] Grantham, R. Science. 185, 862–864 (1974).*) | |
| 785 - **gravy** (GRAVY hydrophobicity amino acid scale, *[9] J. Kyte, R. F. Doolittle, J. Mol. Biol. 1982, 157, 105–132.*) | |
| 786 - **hopp-woods** (Hopp-Woods amino acid hydrophobicity scale,*[10] T. P. Hopp, K. R. Woods, Proc. Natl. Acad. Sci. 1981, 78, 3824–3828.*) | |
| 787 - **ISAECI** (Isotropic Surface Area (ISA) and Electronic Charge Index (ECI) of amino acid side chains, *[11] E. R. Collantes, W. J. Dunn, J. Med. Chem. 1995, 38, 2705–2713.*) | |
| 788 - **janin** (Janin hydrophobicity amino acid scale, *[12] J. L. Cornette, K. B. Cease, H. Margalit, J. L. Spouge, J. A. Berzofsky, C. DeLisi, J. Mol. Biol. 1987, 195, 659–685.*) | |
| 789 - **kytedoolittle** (Kyte & Doolittle hydrophobicity amino acid scale, *[13] J. Kyte, R. F. Doolittle, J. Mol. Biol. 1982, 157, 105–132.*) | |
| 790 - **levitt_alpha** (Levitt amino acid alpha-helix propensity scale, extracted from http://web.expasy.org/protscale. *[14] M. Levitt, Biochemistry 1978, 17, 4277-4285.*) | |
| 791 - **MSS** (A graph-theoretical index that reflects topological shape and size of amino acid side chains, *[15] C. Raychaudhury, A. Banerjee, P. Bag, S. Roy, J. Chem. Inf. Comput. Sci. 1999, 39, 248–254.*) | |
| 792 - **MSW** (Amino acid scale based on a PCA of the molecular surface based WHIM descriptor (MS-WHIM), extended to natural amino acids, *[16] A. Zaliani, E. Gancia, J. Chem. Inf. Comput. Sci 1999, 39, 525–533.*) | |
| 793 - **pepArc** (modlabs pharmacophoric feature scale, dimensions are: hydrophobicity, polarity, positive charge, negative charge, proline.) | |
| 794 - **pepcats** (modlabs pharmacophoric feature based PEPCATS scale, *[17] C. P. Koch, A. M. Perna, M. Pillong, N. K. Todoroff, P. Wrede, G. Folkers, J. A. Hiss, G. Schneider, PLoS Comput. Biol. 2013, 9, e1003088.*) | |
| 795 - **polarity** (Amino acid polarity scale, *[18] J. M. Zimmerman, N. Eliezer, R. Simha, J. Theor. Biol. 1968, 21, 170–201.*) | |
| 796 - **PPCALI** (modlabs inhouse scale derived from a PCA of 143 amino acid property scales, *[19] C. P. Koch, A. M. Perna, M. Pillong, N. K. Todoroff, P. Wrede, G. Folkers, J. A. Hiss, G. Schneider, PLoS Comput. Biol. 2013, 9, e1003088.*) | |
| 797 - **refractivity** (Relative amino acid refractivity values, *[20] T. L. McMeekin, M. Wilensky, M. L. Groves, Biochem. Biophys. Res. Commun. 1962, 7, 151–156.*) | |
| 798 - **t_scale** (A PCA derived scale based on amino acid side chain properties calculated with 6 different probes of the GRID program, *[21] M. Cocchi, E. Johansson, Quant. Struct. Act. Relationships 1993, 12, 1–8.*) | |
| 799 - **TM_tend** (Amino acid transmembrane propensity scale, extracted from http://web.expasy.org/protscale, *[22] Zhao, G., London E. Protein Sci. 2006, 15, 1987-2001.*) | |
| 800 - **z3** (The original three dimensional Z-scale, *[23] S. Hellberg, M. Sjöström, B. Skagerberg, S. Wold, J. Med. Chem. 1987, 30, 1126–1135.*) | |
| 801 - **z5** (The extended five dimensional Z-scale, *[24] M. Sandberg, L. Eriksson, J. Jonsson, M. Sjöström, S. Wold, J. Med. Chem. 1998, 41, 2481–2491.*) | |
| 802 | |
| 803 Further, amino acid scale independent methods can be calculated with help of the :class:`GlobalDescriptor` class. | |
| 804 | |
| 805 """ | |
| 806 | |
| 807 def __init__(self, seqs, scalename="Eisenberg"): | |
| 808 """ | |
| 809 :param seqs: a .fasta file with sequences, a list of sequences or a single sequence as string to calculate the | |
| 810 descriptor values for. | |
| 811 :param scalename: {str} name of the amino acid scale (one of the given list above) used to calculate the | |
| 812 descriptor values | |
| 813 :return: initialized attributes :py:attr:`sequences`, :py:attr:`names` and dictionary :py:attr:`scale` with | |
| 814 amino acid scale values of the scale name in :py:attr:`scalename`. | |
| 815 :Example: | |
| 816 | |
| 817 >>> AMP = PeptideDescriptor('KLLKLLKKLLKLLK','pepcats') | |
| 818 >>> AMP.sequences | |
| 819 ['KLLKLLKKLLKLLK'] | |
| 820 >>> seqs = PeptideDescriptor('/Path/to/file.fasta', 'eisenberg') # load sequences from .fasta file | |
| 821 >>> seqs.sequences | |
| 822 ['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'...] | |
| 823 """ | |
| 824 super(PeptideDescriptor, self).__init__(seqs) | |
| 825 self.scalename, self.scale = load_scale(scalename.lower()) | |
| 826 self.all_moms = list() # for passing hydrophobic moments to calculate_profile | |
| 827 self.all_globs = list() # for passing global to calculate_profile | |
| 828 | |
| 829 def load_scale(self, scalename): | |
| 830 """Method to load amino acid values from a given scale | |
| 831 | |
| 832 :param scalename: {str} name of the amino acid scale to be loaded. | |
| 833 :return: loaded amino acid scale values in a dictionary in the attribute :py:attr:`scale`. | |
| 834 | |
| 835 .. seealso:: :func:`modlamp.core.load_scale()` | |
| 836 """ | |
| 837 self.scalename, self.scale = load_scale(scalename.lower()) | |
| 838 | |
| 839 def calculate_autocorr(self, window, append=False): | |
| 840 """Method for auto-correlating the amino acid values for a given descriptor scale | |
| 841 | |
| 842 :param window: {int} correlation window for descriptor calculation in a sliding window approach | |
| 843 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 844 attribute :py:attr:`descriptor`. | |
| 845 :return: calculated descriptor numpy.array in the attribute :py:attr:`descriptor`. | |
| 846 :Example: | |
| 847 | |
| 848 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','PPCALI') | |
| 849 >>> AMP.calculate_autocorr(7) | |
| 850 >>> AMP.descriptor | |
| 851 array([[ 1.28442339e+00, 1.29025116e+00, 1.03240901e+00, .... ]]) | |
| 852 >>> AMP.descriptor.shape | |
| 853 (1, 133) | |
| 854 | |
| 855 .. versionchanged:: v.2.3.0 | |
| 856 """ | |
| 857 desc = Parallel(n_jobs=-1)( | |
| 858 delayed(_one_autocorr)(seq, window, self.scale) for seq in self.sequences | |
| 859 ) | |
| 860 | |
| 861 if append: | |
| 862 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 863 else: | |
| 864 self.descriptor = np.array(desc) | |
| 865 | |
| 866 def calculate_crosscorr(self, window, append=False): | |
| 867 """Method for cross-correlating the amino acid values for a given descriptor scale | |
| 868 | |
| 869 :param window: {int} correlation window for descriptor calculation in a sliding window approach | |
| 870 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 871 attribute :py:attr:`descriptor`. | |
| 872 :return: calculated descriptor numpy.array in the attribute :py:attr:`descriptor`. | |
| 873 :Example: | |
| 874 | |
| 875 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','pepcats') | |
| 876 >>> AMP.calculate_crosscorr(7) | |
| 877 >>> AMP.descriptor | |
| 878 array([[ 0.6875 , 0.46666667, 0.42857143, 0.61538462, 0.58333333, ... ]]) | |
| 879 >>> AMP.descriptor.shape | |
| 880 (1, 147) | |
| 881 """ | |
| 882 desc = Parallel(n_jobs=-1)( | |
| 883 delayed(_one_crosscorr)(seq, window, self.scale) for seq in self.sequences | |
| 884 ) | |
| 885 | |
| 886 if append: | |
| 887 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 888 else: | |
| 889 self.descriptor = np.array(desc) | |
| 890 | |
| 891 def calculate_moment(self, window=1000, angle=100, modality="max", append=False): | |
| 892 """Method for calculating the maximum or mean moment of the amino acid values for a given descriptor scale and | |
| 893 window. | |
| 894 | |
| 895 :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the | |
| 896 window, the length of the sequence is taken. So if the default window of 1000 is chosen, for all sequences | |
| 897 shorter than 1000, the **global** hydrophobic moment will be calculated. Otherwise, the maximal | |
| 898 hydrophiobic moment for the chosen window size found in the sequence will be returned. | |
| 899 :param angle: {int} angle in which to calculate the moment. **100** for alpha helices, **180** for beta sheets. | |
| 900 :param modality: {'all', 'max' or 'mean'} Calculate respectively maximum or mean hydrophobic moment. If all, | |
| 901 moments for all windows are returned. | |
| 902 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 903 attribute :py:attr:`descriptor`. | |
| 904 :return: Calculated descriptor as a numpy.array in the attribute :py:attr:`descriptor` and all possible global | |
| 905 values in :py:attr:`all_moms` (needed for the :py:func:`calculate_profile` method) | |
| 906 :Example: | |
| 907 | |
| 908 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL', 'eisenberg') | |
| 909 >>> AMP.calculate_moment() | |
| 910 >>> AMP.descriptor | |
| 911 array([[ 0.48790226]]) | |
| 912 """ | |
| 913 if self.scale["A"] == list: | |
| 914 print( | |
| 915 "\n Descriptor moment calculation is only possible for one dimensional descriptors.\n" | |
| 916 ) | |
| 917 | |
| 918 else: | |
| 919 desc = [] | |
| 920 for seq in self.sequences: | |
| 921 wdw = min( | |
| 922 window, len(seq) | |
| 923 ) # if sequence is shorter than window, take the whole sequence instead | |
| 924 mtrx = [] | |
| 925 mwdw = [] | |
| 926 | |
| 927 for aa in range(len(seq)): | |
| 928 mtrx.append(self.scale[str(seq[aa])]) | |
| 929 | |
| 930 for i in range(len(mtrx) - wdw + 1): | |
| 931 mwdw.append(sum(mtrx[i : i + wdw], [])) | |
| 932 | |
| 933 mwdw = np.asarray(mwdw) | |
| 934 rads = ( | |
| 935 angle * (np.pi / 180) * np.asarray(range(wdw)) | |
| 936 ) # calculate actual moment (radial) | |
| 937 vcos = (mwdw * np.cos(rads)).sum(axis=1) | |
| 938 vsin = (mwdw * np.sin(rads)).sum(axis=1) | |
| 939 moms = np.sqrt(vsin**2 + vcos**2) / wdw | |
| 940 | |
| 941 if modality == "max": # take window with maximal value | |
| 942 moment = np.max(moms) | |
| 943 elif modality == "mean": # take average value over all windows | |
| 944 moment = np.mean(moms) | |
| 945 elif modality == "all": | |
| 946 moment = moms | |
| 947 else: | |
| 948 print( | |
| 949 '\nERROR!\nModality parameter is wrong, please choose between "all", "max" and "mean".\n' | |
| 950 ) | |
| 951 return | |
| 952 desc.append(moment) | |
| 953 self.all_moms.append(moms) | |
| 954 | |
| 955 desc = np.asarray(desc).reshape(len(desc), 1) # final descriptor array | |
| 956 | |
| 957 if append: | |
| 958 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 959 else: | |
| 960 self.descriptor = np.array(desc) | |
| 961 | |
| 962 def calculate_global(self, window=1000, modality="max", append=False): | |
| 963 """Method for calculating a global / window averaging descriptor value of a given AA scale | |
| 964 | |
| 965 :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the | |
| 966 window, the length of the sequence is taken. | |
| 967 :param modality: {'max' or 'mean'} Calculate respectively maximum or mean hydrophobic moment. | |
| 968 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 969 attribute :py:attr:`descriptor`. | |
| 970 :return: Calculated descriptor as a numpy.array in the attribute :py:attr:`descriptor` and all possible global | |
| 971 values in :py:attr:`all_globs` (needed for the :py:func:`calculate_profile` method) | |
| 972 :Example: | |
| 973 | |
| 974 >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','eisenberg') | |
| 975 >>> AMP.calculate_global(window=1000, modality='max') | |
| 976 >>> AMP.descriptor | |
| 977 array([[ 0.44875]]) | |
| 978 """ | |
| 979 desc = list() | |
| 980 for n, seq in enumerate(self.sequences): | |
| 981 wdw = min( | |
| 982 window, len(seq) | |
| 983 ) # if sequence is shorter than window, take the whole sequence instead | |
| 984 mtrx = [] | |
| 985 mwdw = [] | |
| 986 | |
| 987 for l in range(len(seq)): # translate AA sequence into values | |
| 988 mtrx.append(self.scale[str(seq[l])]) | |
| 989 | |
| 990 for i in range(len(mtrx) - wdw + 1): | |
| 991 mwdw.append( | |
| 992 sum(mtrx[i : i + wdw], []) | |
| 993 ) # list of all the values for the different windows | |
| 994 | |
| 995 mwdw = np.asarray(mwdw) | |
| 996 glob = np.sum(mwdw, axis=1) / float(wdw) | |
| 997 outglob = float() | |
| 998 | |
| 999 if modality in ["max", "mean"]: | |
| 1000 if modality == "max": | |
| 1001 outglob = np.max( | |
| 1002 glob | |
| 1003 ) # returned moment will be the maximum of all windows | |
| 1004 elif modality == "mean": | |
| 1005 outglob = np.mean( | |
| 1006 glob | |
| 1007 ) # returned moment will be the mean of all windows | |
| 1008 else: | |
| 1009 print( | |
| 1010 'Modality parameter is wrong, please choose between "max" and "mean"\n.' | |
| 1011 ) | |
| 1012 return | |
| 1013 desc.append(outglob) | |
| 1014 self.all_globs.append(glob) | |
| 1015 | |
| 1016 desc = np.asarray(desc).reshape(len(desc), 1) | |
| 1017 if append: | |
| 1018 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 1019 else: | |
| 1020 self.descriptor = np.array(desc) | |
| 1021 | |
| 1022 def calculate_profile(self, prof_type="uH", window=7, append=False): | |
| 1023 """Method for calculating hydrophobicity or hydrophobic moment profiles for given sequences and fitting for | |
| 1024 slope and intercept. The hydrophobicity scale used is "eisenberg" | |
| 1025 | |
| 1026 :param prof_type: prof_type of profile, available: 'H' for hydrophobicity or 'uH' for hydrophobic moment | |
| 1027 :param window: {int} size of sliding window used (odd-numbered). | |
| 1028 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the | |
| 1029 attribute :py:attr:`descriptor`. | |
| 1030 :return: Fitted slope and intercept of calculated profile for every given sequence in the attribute | |
| 1031 :py:attr:`descriptor`. | |
| 1032 :Example: | |
| 1033 | |
| 1034 >>> AMP = PeptideDescriptor('KLLKLLKKVVGALG','kytedoolittle') | |
| 1035 >>> AMP.calculate_profile(prof_type='H') | |
| 1036 >>> AMP.descriptor | |
| 1037 array([[ 0.03731293, 0.19246599]]) | |
| 1038 """ | |
| 1039 if prof_type == "uH": | |
| 1040 self.calculate_moment(window=window) | |
| 1041 y_vals = self.all_moms | |
| 1042 elif prof_type == "H": | |
| 1043 self.calculate_global(window=window) | |
| 1044 y_vals = self.all_globs | |
| 1045 else: | |
| 1046 print( | |
| 1047 'prof_type parameter is unknown, choose "uH" for hydrophobic moment or "H" for hydrophobicity\n.' | |
| 1048 ) | |
| 1049 sys.exit() | |
| 1050 | |
| 1051 desc = list() | |
| 1052 for n, seq in enumerate(self.sequences): | |
| 1053 x_vals = range(len(seq))[int((window - 1) / 2) : -int((window - 1) / 2)] | |
| 1054 if len(seq) <= window: | |
| 1055 slope, intercept, r_value, p_value, std_err = [0, 0, 0, 0, 0] | |
| 1056 else: | |
| 1057 slope, intercept, r_value, p_value, std_err = stats.linregress( | |
| 1058 x_vals, y_vals[n] | |
| 1059 ) | |
| 1060 desc.append([slope, intercept]) | |
| 1061 | |
| 1062 if append: | |
| 1063 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 1064 else: | |
| 1065 self.descriptor = np.array(desc) | |
| 1066 | |
| 1067 def calculate_arc(self, modality="max", append=False): | |
| 1068 """Method for calculating property arcs as seen in the helical wheel plot. Use for binary amino acid scales only. | |
| 1069 | |
| 1070 :param modality: modality of the arc to calculate, to choose between "max" and "mean". | |
| 1071 :param append: if true, append to current descriptor stored in the descriptor attribute. | |
| 1072 :return: calculated descriptor as numpy.array in the descriptor attribute. | |
| 1073 | |
| 1074 :Example: | |
| 1075 | |
| 1076 >>> arc = PeptideDescriptor("KLLKLLKKLLKLLK", scalename="peparc") | |
| 1077 >>> arc.calculate_arc(modality="max", append=False) | |
| 1078 >>> arc.descriptor | |
| 1079 array([[200, 160, 160, 0, 0]]) | |
| 1080 """ | |
| 1081 desc = Parallel(n_jobs=-1)( | |
| 1082 delayed(_one_arc)(seq, modality, self.scale) for seq in self.sequences | |
| 1083 ) | |
| 1084 | |
| 1085 # Converts each of the amino acids to descriptor vector | |
| 1086 for seq in self.sequences: | |
| 1087 | |
| 1088 # desc_mat = [] | |
| 1089 # for aa in seq: | |
| 1090 # desc_mat.append(self.scale[aa]) | |
| 1091 # desc_mat = np.asarray(desc_mat) | |
| 1092 # | |
| 1093 # # Check descriptor dimension | |
| 1094 # desc_dim = desc_mat.shape[1] | |
| 1095 # | |
| 1096 # # list to store descriptor values for all windows | |
| 1097 # allwindows_arc = [] | |
| 1098 # | |
| 1099 # if len(seq) > 18: | |
| 1100 # window = 18 | |
| 1101 # # calculates number of windows in sequence | |
| 1102 # num_windows = len(seq) - window | |
| 1103 # else: | |
| 1104 # window = len(seq) | |
| 1105 # num_windows = 1 | |
| 1106 # | |
| 1107 # # loop through all windows | |
| 1108 # for j in range(num_windows): | |
| 1109 # # slices descriptor matrix into current window | |
| 1110 # window_mat = desc_mat[j:j + window, :] | |
| 1111 # | |
| 1112 # # defines order of amino acids in helical projection | |
| 1113 # order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7] | |
| 1114 # | |
| 1115 # # orders window descriptor matrix into helical projection order | |
| 1116 # ordered = [] | |
| 1117 # for pos in order: | |
| 1118 # try: | |
| 1119 # ordered.append(window_mat[pos, :]) | |
| 1120 # except: | |
| 1121 # # for sequences of len < 18 adding dummy vector with 2s, length of descriptor dimensions | |
| 1122 # ordered.append([2] * desc_dim) | |
| 1123 # ordered = np.asarray(ordered) | |
| 1124 # | |
| 1125 # window_arc = [] | |
| 1126 # | |
| 1127 # # loop through pharmacophoric features | |
| 1128 # for m in range(desc_dim): | |
| 1129 # all_arcs = [] # stores all arcs that can be found of a pharmacophoric feature | |
| 1130 # arc = 0 | |
| 1131 # | |
| 1132 # for n in range(18): # for all positions in helix, regardless of sequence length | |
| 1133 # if ordered[n, m] == 0: # if position does not contain pharmacophoric feature | |
| 1134 # all_arcs.append(arc) # append previous arc to all arcs list | |
| 1135 # arc = 0 # arc is initialized | |
| 1136 # elif ordered[n, m] == 1: # if position contains pharmacophoric feature(PF), elongate arc by 20° | |
| 1137 # arc += 20 | |
| 1138 # elif ordered[n, m] == 2: # if position doesn't contain amino acid: | |
| 1139 # if ordered[n - 1, m] == 1: # if previous position contained PF add 10° | |
| 1140 # arc += 10 | |
| 1141 # elif ordered[n - 1, m] == 0: # if previous position didn't contain PF don't add anything | |
| 1142 # arc += 0 | |
| 1143 # elif ordered[ | |
| 1144 # n - 2, m] == 1: # if previous position is empty then check second previous for PF | |
| 1145 # arc += 10 | |
| 1146 # if n == 17: # if we are at the last position check for position n=0 instead of next position. | |
| 1147 # if ordered[0, m] == 1: # if it contains PF add 10° extra | |
| 1148 # arc += 10 | |
| 1149 # else: # if next position contains PF add 10° extra | |
| 1150 # if ordered[n + 1, m] == 1: | |
| 1151 # arc += 10 | |
| 1152 # elif ordered[n + 1, m] == 0: | |
| 1153 # arc += 0 | |
| 1154 # else: # if next position is empty check for 2nd next position | |
| 1155 # if n == 16: | |
| 1156 # if ordered[0, m] == 1: | |
| 1157 # arc += 10 | |
| 1158 # else: | |
| 1159 # if ordered[n + 2, m] == 1: | |
| 1160 # arc += 10 | |
| 1161 # | |
| 1162 # all_arcs.append(arc) | |
| 1163 # if not arc == 360: | |
| 1164 # arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together | |
| 1165 # all_arcs = [arc0] + all_arcs[1:] | |
| 1166 # | |
| 1167 # window_arc.append(np.max(all_arcs)) # append to window arcs the maximum arc of this PF | |
| 1168 # allwindows_arc.append(window_arc) # append all PF arcs of this window | |
| 1169 # | |
| 1170 # allwindows_arc = np.asarray(allwindows_arc) | |
| 1171 # | |
| 1172 # if modality == 'max': | |
| 1173 # final_arc = np.max(allwindows_arc, axis=0) # calculate maximum / mean arc along all windows | |
| 1174 # elif modality == 'mean': | |
| 1175 # final_arc = np.mean(allwindows_arc, axis=0) | |
| 1176 # else: | |
| 1177 # print('modality is unknown, please choose between "max" and "mean"\n.') | |
| 1178 # sys.exit() | |
| 1179 | |
| 1180 if append: | |
| 1181 self.descriptor = np.hstack((self.descriptor, np.array(desc))) | |
| 1182 else: | |
| 1183 self.descriptor = np.array(desc) |
