| 26 | 1 | 
|  | 2 # -*- coding: utf-8 -*- | 
|  | 3 """ | 
|  | 4 The module is used for computing the composition of amino acids, dipetide and | 
|  | 5 3-mers (tri-peptide) for a given protein sequence. | 
|  | 6 References | 
|  | 7 ---------- | 
|  | 8 .. [1] Reczko, M. and Bohr, H. (1994) The DEF data base of sequence based protein | 
|  | 9    fold class predictions. Nucleic Acids Res, 22, 3616-3619. | 
|  | 10 .. [2] Hua, S. and Sun, Z. (2001) Support vector machine approach for protein | 
|  | 11    subcellular localization prediction. Bioinformatics, 17, 721-728. | 
|  | 12 .. [3] Grassmann, J., Reczko, M., Suhai, S. and Edler, L. (1999) Protein fold | 
|  | 13    class prediction: new methods of statistical classification. Proc Int Conf | 
|  | 14    Intell Syst Mol Biol, 106-112. | 
|  | 15 Authors: Dongsheng Cao and Yizeng Liang. | 
|  | 16 Date: 2012.3.27 | 
|  | 17 Email: oriental-cds@163.com | 
|  | 18 """ | 
|  | 19 | 
|  | 20 # Core Library | 
|  | 21 import re | 
|  | 22 from typing import Any, Dict, List | 
|  | 23 | 
|  | 24 AALetter: List[str] = list("ARNDCEQGHILKMFPSTWYV") | 
|  | 25 | 
|  | 26 ProteinSequence_docstring = """ProteinSequence: str | 
|  | 27         a pure protein sequence""" | 
|  | 28 | 
|  | 29 | 
|  | 30 def CalculateAAComposition(ProteinSequence: str) -> Dict[str, float]: | 
|  | 31     sequence_length = len(ProteinSequence) | 
|  | 32     result: Dict[str, float] = {} | 
|  | 33     for i in AALetter: | 
|  | 34         result[i] = round(float(ProteinSequence.count(i)) / sequence_length * 100, 3) | 
|  | 35     return result | 
|  | 36 | 
|  | 37 | 
|  | 38 def CalculateDipeptideComposition(ProteinSequence: str) -> Dict[str, float]: | 
|  | 39     sequence_length = len(ProteinSequence) | 
|  | 40     result = {} | 
|  | 41     for i in AALetter: | 
|  | 42         for j in AALetter: | 
|  | 43             dipeptide = i + j | 
|  | 44             result[dipeptide] = round( | 
|  | 45                 float(ProteinSequence.count(dipeptide)) / (sequence_length - 1) * 100, 2 | 
|  | 46             ) | 
|  | 47     return result | 
|  | 48 | 
|  | 49 | 
|  | 50 def Getkmers() -> List[str]: | 
|  | 51     kmers = [] | 
|  | 52     for i in AALetter: | 
|  | 53         for j in AALetter: | 
|  | 54             for k in AALetter: | 
|  | 55                 kmers.append(i + j + k) | 
|  | 56     return kmers | 
|  | 57 | 
|  | 58 | 
|  | 59 def GetSpectrumDict(proteinsequence: str) -> Dict[str, int]: | 
|  | 60     result = {} | 
|  | 61     kmers = Getkmers() | 
|  | 62     for i in kmers: | 
|  | 63         result[i] = len(re.findall(i, proteinsequence)) | 
|  | 64     return result | 
|  | 65 | 
|  | 66 | 
|  | 67 def CalculateAADipeptideComposition(ProteinSequence: str) -> Dict[str, float]: | 
|  | 68     result: Dict[Any, Any] = {} | 
|  | 69     result.update(CalculateAAComposition(ProteinSequence)) | 
|  | 70     result.update(CalculateDipeptideComposition(ProteinSequence)) | 
|  | 71     result.update(GetSpectrumDict(ProteinSequence)) | 
|  | 72     return result | 
|  | 73 | 
|  | 74 | 
|  | 75 | 
|  | 76 | 
|  | 77 | 
|  | 78 |