26
|
1
|
|
2 # -*- coding: utf-8 -*-
|
|
3 """
|
|
4 The module is used for computing the composition of amino acids, dipetide and
|
|
5 3-mers (tri-peptide) for a given protein sequence.
|
|
6 References
|
|
7 ----------
|
|
8 .. [1] Reczko, M. and Bohr, H. (1994) The DEF data base of sequence based protein
|
|
9 fold class predictions. Nucleic Acids Res, 22, 3616-3619.
|
|
10 .. [2] Hua, S. and Sun, Z. (2001) Support vector machine approach for protein
|
|
11 subcellular localization prediction. Bioinformatics, 17, 721-728.
|
|
12 .. [3] Grassmann, J., Reczko, M., Suhai, S. and Edler, L. (1999) Protein fold
|
|
13 class prediction: new methods of statistical classification. Proc Int Conf
|
|
14 Intell Syst Mol Biol, 106-112.
|
|
15 Authors: Dongsheng Cao and Yizeng Liang.
|
|
16 Date: 2012.3.27
|
|
17 Email: oriental-cds@163.com
|
|
18 """
|
|
19
|
|
20 # Core Library
|
|
21 import re
|
|
22 from typing import Any, Dict, List
|
|
23
|
|
24 AALetter: List[str] = list("ARNDCEQGHILKMFPSTWYV")
|
|
25
|
|
26 ProteinSequence_docstring = """ProteinSequence: str
|
|
27 a pure protein sequence"""
|
|
28
|
|
29
|
|
30 def CalculateAAComposition(ProteinSequence: str) -> Dict[str, float]:
|
|
31 sequence_length = len(ProteinSequence)
|
|
32 result: Dict[str, float] = {}
|
|
33 for i in AALetter:
|
|
34 result[i] = round(float(ProteinSequence.count(i)) / sequence_length * 100, 3)
|
|
35 return result
|
|
36
|
|
37
|
|
38 def CalculateDipeptideComposition(ProteinSequence: str) -> Dict[str, float]:
|
|
39 sequence_length = len(ProteinSequence)
|
|
40 result = {}
|
|
41 for i in AALetter:
|
|
42 for j in AALetter:
|
|
43 dipeptide = i + j
|
|
44 result[dipeptide] = round(
|
|
45 float(ProteinSequence.count(dipeptide)) / (sequence_length - 1) * 100, 2
|
|
46 )
|
|
47 return result
|
|
48
|
|
49
|
|
50 def Getkmers() -> List[str]:
|
|
51 kmers = []
|
|
52 for i in AALetter:
|
|
53 for j in AALetter:
|
|
54 for k in AALetter:
|
|
55 kmers.append(i + j + k)
|
|
56 return kmers
|
|
57
|
|
58
|
|
59 def GetSpectrumDict(proteinsequence: str) -> Dict[str, int]:
|
|
60 result = {}
|
|
61 kmers = Getkmers()
|
|
62 for i in kmers:
|
|
63 result[i] = len(re.findall(i, proteinsequence))
|
|
64 return result
|
|
65
|
|
66
|
|
67 def CalculateAADipeptideComposition(ProteinSequence: str) -> Dict[str, float]:
|
|
68 result: Dict[Any, Any] = {}
|
|
69 result.update(CalculateAAComposition(ProteinSequence))
|
|
70 result.update(CalculateDipeptideComposition(ProteinSequence))
|
|
71 result.update(GetSpectrumDict(ProteinSequence))
|
|
72 return result
|
|
73
|
|
74
|
|
75
|
|
76
|
|
77
|
|
78
|