comparison local_AAComposition.py @ 26:52e50de4c005 draft

Uploaded
author jose_duarte
date Sun, 12 Dec 2021 10:49:43 +0000
parents
children
comparison
equal deleted inserted replaced
25:ce0de724097a 26:52e50de4c005
1
2 # -*- coding: utf-8 -*-
3 """
4 The module is used for computing the composition of amino acids, dipetide and
5 3-mers (tri-peptide) for a given protein sequence.
6 References
7 ----------
8 .. [1] Reczko, M. and Bohr, H. (1994) The DEF data base of sequence based protein
9 fold class predictions. Nucleic Acids Res, 22, 3616-3619.
10 .. [2] Hua, S. and Sun, Z. (2001) Support vector machine approach for protein
11 subcellular localization prediction. Bioinformatics, 17, 721-728.
12 .. [3] Grassmann, J., Reczko, M., Suhai, S. and Edler, L. (1999) Protein fold
13 class prediction: new methods of statistical classification. Proc Int Conf
14 Intell Syst Mol Biol, 106-112.
15 Authors: Dongsheng Cao and Yizeng Liang.
16 Date: 2012.3.27
17 Email: oriental-cds@163.com
18 """
19
20 # Core Library
21 import re
22 from typing import Any, Dict, List
23
24 AALetter: List[str] = list("ARNDCEQGHILKMFPSTWYV")
25
26 ProteinSequence_docstring = """ProteinSequence: str
27 a pure protein sequence"""
28
29
30 def CalculateAAComposition(ProteinSequence: str) -> Dict[str, float]:
31 sequence_length = len(ProteinSequence)
32 result: Dict[str, float] = {}
33 for i in AALetter:
34 result[i] = round(float(ProteinSequence.count(i)) / sequence_length * 100, 3)
35 return result
36
37
38 def CalculateDipeptideComposition(ProteinSequence: str) -> Dict[str, float]:
39 sequence_length = len(ProteinSequence)
40 result = {}
41 for i in AALetter:
42 for j in AALetter:
43 dipeptide = i + j
44 result[dipeptide] = round(
45 float(ProteinSequence.count(dipeptide)) / (sequence_length - 1) * 100, 2
46 )
47 return result
48
49
50 def Getkmers() -> List[str]:
51 kmers = []
52 for i in AALetter:
53 for j in AALetter:
54 for k in AALetter:
55 kmers.append(i + j + k)
56 return kmers
57
58
59 def GetSpectrumDict(proteinsequence: str) -> Dict[str, int]:
60 result = {}
61 kmers = Getkmers()
62 for i in kmers:
63 result[i] = len(re.findall(i, proteinsequence))
64 return result
65
66
67 def CalculateAADipeptideComposition(ProteinSequence: str) -> Dict[str, float]:
68 result: Dict[Any, Any] = {}
69 result.update(CalculateAAComposition(ProteinSequence))
70 result.update(CalculateDipeptideComposition(ProteinSequence))
71 result.update(GetSpectrumDict(ProteinSequence))
72 return result
73
74
75
76
77
78