view local_AAComposition.py @ 31:3d94608aea7a draft

Uploaded
author jose_duarte
date Mon, 13 Dec 2021 11:19:23 +0000
parents 52e50de4c005
children
line wrap: on
line source


# -*- coding: utf-8 -*-
"""
The module is used for computing the composition of amino acids, dipetide and
3-mers (tri-peptide) for a given protein sequence.
References
----------
.. [1] Reczko, M. and Bohr, H. (1994) The DEF data base of sequence based protein
   fold class predictions. Nucleic Acids Res, 22, 3616-3619.
.. [2] Hua, S. and Sun, Z. (2001) Support vector machine approach for protein
   subcellular localization prediction. Bioinformatics, 17, 721-728.
.. [3] Grassmann, J., Reczko, M., Suhai, S. and Edler, L. (1999) Protein fold
   class prediction: new methods of statistical classification. Proc Int Conf
   Intell Syst Mol Biol, 106-112.
Authors: Dongsheng Cao and Yizeng Liang.
Date: 2012.3.27
Email: oriental-cds@163.com
"""

# Core Library
import re
from typing import Any, Dict, List

AALetter: List[str] = list("ARNDCEQGHILKMFPSTWYV")

ProteinSequence_docstring = """ProteinSequence: str
        a pure protein sequence"""


def CalculateAAComposition(ProteinSequence: str) -> Dict[str, float]:
    sequence_length = len(ProteinSequence)
    result: Dict[str, float] = {}
    for i in AALetter:
        result[i] = round(float(ProteinSequence.count(i)) / sequence_length * 100, 3)
    return result


def CalculateDipeptideComposition(ProteinSequence: str) -> Dict[str, float]:
    sequence_length = len(ProteinSequence)
    result = {}
    for i in AALetter:
        for j in AALetter:
            dipeptide = i + j
            result[dipeptide] = round(
                float(ProteinSequence.count(dipeptide)) / (sequence_length - 1) * 100, 2
            )
    return result


def Getkmers() -> List[str]:
    kmers = []
    for i in AALetter:
        for j in AALetter:
            for k in AALetter:
                kmers.append(i + j + k)
    return kmers


def GetSpectrumDict(proteinsequence: str) -> Dict[str, int]:
    result = {}
    kmers = Getkmers()
    for i in kmers:
        result[i] = len(re.findall(i, proteinsequence))
    return result


def CalculateAADipeptideComposition(ProteinSequence: str) -> Dict[str, float]:
    result: Dict[Any, Any] = {}
    result.update(CalculateAAComposition(ProteinSequence))
    result.update(CalculateDipeptideComposition(ProteinSequence))
    result.update(GetSpectrumDict(ProteinSequence))
    return result