Changeset 14:52cc2b467990 (2023-10-25)

Previous changeset 13:af2d3c8f616b (2023-10-25) Next changeset 15:68d3903df20e (2023-10-25)

Commit message:
Uploaded

added:
matrice_scores.py

diff -r af2d3c8f616b -r 52cc2b467990 matrice_scores.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrice_scores.py Wed Oct 25 15:46:41 2023 +0000

[

@@ -0,0 +1,213 @@
+import math, copy
+import blosum as bl
+import numpy as np
+from Bio.Seq import Seq
+from collections import Counter
+
+
+class SnpMatricesScores:
+    """Class to reproduce all the matrix substitutions scores for the inputted data.
+    Here we consider BLOSUM, PAM, GONNET, and NUC44 matrices.
+    """
+
+    def __init__(self, sequence: Seq, cluster: list[Seq], position: int, matrices: dict):
+        """Initialization method of the class SnpMatricesScores.
+
+        Args:
+            sequence (Seq): Main sequence.
+            cluster (list[Seq]): Cluster of aligned sequences.
+            position (int): Single-Nucleotide Polymorphism position.
+            matrices (dict): All df matrices loaded.
+        """
+        # Arguments
+        self.sequence = sequence
+        self.cluster = cluster
+        self.position = position
+        self.matrices = matrices
+
+        # Variables
+        self.blosum_matrices = [45, 50, 62, 80, 90]
+        self.pam_matrices = [250, 200, 160, 120, 100]
+        self.pfasum_matrices = [31, 43, 60]
+        self.vtml_matrices = [160, 80, 20]
+
+        # Construct further variables for the aa substitution analysis
+        self._construct_variables()
+
+    def get_details(self) -> dict:
+        """Method to obtain the details of the nucleotide and amino acid substitution.
+
+        Returns:
+            dict: Substitutions.
+        """
+        nucleotide_subs = f"{self.most_recurrent_nucleotide} -> {self.nucleotide}"
+        aa_subs = f"{self.modified_aa} -> {self.aa}"
+        return {"Nucleotide": nucleotide_subs, "Amino Acid": aa_subs}
+
+    def get_all_scores(self) -> dict:
+        """Method to obtain all the matrices substitution scores.
+
+        Returns:
+            dict: Matrices substitution scores.
+        """
+        res = self.blosum_scores() | self.pam_scores() | self.pfasum_scores() | self.vtml_scores()
+        res["GONNET"] = self.gonnet_score()
+        res["NUC44"] = self.nuc44_score()
+        return res
+
+    def nuc44_score(self) -> float:
+        """Method to extract the NUC44 matrix substitution value.
+
+        Returns:
+            float: NUC44 matrix substitution value.
+        """
+        df = self.matrices["nuc44"]
+        return df.at[self.nucleotide, self.most_recurrent_nucleotide]
+
+    def gonnet_score(self) -> float:
+        """Method to extract the GONNET matrix substitution value.
+
+        Returns:
+            float: GONNET matrix substitution value.
+        """
+        df = self.matrices["gonnet"]
+        return df.at[self.aa, self.modified_aa]
+
+    def pam_scores(self) -> dict:
+        """Method to extract all the PAM matrices substitution values.
+
+        Returns:
+            dict: PAM matrices substitution values.
+        """
+        res = {}
+        for pam_value in self.pam_matrices:
+            pam_matrix = f"PAM{str(pam_value)}"
+            res[pam_matrix] = self._get_pam_score(pam_matrix)
+        return res
+
+    def pfasum_scores(self) -> dict:
+        """Method to extract all the PFASUM matrices substitution values.
+
+        Returns:
+            dict: PFASUM matrices substitution values.
+        """
+        res = {}
+        for pfasum_value in self.pfasum_matrices:
+            pfasum_matrix = f"PFASUM{str(pfasum_value)}"
+            res[pfasum_matrix] = self._get_pfasum_score(pfasum_matrix)
+        return res
+
+    def vtml_scores(self) -> dict:
+        """Method to extract all the VTML matrices substitution values.
+
+        Returns:
+            dict: VTML matrices substitution values.
+        """
+        res = {}
+        for vtml_value in self.vtml_matrices:
+            vtml_matrix = f"VTML{str(vtml_value)}"
+            res[vtml_matrix] = self._get_vtml_score(vtml_matrix)
+        return res
+
+    def blosum_scores(self) -> dict:
+        """Method to obtain each blosum matrix substitution value.
+
+        Returns:
+            dict: Blosum matrix substitution values.
+        """
+        res = {}
+        for blosum_matrice in self.blosum_matrices:
+            res[f"BLOSUM{str(blosum_matrice)}"] = self._get_blosum_score(blosum_matrice)
+        return res
+
+    def _get_pam_score(self, pam_matrix: str) -> int:
+        """Method to extract the substitution amino acid value of the PAM matrix inputted.
+
+        Args:
+            pam_matrix (str): PAM matrix.
+
+        Returns:
+            int: PAM substitution value.
+        """
+        df = self.matrices[pam_matrix]
+        return df.at[self.aa, self.modified_aa]
+
+    def _get_vtml_score(self, vtml_matrix: str) -> int:
+        """Method to extract the substitution amino acid value of the VTML matrix inputted.
+
+        Args:
+            vtml_matrix (str): VTML matrix.
+
+        Returns:
+            int: VTML substitution value.
+        """
+        df = self.matrices[vtml_matrix]
+        return df.at[self.aa, self.modified_aa]
+
+    def _get_pfasum_score(self, pfasum_matrix: str) -> int:
+        """Method to extract the substitution amino acid value of the PFASUM matrix inputted.
+
+        Args:
+            pfasum_matrix (str): PFASUM matrix.
+
+        Returns:
+            int: PFASUM substitution value.
+        """
+        df = self.matrices[pfasum_matrix]
+        value = df.at[self.aa, self.modified_aa]
+        if np.isnan(value):
+            value = df.at[self.modified_aa, self.aa]
+        return value
+
+    def _get_blosum_score(self, blosum_matrix: int) -> int:
+        """Method to obtain the blosum matrix value of the given substitution.
+
+        Args:
+            blosum_matrix (int): Blosum matrix number.
+
+        Returns:
+            int: Blosum substitution value.
+        """
+        matrix = bl.BLOSUM(blosum_matrix)
+        return matrix[self.modified_aa][self.aa]
+
+    def _get_snp_aa(self, sequence: Seq) -> str:
+        """Extract the altered amino acid of the DNA sequence polymorphism.
+
+        Args:
+            sequence (Seq): DNA Sequence.
+
+        Returns:
+            str: Altered amino acid.
+        """
+        number_removed = sequence[: self.position].count("-")
+        sequence_format = str(sequence).replace("-", "")
+        protein = Seq(sequence_format).translate()
+
+        # Get the position of the SNP in the proteic sequence
+        protein_pos = (self.position - number_removed) / 3
+        protein_pos = (
+            math.ceil(protein_pos) - 1
+            if int(protein_pos) != math.ceil(protein_pos) and protein_pos >= 0
+            else math.ceil(protein_pos)
+        )
+
+        if protein_pos >= len(protein):
+            return None
+        return protein[max(protein_pos, 0)]
+
+    def _construct_variables(self):
+        """Method to construct the protein variables important for the analysis of the nucleotide and amino acid substitution.
+        Used to avoid repetitions in variable creation and package calling.
+        """
+        # DNA
+        self.nucleotide = self.sequence[self.position]
+        cluster_nucleotides = list(map(lambda x: x[self.position], self.cluster))
+        occurence_count = Counter(cluster_nucleotides)
+        self.most_recurrent_nucleotide = occurence_count.most_common(1)[0][0]
+
+        # Protein
+        seq_without_snp = copy.deepcopy(self.sequence)
+        seq_without_snp = seq_without_snp[:self.position] + self.most_recurrent_nucleotide + seq_without_snp[self.position + 1:]
+        self.modified_aa = self._get_snp_aa(seq_without_snp)
+        self.aa = self._get_snp_aa(self.sequence)