| Previous changeset 13:af2d3c8f616b (2023-10-25) Next changeset 15:68d3903df20e (2023-10-25) |
|
Commit message:
Uploaded |
|
added:
matrice_scores.py |
| b |
| diff -r af2d3c8f616b -r 52cc2b467990 matrice_scores.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrice_scores.py Wed Oct 25 15:46:41 2023 +0000 |
| [ |
| @@ -0,0 +1,213 @@ +import math, copy +import blosum as bl +import numpy as np +from Bio.Seq import Seq +from collections import Counter + + +class SnpMatricesScores: + """Class to reproduce all the matrix substitutions scores for the inputted data. + Here we consider BLOSUM, PAM, GONNET, and NUC44 matrices. + """ + + def __init__(self, sequence: Seq, cluster: list[Seq], position: int, matrices: dict): + """Initialization method of the class SnpMatricesScores. + + Args: + sequence (Seq): Main sequence. + cluster (list[Seq]): Cluster of aligned sequences. + position (int): Single-Nucleotide Polymorphism position. + matrices (dict): All df matrices loaded. + """ + # Arguments + self.sequence = sequence + self.cluster = cluster + self.position = position + self.matrices = matrices + + # Variables + self.blosum_matrices = [45, 50, 62, 80, 90] + self.pam_matrices = [250, 200, 160, 120, 100] + self.pfasum_matrices = [31, 43, 60] + self.vtml_matrices = [160, 80, 20] + + # Construct further variables for the aa substitution analysis + self._construct_variables() + + def get_details(self) -> dict: + """Method to obtain the details of the nucleotide and amino acid substitution. + + Returns: + dict: Substitutions. + """ + nucleotide_subs = f"{self.most_recurrent_nucleotide} -> {self.nucleotide}" + aa_subs = f"{self.modified_aa} -> {self.aa}" + return {"Nucleotide": nucleotide_subs, "Amino Acid": aa_subs} + + def get_all_scores(self) -> dict: + """Method to obtain all the matrices substitution scores. + + Returns: + dict: Matrices substitution scores. + """ + res = self.blosum_scores() | self.pam_scores() | self.pfasum_scores() | self.vtml_scores() + res["GONNET"] = self.gonnet_score() + res["NUC44"] = self.nuc44_score() + return res + + def nuc44_score(self) -> float: + """Method to extract the NUC44 matrix substitution value. + + Returns: + float: NUC44 matrix substitution value. + """ + df = self.matrices["nuc44"] + return df.at[self.nucleotide, self.most_recurrent_nucleotide] + + def gonnet_score(self) -> float: + """Method to extract the GONNET matrix substitution value. + + Returns: + float: GONNET matrix substitution value. + """ + df = self.matrices["gonnet"] + return df.at[self.aa, self.modified_aa] + + def pam_scores(self) -> dict: + """Method to extract all the PAM matrices substitution values. + + Returns: + dict: PAM matrices substitution values. + """ + res = {} + for pam_value in self.pam_matrices: + pam_matrix = f"PAM{str(pam_value)}" + res[pam_matrix] = self._get_pam_score(pam_matrix) + return res + + def pfasum_scores(self) -> dict: + """Method to extract all the PFASUM matrices substitution values. + + Returns: + dict: PFASUM matrices substitution values. + """ + res = {} + for pfasum_value in self.pfasum_matrices: + pfasum_matrix = f"PFASUM{str(pfasum_value)}" + res[pfasum_matrix] = self._get_pfasum_score(pfasum_matrix) + return res + + def vtml_scores(self) -> dict: + """Method to extract all the VTML matrices substitution values. + + Returns: + dict: VTML matrices substitution values. + """ + res = {} + for vtml_value in self.vtml_matrices: + vtml_matrix = f"VTML{str(vtml_value)}" + res[vtml_matrix] = self._get_vtml_score(vtml_matrix) + return res + + def blosum_scores(self) -> dict: + """Method to obtain each blosum matrix substitution value. + + Returns: + dict: Blosum matrix substitution values. + """ + res = {} + for blosum_matrice in self.blosum_matrices: + res[f"BLOSUM{str(blosum_matrice)}"] = self._get_blosum_score(blosum_matrice) + return res + + def _get_pam_score(self, pam_matrix: str) -> int: + """Method to extract the substitution amino acid value of the PAM matrix inputted. + + Args: + pam_matrix (str): PAM matrix. + + Returns: + int: PAM substitution value. + """ + df = self.matrices[pam_matrix] + return df.at[self.aa, self.modified_aa] + + def _get_vtml_score(self, vtml_matrix: str) -> int: + """Method to extract the substitution amino acid value of the VTML matrix inputted. + + Args: + vtml_matrix (str): VTML matrix. + + Returns: + int: VTML substitution value. + """ + df = self.matrices[vtml_matrix] + return df.at[self.aa, self.modified_aa] + + def _get_pfasum_score(self, pfasum_matrix: str) -> int: + """Method to extract the substitution amino acid value of the PFASUM matrix inputted. + + Args: + pfasum_matrix (str): PFASUM matrix. + + Returns: + int: PFASUM substitution value. + """ + df = self.matrices[pfasum_matrix] + value = df.at[self.aa, self.modified_aa] + if np.isnan(value): + value = df.at[self.modified_aa, self.aa] + return value + + def _get_blosum_score(self, blosum_matrix: int) -> int: + """Method to obtain the blosum matrix value of the given substitution. + + Args: + blosum_matrix (int): Blosum matrix number. + + Returns: + int: Blosum substitution value. + """ + matrix = bl.BLOSUM(blosum_matrix) + return matrix[self.modified_aa][self.aa] + + def _get_snp_aa(self, sequence: Seq) -> str: + """Extract the altered amino acid of the DNA sequence polymorphism. + + Args: + sequence (Seq): DNA Sequence. + + Returns: + str: Altered amino acid. + """ + number_removed = sequence[: self.position].count("-") + sequence_format = str(sequence).replace("-", "") + protein = Seq(sequence_format).translate() + + # Get the position of the SNP in the proteic sequence + protein_pos = (self.position - number_removed) / 3 + protein_pos = ( + math.ceil(protein_pos) - 1 + if int(protein_pos) != math.ceil(protein_pos) and protein_pos >= 0 + else math.ceil(protein_pos) + ) + + if protein_pos >= len(protein): + return None + return protein[max(protein_pos, 0)] + + def _construct_variables(self): + """Method to construct the protein variables important for the analysis of the nucleotide and amino acid substitution. + Used to avoid repetitions in variable creation and package calling. + """ + # DNA + self.nucleotide = self.sequence[self.position] + cluster_nucleotides = list(map(lambda x: x[self.position], self.cluster)) + occurence_count = Counter(cluster_nucleotides) + self.most_recurrent_nucleotide = occurence_count.most_common(1)[0][0] + + # Protein + seq_without_snp = copy.deepcopy(self.sequence) + seq_without_snp = seq_without_snp[:self.position] + self.most_recurrent_nucleotide + seq_without_snp[self.position + 1:] + self.modified_aa = self._get_snp_aa(seq_without_snp) + self.aa = self._get_snp_aa(self.sequence) |