Mercurial > repos > cpt > cpt_sar_finder

diff SAR_functions.py @ 1:112751823323 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author: cpt
date: Mon, 05 Jun 2023 02:52:57 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SAR_functions.py	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,348 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import os
+import re
+from Bio import SeqIO
+
+
+class CheckSequence:
+    """
+    SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins
+    """
+
+    def __init__(self, protein_name, protein_data):
+        self.name = protein_name
+        self.seq = protein_data.seq
+        self.description = protein_data.description
+        self.size = len(self.seq)
+        self.store = {}
+
+    def check_sizes(self, min, max):
+        """check the minimum and maximum peptide lengths"""
+        if self.size < min:
+            print("too small")
+        elif self.size > max:
+            print("too large")
+        else:
+            print(f"{self.name} : {self.seq}")
+            return True
+
+    def check_hydrophobicity_and_charge(
+        self, sar_min=15, sar_max=20, perc_residues="SGAT"
+    ):
+        """verifies the existence of a hydrophobic region within the sequence"""
+        hydrophobic_residues = "['FIWLVMYCATGSP']"  # fed through regex
+        hits = self.store
+        pos_res = "RK"
+        neg_res = "DE"
+
+        if self.size > 50:
+            seq = self.seq[0:50]
+        else:
+            seq = self.seq
+        for sar_size in range(sar_min, sar_max, 1):
+            for i in range(0, len(seq) - sar_size, 1):
+                sar_seq = str(seq[i : i + sar_size])
+                if re.search(
+                    (hydrophobic_residues + "{" + str(sar_size) + "}"), sar_seq
+                ):
+                    (
+                        charge_seq,
+                        charge,
+                        perc_cont,
+                        sar_coords,
+                        nterm_coords,
+                        cterm_coords,
+                        sar_start,
+                        sar_end,
+                    ) = rep_funcs(
+                        self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
+                    )
+                    storage_dict(
+                        self=self,
+                        sar_size=sar_size,
+                        sar_seq=sar_seq,
+                        hits=hits,
+                        charge_seq=charge_seq,
+                        charge=charge,
+                        perc_cont=perc_cont,
+                        nterm_coords=nterm_coords,
+                        sar_coords=sar_coords,
+                        cterm_coords=cterm_coords,
+                        sar_start=sar_start,
+                        sar_end=sar_end,
+                    )
+                    # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+                elif "K" in sar_seq[0] and re.search(
+                    (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[1:]
+                ):  # check frontend snorkels
+                    (
+                        charge_seq,
+                        charge,
+                        perc_cont,
+                        sar_coords,
+                        nterm_coords,
+                        cterm_coords,
+                        sar_start,
+                        sar_end,
+                    ) = rep_funcs(
+                        self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
+                    )
+                    storage_dict(
+                        self=self,
+                        sar_size=sar_size,
+                        sar_seq=sar_seq,
+                        hits=hits,
+                        charge_seq=charge_seq,
+                        charge=charge,
+                        perc_cont=perc_cont,
+                        nterm_coords=nterm_coords,
+                        sar_coords=sar_coords,
+                        cterm_coords=cterm_coords,
+                        sar_start=sar_start,
+                        sar_end=sar_end,
+                    )
+                    # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+                elif "K" in sar_seq[-1] and re.search(
+                    (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[:-1]
+                ):  # check backend snorkels
+                    (
+                        charge_seq,
+                        charge,
+                        perc_cont,
+                        sar_coords,
+                        nterm_coords,
+                        cterm_coords,
+                        sar_start,
+                        sar_end,
+                    ) = rep_funcs(
+                        self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
+                    )
+                    storage_dict(
+                        self=self,
+                        sar_size=sar_size,
+                        sar_seq=sar_seq,
+                        hits=hits,
+                        charge_seq=charge_seq,
+                        charge=charge,
+                        perc_cont=perc_cont,
+                        nterm_coords=nterm_coords,
+                        sar_coords=sar_coords,
+                        cterm_coords=cterm_coords,
+                        sar_start=sar_start,
+                        sar_end=sar_end,
+                    )
+                    # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+                continue
+
+        return hits
+
+    def shrink_results(self, sar_min=15, sar_max=20, perc_residues="SGAT"):
+        """removes repetiive hits, keeps only the shortest and longest of each SAR domain"""
+        compare_candidates = {}
+        hits = self.check_hydrophobicity_and_charge(sar_min=sar_min, sar_max=sar_max)
+        for sar_name, data in hits.items():
+            # print(sar_name)
+            compare_candidates[sar_name] = {}
+            # print("\nThese are the values: {}".format(v))
+            # count_of_times = 0
+            tmd_log = []
+            for sar_size in range(sar_max, sar_min - 1, -1):
+                if "TMD_" + str(sar_size) in data:
+                    tmd_log.append(sar_size)
+                    # print(tmd_log)
+                    for idx, the_data in enumerate(data["TMD_" + str(sar_size)]):
+                        # print(the_data[7])
+                        # print(the_data)
+                        # print(f"This is the index: {idx}")
+                        # print(f"This is the list of data at this index: {the_data}")
+                        if (
+                            the_data[7] in compare_candidates[sar_name]
+                        ):  # index to start
+                            compare_candidates[sar_name][the_data[7]]["count"] += 1
+                            compare_candidates[sar_name][the_data[7]]["size"].append(
+                                sar_size
+                            )
+                            compare_candidates[sar_name][the_data[7]]["index"].append(
+                                idx
+                            )
+                        else:
+                            compare_candidates[sar_name][the_data[7]] = {}
+                            compare_candidates[sar_name][the_data[7]]["count"] = 1
+                            compare_candidates[sar_name][the_data[7]]["size"] = [
+                                sar_size
+                            ]
+                            compare_candidates[sar_name][the_data[7]]["index"] = [idx]
+            hits[sar_name]["biggest_sar"] = tmd_log[0]
+        for sar_name, compare_data in compare_candidates.items():
+            for data in compare_data.values():
+                if len(data["size"]) >= 3:
+                    # print(f"{each_size} --> {data}")
+                    minmax = [min(data["size"]), max(data["size"])]
+                    nonminmax = [x for x in data["size"] if x not in minmax]
+                    nonminmax_index = []
+                    for each_nonminmax in nonminmax:
+                        v = data["size"].index(each_nonminmax)
+                        x = data["index"][v]
+                        nonminmax_index.append(x)
+                    nons = zip(nonminmax, nonminmax_index)
+                    for value in nons:
+                        # hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1])
+                        hits[sar_name]["TMD_" + str(value[0])][value[1]] = [""]
+
+        return hits
+
+
+def rep_funcs(self, seq, loc, pos_res, neg_res, sar_seq, perc_residues, sar_size):
+    """run a set of functions together before sending the results to the storage dictionary"""
+
+    charge_seq = str(seq[:loc])
+    charge = charge_check(charge_seq, pos_res, neg_res)
+    perc_cont = percent_calc(sar_seq, perc_residues, int(sar_size))
+    sar_start = loc
+    sar_end = loc + sar_size
+    sar_coords = "{}..{}".format(loc, loc + sar_size)
+    nterm_coords = "{}..{}".format("0", loc - 1)
+    cterm_coords = "{}..{}".format(loc + sar_size + 1, self.size)
+
+    return (
+        charge_seq,
+        charge,
+        perc_cont,
+        sar_coords,
+        nterm_coords,
+        cterm_coords,
+        sar_start,
+        sar_end,
+    )
+
+
+### Extra "helper" functions
+def storage_dict(
+    self,
+    sar_size,
+    sar_seq,
+    hits,
+    charge_seq,
+    charge,
+    perc_cont,
+    nterm_coords,
+    sar_coords,
+    cterm_coords,
+    sar_start,
+    sar_end,
+):  # probably not good to call "self" a param here...definitley not PEP approved...
+    """organize dictionary for hydrophobicity check"""
+    if self.name not in hits:
+        hits[self.name] = {}
+        hits[self.name]["description"] = str(self.description)
+        hits[self.name]["sequence"] = str(self.seq)
+        hits[self.name]["size"] = str(self.size)
+        # GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100)
+        # hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont))
+        if "TMD_" + str(sar_size) not in hits[self.name]:
+            hits[self.name]["TMD_" + str(sar_size)] = []
+            hits[self.name]["TMD_" + str(sar_size)].append(
+                [
+                    sar_seq,
+                    charge_seq,
+                    charge,
+                    perc_cont,
+                    nterm_coords,
+                    sar_coords,
+                    cterm_coords,
+                    sar_start,
+                    sar_end,
+                ]
+            )
+        else:
+            hits[self.name]["TMD_" + str(sar_size)].append(
+                [
+                    sar_seq,
+                    charge_seq,
+                    charge,
+                    perc_cont,
+                    nterm_coords,
+                    sar_coords,
+                    cterm_coords,
+                    sar_start,
+                    sar_end,
+                ]
+            )
+    else:
+        if "TMD_" + str(sar_size) not in hits[self.name]:
+            hits[self.name]["TMD_" + str(sar_size)] = []
+            hits[self.name]["TMD_" + str(sar_size)].append(
+                [
+                    sar_seq,
+                    charge_seq,
+                    charge,
+                    perc_cont,
+                    nterm_coords,
+                    sar_coords,
+                    cterm_coords,
+                    sar_start,
+                    sar_end,
+                ]
+            )
+        else:
+            hits[self.name]["TMD_" + str(sar_size)].append(
+                [
+                    sar_seq,
+                    charge_seq,
+                    charge,
+                    perc_cont,
+                    nterm_coords,
+                    sar_coords,
+                    cterm_coords,
+                    sar_start,
+                    sar_end,
+                ]
+            )
+
+
+def percent_calc(sequence, residues, size):
+    """Calculate the percent of a set of residues within an input sequence"""
+    counted = {}
+    for aa in sequence:
+        # print(aa)
+        if aa in counted:
+            counted[aa] += 1
+        else:
+            counted[aa] = 1
+    residue_amt = 0
+    my_ratios = []
+    for res_of_interest in residues:
+        try:
+            residue_amt = counted[res_of_interest]
+        except KeyError:
+            residue_amt = 0
+        ratio = residue_amt / size
+        my_ratios.append((round(ratio * 100, 2)))
+
+    res_rat = list(zip(residues, my_ratios))
+
+    return res_rat
+
+
+def charge_check(charge_seq, pos_res, neg_res):
+    charge = 0
+    for aa in charge_seq:
+        if aa in pos_res:
+            charge += 1
+        if aa in neg_res:
+            charge -= 1
+    return charge
+
+
+if __name__ == "__main__":
+    sequence = "MAGBYYYTRLCVRKLRKGGGHP"
+    residues = "YL"
+    size = len(sequence)
+    print(size)
+    v = percent_calc(sequence, residues, size)
+    print(v)
+    for i in v:
+        print(i)