changeset 0:9f62910edcc9 draft

Uploaded
author cpt
date Fri, 17 Jun 2022 13:15:55 +0000
parents
children 112751823323
files cpt_sar_finder/SAR_finder.py cpt_sar_finder/SAR_finder.xml cpt_sar_finder/SAR_functions.py cpt_sar_finder/biopython_parsing.py cpt_sar_finder/cpt-macros.xml cpt_sar_finder/file_operations.py cpt_sar_finder/macros.xml cpt_sar_finder/test-data/candidate_SAR.fa cpt_sar_finder/test-data/candidate_SAR.gff3 cpt_sar_finder/test-data/candidate_SAR_stats.tsv cpt_sar_finder/test-data/simple-proteins.fa
diffstat 11 files changed, 583 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/SAR_finder.py	Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,44 @@
+import sys
+import argparse
+import os
+import re
+from biopython_parsing import FASTA_parser
+from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict
+from SAR_functions import CheckSequence
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SAR Finder")
+
+    parser.add_argument("fa",type=argparse.FileType("r"),help="organism's multi fasta file")
+
+    parser.add_argument("--min",type=int,default=20,help="minimum size of candidate peptide")
+
+    parser.add_argument("--max",type=int,default=200,help="maximum size of candidate peptide")
+
+    parser.add_argument("--sar_min",type=int,default=15,help="minimum size of candidate peptide TMD domain")
+
+    parser.add_argument("--sar_max",type=int,default=24,help="maximum size of candidate peptide TMD domain")
+    
+    parser.add_argument("--out_fa",type=argparse.FileType("w"),help="multifasta output of candidate SAR proteins",default="candidate_SAR.fa")
+
+    parser.add_argument("--out_stat",type=argparse.FileType("w"),help="summary statistic file for candidate SAR proteins, tab separated",default="candidate_SAR_stats.tsv")
+
+    parser.add_argument("--out_gff3",type=argparse.FileType("w"),help="multigff3 file for candidate SAR proteins",default="candidate_SAR.gff3")
+
+    args = parser.parse_args()
+
+    fa_dict = FASTA_parser(fa=args.fa).multifasta_dict()
+
+    sars = {}
+
+    for protein_name, protein_data in fa_dict.items():
+        sar = CheckSequence(protein_name, protein_data)
+        #sar.check_sizes(min=args.min,max=args.max)
+        hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max)
+        sars.update(hydros)
+    
+
+    gff3_from_SAR_dict(sars, args.out_gff3)
+    tab_from_SAR_dict(sars,args.out_stat,"SGAT",sar_min=args.sar_min, sar_max=args.sar_max)
+    fasta_from_SAR_dict(sars,args.out_fa)
+    #stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready.
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/SAR_finder.xml	Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,73 @@
+<tool id="edu.tamu.cpt.sar.sar_finder" name="SAR Finder" version="1.0">
+    <description>SAR Domain Finder</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements">
+    </expand>
+    <command detect_errors="aggressive"><![CDATA[
+python $__tool_directory__/SAR_finder.py
+$fa
+--sar_min $sar_min
+--sar_max $sar_max
+--out_fa $out_fa
+--out_gff3 $out_gff3
+--out_stat $out_stat
+    ]]></command>
+    <inputs>
+        <param label="Multi FASTA File" name="fa" type="data" format="fasta" />
+        <param label="SAR domain minimal size" name="sar_min" type="integer" value="15" />
+        <param label="SAR domain maximum size" name="sar_max" type="integer" value="20" />
+    </inputs>
+    <outputs>
+        <data format="tabular" name="out_stat" label="candidate_SAR_stats.tsv"/>
+        <data format="fasta" name="out_fa" label="candidate_SAR.fa"/>
+        <data format="gff3" name="out_gff3" label="candidate_SAR.gff3"/>
+    </outputs>
+        <tests>
+            <test>
+                <param name="fa" value="simple-proteins.fa"/>
+                <param name="sar_min" value="15"/>
+                <param name="sar_max" value="20"/>
+                <output name="out_stat" file="candidate_SAR_stats.tsv"/>
+                <output name="out_fa" file="candidate_SAR.fa"/>
+                <output name="out_gff3" file="candidate_SAR.gff3"/>
+            </test>
+        </tests>
+    <help><![CDATA[
+
+**What it does**
+A tool that analyzes the sequence within the first 50 residues of a protein for a weakly hydrophobic domain called Signal-Anchor-Release (aka SAR). 
+The tool finds proteins that contain a stretch (default 15-20 residues) of hydrophobic residues (Ile, Leu, Val, Phe, Tyr, Trp, Met, Gly, Ala, Ser) and 
+calculates the % Gly/Ala/Ser/Thr residues in the hydrophobic stretch. The net charge on the N-terminus is also displayed to aid in determining the 
+SAR orientation in the membrane.[2]
+
+Definition: A Signal-Anchor-Release (SAR) domain is an N-terminal, weakly hydrophobic transmembrane region rich in Gly/Ala and/or Ser (and sometimes Thr) residues. 
+The SAR domain is sometimes found in phage lysis proteins, including endolysins and holins. The SAR domain can be released from the membrane in a proton 
+motive force-dependent manner. Known SAR domains in phage endolysins often have >50-60% Gly/Ala/Ser/Thr content. SAR endolysins are expected to have a net positive
+charge on the N-terminus by the positive-inside rule.
+
+**INPUT** --> Protein Multi FASTA
+
+**OUTPUT** --> 
+
+* Multi FASTA with candidate proteins that pass the SAR domain criteria
+
+* Tabular summary file that lists every subdomain fitting the criteria for each potential SAR domain-containing protein with the following: protein name/sequence/length, SAR length/start/sequence/end, individual and total GAST% content in SAR, and N-terminal sequence/net charge
+
+* Multi GFF3 for unique candidate SAR domain-containing proteins
+
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+        <citation type="doi">https://dx.doi.org/10.1016/bs.aivir.2018.09.003</citation>
+        <citation type="bibtex">
+            @unpublished{galaxyTools,
+            author = {C. Ross},
+            title = {CPT Galaxy Tools},
+            year = {2020-},
+            note = {https://github.com/tamu-cpt/galaxy-tools/}
+            }
+        </citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/SAR_functions.py	Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import os
+import re
+from Bio import SeqIO
+
+
+class CheckSequence:
+    """ 
+    SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins 
+    """
+
+
+    def __init__(self, protein_name, protein_data):
+        self.name = protein_name
+        self.seq = protein_data.seq
+        self.description = protein_data.description
+        self.size = len(self.seq)
+        self.store = {}
+
+
+    def check_sizes(self,min,max):
+        """ check the minimum and maximum peptide lengths """
+        if self.size < min:
+            print("too small")
+        elif self.size > max:
+            print("too large")
+        else:
+            print(f"{self.name} : {self.seq}")
+            return True
+
+
+    def check_hydrophobicity_and_charge(self,sar_min=15,sar_max=20,perc_residues="SGAT"):
+        """ verifies the existence of a hydrophobic region within the sequence """
+        hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex
+        hits = self.store
+        pos_res = "RK"
+        neg_res = "DE"
+
+        if self.size > 50:
+            seq = self.seq[0:50]
+        else:
+            seq = self.seq 
+        for sar_size in range(sar_min, sar_max, 1):
+            for i in range(0,len(seq)-sar_size,1):
+                sar_seq = str(seq[i:i+sar_size])
+                if re.search((hydrophobic_residues+"{"+str(sar_size)+"}"),sar_seq):
+                    charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
+                    storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
+                    #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+                elif "K" in sar_seq[0] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[1:]): # check frontend snorkels
+                    charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
+                    storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
+                    #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+                elif "K" in sar_seq[-1] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[:-1]): # check backend snorkels
+                    charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
+                    storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
+                    #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+                continue
+        
+        return hits
+
+    def shrink_results(self,sar_min=15,sar_max=20,perc_residues="SGAT"):
+        """ removes repetiive hits, keeps only the shortest and longest of each SAR domain """
+        compare_candidates = {}
+        hits = self.check_hydrophobicity_and_charge(sar_min=sar_min,sar_max=sar_max)
+        for sar_name, data in hits.items():
+            #print(sar_name)
+            compare_candidates[sar_name] = {}
+            #print("\nThese are the values: {}".format(v))
+            #count_of_times = 0
+            tmd_log = []
+            for sar_size in range(sar_max,sar_min-1,-1):
+                if "TMD_"+str(sar_size) in data:
+                    tmd_log.append(sar_size)
+                    #print(tmd_log)
+                    for idx,the_data in enumerate(data["TMD_"+str(sar_size)]):
+                        #print(the_data[7])
+                        #print(the_data)
+                        #print(f"This is the index: {idx}")
+                        #print(f"This is the list of data at this index: {the_data}")
+                        if the_data[7] in compare_candidates[sar_name]: # index to start
+                            compare_candidates[sar_name][the_data[7]]["count"] += 1
+                            compare_candidates[sar_name][the_data[7]]["size"].append(sar_size)
+                            compare_candidates[sar_name][the_data[7]]["index"].append(idx)
+                        else:
+                            compare_candidates[sar_name][the_data[7]] = {}
+                            compare_candidates[sar_name][the_data[7]]["count"] = 1
+                            compare_candidates[sar_name][the_data[7]]["size"] = [sar_size]
+                            compare_candidates[sar_name][the_data[7]]["index"] = [idx]
+            hits[sar_name]["biggest_sar"] = tmd_log[0]
+        for sar_name, compare_data in compare_candidates.items():
+            for data in compare_data.values():
+                if len(data["size"]) >= 3:
+                    #print(f"{each_size} --> {data}")
+                    minmax = [min(data["size"]),max(data["size"])]
+                    nonminmax = [x for x in data["size"] if x not in minmax]
+                    nonminmax_index = []
+                    for each_nonminmax in nonminmax:
+                        v = data["size"].index(each_nonminmax)
+                        x = data["index"][v]
+                        nonminmax_index.append(x)
+                    nons = zip(nonminmax,nonminmax_index)
+                    for value in nons:
+                        #hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1])
+                        hits[sar_name]["TMD_"+str(value[0])][value[1]] = [""]
+
+        return hits
+
+
+def rep_funcs(self,seq,loc,pos_res,neg_res,sar_seq,perc_residues,sar_size):
+    """ run a set of functions together before sending the results to the storage dictionary """
+
+    charge_seq = str(seq[:loc])
+    charge = charge_check(charge_seq,pos_res,neg_res)
+    perc_cont = percent_calc(sar_seq,perc_residues,int(sar_size))
+    sar_start = loc
+    sar_end = loc + sar_size
+    sar_coords = "{}..{}".format(loc,loc+sar_size)
+    nterm_coords = "{}..{}".format("0",loc-1)
+    cterm_coords = "{}..{}".format(loc+sar_size+1,self.size)
+
+    return charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end
+
+
+### Extra "helper" functions
+def storage_dict(self,sar_size,sar_seq,hits,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end): # probably not good to call "self" a param here...definitley not PEP approved...
+    """ organize dictionary for hydrophobicity check """
+    if self.name not in hits:
+        hits[self.name] = {}
+        hits[self.name]["description"] = str(self.description)
+        hits[self.name]["sequence"] = str(self.seq)
+        hits[self.name]["size"] = str(self.size)
+        #GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100)
+        #hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont))
+        if "TMD_"+str(sar_size) not in hits[self.name]:
+            hits[self.name]["TMD_"+str(sar_size)] = []
+            hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
+        else:
+            hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
+    else:
+        if "TMD_"+str(sar_size) not in hits[self.name]:
+            hits[self.name]["TMD_"+str(sar_size)] = []
+            hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
+        else:
+            hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) 
+
+
+def percent_calc(sequence,residues,size):
+    """ Calculate the percent of a set of residues within an input sequence """
+    counted = {}
+    for aa in sequence:
+        #print(aa)
+        if aa in counted:
+            counted[aa] += 1
+        else:
+            counted[aa] = 1
+    residue_amt = 0
+    my_ratios = []
+    for res_of_interest in residues:
+        try:
+            residue_amt = counted[res_of_interest]
+        except KeyError:
+            residue_amt = 0
+        ratio = residue_amt/size
+        my_ratios.append((round(ratio*100,2)))
+    
+    res_rat = list(zip(residues,my_ratios))
+
+    return res_rat
+
+
+def charge_check(charge_seq,pos_res,neg_res):
+    charge = 0
+    for aa in charge_seq:
+        if aa in pos_res:
+            charge += 1
+        if aa in neg_res:
+            charge -= 1
+    return charge
+
+if __name__ == "__main__":
+    sequence = "MAGBYYYTRLCVRKLRKGGGHP"
+    residues = "YL"
+    size = len(sequence)
+    print(size)
+    v = percent_calc(sequence,residues,size)
+    print(v)
+    for i in v:
+        print(i)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/biopython_parsing.py	Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# Biopython parsing module. Uses in conjunction with the sar_finder script, and potential future scripts down the line.
+
+from Bio import SeqIO
+
+class FASTA_parser:
+    """ Parses multi fasta file, and zips together header with sequence """
+
+    def __init__(self, fa):
+        self.fa = fa
+    
+    def multifasta_dict(self):
+        """ parses the input multi fasta, and puts results into dictionary """
+
+        return SeqIO.to_dict(SeqIO.parse(self.fa,"fasta"))
+
+
+if __name__ == "__main__":
+    fa_file = "test-data/mu-proteins.fa"
+    d = FASTA_parser(fa_file).multifasta_dict()
+    print(d)
+    for k, v in d.items():
+        print(v.description)
+    
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/cpt-macros.xml	Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,115 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="gff_requirements">
+		<requirements>
+			<requirement type="package" version="2.7">python</requirement>
+			<requirement type="package" version="1.65">biopython</requirement>
+			<requirement type="package" version="2.12.1">requests</requirement>
+			<yield/>
+		</requirements>
+		<version_command>
+		<![CDATA[
+			cd $__tool_directory__ && git rev-parse HEAD
+		]]>
+		</version_command>
+	</xml>
+	<xml name="citation/mijalisrasche">
+		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+		<citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+	</xml>
+	<xml name="citations">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation> 
+		<yield/>
+		</citations>
+	</xml>
+    	<xml name="citations-crr">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+		<yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020-AJC-solo">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-clm">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="sl-citations-clm">
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+	</xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/file_operations.py	Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,98 @@
+
+def fasta_from_SAR_dict(sar_dict,fa_file):
+    """ makes a multi fasta with candidates from SAR dictionary """
+    with fa_file as f:
+        for data in sar_dict.values():
+            f.writelines(">{}\n".format(data["description"]))
+            f.writelines("{}\n".format(data["sequence"]))
+
+def gff3_from_SAR_dict(sar_dict,gff3_file):
+    """ make a multi gff3 with candidates from SAR dictionary """
+    gff3_cols = ["Seqid","Source","Type","Start","End","Score","Strand","Phase","Attributes"]
+    with gff3_file as f:
+        f.writelines(f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n")
+        if sar_dict:
+            #print(sar_dict)
+            for name, data in sar_dict.items():
+                min_idx = 0
+                f.writelines("##gff-version 3\n")
+                f.writelines(f"##sequence-region {name}\n")
+                n_start, n_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][4])
+                sar_start, sar_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][5])
+                c_start, c_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][6])
+                f.writelines(f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n')
+                f.writelines(f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n')
+                f.writelines(f'{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n')
+        else:
+            f.writelines("##gff-version 3\n")
+            f.writelines(f"##sequence-region\n")
+
+
+def tab_from_SAR_dict(sar_dict,stat_file,hydrophillic_res, sar_min, sar_max):
+    """ convert SAR dict to a dataframe """
+    columns = ["Name","Protein Sequence","Protein Length","SAR Length","SAR Start","Putative SAR Sequence","SAR End",[f"{res}%" for res in hydrophillic_res],"% Total","N-term Sequence","N-term net Charge"] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res]
+    with stat_file as f:
+        f.writelines(f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n")
+        if sar_dict:
+            #print(sar_dict)
+            for name, data in sar_dict.items():
+                for tmd_size in range(sar_max, sar_min-1, -1):
+                    if "TMD_"+str(tmd_size) in data:
+                        for each_match in data["TMD_"+str(tmd_size)]:
+                            if each_match != [""]:
+                                #print(f"{name} - {data}")
+                                #print(each_match)
+                                #for perc in each_match[3]:
+                                #    print(perc)
+                                try:
+                                    f.writelines(f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n')
+                                except IndexError:
+                                    f.writelines(f'ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n')
+                            else:
+                                continue
+
+def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max):
+    """ summary statistics from SAR finder function """
+    with stat_file as f:
+        f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n")
+        if sar_dict:
+            for data in sar_dict.values():
+                f.writelines("Protein Description and Name: {}\n".format(data["description"]))
+                f.writelines("Protein Sequence: {}\n".format(data["sequence"]))
+                f.writelines("Protein Length: {}\n".format(data["size"]))
+                f.writelines("SAR Criteria matching region(s)\n")
+                for tmd_size in range(sar_max, sar_min-1, -1):
+                    if "TMD_"+str(tmd_size) in data:
+                        f.writelines("\nSAR length of {}:\n".format(tmd_size))
+                        for each_match in data["TMD_"+str(tmd_size)]:
+                            if each_match != ['']:
+                                f.writelines("\nPotential SAR domain sequence: {}\n".format(each_match[0]))
+                                f.writelines("N-term sequence: {}\n".format(each_match[1]))
+                                f.writelines("N-term net charge: {}\n".format(each_match[2]))
+                                for each_perc_calc in each_match[3]:
+                                    f.writelines("Percent {} content: {}%\n".format(each_perc_calc[0],each_perc_calc[1]))
+                                f.writelines("N-term coords: {}\n".format(each_match[4]))
+                                f.writelines("SAR coords: {}\n".format(each_match[5]))
+                                f.writelines("C-term coords: {}\n".format(each_match[6]))
+                                f.writelines("SAR start: {}\n".format(each_match[7]))
+                            else:
+                                continue
+                f.writelines("========================================================\n\n")
+        else:
+            f.writelines("No candidate SAR Proteins found")
+
+def split_seq_string(input_range, python_indexing=True):
+    """ splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps """
+    if python_indexing:
+        values = input_range.split("..")
+        start =int(values[0]) + 1
+        end = int(values[1]) + 1
+    else:
+        values = input_range.split("..")
+        start = values[0]
+        end = values[1]
+
+    return start, end
+
+if __name__ == "__main__":
+    pass
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/macros.xml	Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="requirements">
+		<requirements>
+			<requirement type="package" version="3.6.1">python</requirement>
+			<requirement type="package" version="1.67">biopython</requirement>
+			<yield/>
+		</requirements>
+        <version_command>
+		<![CDATA[
+			cd $__tool_directory__ && git rev-parse HEAD
+		]]>
+		</version_command>
+	</xml>
+</macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/test-data/candidate_SAR.fa	Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,2 @@
+>SAR-endolysin
+MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/test-data/candidate_SAR.gff3	Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,6 @@
+Seqid	Source	Type	Start	End	Score	Strand	Phase	Attributes
+##gff-version 3
+##sequence-region SAR-endolysin
+SAR-endolysin	SAR_finder	Topological domain	1	8	.	.	.	Note=N-terminal net charge is 2
+SAR-endolysin	SAR_finder	SAR domain	9	26	.	.	.	Note=residue % in SAR [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)],Total % is 58.82
+SAR-endolysin	SAR_finder	Topological domain	27	172	.	.	.	Note=C-terminus
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/test-data/candidate_SAR_stats.tsv	Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,6 @@
+Name	Protein Sequence	Protein Length	SAR Length	SAR Start	Putative SAR Sequence	SAR End	['S%', 'G%', 'A%', 'T%']	% Total	N-term Sequence	N-term net Charge
+SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	17	9	KAALLAVTIAGGGVGGY	26	[('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)]	58.82	MAGIPKKL	2
+SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	16	10	AALLAVTIAGGGVGGY	26	[('S', 0.0), ('G', 31.25), ('A', 25.0), ('T', 6.25)]	62.5	MAGIPKKLK	3
+SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	15	9	KAALLAVTIAGGGVG	24	[('S', 0.0), ('G', 26.67), ('A', 26.67), ('T', 6.67)]	60.01	MAGIPKKL	2
+SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	15	10	AALLAVTIAGGGVGG	25	[('S', 0.0), ('G', 33.33), ('A', 26.67), ('T', 6.67)]	66.67	MAGIPKKLK	3
+SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	15	11	ALLAVTIAGGGVGGY	26	[('S', 0.0), ('G', 33.33), ('A', 20.0), ('T', 6.67)]	60.0	MAGIPKKLKA	3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/test-data/simple-proteins.fa	Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,7 @@
+>SAR-endolysin
+MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGP
+DIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTL
+LKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
+>CPT_NC_000929.1_038
+MLKIKPAAGKAIRDPLTMKLLASEGEEKPRNSFWIRRLAAGDVVEVGSTENTADDTDAAP
+KKRSKSK
\ No newline at end of file