Mercurial > repos > cpt > cpt_putative_isp

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/cpt-macros.xml	Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,115 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="gff_requirements">
+		<requirements>
+			<requirement type="package" version="2.7">python</requirement>
+			<requirement type="package" version="1.65">biopython</requirement>
+			<requirement type="package" version="2.12.1">requests</requirement>
+			<yield/>
+		</requirements>
+		<version_command>
+		<![CDATA[
+			cd $__tool_directory__ && git rev-parse HEAD
+		]]>
+		</version_command>
+	</xml>
+	<xml name="citation/mijalisrasche">
+		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+		<citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+	</xml>
+	<xml name="citations">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+		<yield/>
+		</citations>
+	</xml>
+    	<xml name="citations-crr">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+		<yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020-AJC-solo">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-clm">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="sl-citations-clm">
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+	</xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/cpt.py	Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,341 @@
+#!/usr/bin/env python
+from Bio.Seq import Seq, reverse_complement, translate
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from Bio.Data import CodonTable
+import logging
+
+logging.basicConfig()
+log = logging.getLogger()
+
+PHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*phage (?P<phage>.*)$")
+BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*bacteriophage (?P<phage>.*)$")
+STARTS_WITH_PHAGE = re.compile(
+    "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P<phage>.*)$"
+)
+NEW_STYLE_NAMES = re.compile("(?P<phage>v[A-Z]_[A-Z][a-z]{2}_.*)")
+
+
+def phage_name_parser(name):
+    host = None
+    phage = None
+    name = name.replace(", complete genome.", "")
+    name = name.replace(", complete genome", "")
+
+    m = BACTERIOPHAGE_IN_MIDDLE.match(name)
+    if m:
+        host = m.group("host")
+        phage = m.group("phage")
+        return (host, phage)
+
+    m = PHAGE_IN_MIDDLE.match(name)
+    if m:
+        host = m.group("host")
+        phage = m.group("phage")
+        return (host, phage)
+
+    m = STARTS_WITH_PHAGE.match(name)
+    if m:
+        phage = m.group("phage")
+        return (host, phage)
+
+    m = NEW_STYLE_NAMES.match(name)
+    if m:
+        phage = m.group("phage")
+        return (host, phage)
+
+    return (host, phage)
+
+
+class OrfFinder(object):
+    def __init__(self, table, ftype, ends, min_len, strand):
+        self.table = table
+        self.table_obj = CodonTable.ambiguous_generic_by_id[table]
+        self.ends = ends
+        self.ftype = ftype
+        self.min_len = min_len
+        self.starts = sorted(self.table_obj.start_codons)
+        self.stops = sorted(self.table_obj.stop_codons)
+        self.re_starts = re.compile("|".join(self.starts))
+        self.re_stops = re.compile("|".join(self.stops))
+        self.strand = strand
+
+    def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
+        seq_format = "fasta"
+        log.debug("Genetic code table %i" % self.table)
+        log.debug("Minimum length %i aa" % self.min_len)
+
+        out_count = 0
+
+        out_gff3.write("##gff-version 3\n")
+
+        for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
+            for i, (f_start, f_end, f_strand, n, t) in enumerate(
+                self.get_all_peptides(str(record.seq).upper())
+            ):
+                out_count += 1
+
+                descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
+                    len(t),
+                    len(n),
+                    f_start,
+                    f_end,
+                    f_strand,
+                    record.description,
+                )
+                fid = record.id + "|%s%i" % (self.ftype, i + 1)
+
+                r = SeqRecord(Seq(n), id=fid, name="", description=descr)
+                t = SeqRecord(Seq(t), id=fid, name="", description=descr)
+
+                SeqIO.write(r, out_nuc, "fasta")
+                SeqIO.write(t, out_prot, "fasta")
+
+                nice_strand = "+" if f_strand == +1 else "-"
+
+                out_bed.write(
+                    "\t".join(
+                        map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
+                    )
+                    + "\n"
+                )
+
+                out_gff3.write(
+                    "\t".join(
+                        map(
+                            str,
+                            [
+                                record.id,
+                                "getOrfsOrCds",
+                                "CDS",
+                                f_start + 1,
+                                f_end,
+                                ".",
+                                nice_strand,
+                                0,
+                                "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
+                            ],
+                        )
+                    )
+                    + "\n"
+                )
+        log.info("Found %i %ss", out_count, self.ftype)
+
+    def start_chop_and_trans(self, s, strict=True):
+        """Returns offset, trimmed nuc, protein."""
+        if strict:
+            assert s[-3:] in self.stops, s
+        assert len(s) % 3 == 0
+        for match in self.re_starts.finditer(s, overlapped=True):
+            # Must check the start is in frame
+            start = match.start()
+            if start % 3 == 0:
+                n = s[start:]
+                assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+                if strict:
+                    t = translate(n, self.table)
+                else:
+                    # Use when missing stop codon,
+                    t = "M" + translate(n[3:], self.table, to_stop=True)
+                yield start, n, t  # Edited by CPT to be a generator
+
+    def break_up_frame(self, s):
+        """Returns offset, nuc, protein."""
+        start = 0
+        for match in self.re_stops.finditer(s, overlapped=True):
+            index = match.start() + 3
+            if index % 3 != 0:
+                continue
+            n = s[start:index]
+            for (offset, n, t) in self.start_chop_and_trans(n):
+                if n and len(t) >= self.min_len:
+                    yield start + offset, n, t
+            start = index
+
+    def putative_genes_in_sequence(self, nuc_seq):
+        """Returns start, end, strand, nucleotides, protein.
+        Co-ordinates are Python style zero-based.
+        """
+        nuc_seq = nuc_seq.upper()
+        # TODO - Refactor to use a generator function (in start order)
+        # rather than making a list and sorting?
+        answer = []
+        full_len = len(nuc_seq)
+
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+                start = frame + offset  # zero based
+                answer.append((start, start + len(n), +1, n, t))
+
+        rc = reverse_complement(nuc_seq)
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(rc[frame:]):
+                start = full_len - frame - offset  # zero based
+                answer.append((start, start - len(n), -1, n, t))
+        answer.sort()
+        return answer
+
+    def get_all_peptides(self, nuc_seq):
+        """Returns start, end, strand, nucleotides, protein.
+
+        Co-ordinates are Python style zero-based.
+        """
+        # Refactored into generator by CPT
+        full_len = len(nuc_seq)
+        if self.strand != "reverse":
+            for frame in range(0, 3):
+                for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+                    start = frame + offset  # zero based
+                    yield (start, start + len(n), +1, n, t)
+        if self.strand != "forward":
+            rc = reverse_complement(nuc_seq)
+            for frame in range(0, 3):
+                for offset, n, t in self.break_up_frame(rc[frame:]):
+                    start = full_len - frame - offset  # zero based
+                    yield (start - len(n), start, -1, n, t)
+
+
+class MGAFinder(object):
+    def __init__(self, table, ftype, ends, min_len):
+        self.table = table
+        self.table_obj = CodonTable.ambiguous_generic_by_id[table]
+        self.ends = ends
+        self.ftype = ftype
+        self.min_len = min_len
+        self.starts = sorted(self.table_obj.start_codons)
+        self.stops = sorted(self.table_obj.stop_codons)
+        self.re_starts = re.compile("|".join(self.starts))
+        self.re_stops = re.compile("|".join(self.stops))
+
+    def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
+        seq_format = "fasta"
+        log.debug("Genetic code table %i" % self.table)
+        log.debug("Minimum length %i aa" % self.min_len)
+
+        out_count = 0
+
+        out_gff3.write("##gff-version 3\n")
+
+        for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
+            for i, (f_start, f_end, f_strand, n, t) in enumerate(
+                self.get_all_peptides(str(record.seq).upper())
+            ):
+                out_count += 1
+
+                descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
+                    len(t),
+                    len(n),
+                    f_start,
+                    f_end,
+                    f_strand,
+                    record.description,
+                )
+                fid = record.id + "|%s%i" % (self.ftype, i + 1)
+
+                r = SeqRecord(Seq(n), id=fid, name="", description=descr)
+                t = SeqRecord(Seq(t), id=fid, name="", description=descr)
+
+                SeqIO.write(r, out_nuc, "fasta")
+                SeqIO.write(t, out_prot, "fasta")
+
+                nice_strand = "+" if f_strand == +1 else "-"
+
+                out_bed.write(
+                    "\t".join(
+                        map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
+                    )
+                    + "\n"
+                )
+
+                out_gff3.write(
+                    "\t".join(
+                        map(
+                            str,
+                            [
+                                record.id,
+                                "getOrfsOrCds",
+                                "CDS",
+                                f_start + 1,
+                                f_end,
+                                ".",
+                                nice_strand,
+                                0,
+                                "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
+                            ],
+                        )
+                    )
+                    + "\n"
+                )
+        log.info("Found %i %ss", out_count, self.ftype)
+
+    def start_chop_and_trans(self, s, strict=True):
+        """Returns offset, trimmed nuc, protein."""
+        if strict:
+            assert s[-3:] in self.stops, s
+        assert len(s) % 3 == 0
+        for match in self.re_starts.finditer(s, overlapped=True):
+            # Must check the start is in frame
+            start = match.start()
+            if start % 3 == 0:
+                n = s[start:]
+                assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+                if strict:
+                    t = translate(n, self.table)
+                else:
+                    # Use when missing stop codon,
+                    t = "M" + translate(n[3:], self.table, to_stop=True)
+                yield start, n, t
+
+    def break_up_frame(self, s):
+        """Returns offset, nuc, protein."""
+        start = 0
+        for match in self.re_stops.finditer(s, overlapped=True):
+            index = match.start() + 3
+            if index % 3 != 0:
+                continue
+            n = s[start:index]
+            for (offset, n, t) in self.start_chop_and_trans(n):
+                if n and len(t) >= self.min_len:
+                    yield start + offset, n, t
+            start = index
+
+    def putative_genes_in_sequence(self, nuc_seq):
+        """Returns start, end, strand, nucleotides, protein.
+        Co-ordinates are Python style zero-based.
+        """
+        nuc_seq = nuc_seq.upper()
+        # TODO - Refactor to use a generator function (in start order)
+        # rather than making a list and sorting?
+        answer = []
+        full_len = len(nuc_seq)
+
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+                start = frame + offset  # zero based
+                answer.append((start, start + len(n), +1, n, t))
+
+        rc = reverse_complement(nuc_seq)
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(rc[frame:]):
+                start = full_len - frame - offset  # zero based
+                answer.append((start, start - len(n), -1, n, t))
+        answer.sort()
+        return answer
+
+    def get_all_peptides(self, nuc_seq):
+        """Returns start, end, strand, nucleotides, protein.
+
+        Co-ordinates are Python style zero-based.
+        """
+        # Refactored into generator by CPT
+
+        full_len = len(nuc_seq)
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+                start = frame + offset  # zero based
+                yield (start, start + len(n), +1, n, t)
+        rc = reverse_complement(nuc_seq)
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(rc[frame:]):
+                start = full_len - frame - offset  # zero based
+                yield (start - len(n), start, -1, n, t)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/generate-putative-isp.py	Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,317 @@
+#!/usr/bin/env python
+
+##### findSpanin.pl --> findSpanin.py
+######### Incooperated from the findSpanin.pl script, but better and more snakey.
+
+import argparse
+from cpt import OrfFinder
+from Bio import SeqIO
+from Bio import Seq
+import re
+from spaninFuncs import *
+import os
+
+# if __name__ == '__main__':
+# pass
+###############################################################################
+
+if __name__ == "__main__":
+
+    # Common parameters for both ISP / OSP portion of script
+
+    parser = argparse.ArgumentParser(
+        description="Get putative protein candidates for spanins"
+    )
+    parser.add_argument(
+        "fasta_file", type=argparse.FileType("r"), help="Fasta file"
+    )  # the "input" argument
+
+    parser.add_argument(
+        "-f",
+        "--format",
+        dest="seq_format",
+        default="fasta",
+        help="Sequence format (e.g. fasta, fastq, sff)",
+    )  # optional formats for input, currently just going to do ntFASTA
+
+    parser.add_argument(
+        "--strand",
+        dest="strand",
+        choices=("both", "forward", "reverse"),
+        default="both",
+        help="select strand",
+    )  # Selection of +, -, or both strands
+
+    parser.add_argument(
+        "--table", dest="table", default=11, help="NCBI Translation table", type=int
+    )  # Uses "default" NCBI codon table. This should always (afaik) be what we want...
+
+    parser.add_argument(
+        "-t",
+        "--ftype",
+        dest="ftype",
+        choices=("CDS", "ORF"),
+        default="ORF",
+        help="Find ORF or CDSs",
+    )  # "functional type(?)" --> Finds ORF or CDS, for this we want just the ORF
+
+    parser.add_argument(
+        "-e",
+        "--ends",
+        dest="ends",
+        choices=("open", "closed"),
+        default="closed",
+        help="Open or closed. Closed ensures start/stop codons are present",
+    )  # includes the start and stop codon
+
+    parser.add_argument(
+        "-m",
+        "--mode",
+        dest="mode",
+        choices=("all", "top", "one"),
+        default="all",  # I think we want this to JUST be all...nearly always
+        help="Output all ORFs/CDSs from sequence, all ORFs/CDSs with max length, or first with maximum length",
+    )
+
+    parser.add_argument(
+        "--switch",
+        dest="switch",
+        default="all",
+        help="switch between ALL putative osps, or a range. If not all, insert a range of two integers separated by a colon (:). Eg: 1234:4321",
+    )
+    # isp parameters
+    parser.add_argument(
+        "--isp_min_len",
+        dest="isp_min_len",
+        default=60,
+        help="Minimum ORF length, measured in codons",
+        type=int,
+    )
+    parser.add_argument(
+        "--isp_on",
+        dest="out_isp_nuc",
+        type=argparse.FileType("w"),
+        default="_out_isp.fna",
+        help="Output nucleotide sequences, FASTA",
+    )
+    parser.add_argument(
+        "--isp_op",
+        dest="out_isp_prot",
+        type=argparse.FileType("w"),
+        default="_out_isp.fa",
+        help="Output protein sequences, FASTA",
+    )
+    parser.add_argument(
+        "--isp_ob",
+        dest="out_isp_bed",
+        type=argparse.FileType("w"),
+        default="_out_isp.bed",
+        help="Output BED file",
+    )
+    parser.add_argument(
+        "--isp_og",
+        dest="out_isp_gff3",
+        type=argparse.FileType("w"),
+        default="_out_isp.gff3",
+        help="Output GFF3 file",
+    )
+    parser.add_argument(
+        "--isp_min_dist",
+        dest="isp_min_dist",
+        default=10,
+        help="Minimal distance to first AA of TMD, measured in AA",
+        type=int,
+    )
+    parser.add_argument(
+        "--isp_max_dist",
+        dest="isp_max_dist",
+        default=30,
+        help="Maximum distance to first AA of TMD, measured in AA",
+        type=int,
+    )
+    parser.add_argument(
+        "--putative_isp",
+        dest="putative_isp_fa",
+        type=argparse.FileType("w"),
+        default="_putative_isp.fa",
+        help="Output of putative FASTA file",
+    )
+    parser.add_argument(
+        "--min_tmd_size",
+        dest="min_tmd_size",
+        default=10,
+        help="Minimal size of the TMD domain",
+        type=int,
+    )
+    parser.add_argument(
+        "--max_tmd_size",
+        dest="max_tmd_size",
+        default=20,
+        help="Maximum size of the TMD domain",
+        type=int,
+    )
+    parser.add_argument(
+        "--summary_isp_txt",
+        dest="summary_isp_txt",
+        type=argparse.FileType("w"),
+        default="_summary_isp.txt",
+        help="Summary statistics on putative i-spanins",
+    )
+    parser.add_argument(
+        "--putative_isp_gff",
+        dest="putative_isp_gff",
+        type=argparse.FileType("w"),
+        default="_putative_isp.gff3",
+        help="gff3 output for putative i-spanins",
+    )
+
+    parser.add_argument(
+        "--max_isp",
+        dest="max_isp",
+        default=230,
+        help="Maximum size of the ISP",
+        type=int,
+    )
+
+    parser.add_argument(
+        "--isp_mode",
+        action="store_true",
+        default=True
+    )
+
+    parser.add_argument(
+        "--peri_min",
+        type=int,
+        default=18,
+        help="amount of residues after TMD is found min"
+    )
+
+    parser.add_argument(
+        "--peri_max",
+        type=int,
+        default=206,
+        help="amount of residues after TMD is found max"
+    )
+    # parser.add_argument('-v', action='version', version='0.3.0') # Is this manually updated?
+    args = parser.parse_args()
+    the_args = vars(parser.parse_args())
+
+    ### isp output, naive ORF finding:
+    isps = OrfFinder(args.table, args.ftype, args.ends, args.isp_min_len, args.strand)
+    isps.locate(
+        args.fasta_file,
+        args.out_isp_nuc,
+        args.out_isp_prot,
+        args.out_isp_bed,
+        args.out_isp_gff3,
+    )
+    """
+    >T7_EIS MLEFLRKLIPWVLVGMLFGLGWHLGSDSMDAKWKQEVHNEYVKRVEAAKSTQRAIGAVSAKYQEDLAALEGSTDRIISDLRSDNKRLRVRVKTTGISDGQCGFEPDGRAELDDRDAKRILAVTQKGDAWIRALQDTIRELQRK
+    >lambda_EIS MSRVTAIISALVICIIVCLSWAVNHYRDNAITYKAQRDKNARELKLANAAITDMQMRQRDVAALDAKYTKELADAKAENDALRDDVAAGRRRLHIKAVCQSVREATTASGVDNAASPRLADTAERDYFTLRERLITMQKQLEGTQKYINEQCR
+    """
+    args.fasta_file.close()
+    args.fasta_file = open(args.fasta_file.name, "r")
+    args.out_isp_prot.close()
+    args.out_isp_prot = open(args.out_isp_prot.name, "r")
+
+    pairs = tuple_fasta(fasta_file=args.out_isp_prot)
+
+    # print(pairs)
+
+    have_tmd = []  # empty candidates list to be passed through the user input criteria
+
+    for (
+        each_pair
+    ) in (
+        pairs
+    ):  # grab transmembrane domains from spaninFuncts (queries for lysin snorkels # and a range of hydrophobic regions that could be TMDs)
+        if len(each_pair[1]) <= args.max_isp:
+            try:
+                have_tmd += find_tmd(
+                    pair=each_pair,
+                    minimum=args.isp_min_dist,
+                    maximum=args.isp_max_dist,
+                    TMDmin=args.min_tmd_size,
+                    TMDmax=args.max_tmd_size,
+                    isp_mode=args.isp_mode,
+                    peri_min=args.peri_min,
+                    peri_max=args.peri_max,
+                )
+            except TypeError:
+                continue
+
+    if args.switch == "all":
+        pass
+    else:
+        # for each_pair in have_lipo:
+        range_of = args.switch
+        range_of = re.search(("[\d]+:[\d]+"), range_of).group(0)
+        start = int(range_of.split(":")[0])
+        end = int(range_of.split(":")[1])
+        have_tmd = parse_a_range(pair=have_tmd, start=start, end=end)
+
+    total_isp = len(have_tmd)
+
+    # ORF = [] # mightttttttttttt use eventually
+    length = []  # grabbing length of the sequences
+    candidate_dict = {k: v for k, v in have_tmd}
+    with args.putative_isp_fa as f:
+        for desc, s in candidate_dict.items():  # description / sequence
+            f.write(">" + str(desc))
+            f.write("\n" + lineWrapper(str(s).replace("*","")) + "\n")
+            length.append(len(s))
+            # ORF.append(desc)
+    if not length:
+        raise Exception("Parameters yielded no candidates.")
+    bot_size = min(length)
+    top_size = max(length)
+    avg = (sum(length)) / total_isp
+    n = len(length)
+    if n == 0:
+        raise Exception("no median for empty data")
+    if n % 2 == 1:
+        med = length[n // 2]
+    else:
+        i = n // 2
+        med = (length[i - 1] + length[i]) / 2
+
+    #### Extra statistics
+    args.out_isp_prot.close()
+    all_orfs = open(args.out_isp_prot.name, "r")
+    all_isps = open(args.putative_isp_fa.name, "r")
+    #record = SeqIO.read(all_orfs, "fasta")
+    #print(len(record))
+    n = 0
+    for line in all_orfs:
+        if line.startswith(">"):
+            n += 1
+    all_orfs_counts = n
+
+    c = 0
+    for line in all_isps:
+        if line.startswith(">"):
+            c += 1
+    all_isps_counts = c
+
+    #print(f"{n} -> {c}")
+    #count = 0
+    #for feature in record.features:
+    #    count += 1
+    #print(count)
+
+
+    with args.summary_isp_txt as f:
+        f.write("total potential o-spanins: " + str(total_isp) + "\n")
+        f.write("average length (AA): " + str(avg) + "\n")
+        f.write("median length (AA): " + str(med) + "\n")
+        f.write("maximum orf in size (AA): " + str(top_size) + "\n")
+        f.write("minimum orf in size (AA): " + str(bot_size) + "\n")
+        f.write("ratio of isps found from naive orfs: " + str(c) + "/" +str(n))
+
+    # Output the putative list in gff3 format
+    args.putative_isp_fa = open(args.putative_isp_fa.name, "r")
+    gff_data = prep_a_gff3(fa=args.putative_isp_fa, spanin_type="isp",org=args.fasta_file)
+    write_gff3(data=gff_data, output=args.putative_isp_gff)
+
+    """https://docs.python.org/3.4/library/subprocess.html"""
+    """https://github.tamu.edu/CPT/Galaxy-Tools/blob/f0bf4a4b8e5124d4f3082d21b738dfaa8e1a3cf6/tools/phage/transmembrane.py"""
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/generate-putative-isp.xml	Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,82 @@
+<?xml version="1.1"?>
+<tool id="edu.tamu.cpt2.spanin.generate-putative-isp" name="ISP candidates" version="1.0">
+    <description>constructs a putative list of potential i-spanin from an input genomic FASTA</description>
+    <macros>
+        <import>macros.xml</import>
+        <import>cpt-macros.xml</import>
+    </macros>
+    <expand macro="requirements">
+    </expand>
+    <command detect_errors="aggressive"><![CDATA[
+python $__tool_directory__/generate-putative-isp.py
+$fasta_file
+--strand $strand
+--switch $switch
+--isp_on $isp_on
+--isp_op $isp_op
+--isp_ob $isp_ob
+--isp_og $isp_og
+--isp_min_len $isp_min_len
+--isp_min_dist $isp_min_dist
+--isp_max_dist $isp_max_dist
+--min_tmd_size $min_tmd_size
+--max_tmd_size $max_tmd_size
+--putative_isp $putative_isp
+--summary_isp_txt $summary_isp
+--putative_isp_gff $putative_isp_gff
+--isp_max $isp_max
+--peri_min $peri_min
+--peri_max $peri_max
+
+]]></command>
+    <inputs>
+        <param type="select" label="Strand Choice" name="strand">
+            <option value="both">both</option>
+            <option value="forward">+</option>
+            <option value="reverse">-</option>
+        </param>
+        <param label="Single Genome FASTA" name="fasta_file" type="data" format="fasta" />
+        <param label="i-spanin minimal length" name="isp_min_len" type="integer" value="60" />
+        <param label="i-spanin maximum length" name="isp_max" type="integer" value="230" />
+        <param label="Range Selection; default is all; for a specific range to check for a spanin input integers separated by a colon (eg. 1000:2000)" type="text" name="switch" value="all" />
+        <param label="TMD minimal distance from start codon" name="isp_min_dist" type="integer" value="10" />
+        <param label="TMD maximum distance from start codon" name="isp_max_dist" type="integer" value="35" help="Searches for a TMD between TMDmin and TMDmax ie [TMDmin,TMDmax]" />
+        <param label="TMD minimal size" name="min_tmd_size" type="integer" value="10" />
+        <param label="TMD maximum size" name="max_tmd_size" type="integer" value="25" />
+        <param label="Periplasmic minimal residue amount" name="peri_min" type="integer" value="16" />
+        <param label="Periplasmic maximum residue amount" name="peri_max" type="integer" value="206" />
+    </inputs>
+    <outputs>
+        <data format="fasta" name="isp_on" label="NucSequences.fa" hidden = "true"/>
+        <data format="fasta" name="isp_op" label="ProtSequences.fa" hidden = "true"/>
+        <data format="bed" name="isp_ob" label="BED_Output.bed" hidden = "true"/>
+        <data format="gff3" name="isp_og" label="GFF_Output.gff" hidden = "true"/>
+        <data format="fasta" name="putative_isp" label="putative_isp.fa"/>
+        <data format="txt" name="summary_isp" label="summary_isp.txt"/>
+        <data format="gff3" name="putative_isp_gff" label="putative_isp.gff3"/>
+    </outputs>
+    <help><![CDATA[
+
+**What it does**
+Searches a genome for candidate i-spanins (ISPs), a phage protein involved in outer membrane disruption during Gram-negative bacterial host cell lysis.
+
+**METHODOLOGY**
+
+Locates ALL potential start sequences, based on TTG / ATG / GTG (M / L / V). This list is pared down to those within the user-set min/max lengths. That filtered list generates a set of files with the ORFs in FASTA (nt and aa), BED, and GFF3 file formats.
+
+With the protein FASTA, the tool next reads in each potential sequence and determines if it has a putative transmembrane domain (TMD) with the following criteria:
+
+    1. Presence of snorkeling Lysine residues surrounded by hydrophobic residues described for TMD below, within the range the user specifies.
+    2. A putative transmembrane domain, or TMD, defined as a repeated hydrophobic region within the sequence ([FIWLVMYCATGSP]), of length and position within the range the user inputs.
+    3. Length of expected periplasmic region. User defines minimum and maximum thresholds for required number of residues after TMD.
+
+**INPUT** --> Genomic FASTA
+*NOTE: This tool only takes a SINGLE genomic fasta. It does not work with multiFASTAs.*
+
+**OUTPUT** --> putative_isp.fa (FASTA) file, putative_isp.gff3, and basic summary statistics as summary_isp.txt.
+
+Protein sequences which passed the above filters are returned as the candidate ISPs.
+
+]]></help>
+        <expand macro="citations-crr" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/macros.xml	Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,63 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="requirements">
+		<requirements>
+                        <requirement type="package" version="2019.06.05">regex</requirement>
+			<requirement type="package" version="3.6">python</requirement>
+			<requirement type="package" version="1.77">biopython</requirement>
+			<requirement type="package" version="1.1.7">cpt_gffparser</requirement>
+			<yield/>
+		</requirements>
+	</xml>
+	<token name="@BLAST_TSV@">
+		"$blast_tsv"
+	</token>
+	<xml name="blast_tsv">
+		<param label="Blast Results" help="TSV/tabular (25 Column)"
+			name="blast_tsv" type="data" format="tabular" />
+	</xml>
+
+	<token name="@BLAST_XML@">
+		"$blast_xml"
+	</token>
+	<xml name="blast_xml">
+		<param label="Blast Results" help="XML format"
+			name="blast_xml" type="data" format="blastxml" />
+	</xml>
+	<xml name="gff3_with_fasta">
+	<param label="Genome Sequences" name="fasta" type="data" format="fasta" />
+	<param label="Genome Annotations" name="gff3" type="data" format="gff3" />
+	</xml>
+	<xml name="genome_selector">
+	    <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+	</xml>
+	<xml name="gff3_input">
+		<param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+	</xml>
+	<xml name="input/gff3+fasta">
+		<expand macro="gff3_input" />
+		<expand macro="genome_selector" />
+	</xml>
+	<token name="@INPUT_GFF@">
+	"$gff3_data"
+	</token>
+	<token name="@INPUT_FASTA@">
+		genomeref.fa
+	</token>
+	<token name="@GENOME_SELECTOR_PRE@">
+		ln -s $genome_fasta genomeref.fa;
+	</token>
+	<token name="@GENOME_SELECTOR@">
+		genomeref.fa
+	</token>
+        <xml name="input/fasta">
+		<param label="Fasta file" name="sequences" type="data" format="fasta"/>
+	</xml>
+
+	<token name="@SEQUENCE@">
+		"$sequences"
+	</token>
+	<xml name="input/fasta/protein">
+		<param label="Protein fasta file" name="sequences" type="data" format="fasta"/>
+	</xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/spaninFuncs.py	Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,469 @@
+"""
+PREMISE
+### Functions/Classes that are used in both generate-putative-osp.py and generate-putative-isp.py
+###### Main premise here is to make the above scripts a little more DRY, as well as easily readable for execution.
+###### Documentation will ATTEMPT to be thourough here
+"""
+
+import re
+from Bio import SeqIO
+from Bio import Seq
+from collections import OrderedDict
+
+# Not written in OOP for a LITTLE bit of trying to keep the complication down in case adjustments are needed by someone else.
+# Much of the manipulation is string based; so it should be straightforward as well as moderately quick
+################## GLobal Variables
+Lys = "K"
+
+
+def check_back_end_snorkels(seq, tmsize):
+    """
+        Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match.
+        --> seq : should be the sequence fed from the "search_region" portion of the sequence
+        --> tmsize : size of the potential TMD being investigated
+    """
+    found = []
+    if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]):
+        found = "match"
+        return found
+    elif seq[tmsize - 3] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 4]):
+        found = "match"
+        return found
+    elif seq[tmsize - 2] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 3]):
+        found = "match"
+        return found
+    elif seq[tmsize - 1] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 2]):
+        found = "match"
+        return found
+    else:
+        found = "NOTmatch"
+        return found
+
+
+def prep_a_gff3(fa, spanin_type, org):
+    """
+        Function parses an input detailed 'fa' file and outputs a 'gff3' file
+        ---> fa = input .fa file
+        ---> output = output a returned list of data, easily portable to a gff3 next
+        ---> spanin_type = 'isp' or 'osp'
+    """
+    with org as f:
+        header = f.readline()
+        orgacc = header.split(" ")
+        orgacc = orgacc[0].split(">")[1].strip()
+    fa_zip = tuple_fasta(fa)
+    data = []
+    for a_pair in fa_zip:
+        # print(a_pair)
+        if re.search(("(\[1\])"), a_pair[0]):
+            strand = "+"
+        elif re.search(("(\[-1\])"), a_pair[0]):
+            strand = "-"  # column 7
+        start = re.search(("[\d]+\.\."), a_pair[0]).group(0).split("..")[0]  # column 4
+        end = re.search(("\.\.[\d]+"), a_pair[0]).group(0).split("..")[1]  # column 5
+        orfid = re.search(("(ORF)[\d]+"), a_pair[0]).group(0)  # column 1
+        if spanin_type == "isp":
+            methodtype = "CDS"  # column 3
+            spanin = "isp"
+        elif spanin_type == "osp":
+            methodtype = "CDS"  # column 3
+            spanin = "osp"
+        elif spanin_type == "usp":
+            methodtype = "CDS"
+            spanin = "usp"
+        else:
+            raise "need to input spanin type"
+        source = "cpt.py|putative-*.py"  # column 2
+        score = "."  # column 6
+        phase = "."  # column 8
+        attributes = "ID=" +orgacc+ "|"+ orfid + ";ALIAS=" + spanin + ";SEQ="+a_pair[1]  # column 9
+        sequence = [[orgacc, source, methodtype, start, end, score, strand, phase, attributes]]
+        data += sequence
+    return data
+
+
+def write_gff3(data, output="results.gff3"):
+    """
+        Parses results from prep_a_gff3 into a gff3 file
+        ---> input : list from prep_a_gff3
+        ---> output : gff3 file
+    """
+    data = data
+    filename = output
+    with filename as f:
+        f.write("#gff-version 3\n")
+        for value in data:
+            f.write(
+                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
+                    value[0],
+                    value[1],
+                    value[2],
+                    value[3],
+                    value[4],
+                    value[5],
+                    value[6],
+                    value[7],
+                    value[8],
+                )
+            )
+    f.close()
+
+
+def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, peri_min=18, peri_max=206):
+    """
+        Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD
+        ---> pair : Input of tuple with description and AA sequence (str)
+        ---> minimum : How close from the initial start codon a TMD can be within
+        ---> maximum : How far from the initial start codon a TMD can be within
+        ---> TMDmin : The minimum size that a transmembrane can be (default = 10)
+        ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20)
+    """
+    # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S']
+    tmd = []
+    s = str(pair[1])  # sequence being analyzed
+    # print(s) # for trouble shooting
+    if maximum > len(s):
+        maximum = len(s)
+    search_region = s[minimum - 1 : maximum + 1]
+    #print(f"this is the search region: {search_region}")
+    # print(search_region) # for trouble shooting
+
+    for tmsize in range(TMDmin, TMDmax+1, 1):
+        #print(f"this is the current tmsize we're trying: {tmsize}")
+        # print('==============='+str(tmsize)+'================') # print for troubleshooting
+        pattern = "[PFIWLVMYCATGS]{"+str(tmsize)+"}"  # searches for these hydrophobic residues tmsize total times
+        #print(pattern)
+        #print(f"sending to regex: {search_region}")
+        if re.search(
+            ("[K]"), search_region[1:8]):  # grabbing one below with search region, so I want to grab one ahead here when I query.
+            store_search = re.search(("[K]"), search_region[1:8])  # storing regex object
+            where_we_are = store_search.start()  # finding where we got the hit
+            if re.search(
+                ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]
+            ) and re.search(
+                ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1]
+            ):  # hydrophobic neighbor
+                #try:
+                g = re.search(("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]).group()
+                backend = check_back_end_snorkels(search_region, tmsize)
+                if backend == "match":
+                    if isp_mode:
+                        g = re.search((pattern), search_region).group()
+                        end_of_tmd = re.search((g), s).end()+1
+                        amt_peri = len(s) - end_of_tmd
+                        if peri_min <= amt_peri <= peri_max:
+                            pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
+                            new_pair = (pair_desc,pair[1])
+                            tmd.append(new_pair)
+                    else:
+                        tmd.append(pair)
+                else:
+                    continue
+        #else:
+            #print("I'm continuing out of snorkel loop")
+            #print(f"{search_region}")
+            #continue
+        if re.search((pattern), search_region):
+            #print(f"found match: {}")
+            #print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE")
+            #try:
+            if isp_mode:
+                g = re.search((pattern), search_region).group()
+                end_of_tmd = re.search((g), s).end()+1
+                amt_peri = len(s) - end_of_tmd
+                if peri_min <= amt_peri <= peri_max:
+                    pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
+                    new_pair = (pair_desc,pair[1])
+                    tmd.append(new_pair)
+            else:
+                tmd.append(pair)
+        else:
+            continue
+
+        return tmd
+
+
+def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False):
+    """
+        Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox
+        ---> minimum - min distance from start codon to first AA of lipobox
+        ---> maximum - max distance from start codon to first AA of lipobox
+        ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy
+
+    """
+    if regex == 1:
+        pattern = "[ILMFTV][^REKD][GAS]C"  # regex for Lipobox from findSpanin.pl
+    elif regex == 2:
+        pattern = "[ACGSILMFTV][^REKD][GAS]C"  # regex for Lipobox from LipoRy
+
+    candidates = []
+    s = str(pair[1])
+    # print(s) # trouble shooting
+    search_region = s[minimum-1 : maximum + 5] # properly slice the input... add 4 to catch if it hangs off at max input
+    # print(search_region) # trouble shooting
+    patterns = ["[ILMFTV][^REKD][GAS]C","AW[AGS]C"]
+    for pattern in patterns:
+        #print(pattern)  # trouble shooting
+        if re.search((pattern), search_region):  # lipobox must be WITHIN the range...
+            # searches the sequence with the input RegEx AND omits if
+            g = re.search((pattern), search_region).group() # find the exact group match
+            amt_peri = len(s) - re.search((g), s).end() + 1
+            if min_after <= amt_peri <= max_after: # find the lipobox end region
+                if osp_mode:
+                    pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
+                    new_pair = (pair_desc,pair[1])
+                    candidates.append(new_pair)
+                else:
+                    candidates.append(pair)
+
+                return candidates
+
+
+def tuple_fasta(fasta_file):
+    """
+        #### INPUT: Fasta File
+        #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence
+        ####
+    """
+    fasta = SeqIO.parse(fasta_file, "fasta")
+    descriptions = []
+    sequences = []
+    for r in fasta:  # iterates and stores each description and sequence
+        description = r.description
+        sequence = str(r.seq)
+        if (
+            sequence[0] != "I"
+        ):  # the translation table currently has I as a potential start codon ==> this will remove all ORFs that start with I
+            descriptions.append(description)
+            sequences.append(sequence)
+        else:
+            continue
+
+    return zip(descriptions, sequences)
+
+
+def lineWrapper(text, charactersize=60):
+
+    if len(text) <= charactersize:
+        return text
+    else:
+        return (
+            text[:charactersize]
+            + "\n"
+            + lineWrapper(text[charactersize:], charactersize)
+        )
+
+
+def getDescriptions(fasta):
+    """
+    Takes an output FASTA file, and parses retrieves the description headers. These headers contain information needed
+    for finding locations of a potential i-spanin and o-spanin proximity to one another.
+    """
+    desc = []
+    with fasta as f:
+        for line in f:
+            if line.startswith(">"):
+                desc.append(line)
+    return desc
+
+
+def splitStrands(text, strand="+"):
+    # positive_strands = []
+    # negative_strands = []
+    if strand == "+":
+        if re.search(("(\[1\])"), text):
+            return text
+    elif strand == "-":
+        if re.search(("(\[-1\])"), text):
+            return text
+    # return positive_strands, negative_strands
+
+
+def parse_a_range(pair, start, end):
+    """
+        Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range
+        ---> data : fasta tuple data
+        ---> start : start range to keep
+        ---> end : end range to keep (will need to + 1)
+    """
+    matches = []
+    for each_pair in pair:
+
+        s = re.search(("[\d]+\.\."), each_pair[0]).group(0)  # Start of the sequence
+        s = int(s.split("..")[0])
+        e = re.search(("\.\.[\d]+"), each_pair[0]).group(0)
+        e = int(e.split("..")[1])
+        if start - 1 <= s and e <= end + 1:
+            matches.append(each_pair)
+        else:
+            continue
+    # else:
+    # continue
+    # if matches != []:
+    return matches
+    # else:
+    # print('no candidates within selected range')
+
+
+def grabLocs(text):
+    """
+    Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module
+    from cpt.py
+    """
+    start = re.search(("[\d]+\.\."), text).group(0)  # Start of the sequence ; looks for [numbers]..
+    end = re.search(("\.\.[\d]+"), text).group(0)  # End of the sequence ; Looks for ..[numbers]
+    orf = re.search(("(ORF)[\d]+"), text).group(0)  # Looks for ORF and the numbers that are after it
+    if re.search(("(\[1\])"), text): # stores strand
+        strand = "+"
+    elif re.search(("(\[-1\])"), text): # stores strand
+        strand = "-"
+    start = int(start.split("..")[0])
+    end = int(end.split("..")[1])
+    vals = [start, end, orf, strand]
+
+    return vals
+
+
+def spaninProximity(isp, osp, max_dist=30):
+    """
+    _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_
+    Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site
+    to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary (<user_input> * 3)
+    I modified this on 07.30.2020 to bypass the pick + or - strand. To
+    INPUT: list of OSP and ISP candidates
+    OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list
+    """
+
+    embedded = {}
+    overlap = {}
+    separate = {}
+    for iseq in isp:
+        embedded[iseq[2]] = []
+        overlap[iseq[2]] = []
+        separate[iseq[2]] = []
+        for oseq in osp:
+            if iseq[3] == "+":
+                if oseq[3] == "+":
+                    if iseq[0] < oseq[0] < iseq[1] and oseq[1] < iseq[1]:
+                        ### EMBEDDED ###
+                        combo = [
+                            iseq[0],
+                            iseq[1],
+                            oseq[2],
+                            oseq[0],
+                            oseq[1],
+                            iseq[3],
+                        ]  # ordering a return for dic
+                        embedded[iseq[2]] += [combo]
+                    elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]:
+                        ### OVERLAP / SEPARATE ###
+                        if (iseq[1] - oseq[0]) < 6:
+                            combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+                            separate[iseq[2]] += [combo]
+                        else:
+                            combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+                            overlap[iseq[2]] += [combo]
+                    elif iseq[1] <= oseq[0] <= iseq[1] + max_dist:
+                        combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+                        separate[iseq[2]] += [combo]
+                    else:
+                        continue
+            if iseq[3] == "-":
+                if oseq[3] == "-":
+                    if iseq[0] <= oseq[1] <= iseq[1] and oseq[0] > iseq[0]:
+                        ### EMBEDDED ###
+                        combo = [
+                            iseq[0],
+                            iseq[1],
+                            oseq[2],
+                            oseq[0],
+                            oseq[1],
+                            iseq[3],
+                        ]  # ordering a return for dict
+                        embedded[iseq[2]] += [combo]
+                    elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]:
+                        if (oseq[1] - iseq[0]) < 6:
+                            combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+                            separate[iseq[2]] += [combo]
+                        else:
+                            combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+                            overlap[iseq[2]] += [combo]
+                    elif iseq[0] - 10 < oseq[1] < iseq[0]:
+                        combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+                        separate[iseq[2]] += [combo]
+                    else:
+                        continue
+
+    embedded = {k: embedded[k] for k in embedded if embedded[k]}
+    overlap = {k: overlap[k] for k in overlap if overlap[k]}
+    separate = {k: separate[k] for k in separate if separate[k]}
+
+    return embedded, overlap, separate
+
+
+def check_for_usp():
+    " pass "
+
+############################################### TEST RANGE #########################################################################
+####################################################################################################################################
+if __name__ == "__main__":
+
+    #### TMD TEST
+    test_desc = ["one", "two", "three", "four", "five"]
+    test_seq = [
+        "XXXXXXXXXXXXXXXFMCFMCFMCFMCFMCXXXXXXXXXXXXXXXXXXXXXXXXXX",
+        "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
+        "XXXXXXX",
+        "XXXXXXXXXXXKXXXXXXXXXX",
+        "XXXXXXXXXXAKXXXXXXXXXXAKXXXXXXXX",
+    ]
+    # for l in
+    # combo = zip(test_desc,test_seq)
+    pairs = zip(test_desc, test_seq)
+    tmd = []
+    for each_pair in pairs:
+        # print(each_pair)
+        try:
+            tmd += find_tmd(pair=each_pair)
+        except (IndexError, TypeError):
+            continue
+        # try:s = each_pair[1]
+        # tmd += find_tmd(seq=s, tmsize=15)
+    # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+    # print(tmd)
+    # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+
+    #### tuple-fasta TEST
+    # fasta_file = 'out_isp.fa'
+    # ret = tuple_fasta(fasta_file)
+    # print('=============')
+    # for i in ret:
+    # print(i[1])
+
+    #### LipoBox TEST
+    test_desc = ["one", "two", "three", "four", "five", "six", "seven"]
+    test_seq = [
+        "XXXXXXXXXTGGCXXXXXXXXXXXXXXXX",
+        "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
+        "XXXXXXX",
+        "AGGCXXXXXXXXXXXXXXXXXXXXTT",
+        "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
+        "XXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
+        "MSTLRELRLRRALKEQSMRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPEPPMVSVDSSLMVEPNLTTEMLNVFSQ*",
+    ]
+    pairs = zip(test_desc, test_seq)
+    lipo = []
+    for each_pair in pairs:
+        #print(each_pair)
+        # try:
+        try:
+            lipo += find_lipobox(pair=each_pair, regex=2)  # , minimum=8)
+        except TypeError:  # catches if something doesnt have the min/max requirements (something is too small)
+            continue
+        # except:
+        # continue
+    # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+    #############################3
+    # g = prep_a_gff3(fa='putative_isp.fa', spanin_type='isp')
+    # print(g)
+    # write_gff3(data=g)