changeset 1:5081ba961953 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:40:58 +0000
parents 6661bb42b5a9
children 16c3654c9dbb
files cpt-macros.xml cpt_disruptin_proximity/cpt-macros.xml cpt_disruptin_proximity/disruptin_proximity_2_lysis_genes.py cpt_disruptin_proximity/disruptin_proximity_2_lysis_genes.xml cpt_disruptin_proximity/gff3.py cpt_disruptin_proximity/macros.xml disruptin_proximity_2_lysis_genes.py disruptin_proximity_2_lysis_genes.xml gff3.py macros.xml
diffstat 10 files changed, 963 insertions(+), 915 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt-macros.xml	Mon Jun 05 02:40:58 2023 +0000
@@ -0,0 +1,115 @@
+<macros>
+    <xml name="gff_requirements">
+        <requirements>
+            <requirement type="package" version="2.7">python</requirement>
+            <requirement type="package" version="1.65">biopython</requirement>
+            <requirement type="package" version="2.12.1">requests</requirement>
+			<requirement type="package" version="1.2.2">cpt_gffparser</requirement>
+            <yield/>
+        </requirements>
+        <version_command>
+		<![CDATA[
+			cd '$__tool_directory__' && git rev-parse HEAD
+		]]>
+		</version_command>
+    </xml>
+    <xml name="citation/mijalisrasche">
+        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+        <citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-crr">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020-AJC-solo">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-clm">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="sl-citations-clm">
+        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+        <yield/>
+    </xml>
+</macros>
--- a/cpt_disruptin_proximity/cpt-macros.xml	Fri Jun 17 12:24:06 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,115 +0,0 @@
-<?xml version="1.0"?>
-<macros>
-	<xml name="gff_requirements">
-		<requirements>
-			<requirement type="package" version="2.7">python</requirement>
-			<requirement type="package" version="1.65">biopython</requirement>
-			<requirement type="package" version="2.12.1">requests</requirement>
-			<yield/>
-		</requirements>
-		<version_command>
-		<![CDATA[
-			cd $__tool_directory__ && git rev-parse HEAD
-		]]>
-		</version_command>
-	</xml>
-	<xml name="citation/mijalisrasche">
-		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-		<citation type="bibtex">@unpublished{galaxyTools,
-		author = {E. Mijalis, H. Rasche},
-		title = {CPT Galaxy Tools},
-		year = {2013-2017},
-		note = {https://github.com/tamu-cpt/galaxy-tools/}
-		}
-		</citation>
-	</xml>
-	<xml name="citations">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {E. Mijalis, H. Rasche},
-				title = {CPT Galaxy Tools},
-				year = {2013-2017},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation> 
-		<yield/>
-		</citations>
-	</xml>
-    	<xml name="citations-crr">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Ross},
-				title = {CPT Galaxy Tools},
-				year = {2020-},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-		<yield/>
-		</citations>
-	</xml>
-        <xml name="citations-2020">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {E. Mijalis, H. Rasche},
-				title = {CPT Galaxy Tools},
-				year = {2013-2017},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {A. Criscione},
-				title = {CPT Galaxy Tools},
-				year = {2019-2021},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-                        </citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="citations-2020-AJC-solo">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-                        <citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {A. Criscione},
-				title = {CPT Galaxy Tools},
-				year = {2019-2021},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-                        </citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="citations-clm">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Maughmer},
-				title = {CPT Galaxy Tools},
-				year = {2017-2020},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="sl-citations-clm">
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Maughmer},
-				title = {CPT Galaxy Tools},
-				year = {2017-2020},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <yield/>
-	</xml>
-</macros>
--- a/cpt_disruptin_proximity/disruptin_proximity_2_lysis_genes.py	Fri Jun 17 12:24:06 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,375 +0,0 @@
-#!/usr/bin/env python
-"""
-This program is intended to identify protein coding sequences within a certain window (number of base pairs) of genes encoding recognized endolysin domains and genes encoding transmembrane domains. The goal is narrow a list of disruptin candidates by identifying the sequences close to the other lysis genes in the phage genome.
-Inputs for this program include a .fasta file with protein sequences of lysis gene candidates from the phage genome, a .gff3 file with the tmhmm results from the genome, a .gff3 file with the results from interproscan of the genome, a .gff3 file of the genome, window size in number of base pairs, a tab separated list of endolysin domains, and optional names of output files.
-The program outputs lists of lysis gene candidates that are close to protein codings sequences with endolysin domains or to sequences with transmembrane domains and lists of the proteins in proximity to the lysis gene candidates (one list for proteins with endolysin domains and one list for TMD-containing proteins).
-"""
-
-from Bio import SeqIO
-import argparse
-import sys
-from CPT_GFFParser import gffParse, gffWrite
-from Bio.SeqRecord import SeqRecord
-from Bio.SeqFeature import SeqFeature
-from Bio.Seq import Seq
-from intervaltree import IntervalTree, Interval
-from gff3 import feature_lambda, feature_test_type
-
-# Used for genome in fasta format
-# outputs the start and end coordinates from a record in fasta format
-# Should work for seqrecords from NCBI database
-# def location_start_end(seqrec):
-#    F = seqrec.description
-#    description_subparts = F.split(' ')
-#    for i in range(len(description_subparts)):
-#        if description_subparts[i].startswith('[location'):
-#            location = description_subparts[i][10:-1]
-#    location_se = location.split('..')
-#    location_start = location_se[0]
-#    location_end = location_se[1]
-#
-#    return location_start, location_end
-
-# adapted from intersect_and_adjacent.py
-def treeFeatures(features):
-    for feat in features:
-        # used with genome in fasta format
-        # start, end = location_start_end(feat)
-
-        # Interval(begin, end, data)
-        yield Interval(int(feat.location.start), int(feat.location.end), feat.id)
-
-
-# Function to read enzyme domain names and ids from the enzyme list
-# Enzyme list must be a tab separated txt file with the format with four columns: Predicted catalytic or binding domain, Abbreviation, Conserved domain, Phage example
-# The first column is used here as the domain name, and the 3rd column is the domain id
-def read_enzyme_list(enzyme_file=None):
-    enzyme_file.seek(0)
-    domains = []
-    #domain_names = []
-
-    for line in enzyme_file:
-        if not line.startswith("*"):
-            words = line.split("\t")
-            if len(words) > 3:
-                domains += [words[2]]
-                #domain_names += [words[0]]
-
-    return (domains[1:])
-
-
-# adapted from intersect_and_adjacent.py
-def intersect(rec_a, rec_b, window):
-    if len(rec_a) > 0 and len(rec_b) > 0:
-
-        # builds interval tree from Interval objects of form (start, end, id) for each feature
-        tree_a = IntervalTree(list(treeFeatures(rec_a)))
-        tree_b = IntervalTree(list(treeFeatures(rec_b)))
-
-        # Used to map ids back to features later
-        rec_a_map = {f.id: f for f in rec_a}
-        rec_b_map = {f.id: f for f in rec_b}
-
-        rec_a_hits_in_b = []
-        rec_b_hits_in_a = []
-
-        for feature in rec_a:
-            # Used with genome in fasta format
-            # start, end = location_start_end(feature)
-            # Save each feature in rec_a that overlaps a feature in rec_b
-            # hits = tree_b.find_range((int(feature.location.start), int(feature.location.end)))
-            hits = tree_b[
-                (int(feature.location.start) - window) : (
-                    int(feature.location.end) + window
-                )
-            ]
-            # feature id is saved in interval result.data, use map to get full feature
-            for hit in hits:
-                rec_a_hits_in_b.append(rec_b_map[hit.data])
-
-        for feature in rec_b:
-            # Used with genome in fasta format
-            # start, end = location_start_end(feature)
-            # Save each feature in rec_a that overlaps a feature in rec_b
-            # hits = tree_a.find_range((int(feature.location.start), int(feature.location.end)))
-            hits = tree_a[
-                (int(feature.location.start) - window) : (
-                    int(feature.location.end) + window
-                )
-            ]
-            # feature id is saved in interval result.data, use map to get full feature
-            for hit in hits:
-                rec_b_hits_in_a.append(rec_a_map[hit.data])
-
-        # Remove duplicate features using sets
-        rec_a = set(rec_a_hits_in_b)
-        rec_b = set(rec_b_hits_in_a)
-
-    else:
-        # If one input is empty, output two empty result files.
-        rec_a = SeqRecord(Seq(""), "none")
-        rec_b = SeqRecord(Seq(""), "none")
-    return rec_a, rec_b
-
-
-# Function to identify enzyme domains associated with endolysin function from the interproscan results file
-def find_endolysins(rec_ipro, enzyme_domain_ids):
-
-    # print(rec_ipro)
-
-    if len(rec_ipro) > 0:
-        endo_rec_names = []
-        endo_rec_domain_ids = []
-        rec_domain_name = []
-
-        # Check each feature in the ipro file if the domain id/feature qualifier "Name" is included in the domain list.
-        for seq in rec_ipro:
-            for i in range(len(seq.features)):
-                f = seq.features[i]
-                if f.type == "protein_match":
-                    # print(f)
-                    unwanted = ["TRANS", "SIGNAL", "CYTO", "Coil", "Signal"]
-                    # Ignores feature with unwanted key words in the feature name
-                    if all(x not in f.qualifiers["Name"][0] for x in unwanted):
-                        # If feature is included in the given enzyme domain list, the protein name, domain id, and domain name are stored
-                        domain_description = [str(f.qualifiers['Name'][0])]
-                        if 'signature_desc' in f.qualifiers:
-                            domain_description += [str(f.qualifiers['signature_desc'][0])]
-
-                        for i in enzyme_domain_ids:
-                            for y in domain_description:
-                                if i in y:
-                                    endo_rec_domain_ids += [f.qualifiers["Name"][0]]
-
-                                    # e_index = enzyme_domain_ids.index(i)
-                                    # rec_domain_name += [enzyme_domain_names[e_index]]
-
-                                    target = f.qualifiers["Target"][0]
-                                    target = target.split(" ")
-                                    protein_name = str(target[0]) + '**'
-                                    endo_rec_names += [protein_name]
-
-        return endo_rec_names, endo_rec_domain_ids
-
-
-def adjacent_lgc(lgc, tmhmm, ipro, genome, enzyme, window):
-    rec_lgc = list(SeqIO.parse(lgc, "fasta"))
-    rec_tmhmm = gffParse(tmhmm)
-    rec_ipro = gffParse(ipro)
-    recTemp = gffParse(genome)[0]
-    tempFeats = feature_lambda(
-                recTemp.features,
-                feature_test_type,
-                {"types": ["CDS"]},
-                subfeatures=True,
-                )
-    recTemp.features = tempFeats
-    rec_genome_ini = [recTemp]
-
-    # genome.seek(0)
-    # examiner = GFFExaminer()
-    # print(examiner.available_limits(genome))
-
-    enzyme_domain_ids = read_enzyme_list(enzyme)
-
-    if len(rec_lgc) > 0 and len(rec_tmhmm) > 0 and len(rec_genome_ini) > 0:
-
-        # find names of the proteins containing endolysin associated domains
-        endo_names, endo_domain_ids = find_endolysins(
-            rec_ipro, list(enzyme_domain_ids)
-        )
-
-        # find names of proteins containing transmembrane domains
-        tmhmm_protein_names = []
-        for seq in rec_tmhmm:
-            tmhmm_protein_names += [str(seq.id) + '**']
-
-        lgc_names = []
-        for seq in rec_lgc:
-            lgc_names += [str(seq.id) + '**']
-
-        adjacent_endo = {}
-        adjacent_lgc_to_endo = {}
-        adjacent_tm = {}
-        adjacent_lgc_to_tm = {}
-
-        # print(len(tmhmm_protein_names), len(endo_names))
-        # print(rec_genome_ini)
-        # print(len(rec_genome_ini))
-
-        for i in range(len(rec_genome_ini)):
-            rec_genome = rec_genome_ini[i]
-
-            # find records for proteins containing endolysin domains and tmds from genome fasta file
-            tm_seqrec = []
-            endolysin_seqrec = []
-            lgc_seqrec = []
-
-            # print(tmhmm_protein_names)
-            # print(endo_names)
-            # print(lgc_names)
-            # print(rec_genome)
-
-            for feat in rec_genome.features:
-                # print(feat)
-                # searches for synonyms and
-                if feat.type == "CDS":
-                    feat_names = []
-                    feat_names.append(str(feat.id) + '**')
-                    if "locus_tag" in feat.qualifiers:
-                        feat_names.append(str(feat.qualifiers["locus_tag"][0]) + '**')
-                    if "protein_id" in feat.qualifiers:
-                        feat_names.append(str(feat.qualifiers["protein_id"][0]) + '**')
-                    if "Name" in feat.qualifiers:
-                        if len(str(feat.qualifiers["Name"][0])) > 5:
-                            feat_names.append(str(feat.qualifiers["Name"][0]) + '**')
-
-                    # print(str(feat_names))
-                    # print(str(feat.qualifiers))
-                    for i in range(len(feat_names)):
-                        if str(feat_names[i]) in str(lgc_names):
-                            lgc_seqrec += [feat]
-                    # check if gene annotated as holin using key words/synonyms
-                    holin_annotations = ["holin"]
-                    if "product" in feat.qualifiers:
-                        if any(
-                            x
-                            for x in holin_annotations
-                            if (x in str(feat.qualifiers))
-                        ):
-                            tm_seqrec += [feat]
-                    # check if protein contains a TMD
-                    for i in range(len(feat_names)):
-                        if str(feat_names[i]) in str(tmhmm_protein_names):
-                            # print(feat_names[i])
-                            tm_seqrec += [feat]
-
-                    # check if gene annotated as endolysin using key words/synonyms
-                    endolysin_annotations = ["lysin", "lysozyme"]
-                    if "product" in feat.qualifiers:
-                        if any(
-                            x
-                            for x in endolysin_annotations
-                            if (x in str(feat.qualifiers))
-                        ):
-                            endolysin_seqrec += [feat]
-                    # check if protein contains an endolysin-associated domain
-                    for i in range(len(feat_names)):
-                        if str(feat_names[i]) in str(endo_names):
-                            endolysin_seqrec += [feat]
-
-            # print(endolysin_seqrec, tm_seqrec, lgc_seqrec)
-            # find possible endolysins that are adjacent to (or within window length away from) the lysis gene, or disruptin, candidates
-            # if len(endolysin_seqrec) > 0:
-            adjacent_lgc_to_endo_i, adjacent_endo_i = intersect(
-                endolysin_seqrec, lgc_seqrec, window
-            )
-            # find TMD-containing proteins that are adjacent to (or within window length away from) the lysis gene, or disruptin, candidates
-            # if len(tm_seqrec) > 0:
-            adjacent_lgc_to_tm_i, adjacent_tm_i = intersect(
-                tm_seqrec, lgc_seqrec, window
-            )
-
-            # print(len(endolysin_seqrec), len(lgc_seqrec), len(tm_seqrec))
-            adjacent_endo[rec_genome.id] = adjacent_endo_i
-            adjacent_lgc_to_endo[rec_genome.id] = adjacent_lgc_to_endo_i
-            adjacent_tm[rec_genome.id] = adjacent_tm_i
-            adjacent_lgc_to_tm[rec_genome.id] = adjacent_lgc_to_tm_i
-            # print(rec_genome.id)
-    else:
-      return 0, 0, 0, 0
-    # print(adjacent_endo)
-    return adjacent_endo, adjacent_lgc_to_endo, adjacent_tm, adjacent_lgc_to_tm
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Identify lysis gene candidates next to possible endolysin or holin genes",
-        epilog="",
-    )
-    parser.add_argument(
-        "lgc",
-        type=argparse.FileType("r"),
-        help="fasta file with protein coding sequences of lysis gene candidates",
-    )
-    parser.add_argument(
-        "tmhmm",
-        type=argparse.FileType("r"),
-        help="gff3 file with tmhmm results from the coding sequences in the genome",
-    )
-    parser.add_argument(
-        "ipro",
-        type=argparse.FileType("r"),
-        help="gff3 file with interpro results from protein sequences in the genome",
-    )
-    parser.add_argument(
-        "genome",
-        type=argparse.FileType("r"),
-        help="fasta file with protein coding sequences for all genes in the genome",
-    )
-    parser.add_argument(
-        "window",
-        type=int,
-        default=1000,
-        help="Allows features this far away to still be considered 'adjacent'",
-    )
-    parser.add_argument(
-        "enzyme",
-        type=argparse.FileType("r"),
-        help="tab delimited text file including list of conserved protein domains linked to endolysin function",
-    )
-    parser.add_argument("--oa", type=str, default="possible_endolysin.gff3")
-    parser.add_argument(
-        "--ob", type=str, default="lysis_gene_candidates_near_endolysin.gff3"
-    )
-    parser.add_argument("--oc", type=str, default="possible_holin.gff3")
-    parser.add_argument(
-        "--od", type=str, default="lysis_gene_candidates_near_holin.gff3"
-    )
-    args = parser.parse_args()
-
-    endo, lgc_endo, tm, lgc_tm = adjacent_lgc(
-        args.lgc, args.tmhmm, args.ipro, args.genome, args.enzyme, args.window
-    )
-
-    if endo == 0:
-      with open(args.oa, "w") as handle:
-        handle.write("##gff-version 3")
-      with open(args.ob, "w") as handle:
-        handle.write("##gff-version 3")
-      with open(args.oc, "w") as handle:
-        handle.write("##gff-version 3")
-      with open(args.od, "w") as handle:
-        handle.write("##gff-version 3")
-      return
-
-    args.genome.seek(0)
-    rec = list(gffParse(args.genome))
-    
-    with open(args.oa, "w") as handle:
-        for i in range(len(rec)):
-            rec_i = rec[i]
-            if endo.get(rec_i.id, "") is not "":
-                rec_i.features = endo[rec_i.id]
-                gffWrite([rec_i], handle)
-
-    with open(args.ob, "w") as handle:
-        for i in range(len(rec)):
-            rec_i = rec[i]
-            if lgc_endo.get(rec_i.id, "") is not "":
-                rec_i.features = lgc_endo[rec_i.id]
-                gffWrite([rec_i], handle)
-
-    with open(args.oc, "w") as handle:
-        for i in range(len(rec)):
-            rec_i = rec[i]
-            if tm.get(rec_i.id, "") is not "":
-                rec_i.features = tm[rec_i.id]
-                gffWrite([rec_i], handle)
-
-    with open(args.od, "w") as handle:
-        for i in range(len(rec)):
-            rec_i = rec[i]
-            if lgc_tm.get(rec_i.id, "") is not "":
-                rec_i.features = lgc_tm[rec_i.id]
-                gffWrite([rec_i], handle)
--- a/cpt_disruptin_proximity/disruptin_proximity_2_lysis_genes.xml	Fri Jun 17 12:24:06 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,56 +0,0 @@
-<?xml version="1.1"?>
-<tool id="edu.tamu.cpt2.phage.disruptin_prox" name="Disruptin Proximity to Lysis Genes Tool" version="1.1">
-    <description>identifies lysis gene candidates close to genes encoding endolysin domains and genes encoding transmembrane domains</description>
-    <macros>
-		<import>macros.xml</import>
-		<import>cpt-macros.xml</import>
-    </macros>
-    <expand macro="requirements">
-      	<requirement type="package" version="3.0.2">intervaltree</requirement>
-    </expand>
-    <command detect_errors="aggressive"><![CDATA[
-python $__tool_directory__/disruptin_proximity_2_lysis_genes.py
-$lgc
-$tmhmm
-$ipro
-$genome
-$window
-$enzyme
---oa $oa
---ob $ob
---oc $oc
---od $od
-
-]]></command>
-    <inputs>
-        <param label="Lysis Gene Candidates" name="lgc" type="data" format="fasta" />
-	<param label="TMHMM Results" name="tmhmm" type="data" format="gff3" />
-        <param label="InterProScan Results" name="ipro" type="data" format="gff3" />
-	<param label="Phage Genome" name="genome" type="data" format="gff3" />
-        <param label="Adjacency Window Size" name="window" type="integer" value="1000" />
-        <param label="Enzyme Domain List" name="enzyme" type="data" format="tabular" />
-    </inputs>
-    <outputs>
-	<data format="gff3" name="oa" label="Protein with endolysin-associated domain and adjacent to lysis gene candidate"/>
-        <data format="gff3" name="ob" label="Lysis gene candidates near possible endolysin"/>
-        <data format="gff3" name="oc" label="Protein with transmembrane domain and adjacent to lysis gene candidate"/>
-        <data format="gff3" name="od" label="Lysis gene candidates near TMD-containing protein"/>
-    </outputs>
-    <help><![CDATA[
-**What it does**
-This program is intended to identify protein coding sequences within a certain window (number of base pairs) of genes encoding recognized endolysin domains and others encoding transmembrane domains.
-The goal is to identify possible lysis genes or to narrow a list of disruptin candidates.
-
-        ]]></help>
-    <citations>
-        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
-        <citation type="bibtex">
-        @unpublished{galaxyTools, 
-            author = {A. Holt},
-            title = {CPT Galaxy Tools},
-            year = {2020},
-            note = {https://github.com/tamu-cpt/galaxy-tools/}
-        }
-        </citation>
-    </citations>
-</tool>
--- a/cpt_disruptin_proximity/gff3.py	Fri Jun 17 12:24:06 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,346 +0,0 @@
-import copy
-import logging
-
-log = logging.getLogger()
-log.setLevel(logging.WARN)
-
-
-def feature_lambda(
-    feature_list,
-    test,
-    test_kwargs,
-    subfeatures=True,
-    parent=None,
-    invert=False,
-    recurse=True,
-):
-    """Recursively search through features, testing each with a test function, yielding matches.
-
-    GFF3 is a hierachical data structure, so we need to be able to recursively
-    search through features. E.g. if you're looking for a feature with
-    ID='bob.42', you can't just do a simple list comprehension with a test
-    case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.
-
-    :type feature_list: list
-    :param feature_list: an iterable of features
-
-    :type test: function reference
-    :param test: a closure with the method signature (feature, **kwargs) where
-                 the kwargs are those passed in the next argument. This
-                 function should return True or False, True if the feature is
-                 to be yielded as part of the main feature_lambda function, or
-                 False if it is to be ignored. This function CAN mutate the
-                 features passed to it (think "apply").
-
-    :type test_kwargs: dictionary
-    :param test_kwargs: kwargs to pass to your closure when it is called.
-
-    :type subfeatures: boolean
-    :param subfeatures: when a feature is matched, should just that feature be
-                        yielded to the caller, or should the entire sub_feature
-                        tree for that feature be included? subfeatures=True is
-                        useful in cases such as searching for a gene feature,
-                        and wanting to know what RBS/Shine_Dalgarno_sequences
-                        are in the sub_feature tree (which can be accomplished
-                        with two feature_lambda calls). subfeatures=False is
-                        useful in cases when you want to process (and possibly
-                        return) the entire feature tree, such as applying a
-                        qualifier to every single feature.
-
-    :type invert: boolean
-    :param invert: Negate/invert the result of the filter.
-
-    :rtype: yielded list
-    :return: Yields a list of matching features.
-    """
-    # Either the top level set of [features] or the subfeature attribute
-    for feature in feature_list:
-        feature._parent = parent
-        if not parent:
-            # Set to self so we cannot go above root.
-            feature._parent = feature
-        test_result = test(feature, **test_kwargs)
-        # if (not invert and test_result) or (invert and not test_result):
-        if invert ^ test_result:
-            if not subfeatures:
-                feature_copy = copy.deepcopy(feature)
-                feature_copy.sub_features = list()
-                yield feature_copy
-            else:
-                yield feature
-
-        if recurse and hasattr(feature, "sub_features"):
-            for x in feature_lambda(
-                feature.sub_features,
-                test,
-                test_kwargs,
-                subfeatures=subfeatures,
-                parent=feature,
-                invert=invert,
-                recurse=recurse,
-            ):
-                yield x
-
-
-def fetchParent(feature):
-    if not hasattr(feature, "_parent") or feature._parent is None:
-        return feature
-    else:
-        return fetchParent(feature._parent)
-
-
-def feature_test_true(feature, **kwargs):
-    return True
-
-
-def feature_test_type(feature, **kwargs):
-    if "type" in kwargs:
-        return str(feature.type).upper() == str(kwargs["type"]).upper()
-    elif "types" in kwargs:
-      for x in kwargs["types"]:
-        if str(feature.type).upper() == str(x).upper():
-          return True
-      return False
-    raise Exception("Incorrect feature_test_type call, need type or types")
-
-
-def feature_test_qual_value(feature, **kwargs):
-    """Test qualifier values.
-
-    For every feature, check that at least one value in
-    feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
-    """
-    if isinstance(kwargs["qualifier"], list):
-        for qualifier in kwargs["qualifier"]:
-            for attribute_value in feature.qualifiers.get(qualifier, []):
-                if attribute_value in kwargs["attribute_list"]:
-                    return True
-    else:
-        for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []):
-            if attribute_value in kwargs["attribute_list"]:
-                return True
-    return False
-
-
-def feature_test_location(feature, **kwargs):
-    if "strand" in kwargs:
-        if feature.location.strand != kwargs["strand"]:
-            return False
-
-    return feature.location.start <= kwargs["loc"] <= feature.location.end
-
-
-def feature_test_quals(feature, **kwargs):
-    """
-    Example::
-
-        a = Feature(qualifiers={'Note': ['Some notes', 'Aasdf']})
-
-        # Check if a contains a Note
-        feature_test_quals(a, {'Note': None})  # Returns True
-        feature_test_quals(a, {'Product': None})  # Returns False
-
-        # Check if a contains a note with specific value
-        feature_test_quals(a, {'Note': ['ome']})  # Returns True
-
-        # Check if a contains a note with specific value
-        feature_test_quals(a, {'Note': ['other']})  # Returns False
-    """
-    for key in kwargs:
-        if key not in feature.qualifiers:
-            return False
-
-        # Key is present, no value specified
-        if kwargs[key] is None:
-            return True
-
-        # Otherwise there is a key value we're looking for.
-        # so we make a list of matches
-        matches = []
-        # And check all of the feature qualifier valuse
-        for value in feature.qualifiers[key]:
-            # For that kwargs[key] value
-            for x in kwargs[key]:
-                matches.append(x in value)
-
-        # If none matched, then we return false.
-        if not any(matches):
-            return False
-
-    return True
-
-
-def feature_test_contains(feature, **kwargs):
-    if "index" in kwargs:
-        return feature.location.start < kwargs["index"] < feature.location.end
-    elif "range" in kwargs:
-        return (
-            feature.location.start < kwargs["range"]["start"] < feature.location.end
-            and feature.location.start < kwargs["range"]["end"] < feature.location.end
-        )
-    else:
-        raise RuntimeError("Must use index or range keyword")
-
-
-def get_id(feature=None, parent_prefix=None):
-    result = ""
-    if parent_prefix is not None:
-        result += parent_prefix + "|"
-    if "locus_tag" in feature.qualifiers:
-        result += feature.qualifiers["locus_tag"][0]
-    elif "gene" in feature.qualifiers:
-        result += feature.qualifiers["gene"][0]
-    elif "Gene" in feature.qualifiers:
-        result += feature.qualifiers["Gene"][0]
-    elif "product" in feature.qualifiers:
-        result += feature.qualifiers["product"][0]
-    elif "Product" in feature.qualifiers:
-        result += feature.qualifiers["Product"][0]
-    elif "Name" in feature.qualifiers:
-        result += feature.qualifiers["Name"][0]
-    else:
-        return feature.id
-        # Leaving in case bad things happen.
-        # result += '%s_%s_%s_%s' % (
-        # feature.id,
-        # feature.location.start,
-        # feature.location.end,
-        # feature.location.strand
-        # )
-    return result
-
-
-def get_gff3_id(gene):
-    return gene.qualifiers.get("Name", [gene.id])[0]
-
-
-def ensure_location_in_bounds(start=0, end=0, parent_length=0):
-    # This prevents frameshift errors
-    while start < 0:
-        start += 3
-    while end < 0:
-        end += 3
-    while start > parent_length:
-        start -= 3
-    while end > parent_length:
-        end -= 3
-    return (start, end)
-
-
-def coding_genes(feature_list):
-    for x in genes(feature_list):
-        if (
-            len(
-                list(
-                    feature_lambda(
-                        x.sub_features,
-                        feature_test_type,
-                        {"type": "CDS"},
-                        subfeatures=False,
-                    )
-                )
-            )
-            > 0
-        ):
-            yield x
-
-
-def genes(feature_list, feature_type="gene", sort=False):
-    """
-    Simple filter to extract gene features from the feature set.
-    """
-
-    if not sort:
-        for x in feature_lambda(
-            feature_list, feature_test_type, {"type": feature_type}, subfeatures=True
-        ):
-            yield x
-    else:
-        data = list(genes(feature_list, feature_type=feature_type, sort=False))
-        data = sorted(data, key=lambda feature: feature.location.start)
-        for x in data:
-            yield x
-
-
-def wa_unified_product_name(feature):
-    """
-    Try and figure out a name. We gave conflicting instructions, so
-    this isn't as trivial as it should be. Sometimes it will be in
-    'product' or 'Product', othertimes in 'Name'
-    """
-    # Manually applied tags.
-    protein_product = feature.qualifiers.get(
-        "product", feature.qualifiers.get("Product", [None])
-    )[0]
-
-    # If neither of those are available ...
-    if protein_product is None:
-        # And there's a name...
-        if "Name" in feature.qualifiers:
-            if not is_uuid(feature.qualifiers["Name"][0]):
-                protein_product = feature.qualifiers["Name"][0]
-
-    return protein_product
-
-
-def is_uuid(name):
-    return name.count("-") == 4 and len(name) == 36
-
-
-def get_rbs_from(gene):
-    # Normal RBS annotation types
-    rbs_rbs = list(
-        feature_lambda(
-            gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False
-        )
-    )
-    rbs_sds = list(
-        feature_lambda(
-            gene.sub_features,
-            feature_test_type,
-            {"type": "Shine_Dalgarno_sequence"},
-            subfeatures=False,
-        )
-    )
-    # Fraking apollo
-    apollo_exons = list(
-        feature_lambda(
-            gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False
-        )
-    )
-    apollo_exons = [x for x in apollo_exons if len(x) < 10]
-    # These are more NCBI's style
-    regulatory_elements = list(
-        feature_lambda(
-            gene.sub_features,
-            feature_test_type,
-            {"type": "regulatory"},
-            subfeatures=False,
-        )
-    )
-    rbs_regulatory = list(
-        feature_lambda(
-            regulatory_elements,
-            feature_test_quals,
-            {"regulatory_class": ["ribosome_binding_site"]},
-            subfeatures=False,
-        )
-    )
-    # Here's hoping you find just one ;)
-    return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons
-
-
-def nice_name(record):
-    """
-    get the real name rather than NCBI IDs and so on. If fails, will return record.id
-    """
-    name = record.id
-    likely_parental_contig = list(genes(record.features, feature_type="contig"))
-    if len(likely_parental_contig) == 1:
-        name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]
-    return name
-
-
-def fsort(it):
-    for i in sorted(it, key=lambda x: int(x.location.start)):
-        yield i
--- a/cpt_disruptin_proximity/macros.xml	Fri Jun 17 12:24:06 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,23 +0,0 @@
-<?xml version="1.0"?>
-<macros>
-  <xml name="requirements">
-    <requirements>
-		<requirement type="package" version="3.8.13">python</requirement>
-		<requirement type="package" version="1.79">biopython</requirement>
-		<requirement type="package" version="1.2.2">cpt_gffparser</requirement>  
-		<yield/>
-    </requirements>
-  </xml>
-  <xml name="genome_selector">
-	    <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
-  </xml>
-  <xml name="gff3_input">
-    <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
-  </xml>
-  <token name="@GENOME_SELECTOR_PRE@">
-		ln -s $genome_fasta genomeref.fa;
-	</token>
-	<token name="@GENOME_SELECTOR@">
-		genomeref.fa
-	</token>
-</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/disruptin_proximity_2_lysis_genes.py	Mon Jun 05 02:40:58 2023 +0000
@@ -0,0 +1,373 @@
+#!/usr/bin/env python
+"""
+This program is intended to identify protein coding sequences within a certain window (number of base pairs) of genes encoding recognized endolysin domains and genes encoding transmembrane domains. The goal is narrow a list of disruptin candidates by identifying the sequences close to the other lysis genes in the phage genome.
+Inputs for this program include a .fasta file with protein sequences of lysis gene candidates from the phage genome, a .gff3 file with the tmhmm results from the genome, a .gff3 file with the results from interproscan of the genome, a .gff3 file of the genome, window size in number of base pairs, a tab separated list of endolysin domains, and optional names of output files.
+The program outputs lists of lysis gene candidates that are close to protein codings sequences with endolysin domains or to sequences with transmembrane domains and lists of the proteins in proximity to the lysis gene candidates (one list for proteins with endolysin domains and one list for TMD-containing proteins).
+"""
+
+from Bio import SeqIO
+import argparse
+import sys
+from CPT_GFFParser import gffParse, gffWrite
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature
+from Bio.Seq import Seq
+from intervaltree import IntervalTree, Interval
+from gff3 import feature_lambda, feature_test_type
+
+# Used for genome in fasta format
+# outputs the start and end coordinates from a record in fasta format
+# Should work for seqrecords from NCBI database
+# def location_start_end(seqrec):
+#    F = seqrec.description
+#    description_subparts = F.split(' ')
+#    for i in range(len(description_subparts)):
+#        if description_subparts[i].startswith('[location'):
+#            location = description_subparts[i][10:-1]
+#    location_se = location.split('..')
+#    location_start = location_se[0]
+#    location_end = location_se[1]
+#
+#    return location_start, location_end
+
+# adapted from intersect_and_adjacent.py
+def treeFeatures(features):
+    for feat in features:
+        # used with genome in fasta format
+        # start, end = location_start_end(feat)
+
+        # Interval(begin, end, data)
+        yield Interval(int(feat.location.start), int(feat.location.end), feat.id)
+
+
+# Function to read enzyme domain names and ids from the enzyme list
+# Enzyme list must be a tab separated txt file with the format with four columns: Predicted catalytic or binding domain, Abbreviation, Conserved domain, Phage example
+# The first column is used here as the domain name, and the 3rd column is the domain id
+def read_enzyme_list(enzyme_file=None):
+    enzyme_file.seek(0)
+    domains = []
+    # domain_names = []
+
+    for line in enzyme_file:
+        if not line.startswith("*"):
+            words = line.split("\t")
+            if len(words) > 3:
+                domains += [words[2]]
+                # domain_names += [words[0]]
+
+    return domains[1:]
+
+
+# adapted from intersect_and_adjacent.py
+def intersect(rec_a, rec_b, window):
+    if len(rec_a) > 0 and len(rec_b) > 0:
+
+        # builds interval tree from Interval objects of form (start, end, id) for each feature
+        tree_a = IntervalTree(list(treeFeatures(rec_a)))
+        tree_b = IntervalTree(list(treeFeatures(rec_b)))
+
+        # Used to map ids back to features later
+        rec_a_map = {f.id: f for f in rec_a}
+        rec_b_map = {f.id: f for f in rec_b}
+
+        rec_a_hits_in_b = []
+        rec_b_hits_in_a = []
+
+        for feature in rec_a:
+            # Used with genome in fasta format
+            # start, end = location_start_end(feature)
+            # Save each feature in rec_a that overlaps a feature in rec_b
+            # hits = tree_b.find_range((int(feature.location.start), int(feature.location.end)))
+            hits = tree_b[
+                (int(feature.location.start) - window) : (
+                    int(feature.location.end) + window
+                )
+            ]
+            # feature id is saved in interval result.data, use map to get full feature
+            for hit in hits:
+                rec_a_hits_in_b.append(rec_b_map[hit.data])
+
+        for feature in rec_b:
+            # Used with genome in fasta format
+            # start, end = location_start_end(feature)
+            # Save each feature in rec_a that overlaps a feature in rec_b
+            # hits = tree_a.find_range((int(feature.location.start), int(feature.location.end)))
+            hits = tree_a[
+                (int(feature.location.start) - window) : (
+                    int(feature.location.end) + window
+                )
+            ]
+            # feature id is saved in interval result.data, use map to get full feature
+            for hit in hits:
+                rec_b_hits_in_a.append(rec_a_map[hit.data])
+
+        # Remove duplicate features using sets
+        rec_a = set(rec_a_hits_in_b)
+        rec_b = set(rec_b_hits_in_a)
+
+    else:
+        # If one input is empty, output two empty result files.
+        rec_a = SeqRecord(Seq(""), "none")
+        rec_b = SeqRecord(Seq(""), "none")
+    return rec_a, rec_b
+
+
+# Function to identify enzyme domains associated with endolysin function from the interproscan results file
+def find_endolysins(rec_ipro, enzyme_domain_ids):
+
+    # print(rec_ipro)
+
+    if len(rec_ipro) > 0:
+        endo_rec_names = []
+        endo_rec_domain_ids = []
+        rec_domain_name = []
+
+        # Check each feature in the ipro file if the domain id/feature qualifier "Name" is included in the domain list.
+        for seq in rec_ipro:
+            for i in range(len(seq.features)):
+                f = seq.features[i]
+                if f.type == "protein_match":
+                    # print(f)
+                    unwanted = ["TRANS", "SIGNAL", "CYTO", "Coil", "Signal"]
+                    # Ignores feature with unwanted key words in the feature name
+                    if all(x not in f.qualifiers["Name"][0] for x in unwanted):
+                        # If feature is included in the given enzyme domain list, the protein name, domain id, and domain name are stored
+                        domain_description = [str(f.qualifiers["Name"][0])]
+                        if "signature_desc" in f.qualifiers:
+                            domain_description += [
+                                str(f.qualifiers["signature_desc"][0])
+                            ]
+
+                        for i in enzyme_domain_ids:
+                            for y in domain_description:
+                                if i in y:
+                                    endo_rec_domain_ids += [f.qualifiers["Name"][0]]
+
+                                    # e_index = enzyme_domain_ids.index(i)
+                                    # rec_domain_name += [enzyme_domain_names[e_index]]
+
+                                    target = f.qualifiers["Target"][0]
+                                    target = target.split(" ")
+                                    protein_name = str(target[0]) + "**"
+                                    endo_rec_names += [protein_name]
+
+        return endo_rec_names, endo_rec_domain_ids
+
+
+def adjacent_lgc(lgc, tmhmm, ipro, genome, enzyme, window):
+    rec_lgc = list(SeqIO.parse(lgc, "fasta"))
+    rec_tmhmm = gffParse(tmhmm)
+    rec_ipro = gffParse(ipro)
+    recTemp = gffParse(genome)[0]
+    tempFeats = feature_lambda(
+        recTemp.features,
+        feature_test_type,
+        {"types": ["CDS"]},
+        subfeatures=True,
+    )
+    recTemp.features = tempFeats
+    rec_genome_ini = [recTemp]
+
+    # genome.seek(0)
+    # examiner = GFFExaminer()
+    # print(examiner.available_limits(genome))
+
+    enzyme_domain_ids = read_enzyme_list(enzyme)
+
+    if len(rec_lgc) > 0 and len(rec_tmhmm) > 0 and len(rec_genome_ini) > 0:
+
+        # find names of the proteins containing endolysin associated domains
+        endo_names, endo_domain_ids = find_endolysins(rec_ipro, list(enzyme_domain_ids))
+
+        # find names of proteins containing transmembrane domains
+        tmhmm_protein_names = []
+        for seq in rec_tmhmm:
+            tmhmm_protein_names += [str(seq.id) + "**"]
+
+        lgc_names = []
+        for seq in rec_lgc:
+            lgc_names += [str(seq.id) + "**"]
+
+        adjacent_endo = {}
+        adjacent_lgc_to_endo = {}
+        adjacent_tm = {}
+        adjacent_lgc_to_tm = {}
+
+        # print(len(tmhmm_protein_names), len(endo_names))
+        # print(rec_genome_ini)
+        # print(len(rec_genome_ini))
+
+        for i in range(len(rec_genome_ini)):
+            rec_genome = rec_genome_ini[i]
+
+            # find records for proteins containing endolysin domains and tmds from genome fasta file
+            tm_seqrec = []
+            endolysin_seqrec = []
+            lgc_seqrec = []
+
+            # print(tmhmm_protein_names)
+            # print(endo_names)
+            # print(lgc_names)
+            # print(rec_genome)
+
+            for feat in rec_genome.features:
+                # print(feat)
+                # searches for synonyms and
+                if feat.type == "CDS":
+                    feat_names = []
+                    feat_names.append(str(feat.id) + "**")
+                    if "locus_tag" in feat.qualifiers:
+                        feat_names.append(str(feat.qualifiers["locus_tag"][0]) + "**")
+                    if "protein_id" in feat.qualifiers:
+                        feat_names.append(str(feat.qualifiers["protein_id"][0]) + "**")
+                    if "Name" in feat.qualifiers:
+                        if len(str(feat.qualifiers["Name"][0])) > 5:
+                            feat_names.append(str(feat.qualifiers["Name"][0]) + "**")
+
+                    # print(str(feat_names))
+                    # print(str(feat.qualifiers))
+                    for i in range(len(feat_names)):
+                        if str(feat_names[i]) in str(lgc_names):
+                            lgc_seqrec += [feat]
+                    # check if gene annotated as holin using key words/synonyms
+                    holin_annotations = ["holin"]
+                    if "product" in feat.qualifiers:
+                        if any(
+                            x for x in holin_annotations if (x in str(feat.qualifiers))
+                        ):
+                            tm_seqrec += [feat]
+                    # check if protein contains a TMD
+                    for i in range(len(feat_names)):
+                        if str(feat_names[i]) in str(tmhmm_protein_names):
+                            # print(feat_names[i])
+                            tm_seqrec += [feat]
+
+                    # check if gene annotated as endolysin using key words/synonyms
+                    endolysin_annotations = ["lysin", "lysozyme"]
+                    if "product" in feat.qualifiers:
+                        if any(
+                            x
+                            for x in endolysin_annotations
+                            if (x in str(feat.qualifiers))
+                        ):
+                            endolysin_seqrec += [feat]
+                    # check if protein contains an endolysin-associated domain
+                    for i in range(len(feat_names)):
+                        if str(feat_names[i]) in str(endo_names):
+                            endolysin_seqrec += [feat]
+
+            # print(endolysin_seqrec, tm_seqrec, lgc_seqrec)
+            # find possible endolysins that are adjacent to (or within window length away from) the lysis gene, or disruptin, candidates
+            # if len(endolysin_seqrec) > 0:
+            adjacent_lgc_to_endo_i, adjacent_endo_i = intersect(
+                endolysin_seqrec, lgc_seqrec, window
+            )
+            # find TMD-containing proteins that are adjacent to (or within window length away from) the lysis gene, or disruptin, candidates
+            # if len(tm_seqrec) > 0:
+            adjacent_lgc_to_tm_i, adjacent_tm_i = intersect(
+                tm_seqrec, lgc_seqrec, window
+            )
+
+            # print(len(endolysin_seqrec), len(lgc_seqrec), len(tm_seqrec))
+            adjacent_endo[rec_genome.id] = adjacent_endo_i
+            adjacent_lgc_to_endo[rec_genome.id] = adjacent_lgc_to_endo_i
+            adjacent_tm[rec_genome.id] = adjacent_tm_i
+            adjacent_lgc_to_tm[rec_genome.id] = adjacent_lgc_to_tm_i
+            # print(rec_genome.id)
+    else:
+        return 0, 0, 0, 0
+    # print(adjacent_endo)
+    return adjacent_endo, adjacent_lgc_to_endo, adjacent_tm, adjacent_lgc_to_tm
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Identify lysis gene candidates next to possible endolysin or holin genes",
+        epilog="",
+    )
+    parser.add_argument(
+        "lgc",
+        type=argparse.FileType("r"),
+        help="fasta file with protein coding sequences of lysis gene candidates",
+    )
+    parser.add_argument(
+        "tmhmm",
+        type=argparse.FileType("r"),
+        help="gff3 file with tmhmm results from the coding sequences in the genome",
+    )
+    parser.add_argument(
+        "ipro",
+        type=argparse.FileType("r"),
+        help="gff3 file with interpro results from protein sequences in the genome",
+    )
+    parser.add_argument(
+        "genome",
+        type=argparse.FileType("r"),
+        help="fasta file with protein coding sequences for all genes in the genome",
+    )
+    parser.add_argument(
+        "window",
+        type=int,
+        default=1000,
+        help="Allows features this far away to still be considered 'adjacent'",
+    )
+    parser.add_argument(
+        "enzyme",
+        type=argparse.FileType("r"),
+        help="tab delimited text file including list of conserved protein domains linked to endolysin function",
+    )
+    parser.add_argument("--oa", type=str, default="possible_endolysin.gff3")
+    parser.add_argument(
+        "--ob", type=str, default="lysis_gene_candidates_near_endolysin.gff3"
+    )
+    parser.add_argument("--oc", type=str, default="possible_holin.gff3")
+    parser.add_argument(
+        "--od", type=str, default="lysis_gene_candidates_near_holin.gff3"
+    )
+    args = parser.parse_args()
+
+    endo, lgc_endo, tm, lgc_tm = adjacent_lgc(
+        args.lgc, args.tmhmm, args.ipro, args.genome, args.enzyme, args.window
+    )
+
+    if endo == 0:
+        with open(args.oa, "w") as handle:
+            handle.write("##gff-version 3")
+        with open(args.ob, "w") as handle:
+            handle.write("##gff-version 3")
+        with open(args.oc, "w") as handle:
+            handle.write("##gff-version 3")
+        with open(args.od, "w") as handle:
+            handle.write("##gff-version 3")
+        return
+
+    args.genome.seek(0)
+    rec = list(gffParse(args.genome))
+
+    with open(args.oa, "w") as handle:
+        for i in range(len(rec)):
+            rec_i = rec[i]
+            if endo.get(rec_i.id, "") is not "":
+                rec_i.features = endo[rec_i.id]
+                gffWrite([rec_i], handle)
+
+    with open(args.ob, "w") as handle:
+        for i in range(len(rec)):
+            rec_i = rec[i]
+            if lgc_endo.get(rec_i.id, "") is not "":
+                rec_i.features = lgc_endo[rec_i.id]
+                gffWrite([rec_i], handle)
+
+    with open(args.oc, "w") as handle:
+        for i in range(len(rec)):
+            rec_i = rec[i]
+            if tm.get(rec_i.id, "") is not "":
+                rec_i.features = tm[rec_i.id]
+                gffWrite([rec_i], handle)
+
+    with open(args.od, "w") as handle:
+        for i in range(len(rec)):
+            rec_i = rec[i]
+            if lgc_tm.get(rec_i.id, "") is not "":
+                rec_i.features = lgc_tm[rec_i.id]
+                gffWrite([rec_i], handle)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/disruptin_proximity_2_lysis_genes.xml	Mon Jun 05 02:40:58 2023 +0000
@@ -0,0 +1,55 @@
+<tool id="edu.tamu.cpt2.phage.disruptin_prox" name="Disruptin Proximity to Lysis Genes Tool" version="1.1">
+    <description>identifies lysis gene candidates close to genes encoding endolysin domains and genes encoding transmembrane domains</description>
+    <macros>
+        <import>macros.xml</import>
+        <import>cpt-macros.xml</import>
+    </macros>
+    <expand macro="requirements">
+        <requirement type="package" version="3.0.2">intervaltree</requirement>
+    </expand>
+    <command detect_errors="aggressive"><![CDATA[
+python '$__tool_directory__/disruptin_proximity_2_lysis_genes.py'
+'$lgc'
+'$tmhmm'
+'$ipro'
+'$genome'
+'$window'
+'$enzyme'
+--oa '$oa'
+--ob '$ob'
+--oc '$oc'
+--od '$od'
+
+]]></command>
+    <inputs>
+        <param label="Lysis Gene Candidates" name="lgc" type="data" format="fasta"/>
+        <param label="TMHMM Results" name="tmhmm" type="data" format="gff3"/>
+        <param label="InterProScan Results" name="ipro" type="data" format="gff3"/>
+        <param label="Phage Genome" name="genome" type="data" format="gff3"/>
+        <param label="Adjacency Window Size" name="window" type="integer" value="1000"/>
+        <param label="Enzyme Domain List" name="enzyme" type="data" format="tabular"/>
+    </inputs>
+    <outputs>
+        <data format="gff3" name="oa" label="Protein with endolysin-associated domain and adjacent to lysis gene candidate"/>
+        <data format="gff3" name="ob" label="Lysis gene candidates near possible endolysin"/>
+        <data format="gff3" name="oc" label="Protein with transmembrane domain and adjacent to lysis gene candidate"/>
+        <data format="gff3" name="od" label="Lysis gene candidates near TMD-containing protein"/>
+    </outputs>
+    <help><![CDATA[
+**What it does**
+This program is intended to identify protein coding sequences within a certain window (number of base pairs) of genes encoding recognized endolysin domains and others encoding transmembrane domains.
+The goal is to identify possible lysis genes or to narrow a list of disruptin candidates.
+
+        ]]></help>
+    <citations>
+        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+        <citation type="bibtex">
+        @unpublished{galaxyTools, 
+            author = {A. Holt},
+            title = {CPT Galaxy Tools},
+            year = {2020},
+            note = {https://github.com/tamu-cpt/galaxy-tools/}
+        }
+        </citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff3.py	Mon Jun 05 02:40:58 2023 +0000
@@ -0,0 +1,346 @@
+import copy
+import logging
+
+log = logging.getLogger()
+log.setLevel(logging.WARN)
+
+
+def feature_lambda(
+    feature_list,
+    test,
+    test_kwargs,
+    subfeatures=True,
+    parent=None,
+    invert=False,
+    recurse=True,
+):
+    """Recursively search through features, testing each with a test function, yielding matches.
+
+    GFF3 is a hierachical data structure, so we need to be able to recursively
+    search through features. E.g. if you're looking for a feature with
+    ID='bob.42', you can't just do a simple list comprehension with a test
+    case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.
+
+    :type feature_list: list
+    :param feature_list: an iterable of features
+
+    :type test: function reference
+    :param test: a closure with the method signature (feature, **kwargs) where
+                 the kwargs are those passed in the next argument. This
+                 function should return True or False, True if the feature is
+                 to be yielded as part of the main feature_lambda function, or
+                 False if it is to be ignored. This function CAN mutate the
+                 features passed to it (think "apply").
+
+    :type test_kwargs: dictionary
+    :param test_kwargs: kwargs to pass to your closure when it is called.
+
+    :type subfeatures: boolean
+    :param subfeatures: when a feature is matched, should just that feature be
+                        yielded to the caller, or should the entire sub_feature
+                        tree for that feature be included? subfeatures=True is
+                        useful in cases such as searching for a gene feature,
+                        and wanting to know what RBS/Shine_Dalgarno_sequences
+                        are in the sub_feature tree (which can be accomplished
+                        with two feature_lambda calls). subfeatures=False is
+                        useful in cases when you want to process (and possibly
+                        return) the entire feature tree, such as applying a
+                        qualifier to every single feature.
+
+    :type invert: boolean
+    :param invert: Negate/invert the result of the filter.
+
+    :rtype: yielded list
+    :return: Yields a list of matching features.
+    """
+    # Either the top level set of [features] or the subfeature attribute
+    for feature in feature_list:
+        feature._parent = parent
+        if not parent:
+            # Set to self so we cannot go above root.
+            feature._parent = feature
+        test_result = test(feature, **test_kwargs)
+        # if (not invert and test_result) or (invert and not test_result):
+        if invert ^ test_result:
+            if not subfeatures:
+                feature_copy = copy.deepcopy(feature)
+                feature_copy.sub_features = list()
+                yield feature_copy
+            else:
+                yield feature
+
+        if recurse and hasattr(feature, "sub_features"):
+            for x in feature_lambda(
+                feature.sub_features,
+                test,
+                test_kwargs,
+                subfeatures=subfeatures,
+                parent=feature,
+                invert=invert,
+                recurse=recurse,
+            ):
+                yield x
+
+
+def fetchParent(feature):
+    if not hasattr(feature, "_parent") or feature._parent is None:
+        return feature
+    else:
+        return fetchParent(feature._parent)
+
+
+def feature_test_true(feature, **kwargs):
+    return True
+
+
+def feature_test_type(feature, **kwargs):
+    if "type" in kwargs:
+        return str(feature.type).upper() == str(kwargs["type"]).upper()
+    elif "types" in kwargs:
+        for x in kwargs["types"]:
+            if str(feature.type).upper() == str(x).upper():
+                return True
+        return False
+    raise Exception("Incorrect feature_test_type call, need type or types")
+
+
+def feature_test_qual_value(feature, **kwargs):
+    """Test qualifier values.
+
+    For every feature, check that at least one value in
+    feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
+    """
+    if isinstance(kwargs["qualifier"], list):
+        for qualifier in kwargs["qualifier"]:
+            for attribute_value in feature.qualifiers.get(qualifier, []):
+                if attribute_value in kwargs["attribute_list"]:
+                    return True
+    else:
+        for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []):
+            if attribute_value in kwargs["attribute_list"]:
+                return True
+    return False
+
+
+def feature_test_location(feature, **kwargs):
+    if "strand" in kwargs:
+        if feature.location.strand != kwargs["strand"]:
+            return False
+
+    return feature.location.start <= kwargs["loc"] <= feature.location.end
+
+
+def feature_test_quals(feature, **kwargs):
+    """
+    Example::
+
+        a = Feature(qualifiers={'Note': ['Some notes', 'Aasdf']})
+
+        # Check if a contains a Note
+        feature_test_quals(a, {'Note': None})  # Returns True
+        feature_test_quals(a, {'Product': None})  # Returns False
+
+        # Check if a contains a note with specific value
+        feature_test_quals(a, {'Note': ['ome']})  # Returns True
+
+        # Check if a contains a note with specific value
+        feature_test_quals(a, {'Note': ['other']})  # Returns False
+    """
+    for key in kwargs:
+        if key not in feature.qualifiers:
+            return False
+
+        # Key is present, no value specified
+        if kwargs[key] is None:
+            return True
+
+        # Otherwise there is a key value we're looking for.
+        # so we make a list of matches
+        matches = []
+        # And check all of the feature qualifier valuse
+        for value in feature.qualifiers[key]:
+            # For that kwargs[key] value
+            for x in kwargs[key]:
+                matches.append(x in value)
+
+        # If none matched, then we return false.
+        if not any(matches):
+            return False
+
+    return True
+
+
+def feature_test_contains(feature, **kwargs):
+    if "index" in kwargs:
+        return feature.location.start < kwargs["index"] < feature.location.end
+    elif "range" in kwargs:
+        return (
+            feature.location.start < kwargs["range"]["start"] < feature.location.end
+            and feature.location.start < kwargs["range"]["end"] < feature.location.end
+        )
+    else:
+        raise RuntimeError("Must use index or range keyword")
+
+
+def get_id(feature=None, parent_prefix=None):
+    result = ""
+    if parent_prefix is not None:
+        result += parent_prefix + "|"
+    if "locus_tag" in feature.qualifiers:
+        result += feature.qualifiers["locus_tag"][0]
+    elif "gene" in feature.qualifiers:
+        result += feature.qualifiers["gene"][0]
+    elif "Gene" in feature.qualifiers:
+        result += feature.qualifiers["Gene"][0]
+    elif "product" in feature.qualifiers:
+        result += feature.qualifiers["product"][0]
+    elif "Product" in feature.qualifiers:
+        result += feature.qualifiers["Product"][0]
+    elif "Name" in feature.qualifiers:
+        result += feature.qualifiers["Name"][0]
+    else:
+        return feature.id
+        # Leaving in case bad things happen.
+        # result += '%s_%s_%s_%s' % (
+        # feature.id,
+        # feature.location.start,
+        # feature.location.end,
+        # feature.location.strand
+        # )
+    return result
+
+
+def get_gff3_id(gene):
+    return gene.qualifiers.get("Name", [gene.id])[0]
+
+
+def ensure_location_in_bounds(start=0, end=0, parent_length=0):
+    # This prevents frameshift errors
+    while start < 0:
+        start += 3
+    while end < 0:
+        end += 3
+    while start > parent_length:
+        start -= 3
+    while end > parent_length:
+        end -= 3
+    return (start, end)
+
+
+def coding_genes(feature_list):
+    for x in genes(feature_list):
+        if (
+            len(
+                list(
+                    feature_lambda(
+                        x.sub_features,
+                        feature_test_type,
+                        {"type": "CDS"},
+                        subfeatures=False,
+                    )
+                )
+            )
+            > 0
+        ):
+            yield x
+
+
+def genes(feature_list, feature_type="gene", sort=False):
+    """
+    Simple filter to extract gene features from the feature set.
+    """
+
+    if not sort:
+        for x in feature_lambda(
+            feature_list, feature_test_type, {"type": feature_type}, subfeatures=True
+        ):
+            yield x
+    else:
+        data = list(genes(feature_list, feature_type=feature_type, sort=False))
+        data = sorted(data, key=lambda feature: feature.location.start)
+        for x in data:
+            yield x
+
+
+def wa_unified_product_name(feature):
+    """
+    Try and figure out a name. We gave conflicting instructions, so
+    this isn't as trivial as it should be. Sometimes it will be in
+    'product' or 'Product', othertimes in 'Name'
+    """
+    # Manually applied tags.
+    protein_product = feature.qualifiers.get(
+        "product", feature.qualifiers.get("Product", [None])
+    )[0]
+
+    # If neither of those are available ...
+    if protein_product is None:
+        # And there's a name...
+        if "Name" in feature.qualifiers:
+            if not is_uuid(feature.qualifiers["Name"][0]):
+                protein_product = feature.qualifiers["Name"][0]
+
+    return protein_product
+
+
+def is_uuid(name):
+    return name.count("-") == 4 and len(name) == 36
+
+
+def get_rbs_from(gene):
+    # Normal RBS annotation types
+    rbs_rbs = list(
+        feature_lambda(
+            gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False
+        )
+    )
+    rbs_sds = list(
+        feature_lambda(
+            gene.sub_features,
+            feature_test_type,
+            {"type": "Shine_Dalgarno_sequence"},
+            subfeatures=False,
+        )
+    )
+    # Fraking apollo
+    apollo_exons = list(
+        feature_lambda(
+            gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False
+        )
+    )
+    apollo_exons = [x for x in apollo_exons if len(x) < 10]
+    # These are more NCBI's style
+    regulatory_elements = list(
+        feature_lambda(
+            gene.sub_features,
+            feature_test_type,
+            {"type": "regulatory"},
+            subfeatures=False,
+        )
+    )
+    rbs_regulatory = list(
+        feature_lambda(
+            regulatory_elements,
+            feature_test_quals,
+            {"regulatory_class": ["ribosome_binding_site"]},
+            subfeatures=False,
+        )
+    )
+    # Here's hoping you find just one ;)
+    return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons
+
+
+def nice_name(record):
+    """
+    get the real name rather than NCBI IDs and so on. If fails, will return record.id
+    """
+    name = record.id
+    likely_parental_contig = list(genes(record.features, feature_type="contig"))
+    if len(likely_parental_contig) == 1:
+        name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]
+    return name
+
+
+def fsort(it):
+    for i in sorted(it, key=lambda x: int(x.location.start)):
+        yield i
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Jun 05 02:40:58 2023 +0000
@@ -0,0 +1,74 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package">progressivemauve</requirement>
+            <!--<requirement type="package" version="2.7">python</requirement>-->
+            <requirement type="package" version="0.6.4">bcbiogff</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@WRAPPER_VERSION@">2.4.0</token>
+    <xml name="citation/progressive_mauve">
+        <citation type="doi">10.1371/journal.pone.0011147</citation>
+    </xml>
+    <xml name="citation/gepard">
+        <citation type="doi">10.1093/bioinformatics/btm039</citation>
+    </xml>
+    <token name="@XMFA_INPUT@">
+		'$xmfa'
+	</token>
+    <xml name="xmfa_input" token_formats="xmfa">
+        <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/>
+    </xml>
+    <token name="@XMFA_FA_INPUT@">
+		'$sequences'
+	</token>
+    <xml name="xmfa_fa_input">
+        <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/>
+    </xml>
+    <xml name="genome_selector">
+        <conditional name="reference_genome">
+            <param name="reference_genome_source" type="select" label="Reference Genome">
+                <option value="history" selected="True">From History</option>
+                <option value="cached">Locally Cached</option>
+            </param>
+            <when value="cached">
+                <param name="fasta_indexes" type="select" label="Source FASTA Sequence">
+                    <options from_data_table="all_fasta"/>
+                </param>
+            </when>
+            <when value="history">
+                <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="gff3_input">
+        <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+    </xml>
+    <xml name="input/gff3+fasta">
+        <expand macro="gff3_input"/>
+        <expand macro="genome_selector"/>
+    </xml>
+    <token name="@INPUT_GFF@">
+	    '$gff3_data'
+	</token>
+    <token name="@INPUT_FASTA@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+	</token>
+    <token name="@GENOME_SELECTOR_PRE@">
+    #if $reference_genome.reference_genome_source == 'history':
+            ln -s '$reference_genome.genome_fasta' genomeref.fa;
+    #end if
+	</token>
+    <token name="@GENOME_SELECTOR@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+	</token>
+</macros>