Mercurial > repos > cpt > cpt_prep_for_apollo

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/cpt-macros.xml	Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,115 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="gff_requirements">
+		<requirements>
+			<requirement type="package" version="2.7">python</requirement>
+			<requirement type="package" version="1.65">biopython</requirement>
+			<requirement type="package" version="2.12.1">requests</requirement>
+			<yield/>
+		</requirements>
+		<version_command>
+		<![CDATA[
+			cd $__tool_directory__ && git rev-parse HEAD
+		]]>
+		</version_command>
+	</xml>
+	<xml name="citation/mijalisrasche">
+		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+		<citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+	</xml>
+	<xml name="citations">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+		<yield/>
+		</citations>
+	</xml>
+    	<xml name="citations-crr">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+		<yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020-AJC-solo">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-clm">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="sl-citations-clm">
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+	</xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/gff3.py	Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,346 @@
+import copy
+import logging
+
+log = logging.getLogger()
+log.setLevel(logging.WARN)
+
+
+def feature_lambda(
+    feature_list,
+    test,
+    test_kwargs,
+    subfeatures=True,
+    parent=None,
+    invert=False,
+    recurse=True,
+):
+    """Recursively search through features, testing each with a test function, yielding matches.
+
+    GFF3 is a hierachical data structure, so we need to be able to recursively
+    search through features. E.g. if you're looking for a feature with
+    ID='bob.42', you can't just do a simple list comprehension with a test
+    case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.
+
+    :type feature_list: list
+    :param feature_list: an iterable of features
+
+    :type test: function reference
+    :param test: a closure with the method signature (feature, **kwargs) where
+                 the kwargs are those passed in the next argument. This
+                 function should return True or False, True if the feature is
+                 to be yielded as part of the main feature_lambda function, or
+                 False if it is to be ignored. This function CAN mutate the
+                 features passed to it (think "apply").
+
+    :type test_kwargs: dictionary
+    :param test_kwargs: kwargs to pass to your closure when it is called.
+
+    :type subfeatures: boolean
+    :param subfeatures: when a feature is matched, should just that feature be
+                        yielded to the caller, or should the entire sub_feature
+                        tree for that feature be included? subfeatures=True is
+                        useful in cases such as searching for a gene feature,
+                        and wanting to know what RBS/Shine_Dalgarno_sequences
+                        are in the sub_feature tree (which can be accomplished
+                        with two feature_lambda calls). subfeatures=False is
+                        useful in cases when you want to process (and possibly
+                        return) the entire feature tree, such as applying a
+                        qualifier to every single feature.
+
+    :type invert: boolean
+    :param invert: Negate/invert the result of the filter.
+
+    :rtype: yielded list
+    :return: Yields a list of matching features.
+    """
+    # Either the top level set of [features] or the subfeature attribute
+    for feature in feature_list:
+        feature._parent = parent
+        if not parent:
+            # Set to self so we cannot go above root.
+            feature._parent = feature
+        test_result = test(feature, **test_kwargs)
+        # if (not invert and test_result) or (invert and not test_result):
+        if invert ^ test_result:
+            if not subfeatures:
+                feature_copy = copy.deepcopy(feature)
+                feature_copy.sub_features = list()
+                yield feature_copy
+            else:
+                yield feature
+
+        if recurse and hasattr(feature, "sub_features"):
+            for x in feature_lambda(
+                feature.sub_features,
+                test,
+                test_kwargs,
+                subfeatures=subfeatures,
+                parent=feature,
+                invert=invert,
+                recurse=recurse,
+            ):
+                yield x
+
+
+def fetchParent(feature):
+    if not hasattr(feature, "_parent") or feature._parent is None:
+        return feature
+    else:
+        return fetchParent(feature._parent)
+
+
+def feature_test_true(feature, **kwargs):
+    return True
+
+
+def feature_test_type(feature, **kwargs):
+    if "type" in kwargs:
+        return str(feature.type).upper() == str(kwargs["type"]).upper()
+    elif "types" in kwargs:
+      for x in kwargs["types"]:
+        if str(feature.type).upper() == str(x).upper():
+          return True
+      return False
+    raise Exception("Incorrect feature_test_type call, need type or types")
+
+
+def feature_test_qual_value(feature, **kwargs):
+    """Test qualifier values.
+
+    For every feature, check that at least one value in
+    feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
+    """
+    if isinstance(kwargs["qualifier"], list):
+        for qualifier in kwargs["qualifier"]:
+            for attribute_value in feature.qualifiers.get(qualifier, []):
+                if attribute_value in kwargs["attribute_list"]:
+                    return True
+    else:
+        for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []):
+            if attribute_value in kwargs["attribute_list"]:
+                return True
+    return False
+
+
+def feature_test_location(feature, **kwargs):
+    if "strand" in kwargs:
+        if feature.location.strand != kwargs["strand"]:
+            return False
+
+    return feature.location.start <= kwargs["loc"] <= feature.location.end
+
+
+def feature_test_quals(feature, **kwargs):
+    """
+    Example::
+
+        a = Feature(qualifiers={'Note': ['Some notes', 'Aasdf']})
+
+        # Check if a contains a Note
+        feature_test_quals(a, {'Note': None})  # Returns True
+        feature_test_quals(a, {'Product': None})  # Returns False
+
+        # Check if a contains a note with specific value
+        feature_test_quals(a, {'Note': ['ome']})  # Returns True
+
+        # Check if a contains a note with specific value
+        feature_test_quals(a, {'Note': ['other']})  # Returns False
+    """
+    for key in kwargs:
+        if key not in feature.qualifiers:
+            return False
+
+        # Key is present, no value specified
+        if kwargs[key] is None:
+            return True
+
+        # Otherwise there is a key value we're looking for.
+        # so we make a list of matches
+        matches = []
+        # And check all of the feature qualifier valuse
+        for value in feature.qualifiers[key]:
+            # For that kwargs[key] value
+            for x in kwargs[key]:
+                matches.append(x in value)
+
+        # If none matched, then we return false.
+        if not any(matches):
+            return False
+
+    return True
+
+
+def feature_test_contains(feature, **kwargs):
+    if "index" in kwargs:
+        return feature.location.start < kwargs["index"] < feature.location.end
+    elif "range" in kwargs:
+        return (
+            feature.location.start < kwargs["range"]["start"] < feature.location.end
+            and feature.location.start < kwargs["range"]["end"] < feature.location.end
+        )
+    else:
+        raise RuntimeError("Must use index or range keyword")
+
+
+def get_id(feature=None, parent_prefix=None):
+    result = ""
+    if parent_prefix is not None:
+        result += parent_prefix + "|"
+    if "locus_tag" in feature.qualifiers:
+        result += feature.qualifiers["locus_tag"][0]
+    elif "gene" in feature.qualifiers:
+        result += feature.qualifiers["gene"][0]
+    elif "Gene" in feature.qualifiers:
+        result += feature.qualifiers["Gene"][0]
+    elif "product" in feature.qualifiers:
+        result += feature.qualifiers["product"][0]
+    elif "Product" in feature.qualifiers:
+        result += feature.qualifiers["Product"][0]
+    elif "Name" in feature.qualifiers:
+        result += feature.qualifiers["Name"][0]
+    else:
+        return feature.id
+        # Leaving in case bad things happen.
+        # result += '%s_%s_%s_%s' % (
+        # feature.id,
+        # feature.location.start,
+        # feature.location.end,
+        # feature.location.strand
+        # )
+    return result
+
+
+def get_gff3_id(gene):
+    return gene.qualifiers.get("Name", [gene.id])[0]
+
+
+def ensure_location_in_bounds(start=0, end=0, parent_length=0):
+    # This prevents frameshift errors
+    while start < 0:
+        start += 3
+    while end < 0:
+        end += 3
+    while start > parent_length:
+        start -= 3
+    while end > parent_length:
+        end -= 3
+    return (start, end)
+
+
+def coding_genes(feature_list):
+    for x in genes(feature_list):
+        if (
+            len(
+                list(
+                    feature_lambda(
+                        x.sub_features,
+                        feature_test_type,
+                        {"type": "CDS"},
+                        subfeatures=False,
+                    )
+                )
+            )
+            > 0
+        ):
+            yield x
+
+
+def genes(feature_list, feature_type="gene", sort=False):
+    """
+    Simple filter to extract gene features from the feature set.
+    """
+
+    if not sort:
+        for x in feature_lambda(
+            feature_list, feature_test_type, {"type": feature_type}, subfeatures=True
+        ):
+            yield x
+    else:
+        data = list(genes(feature_list, feature_type=feature_type, sort=False))
+        data = sorted(data, key=lambda feature: feature.location.start)
+        for x in data:
+            yield x
+
+
+def wa_unified_product_name(feature):
+    """
+    Try and figure out a name. We gave conflicting instructions, so
+    this isn't as trivial as it should be. Sometimes it will be in
+    'product' or 'Product', othertimes in 'Name'
+    """
+    # Manually applied tags.
+    protein_product = feature.qualifiers.get(
+        "product", feature.qualifiers.get("Product", [None])
+    )[0]
+
+    # If neither of those are available ...
+    if protein_product is None:
+        # And there's a name...
+        if "Name" in feature.qualifiers:
+            if not is_uuid(feature.qualifiers["Name"][0]):
+                protein_product = feature.qualifiers["Name"][0]
+
+    return protein_product
+
+
+def is_uuid(name):
+    return name.count("-") == 4 and len(name) == 36
+
+
+def get_rbs_from(gene):
+    # Normal RBS annotation types
+    rbs_rbs = list(
+        feature_lambda(
+            gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False
+        )
+    )
+    rbs_sds = list(
+        feature_lambda(
+            gene.sub_features,
+            feature_test_type,
+            {"type": "Shine_Dalgarno_sequence"},
+            subfeatures=False,
+        )
+    )
+    # Fraking apollo
+    apollo_exons = list(
+        feature_lambda(
+            gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False
+        )
+    )
+    apollo_exons = [x for x in apollo_exons if len(x) < 10]
+    # These are more NCBI's style
+    regulatory_elements = list(
+        feature_lambda(
+            gene.sub_features,
+            feature_test_type,
+            {"type": "regulatory"},
+            subfeatures=False,
+        )
+    )
+    rbs_regulatory = list(
+        feature_lambda(
+            regulatory_elements,
+            feature_test_quals,
+            {"regulatory_class": ["ribosome_binding_site"]},
+            subfeatures=False,
+        )
+    )
+    # Here's hoping you find just one ;)
+    return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons
+
+
+def nice_name(record):
+    """
+    get the real name rather than NCBI IDs and so on. If fails, will return record.id
+    """
+    name = record.id
+    likely_parental_contig = list(genes(record.features, feature_type="contig"))
+    if len(likely_parental_contig) == 1:
+        name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]
+    return name
+
+
+def fsort(it):
+    for i in sorted(it, key=lambda x: int(x.location.start)):
+        yield i
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/gff3_prep_for_apollo.py	Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+import sys
+import logging
+import argparse
+import copy
+from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
+from gff3 import feature_lambda, feature_test_type
+from Bio.SeqFeature import FeatureLocation
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+ALLOWED_FEATURES = [
+        "mRNA",
+        "exon",
+        "transposable_element",
+        "tRNA",
+        "transcript",
+        "terminator",
+        "Shine_Dalgarno_Sequence",
+        "pseudogene",
+        "stop_codon_read_through",
+        "repeat_region",
+        "CDS",
+        "gene",
+        "rRNA",
+        "ncRNA",
+        "snRNA",
+        "snoRNA",
+        "miRNA",
+        ]
+
+SPECIAL_REMOVED_FEATURES = ["gene_component_region", "sequence_difference"]
+
+
+
+def add_exons(features):
+    for gene in feature_lambda(
+        features, feature_test_type, {"type": "gene"}, subfeatures=True
+    ):
+        clean_gene = copy.deepcopy(gene)
+        exon_start = None
+        exon_end = None
+        exon_strand = None
+        cds_list = []
+
+        #for mRNA in gene.sub_features:
+        #    for x in mRNA.sub_features:
+        #        x.qualifiers["Parent"] = [gene.id]
+        #        gene.sub_features.append(x)
+
+        for exon in feature_lambda(gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False,recurse=False):
+            #if the gene contains an exon, skip.
+            continue
+        hasMRNA = False
+        for x in gene.sub_features:
+          if x.type == "mRNA":
+            hasMRNA = True
+            mRNA = x
+        """
+        if not hasMRNA:
+          mRNA = gffSeqFeature(
+                   location=FeatureLocation(gene.location.start, gene.location.end, gene.location.strand),
+                   type="mRNA",
+                   source = "cpt.prepApollo",
+                   qualifiers={
+                       "ID": ["%s.mRNA" % clean_gene.qualifiers["ID"][0]],
+                       "Parent": clean_gene.qualifiers["ID"],
+                   },
+                   sub_features=gene.sub_features,
+                   strand=exon_strand
+                 )
+          for x in mRNA.sub_features:
+            x.qualifiers["Parent"] = mRNA["ID"]
+          clean_gene.sub_features = [mRNA]
+        else:
+          for x in clean_gene.sub_features:
+            if x.type != "mRNA":
+              x.qualifiers["Parent"] = [mRNA.id] """
+
+        # check for CDS child features of the gene, do not go a further step (this should skip any CDS children of exon child features)
+        for cds in feature_lambda(
+            gene.sub_features,
+            feature_test_type,
+            {"type": "CDS"},
+            subfeatures=False,
+            recurse=False,
+            ):
+            # check all CDS features for min/max boundaries
+            if exon_start is None:
+                exon_start = cds.location.start
+                exon_strand = cds.location.strand
+            if exon_end is None:
+                exon_end = cds.location.end
+            exon_start = min(exon_start, cds.location.start)
+            exon_end = max(exon_end, cds.location.end)
+            cds_list.append(cds)
+        if cds_list:
+            # we found a CDS to adopt
+            new_exon = gffSeqFeature(
+                location=FeatureLocation(exon_start, exon_end),
+                type="exon",
+                source = "cpt.prepApollo",
+                qualifiers={
+                    "ID": ["%s.exon" % clean_gene.qualifiers["ID"][0]],
+                    "Parent": [clean_gene.id],
+                    "ApolloExon": ["True"],
+                },
+                sub_features=[],
+                strand=exon_strand
+            )
+            for cds in cds_list:
+                cds.qualifiers["Parent"] = new_exon.qualifiers["ID"]
+                new_exon.sub_features.append(cds)
+            #gene.sub_features.append(new_exon)
+            # get all the other children of gene that AREN'T a CDS including the new exon
+            clean_gene.sub_features.append(copy.deepcopy(new_exon))
+            #clean_gene.sub_features.append(gffSeqFeature(location=FeatureLocation(exon_start, exon_end, exon_strand), type="exon", source = "cpt.prepApollo", qualifiers={"ID": ["%s.exon" % clean_gene.qualifiers["ID"][0]], "Parent": clean_gene.qualifiers["ID"]}, sub_features=[], strand=exon_strand))
+            """
+            for sf in feature_lambda(
+                gene.sub_features,
+                feature_test_type,
+                {"type": "CDS"},
+                subfeatures=True,
+                recurse=False,
+                invert=True,
+            ):
+                child = copy.deepcopy(sf)
+                child.qualifiers["Parent"] = new_exon.qualifiers["ID"]
+                clean_gene.sub_features.append(child)
+            """
+            # add them to the new Exon feature
+        # return the cleaned gene with new exon
+        yield clean_gene
+
+def process_features(features):
+    # change RBS to 'Shine_Dalgarno_sequence'
+    for rbs in feature_lambda(features, feature_test_type, {'type': "RBS"}):
+        rbs.type = "Shine_Dalgarno_sequence"
+
+    # Filter top level features
+    for feature in feature_lambda(features, feature_test_type, {"types": ALLOWED_FEATURES}, subfeatures=True):
+        cleaned_subfeatures = []
+        for sf in feature.sub_features:
+            if sf.type in SPECIAL_REMOVED_FEATURES:
+                # 'gene_component_region' is uncaught by feature_test_type as it contains `gene`
+                continue
+            else:
+                cleaned_subfeatures.append(sf)
+        feature.sub_features = copy.deepcopy(cleaned_subfeatures)
+        yield feature
+
+def gff_filter(gff3):
+    for rec in gffParse(gff3):
+        cleaned_features = sorted(list(process_features(rec.features)), key=lambda x: x.location.start)
+        rec.features = sorted(list(add_exons(cleaned_features)), key=lambda x: x.location.start)
+        rec.annotations = {}
+        gffWrite([rec], sys.stdout)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="add parent exon features to CDSs for Apollo"
+    )
+    parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations")
+    args = parser.parse_args()
+    gff_filter(**vars(args))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/gff3_prep_for_apollo.xml	Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,40 @@
+<?xml version="1.0"?>
+<tool id="edu.tamu.cpt.gff3.prepForApollo" name="Prep GFF3 Input for Apollo" version="20.8.0.0">
+  <description>by ensuring that CDS features have a wrapping exon feature</description>
+  <macros>
+    <import>macros.xml</import>
+    <import>cpt-macros.xml</import>
+  </macros>
+  <expand macro="requirements"/>
+  <command detect_errors="aggressive"><![CDATA[
+$__tool_directory__/gff3_prep_for_apollo.py
+@INPUT_GFF@
+> $output]]></command>
+  <inputs>
+      <expand macro="gff3_input" />
+  </inputs>
+  <outputs>
+    <data format="gff3" name="output"/>
+  </outputs>
+  <tests>
+                <test>
+			<param name="gff3_data" value="ApolloPrep_In.gff3" />
+			<output name="output" value="ApolloPrep_Out.gff3" />
+		</test>
+  </tests>
+  <help><![CDATA[
+**What it does**
+
+This tool updates the gene model in a GFF3 so that it can be added into Apollo
+and be used to promote annotations with the correct CDS calculation. It finds any
+CDS feature who's direct parent is a gene feature, and creates an exon feature
+that is the child of the gene feature and the parent of the CDS. If a gene has
+multiple CDS features, they will be wrapped under the same exon feature.
+
+All other features present in the GFF under the gene will be kept the same.
+
+Warning: Use this tool only when it is absolutely necessary to fix a gene model.
+
+      ]]></help>
+		<expand macro="citations-clm" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/macros.xml	Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,85 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="requirements">
+		<requirements>
+			<requirement type="package" version="3.6">python</requirement>
+			<requirement type="package" version="1.77">biopython</requirement>
+			<requirement type="package" version="1.1.3">cpt_gffparser</requirement>
+			<yield/>
+		</requirements>
+	</xml>
+	<token name="@BLAST_TSV@">
+		"$blast_tsv"
+	</token>
+	<xml name="blast_tsv">
+		<param label="Blast Results" help="TSV/tabular (25 Column)"
+			name="blast_tsv" type="data" format="tabular" />
+	</xml>
+
+	<token name="@BLAST_XML@">
+		"$blast_xml"
+	</token>
+	<xml name="blast_xml">
+		<param label="Blast Results" help="XML format"
+			name="blast_xml" type="data" format="blastxml" />
+	</xml>
+	<xml name="gff3_with_fasta">
+	<param label="Genome Sequences" name="fasta" type="data" format="fasta" />
+	<param label="Genome Annotations" name="gff3" type="data" format="gff3" />
+	</xml>
+	<xml name="genome_selector">
+		<conditional name="reference_genome">
+			<param name="reference_genome_source" type="select" label="Reference Genome">
+				<option value="history" selected="True">From History</option>
+				<option value="cached">Locally Cached</option>
+			</param>
+			<when value="cached">
+				<param name="fasta_indexes" type="select" label="Source FASTA Sequence">
+					<options from_data_table="all_fasta"/>
+				</param>
+			</when>
+			<when value="history">
+				<param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+			</when>
+		</conditional>
+	</xml>
+	<xml name="gff3_input">
+		<param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+	</xml>
+	<xml name="input/gff3+fasta">
+		<expand macro="gff3_input" />
+		<expand macro="genome_selector" />
+	</xml>
+	<token name="@INPUT_GFF@">
+	"$gff3_data"
+	</token>
+	<token name="@INPUT_FASTA@">
+#if str($reference_genome.reference_genome_source) == 'cached':
+		"${reference_genome.fasta_indexes.fields.path}"
+#else if str($reference_genome.reference_genome_source) == 'history':
+		genomeref.fa
+#end if
+	</token>
+	<token name="@GENOME_SELECTOR_PRE@">
+#if $reference_genome.reference_genome_source == 'history':
+		ln -s $reference_genome.genome_fasta genomeref.fa;
+#end if
+	</token>
+	<token name="@GENOME_SELECTOR@">
+#if str($reference_genome.reference_genome_source) == 'cached':
+		"${reference_genome.fasta_indexes.fields.path}"
+#else if str($reference_genome.reference_genome_source) == 'history':
+		genomeref.fa
+#end if
+	</token>
+        <xml name="input/fasta">
+		<param label="Fasta file" name="sequences" type="data" format="fasta"/>
+	</xml>
+
+	<token name="@SEQUENCE@">
+		"$sequences"
+	</token>
+	<xml name="input/fasta/protein">
+		<param label="Protein fasta file" name="sequences" type="data" format="fasta"/>
+	</xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/test-data/ApolloPrep_In.gff3	Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,14 @@
+##gff-version 3
+##sequence-region testseq_2018-03-08 1 9216
+testseq_2018-03-08	feature	gene	154	297	.	-	.	ID=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159323
+testseq_2018-03-08	feature	CDS	154	297	.	-	0	ID=testseq_2018-03-08.cds_gene_1;Parent=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159409
+testseq_2018-03-08	feature	gene	314	507	.	-	.	ID=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159674
+testseq_2018-03-08	feature	CDS	314	490	.	-	0	ID=testseq_2018-03-08.cds_gene_2;Parent=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159760
+testseq_2018-03-08	CPT_ShineFind	Shine_Dalgarno_sequence	504	507	.	-	.	ID=testseq_2018-03-08.cds_gene_2.rbs-0;Parent=testseq_2018-03-08.gene_2;uniqueID=offset-160090
+testseq_2018-03-08	feature	gene	487	670	.	-	.	ID=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159882
+testseq_2018-03-08	feature	exon	487	657	.	-	0	ID=testseq_2018-03-08.exon_gene_3;Parent=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159968
+testseq_2018-03-08	feature	CDS	487	657	.	-	0	ID=testseq_2018-03-08.cds_gene_3;Parent=testseq_2018-03-08.exon_gene_3;source2=MGA;uniqueID=offset-159969
+testseq_2018-03-08	CPT_ShineFind	Shine_Dalgarno_sequence	665	670	.	-	.	ID=testseq_2018-03-08.cds_gene_3.rbs-0;Parent=testseq_2018-03-08.gene_3;uniqueID=offset-160441
+testseq_2018-03-08	feature	gene	700	900	.	-	.	ID=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159874
+testseq_2018-03-08	feature	CDS	700	790	.	-	0	ID=testseq_2018-03-08.cds_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159765
+testseq_2018-03-08	feature	CDS	820	900	.	-	0	ID=testseq_2018-03-08.cds2_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159762
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/test-data/ApolloPrep_Out.gff3	Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,20 @@
+##gff-version 3
+testseq_2018-03-08	feature	gene	154	297	.	-	.	ID=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159323;
+testseq_2018-03-08	feature	CDS	154	297	.	-	0	ID=testseq_2018-03-08.cds_gene_1;Parent=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159409;
+testseq_2018-03-08	cpt.prepApollo	exon	154	297	.	-	.	ID=testseq_2018-03-08.gene_1.exon;Parent=testseq_2018-03-08.gene_1;ApolloExon=True;
+testseq_2018-03-08	feature	CDS	154	297	.	-	0	ID=testseq_2018-03-08.cds_gene_1;Parent=testseq_2018-03-08.gene_1.exon;source2=MGA;uniqueID=offset-159409;
+testseq_2018-03-08	feature	gene	314	507	.	-	.	ID=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159674;
+testseq_2018-03-08	feature	CDS	314	490	.	-	0	ID=testseq_2018-03-08.cds_gene_2;Parent=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159760;
+testseq_2018-03-08	CPT_ShineFind	Shine_Dalgarno_sequence	504	507	.	-	.	ID=testseq_2018-03-08.cds_gene_2.rbs-0;Parent=testseq_2018-03-08.gene_2;uniqueID=offset-160090;
+testseq_2018-03-08	cpt.prepApollo	exon	314	490	.	-	.	ID=testseq_2018-03-08.gene_2.exon;Parent=testseq_2018-03-08.gene_2;ApolloExon=True;
+testseq_2018-03-08	feature	CDS	314	490	.	-	0	ID=testseq_2018-03-08.cds_gene_2;Parent=testseq_2018-03-08.gene_2.exon;source2=MGA;uniqueID=offset-159760;
+testseq_2018-03-08	feature	gene	487	670	.	-	.	ID=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159882;
+testseq_2018-03-08	feature	exon	487	657	.	-	.	ID=testseq_2018-03-08.exon_gene_3;Parent=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159968;
+testseq_2018-03-08	feature	CDS	487	657	.	-	0	ID=testseq_2018-03-08.cds_gene_3;Parent=testseq_2018-03-08.exon_gene_3;source2=MGA;uniqueID=offset-159969;
+testseq_2018-03-08	CPT_ShineFind	Shine_Dalgarno_sequence	665	670	.	-	.	ID=testseq_2018-03-08.cds_gene_3.rbs-0;Parent=testseq_2018-03-08.gene_3;uniqueID=offset-160441;
+testseq_2018-03-08	feature	gene	700	900	.	-	.	ID=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159874;
+testseq_2018-03-08	feature	CDS	700	790	.	-	0	ID=testseq_2018-03-08.cds_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159765;
+testseq_2018-03-08	feature	CDS	820	900	.	-	0	ID=testseq_2018-03-08.cds2_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159762;
+testseq_2018-03-08	cpt.prepApollo	exon	700	900	.	-	.	ID=testseq_2018-03-08.gene_4.exon;Parent=testseq_2018-03-08.gene_4;ApolloExon=True;
+testseq_2018-03-08	feature	CDS	700	790	.	-	0	ID=testseq_2018-03-08.cds_gene_4;Parent=testseq_2018-03-08.gene_4.exon;source2=MGA;uniqueID=offset-159765;
+testseq_2018-03-08	feature	CDS	820	900	.	-	0	ID=testseq_2018-03-08.cds2_gene_4;Parent=testseq_2018-03-08.gene_4.exon;source2=MGA;uniqueID=offset-159762;