# HG changeset patch
# User cpt
# Date 1652417755 0
# Node ID eb0c42719156ffe82382e8b3c0c4bb789487c740
Uploaded
diff -r 000000000000 -r eb0c42719156 cpt_gff_apollo_prep/cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/cpt-macros.xml Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,115 @@
+
+
+
+
+ python
+ biopython
+ requests
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 000000000000 -r eb0c42719156 cpt_gff_apollo_prep/gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/gff3.py Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,346 @@
+import copy
+import logging
+
+log = logging.getLogger()
+log.setLevel(logging.WARN)
+
+
+def feature_lambda(
+ feature_list,
+ test,
+ test_kwargs,
+ subfeatures=True,
+ parent=None,
+ invert=False,
+ recurse=True,
+):
+ """Recursively search through features, testing each with a test function, yielding matches.
+
+ GFF3 is a hierachical data structure, so we need to be able to recursively
+ search through features. E.g. if you're looking for a feature with
+ ID='bob.42', you can't just do a simple list comprehension with a test
+ case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.
+
+ :type feature_list: list
+ :param feature_list: an iterable of features
+
+ :type test: function reference
+ :param test: a closure with the method signature (feature, **kwargs) where
+ the kwargs are those passed in the next argument. This
+ function should return True or False, True if the feature is
+ to be yielded as part of the main feature_lambda function, or
+ False if it is to be ignored. This function CAN mutate the
+ features passed to it (think "apply").
+
+ :type test_kwargs: dictionary
+ :param test_kwargs: kwargs to pass to your closure when it is called.
+
+ :type subfeatures: boolean
+ :param subfeatures: when a feature is matched, should just that feature be
+ yielded to the caller, or should the entire sub_feature
+ tree for that feature be included? subfeatures=True is
+ useful in cases such as searching for a gene feature,
+ and wanting to know what RBS/Shine_Dalgarno_sequences
+ are in the sub_feature tree (which can be accomplished
+ with two feature_lambda calls). subfeatures=False is
+ useful in cases when you want to process (and possibly
+ return) the entire feature tree, such as applying a
+ qualifier to every single feature.
+
+ :type invert: boolean
+ :param invert: Negate/invert the result of the filter.
+
+ :rtype: yielded list
+ :return: Yields a list of matching features.
+ """
+ # Either the top level set of [features] or the subfeature attribute
+ for feature in feature_list:
+ feature._parent = parent
+ if not parent:
+ # Set to self so we cannot go above root.
+ feature._parent = feature
+ test_result = test(feature, **test_kwargs)
+ # if (not invert and test_result) or (invert and not test_result):
+ if invert ^ test_result:
+ if not subfeatures:
+ feature_copy = copy.deepcopy(feature)
+ feature_copy.sub_features = list()
+ yield feature_copy
+ else:
+ yield feature
+
+ if recurse and hasattr(feature, "sub_features"):
+ for x in feature_lambda(
+ feature.sub_features,
+ test,
+ test_kwargs,
+ subfeatures=subfeatures,
+ parent=feature,
+ invert=invert,
+ recurse=recurse,
+ ):
+ yield x
+
+
+def fetchParent(feature):
+ if not hasattr(feature, "_parent") or feature._parent is None:
+ return feature
+ else:
+ return fetchParent(feature._parent)
+
+
+def feature_test_true(feature, **kwargs):
+ return True
+
+
+def feature_test_type(feature, **kwargs):
+ if "type" in kwargs:
+ return str(feature.type).upper() == str(kwargs["type"]).upper()
+ elif "types" in kwargs:
+ for x in kwargs["types"]:
+ if str(feature.type).upper() == str(x).upper():
+ return True
+ return False
+ raise Exception("Incorrect feature_test_type call, need type or types")
+
+
+def feature_test_qual_value(feature, **kwargs):
+ """Test qualifier values.
+
+ For every feature, check that at least one value in
+ feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
+ """
+ if isinstance(kwargs["qualifier"], list):
+ for qualifier in kwargs["qualifier"]:
+ for attribute_value in feature.qualifiers.get(qualifier, []):
+ if attribute_value in kwargs["attribute_list"]:
+ return True
+ else:
+ for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []):
+ if attribute_value in kwargs["attribute_list"]:
+ return True
+ return False
+
+
+def feature_test_location(feature, **kwargs):
+ if "strand" in kwargs:
+ if feature.location.strand != kwargs["strand"]:
+ return False
+
+ return feature.location.start <= kwargs["loc"] <= feature.location.end
+
+
+def feature_test_quals(feature, **kwargs):
+ """
+ Example::
+
+ a = Feature(qualifiers={'Note': ['Some notes', 'Aasdf']})
+
+ # Check if a contains a Note
+ feature_test_quals(a, {'Note': None}) # Returns True
+ feature_test_quals(a, {'Product': None}) # Returns False
+
+ # Check if a contains a note with specific value
+ feature_test_quals(a, {'Note': ['ome']}) # Returns True
+
+ # Check if a contains a note with specific value
+ feature_test_quals(a, {'Note': ['other']}) # Returns False
+ """
+ for key in kwargs:
+ if key not in feature.qualifiers:
+ return False
+
+ # Key is present, no value specified
+ if kwargs[key] is None:
+ return True
+
+ # Otherwise there is a key value we're looking for.
+ # so we make a list of matches
+ matches = []
+ # And check all of the feature qualifier valuse
+ for value in feature.qualifiers[key]:
+ # For that kwargs[key] value
+ for x in kwargs[key]:
+ matches.append(x in value)
+
+ # If none matched, then we return false.
+ if not any(matches):
+ return False
+
+ return True
+
+
+def feature_test_contains(feature, **kwargs):
+ if "index" in kwargs:
+ return feature.location.start < kwargs["index"] < feature.location.end
+ elif "range" in kwargs:
+ return (
+ feature.location.start < kwargs["range"]["start"] < feature.location.end
+ and feature.location.start < kwargs["range"]["end"] < feature.location.end
+ )
+ else:
+ raise RuntimeError("Must use index or range keyword")
+
+
+def get_id(feature=None, parent_prefix=None):
+ result = ""
+ if parent_prefix is not None:
+ result += parent_prefix + "|"
+ if "locus_tag" in feature.qualifiers:
+ result += feature.qualifiers["locus_tag"][0]
+ elif "gene" in feature.qualifiers:
+ result += feature.qualifiers["gene"][0]
+ elif "Gene" in feature.qualifiers:
+ result += feature.qualifiers["Gene"][0]
+ elif "product" in feature.qualifiers:
+ result += feature.qualifiers["product"][0]
+ elif "Product" in feature.qualifiers:
+ result += feature.qualifiers["Product"][0]
+ elif "Name" in feature.qualifiers:
+ result += feature.qualifiers["Name"][0]
+ else:
+ return feature.id
+ # Leaving in case bad things happen.
+ # result += '%s_%s_%s_%s' % (
+ # feature.id,
+ # feature.location.start,
+ # feature.location.end,
+ # feature.location.strand
+ # )
+ return result
+
+
+def get_gff3_id(gene):
+ return gene.qualifiers.get("Name", [gene.id])[0]
+
+
+def ensure_location_in_bounds(start=0, end=0, parent_length=0):
+ # This prevents frameshift errors
+ while start < 0:
+ start += 3
+ while end < 0:
+ end += 3
+ while start > parent_length:
+ start -= 3
+ while end > parent_length:
+ end -= 3
+ return (start, end)
+
+
+def coding_genes(feature_list):
+ for x in genes(feature_list):
+ if (
+ len(
+ list(
+ feature_lambda(
+ x.sub_features,
+ feature_test_type,
+ {"type": "CDS"},
+ subfeatures=False,
+ )
+ )
+ )
+ > 0
+ ):
+ yield x
+
+
+def genes(feature_list, feature_type="gene", sort=False):
+ """
+ Simple filter to extract gene features from the feature set.
+ """
+
+ if not sort:
+ for x in feature_lambda(
+ feature_list, feature_test_type, {"type": feature_type}, subfeatures=True
+ ):
+ yield x
+ else:
+ data = list(genes(feature_list, feature_type=feature_type, sort=False))
+ data = sorted(data, key=lambda feature: feature.location.start)
+ for x in data:
+ yield x
+
+
+def wa_unified_product_name(feature):
+ """
+ Try and figure out a name. We gave conflicting instructions, so
+ this isn't as trivial as it should be. Sometimes it will be in
+ 'product' or 'Product', othertimes in 'Name'
+ """
+ # Manually applied tags.
+ protein_product = feature.qualifiers.get(
+ "product", feature.qualifiers.get("Product", [None])
+ )[0]
+
+ # If neither of those are available ...
+ if protein_product is None:
+ # And there's a name...
+ if "Name" in feature.qualifiers:
+ if not is_uuid(feature.qualifiers["Name"][0]):
+ protein_product = feature.qualifiers["Name"][0]
+
+ return protein_product
+
+
+def is_uuid(name):
+ return name.count("-") == 4 and len(name) == 36
+
+
+def get_rbs_from(gene):
+ # Normal RBS annotation types
+ rbs_rbs = list(
+ feature_lambda(
+ gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False
+ )
+ )
+ rbs_sds = list(
+ feature_lambda(
+ gene.sub_features,
+ feature_test_type,
+ {"type": "Shine_Dalgarno_sequence"},
+ subfeatures=False,
+ )
+ )
+ # Fraking apollo
+ apollo_exons = list(
+ feature_lambda(
+ gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False
+ )
+ )
+ apollo_exons = [x for x in apollo_exons if len(x) < 10]
+ # These are more NCBI's style
+ regulatory_elements = list(
+ feature_lambda(
+ gene.sub_features,
+ feature_test_type,
+ {"type": "regulatory"},
+ subfeatures=False,
+ )
+ )
+ rbs_regulatory = list(
+ feature_lambda(
+ regulatory_elements,
+ feature_test_quals,
+ {"regulatory_class": ["ribosome_binding_site"]},
+ subfeatures=False,
+ )
+ )
+ # Here's hoping you find just one ;)
+ return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons
+
+
+def nice_name(record):
+ """
+ get the real name rather than NCBI IDs and so on. If fails, will return record.id
+ """
+ name = record.id
+ likely_parental_contig = list(genes(record.features, feature_type="contig"))
+ if len(likely_parental_contig) == 1:
+ name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]
+ return name
+
+
+def fsort(it):
+ for i in sorted(it, key=lambda x: int(x.location.start)):
+ yield i
diff -r 000000000000 -r eb0c42719156 cpt_gff_apollo_prep/gff3_prep_for_apollo.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/gff3_prep_for_apollo.py Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+import sys
+import logging
+import argparse
+import copy
+from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
+from gff3 import feature_lambda, feature_test_type
+from Bio.SeqFeature import FeatureLocation
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+ALLOWED_FEATURES = [
+ "mRNA",
+ "exon",
+ "transposable_element",
+ "tRNA",
+ "transcript",
+ "terminator",
+ "Shine_Dalgarno_Sequence",
+ "pseudogene",
+ "stop_codon_read_through",
+ "repeat_region",
+ "CDS",
+ "gene",
+ "rRNA",
+ "ncRNA",
+ "snRNA",
+ "snoRNA",
+ "miRNA",
+ ]
+
+SPECIAL_REMOVED_FEATURES = ["gene_component_region", "sequence_difference"]
+
+
+
+def add_exons(features):
+ for gene in feature_lambda(
+ features, feature_test_type, {"type": "gene"}, subfeatures=True
+ ):
+ clean_gene = copy.deepcopy(gene)
+ exon_start = None
+ exon_end = None
+ exon_strand = None
+ cds_list = []
+
+ #for mRNA in gene.sub_features:
+ # for x in mRNA.sub_features:
+ # x.qualifiers["Parent"] = [gene.id]
+ # gene.sub_features.append(x)
+
+ for exon in feature_lambda(gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False,recurse=False):
+ #if the gene contains an exon, skip.
+ continue
+ hasMRNA = False
+ for x in gene.sub_features:
+ if x.type == "mRNA":
+ hasMRNA = True
+ mRNA = x
+ """
+ if not hasMRNA:
+ mRNA = gffSeqFeature(
+ location=FeatureLocation(gene.location.start, gene.location.end, gene.location.strand),
+ type="mRNA",
+ source = "cpt.prepApollo",
+ qualifiers={
+ "ID": ["%s.mRNA" % clean_gene.qualifiers["ID"][0]],
+ "Parent": clean_gene.qualifiers["ID"],
+ },
+ sub_features=gene.sub_features,
+ strand=exon_strand
+ )
+ for x in mRNA.sub_features:
+ x.qualifiers["Parent"] = mRNA["ID"]
+ clean_gene.sub_features = [mRNA]
+ else:
+ for x in clean_gene.sub_features:
+ if x.type != "mRNA":
+ x.qualifiers["Parent"] = [mRNA.id] """
+
+ # check for CDS child features of the gene, do not go a further step (this should skip any CDS children of exon child features)
+ for cds in feature_lambda(
+ gene.sub_features,
+ feature_test_type,
+ {"type": "CDS"},
+ subfeatures=False,
+ recurse=False,
+ ):
+ # check all CDS features for min/max boundaries
+ if exon_start is None:
+ exon_start = cds.location.start
+ exon_strand = cds.location.strand
+ if exon_end is None:
+ exon_end = cds.location.end
+ exon_start = min(exon_start, cds.location.start)
+ exon_end = max(exon_end, cds.location.end)
+ cds_list.append(cds)
+ if cds_list:
+ # we found a CDS to adopt
+ new_exon = gffSeqFeature(
+ location=FeatureLocation(exon_start, exon_end),
+ type="exon",
+ source = "cpt.prepApollo",
+ qualifiers={
+ "ID": ["%s.exon" % clean_gene.qualifiers["ID"][0]],
+ "Parent": [clean_gene.id],
+ "ApolloExon": ["True"],
+ },
+ sub_features=[],
+ strand=exon_strand
+ )
+ for cds in cds_list:
+ cds.qualifiers["Parent"] = new_exon.qualifiers["ID"]
+ new_exon.sub_features.append(cds)
+ #gene.sub_features.append(new_exon)
+ # get all the other children of gene that AREN'T a CDS including the new exon
+ clean_gene.sub_features.append(copy.deepcopy(new_exon))
+ #clean_gene.sub_features.append(gffSeqFeature(location=FeatureLocation(exon_start, exon_end, exon_strand), type="exon", source = "cpt.prepApollo", qualifiers={"ID": ["%s.exon" % clean_gene.qualifiers["ID"][0]], "Parent": clean_gene.qualifiers["ID"]}, sub_features=[], strand=exon_strand))
+ """
+ for sf in feature_lambda(
+ gene.sub_features,
+ feature_test_type,
+ {"type": "CDS"},
+ subfeatures=True,
+ recurse=False,
+ invert=True,
+ ):
+ child = copy.deepcopy(sf)
+ child.qualifiers["Parent"] = new_exon.qualifiers["ID"]
+ clean_gene.sub_features.append(child)
+ """
+ # add them to the new Exon feature
+ # return the cleaned gene with new exon
+ yield clean_gene
+
+def process_features(features):
+ # change RBS to 'Shine_Dalgarno_sequence'
+ for rbs in feature_lambda(features, feature_test_type, {'type': "RBS"}):
+ rbs.type = "Shine_Dalgarno_sequence"
+
+ # Filter top level features
+ for feature in feature_lambda(features, feature_test_type, {"types": ALLOWED_FEATURES}, subfeatures=True):
+ cleaned_subfeatures = []
+ for sf in feature.sub_features:
+ if sf.type in SPECIAL_REMOVED_FEATURES:
+ # 'gene_component_region' is uncaught by feature_test_type as it contains `gene`
+ continue
+ else:
+ cleaned_subfeatures.append(sf)
+ feature.sub_features = copy.deepcopy(cleaned_subfeatures)
+ yield feature
+
+def gff_filter(gff3):
+ for rec in gffParse(gff3):
+ cleaned_features = sorted(list(process_features(rec.features)), key=lambda x: x.location.start)
+ rec.features = sorted(list(add_exons(cleaned_features)), key=lambda x: x.location.start)
+ rec.annotations = {}
+ gffWrite([rec], sys.stdout)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="add parent exon features to CDSs for Apollo"
+ )
+ parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations")
+ args = parser.parse_args()
+ gff_filter(**vars(args))
diff -r 000000000000 -r eb0c42719156 cpt_gff_apollo_prep/gff3_prep_for_apollo.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/gff3_prep_for_apollo.xml Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,40 @@
+
+
+ by ensuring that CDS features have a wrapping exon feature
+
+ macros.xml
+ cpt-macros.xml
+
+
+ $output]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r eb0c42719156 cpt_gff_apollo_prep/macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/macros.xml Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,85 @@
+
+
+
+
+ python
+ biopython
+ cpt_gffparser
+
+
+
+
+ "$blast_tsv"
+
+
+
+
+
+
+ "$blast_xml"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ "$gff3_data"
+
+
+#if str($reference_genome.reference_genome_source) == 'cached':
+ "${reference_genome.fasta_indexes.fields.path}"
+#else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+#end if
+
+
+#if $reference_genome.reference_genome_source == 'history':
+ ln -s $reference_genome.genome_fasta genomeref.fa;
+#end if
+
+
+#if str($reference_genome.reference_genome_source) == 'cached':
+ "${reference_genome.fasta_indexes.fields.path}"
+#else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+#end if
+
+
+
+
+
+
+ "$sequences"
+
+
+
+
+
diff -r 000000000000 -r eb0c42719156 cpt_gff_apollo_prep/test-data/ApolloPrep_In.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/test-data/ApolloPrep_In.gff3 Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,14 @@
+##gff-version 3
+##sequence-region testseq_2018-03-08 1 9216
+testseq_2018-03-08 feature gene 154 297 . - . ID=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159323
+testseq_2018-03-08 feature CDS 154 297 . - 0 ID=testseq_2018-03-08.cds_gene_1;Parent=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159409
+testseq_2018-03-08 feature gene 314 507 . - . ID=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159674
+testseq_2018-03-08 feature CDS 314 490 . - 0 ID=testseq_2018-03-08.cds_gene_2;Parent=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159760
+testseq_2018-03-08 CPT_ShineFind Shine_Dalgarno_sequence 504 507 . - . ID=testseq_2018-03-08.cds_gene_2.rbs-0;Parent=testseq_2018-03-08.gene_2;uniqueID=offset-160090
+testseq_2018-03-08 feature gene 487 670 . - . ID=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159882
+testseq_2018-03-08 feature exon 487 657 . - 0 ID=testseq_2018-03-08.exon_gene_3;Parent=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159968
+testseq_2018-03-08 feature CDS 487 657 . - 0 ID=testseq_2018-03-08.cds_gene_3;Parent=testseq_2018-03-08.exon_gene_3;source2=MGA;uniqueID=offset-159969
+testseq_2018-03-08 CPT_ShineFind Shine_Dalgarno_sequence 665 670 . - . ID=testseq_2018-03-08.cds_gene_3.rbs-0;Parent=testseq_2018-03-08.gene_3;uniqueID=offset-160441
+testseq_2018-03-08 feature gene 700 900 . - . ID=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159874
+testseq_2018-03-08 feature CDS 700 790 . - 0 ID=testseq_2018-03-08.cds_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159765
+testseq_2018-03-08 feature CDS 820 900 . - 0 ID=testseq_2018-03-08.cds2_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159762
diff -r 000000000000 -r eb0c42719156 cpt_gff_apollo_prep/test-data/ApolloPrep_Out.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gff_apollo_prep/test-data/ApolloPrep_Out.gff3 Fri May 13 04:55:55 2022 +0000
@@ -0,0 +1,20 @@
+##gff-version 3
+testseq_2018-03-08 feature gene 154 297 . - . ID=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159323;
+testseq_2018-03-08 feature CDS 154 297 . - 0 ID=testseq_2018-03-08.cds_gene_1;Parent=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159409;
+testseq_2018-03-08 cpt.prepApollo exon 154 297 . - . ID=testseq_2018-03-08.gene_1.exon;Parent=testseq_2018-03-08.gene_1;ApolloExon=True;
+testseq_2018-03-08 feature CDS 154 297 . - 0 ID=testseq_2018-03-08.cds_gene_1;Parent=testseq_2018-03-08.gene_1.exon;source2=MGA;uniqueID=offset-159409;
+testseq_2018-03-08 feature gene 314 507 . - . ID=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159674;
+testseq_2018-03-08 feature CDS 314 490 . - 0 ID=testseq_2018-03-08.cds_gene_2;Parent=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159760;
+testseq_2018-03-08 CPT_ShineFind Shine_Dalgarno_sequence 504 507 . - . ID=testseq_2018-03-08.cds_gene_2.rbs-0;Parent=testseq_2018-03-08.gene_2;uniqueID=offset-160090;
+testseq_2018-03-08 cpt.prepApollo exon 314 490 . - . ID=testseq_2018-03-08.gene_2.exon;Parent=testseq_2018-03-08.gene_2;ApolloExon=True;
+testseq_2018-03-08 feature CDS 314 490 . - 0 ID=testseq_2018-03-08.cds_gene_2;Parent=testseq_2018-03-08.gene_2.exon;source2=MGA;uniqueID=offset-159760;
+testseq_2018-03-08 feature gene 487 670 . - . ID=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159882;
+testseq_2018-03-08 feature exon 487 657 . - . ID=testseq_2018-03-08.exon_gene_3;Parent=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159968;
+testseq_2018-03-08 feature CDS 487 657 . - 0 ID=testseq_2018-03-08.cds_gene_3;Parent=testseq_2018-03-08.exon_gene_3;source2=MGA;uniqueID=offset-159969;
+testseq_2018-03-08 CPT_ShineFind Shine_Dalgarno_sequence 665 670 . - . ID=testseq_2018-03-08.cds_gene_3.rbs-0;Parent=testseq_2018-03-08.gene_3;uniqueID=offset-160441;
+testseq_2018-03-08 feature gene 700 900 . - . ID=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159874;
+testseq_2018-03-08 feature CDS 700 790 . - 0 ID=testseq_2018-03-08.cds_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159765;
+testseq_2018-03-08 feature CDS 820 900 . - 0 ID=testseq_2018-03-08.cds2_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159762;
+testseq_2018-03-08 cpt.prepApollo exon 700 900 . - . ID=testseq_2018-03-08.gene_4.exon;Parent=testseq_2018-03-08.gene_4;ApolloExon=True;
+testseq_2018-03-08 feature CDS 700 790 . - 0 ID=testseq_2018-03-08.cds_gene_4;Parent=testseq_2018-03-08.gene_4.exon;source2=MGA;uniqueID=offset-159765;
+testseq_2018-03-08 feature CDS 820 900 . - 0 ID=testseq_2018-03-08.cds2_gene_4;Parent=testseq_2018-03-08.gene_4.exon;source2=MGA;uniqueID=offset-159762;