Mercurial > repos > cpt > cpt_prep_for_apollo
changeset 0:eb0c42719156 draft
Uploaded
author | cpt |
---|---|
date | Fri, 13 May 2022 04:55:55 +0000 |
parents | |
children | 7017f2143262 |
files | cpt_gff_apollo_prep/cpt-macros.xml cpt_gff_apollo_prep/gff3.py cpt_gff_apollo_prep/gff3_prep_for_apollo.py cpt_gff_apollo_prep/gff3_prep_for_apollo.xml cpt_gff_apollo_prep/macros.xml cpt_gff_apollo_prep/test-data/ApolloPrep_In.gff3 cpt_gff_apollo_prep/test-data/ApolloPrep_Out.gff3 |
diffstat | 7 files changed, 787 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_apollo_prep/cpt-macros.xml Fri May 13 04:55:55 2022 +0000 @@ -0,0 +1,115 @@ +<?xml version="1.0"?> +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd $__tool_directory__ && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_apollo_prep/gff3.py Fri May 13 04:55:55 2022 +0000 @@ -0,0 +1,346 @@ +import copy +import logging + +log = logging.getLogger() +log.setLevel(logging.WARN) + + +def feature_lambda( + feature_list, + test, + test_kwargs, + subfeatures=True, + parent=None, + invert=False, + recurse=True, +): + """Recursively search through features, testing each with a test function, yielding matches. + + GFF3 is a hierachical data structure, so we need to be able to recursively + search through features. E.g. if you're looking for a feature with + ID='bob.42', you can't just do a simple list comprehension with a test + case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in. + + :type feature_list: list + :param feature_list: an iterable of features + + :type test: function reference + :param test: a closure with the method signature (feature, **kwargs) where + the kwargs are those passed in the next argument. This + function should return True or False, True if the feature is + to be yielded as part of the main feature_lambda function, or + False if it is to be ignored. This function CAN mutate the + features passed to it (think "apply"). + + :type test_kwargs: dictionary + :param test_kwargs: kwargs to pass to your closure when it is called. + + :type subfeatures: boolean + :param subfeatures: when a feature is matched, should just that feature be + yielded to the caller, or should the entire sub_feature + tree for that feature be included? subfeatures=True is + useful in cases such as searching for a gene feature, + and wanting to know what RBS/Shine_Dalgarno_sequences + are in the sub_feature tree (which can be accomplished + with two feature_lambda calls). subfeatures=False is + useful in cases when you want to process (and possibly + return) the entire feature tree, such as applying a + qualifier to every single feature. + + :type invert: boolean + :param invert: Negate/invert the result of the filter. + + :rtype: yielded list + :return: Yields a list of matching features. + """ + # Either the top level set of [features] or the subfeature attribute + for feature in feature_list: + feature._parent = parent + if not parent: + # Set to self so we cannot go above root. + feature._parent = feature + test_result = test(feature, **test_kwargs) + # if (not invert and test_result) or (invert and not test_result): + if invert ^ test_result: + if not subfeatures: + feature_copy = copy.deepcopy(feature) + feature_copy.sub_features = list() + yield feature_copy + else: + yield feature + + if recurse and hasattr(feature, "sub_features"): + for x in feature_lambda( + feature.sub_features, + test, + test_kwargs, + subfeatures=subfeatures, + parent=feature, + invert=invert, + recurse=recurse, + ): + yield x + + +def fetchParent(feature): + if not hasattr(feature, "_parent") or feature._parent is None: + return feature + else: + return fetchParent(feature._parent) + + +def feature_test_true(feature, **kwargs): + return True + + +def feature_test_type(feature, **kwargs): + if "type" in kwargs: + return str(feature.type).upper() == str(kwargs["type"]).upper() + elif "types" in kwargs: + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False + raise Exception("Incorrect feature_test_type call, need type or types") + + +def feature_test_qual_value(feature, **kwargs): + """Test qualifier values. + + For every feature, check that at least one value in + feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list'] + """ + if isinstance(kwargs["qualifier"], list): + for qualifier in kwargs["qualifier"]: + for attribute_value in feature.qualifiers.get(qualifier, []): + if attribute_value in kwargs["attribute_list"]: + return True + else: + for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []): + if attribute_value in kwargs["attribute_list"]: + return True + return False + + +def feature_test_location(feature, **kwargs): + if "strand" in kwargs: + if feature.location.strand != kwargs["strand"]: + return False + + return feature.location.start <= kwargs["loc"] <= feature.location.end + + +def feature_test_quals(feature, **kwargs): + """ + Example:: + + a = Feature(qualifiers={'Note': ['Some notes', 'Aasdf']}) + + # Check if a contains a Note + feature_test_quals(a, {'Note': None}) # Returns True + feature_test_quals(a, {'Product': None}) # Returns False + + # Check if a contains a note with specific value + feature_test_quals(a, {'Note': ['ome']}) # Returns True + + # Check if a contains a note with specific value + feature_test_quals(a, {'Note': ['other']}) # Returns False + """ + for key in kwargs: + if key not in feature.qualifiers: + return False + + # Key is present, no value specified + if kwargs[key] is None: + return True + + # Otherwise there is a key value we're looking for. + # so we make a list of matches + matches = [] + # And check all of the feature qualifier valuse + for value in feature.qualifiers[key]: + # For that kwargs[key] value + for x in kwargs[key]: + matches.append(x in value) + + # If none matched, then we return false. + if not any(matches): + return False + + return True + + +def feature_test_contains(feature, **kwargs): + if "index" in kwargs: + return feature.location.start < kwargs["index"] < feature.location.end + elif "range" in kwargs: + return ( + feature.location.start < kwargs["range"]["start"] < feature.location.end + and feature.location.start < kwargs["range"]["end"] < feature.location.end + ) + else: + raise RuntimeError("Must use index or range keyword") + + +def get_id(feature=None, parent_prefix=None): + result = "" + if parent_prefix is not None: + result += parent_prefix + "|" + if "locus_tag" in feature.qualifiers: + result += feature.qualifiers["locus_tag"][0] + elif "gene" in feature.qualifiers: + result += feature.qualifiers["gene"][0] + elif "Gene" in feature.qualifiers: + result += feature.qualifiers["Gene"][0] + elif "product" in feature.qualifiers: + result += feature.qualifiers["product"][0] + elif "Product" in feature.qualifiers: + result += feature.qualifiers["Product"][0] + elif "Name" in feature.qualifiers: + result += feature.qualifiers["Name"][0] + else: + return feature.id + # Leaving in case bad things happen. + # result += '%s_%s_%s_%s' % ( + # feature.id, + # feature.location.start, + # feature.location.end, + # feature.location.strand + # ) + return result + + +def get_gff3_id(gene): + return gene.qualifiers.get("Name", [gene.id])[0] + + +def ensure_location_in_bounds(start=0, end=0, parent_length=0): + # This prevents frameshift errors + while start < 0: + start += 3 + while end < 0: + end += 3 + while start > parent_length: + start -= 3 + while end > parent_length: + end -= 3 + return (start, end) + + +def coding_genes(feature_list): + for x in genes(feature_list): + if ( + len( + list( + feature_lambda( + x.sub_features, + feature_test_type, + {"type": "CDS"}, + subfeatures=False, + ) + ) + ) + > 0 + ): + yield x + + +def genes(feature_list, feature_type="gene", sort=False): + """ + Simple filter to extract gene features from the feature set. + """ + + if not sort: + for x in feature_lambda( + feature_list, feature_test_type, {"type": feature_type}, subfeatures=True + ): + yield x + else: + data = list(genes(feature_list, feature_type=feature_type, sort=False)) + data = sorted(data, key=lambda feature: feature.location.start) + for x in data: + yield x + + +def wa_unified_product_name(feature): + """ + Try and figure out a name. We gave conflicting instructions, so + this isn't as trivial as it should be. Sometimes it will be in + 'product' or 'Product', othertimes in 'Name' + """ + # Manually applied tags. + protein_product = feature.qualifiers.get( + "product", feature.qualifiers.get("Product", [None]) + )[0] + + # If neither of those are available ... + if protein_product is None: + # And there's a name... + if "Name" in feature.qualifiers: + if not is_uuid(feature.qualifiers["Name"][0]): + protein_product = feature.qualifiers["Name"][0] + + return protein_product + + +def is_uuid(name): + return name.count("-") == 4 and len(name) == 36 + + +def get_rbs_from(gene): + # Normal RBS annotation types + rbs_rbs = list( + feature_lambda( + gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False + ) + ) + rbs_sds = list( + feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "Shine_Dalgarno_sequence"}, + subfeatures=False, + ) + ) + # Fraking apollo + apollo_exons = list( + feature_lambda( + gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False + ) + ) + apollo_exons = [x for x in apollo_exons if len(x) < 10] + # These are more NCBI's style + regulatory_elements = list( + feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "regulatory"}, + subfeatures=False, + ) + ) + rbs_regulatory = list( + feature_lambda( + regulatory_elements, + feature_test_quals, + {"regulatory_class": ["ribosome_binding_site"]}, + subfeatures=False, + ) + ) + # Here's hoping you find just one ;) + return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons + + +def nice_name(record): + """ + get the real name rather than NCBI IDs and so on. If fails, will return record.id + """ + name = record.id + likely_parental_contig = list(genes(record.features, feature_type="contig")) + if len(likely_parental_contig) == 1: + name = likely_parental_contig[0].qualifiers.get("organism", [name])[0] + return name + + +def fsort(it): + for i in sorted(it, key=lambda x: int(x.location.start)): + yield i
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_apollo_prep/gff3_prep_for_apollo.py Fri May 13 04:55:55 2022 +0000 @@ -0,0 +1,167 @@ +#!/usr/bin/env python +import sys +import logging +import argparse +import copy +from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature +from gff3 import feature_lambda, feature_test_type +from Bio.SeqFeature import FeatureLocation + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + +ALLOWED_FEATURES = [ + "mRNA", + "exon", + "transposable_element", + "tRNA", + "transcript", + "terminator", + "Shine_Dalgarno_Sequence", + "pseudogene", + "stop_codon_read_through", + "repeat_region", + "CDS", + "gene", + "rRNA", + "ncRNA", + "snRNA", + "snoRNA", + "miRNA", + ] + +SPECIAL_REMOVED_FEATURES = ["gene_component_region", "sequence_difference"] + + + +def add_exons(features): + for gene in feature_lambda( + features, feature_test_type, {"type": "gene"}, subfeatures=True + ): + clean_gene = copy.deepcopy(gene) + exon_start = None + exon_end = None + exon_strand = None + cds_list = [] + + #for mRNA in gene.sub_features: + # for x in mRNA.sub_features: + # x.qualifiers["Parent"] = [gene.id] + # gene.sub_features.append(x) + + for exon in feature_lambda(gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False,recurse=False): + #if the gene contains an exon, skip. + continue + hasMRNA = False + for x in gene.sub_features: + if x.type == "mRNA": + hasMRNA = True + mRNA = x + """ + if not hasMRNA: + mRNA = gffSeqFeature( + location=FeatureLocation(gene.location.start, gene.location.end, gene.location.strand), + type="mRNA", + source = "cpt.prepApollo", + qualifiers={ + "ID": ["%s.mRNA" % clean_gene.qualifiers["ID"][0]], + "Parent": clean_gene.qualifiers["ID"], + }, + sub_features=gene.sub_features, + strand=exon_strand + ) + for x in mRNA.sub_features: + x.qualifiers["Parent"] = mRNA["ID"] + clean_gene.sub_features = [mRNA] + else: + for x in clean_gene.sub_features: + if x.type != "mRNA": + x.qualifiers["Parent"] = [mRNA.id] """ + + # check for CDS child features of the gene, do not go a further step (this should skip any CDS children of exon child features) + for cds in feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "CDS"}, + subfeatures=False, + recurse=False, + ): + # check all CDS features for min/max boundaries + if exon_start is None: + exon_start = cds.location.start + exon_strand = cds.location.strand + if exon_end is None: + exon_end = cds.location.end + exon_start = min(exon_start, cds.location.start) + exon_end = max(exon_end, cds.location.end) + cds_list.append(cds) + if cds_list: + # we found a CDS to adopt + new_exon = gffSeqFeature( + location=FeatureLocation(exon_start, exon_end), + type="exon", + source = "cpt.prepApollo", + qualifiers={ + "ID": ["%s.exon" % clean_gene.qualifiers["ID"][0]], + "Parent": [clean_gene.id], + "ApolloExon": ["True"], + }, + sub_features=[], + strand=exon_strand + ) + for cds in cds_list: + cds.qualifiers["Parent"] = new_exon.qualifiers["ID"] + new_exon.sub_features.append(cds) + #gene.sub_features.append(new_exon) + # get all the other children of gene that AREN'T a CDS including the new exon + clean_gene.sub_features.append(copy.deepcopy(new_exon)) + #clean_gene.sub_features.append(gffSeqFeature(location=FeatureLocation(exon_start, exon_end, exon_strand), type="exon", source = "cpt.prepApollo", qualifiers={"ID": ["%s.exon" % clean_gene.qualifiers["ID"][0]], "Parent": clean_gene.qualifiers["ID"]}, sub_features=[], strand=exon_strand)) + """ + for sf in feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "CDS"}, + subfeatures=True, + recurse=False, + invert=True, + ): + child = copy.deepcopy(sf) + child.qualifiers["Parent"] = new_exon.qualifiers["ID"] + clean_gene.sub_features.append(child) + """ + # add them to the new Exon feature + # return the cleaned gene with new exon + yield clean_gene + +def process_features(features): + # change RBS to 'Shine_Dalgarno_sequence' + for rbs in feature_lambda(features, feature_test_type, {'type': "RBS"}): + rbs.type = "Shine_Dalgarno_sequence" + + # Filter top level features + for feature in feature_lambda(features, feature_test_type, {"types": ALLOWED_FEATURES}, subfeatures=True): + cleaned_subfeatures = [] + for sf in feature.sub_features: + if sf.type in SPECIAL_REMOVED_FEATURES: + # 'gene_component_region' is uncaught by feature_test_type as it contains `gene` + continue + else: + cleaned_subfeatures.append(sf) + feature.sub_features = copy.deepcopy(cleaned_subfeatures) + yield feature + +def gff_filter(gff3): + for rec in gffParse(gff3): + cleaned_features = sorted(list(process_features(rec.features)), key=lambda x: x.location.start) + rec.features = sorted(list(add_exons(cleaned_features)), key=lambda x: x.location.start) + rec.annotations = {} + gffWrite([rec], sys.stdout) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="add parent exon features to CDSs for Apollo" + ) + parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations") + args = parser.parse_args() + gff_filter(**vars(args))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_apollo_prep/gff3_prep_for_apollo.xml Fri May 13 04:55:55 2022 +0000 @@ -0,0 +1,40 @@ +<?xml version="1.0"?> +<tool id="edu.tamu.cpt.gff3.prepForApollo" name="Prep GFF3 Input for Apollo" version="20.8.0.0"> + <description>by ensuring that CDS features have a wrapping exon feature</description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +$__tool_directory__/gff3_prep_for_apollo.py +@INPUT_GFF@ +> $output]]></command> + <inputs> + <expand macro="gff3_input" /> + </inputs> + <outputs> + <data format="gff3" name="output"/> + </outputs> + <tests> + <test> + <param name="gff3_data" value="ApolloPrep_In.gff3" /> + <output name="output" value="ApolloPrep_Out.gff3" /> + </test> + </tests> + <help><![CDATA[ +**What it does** + +This tool updates the gene model in a GFF3 so that it can be added into Apollo +and be used to promote annotations with the correct CDS calculation. It finds any +CDS feature who's direct parent is a gene feature, and creates an exon feature +that is the child of the gene feature and the parent of the CDS. If a gene has +multiple CDS features, they will be wrapped under the same exon feature. + +All other features present in the GFF under the gene will be kept the same. + +Warning: Use this tool only when it is absolutely necessary to fix a gene model. + + ]]></help> + <expand macro="citations-clm" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_apollo_prep/macros.xml Fri May 13 04:55:55 2022 +0000 @@ -0,0 +1,85 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="3.6">python</requirement> + <requirement type="package" version="1.77">biopython</requirement> + <requirement type="package" version="1.1.3">cpt_gffparser</requirement> + <yield/> + </requirements> + </xml> + <token name="@BLAST_TSV@"> + "$blast_tsv" + </token> + <xml name="blast_tsv"> + <param label="Blast Results" help="TSV/tabular (25 Column)" + name="blast_tsv" type="data" format="tabular" /> + </xml> + + <token name="@BLAST_XML@"> + "$blast_xml" + </token> + <xml name="blast_xml"> + <param label="Blast Results" help="XML format" + name="blast_xml" type="data" format="blastxml" /> + </xml> + <xml name="gff3_with_fasta"> + <param label="Genome Sequences" name="fasta" type="data" format="fasta" /> + <param label="Genome Annotations" name="gff3" type="data" format="gff3" /> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input" /> + <expand macro="genome_selector" /> + </xml> + <token name="@INPUT_GFF@"> + "$gff3_data" + </token> + <token name="@INPUT_FASTA@"> +#if str($reference_genome.reference_genome_source) == 'cached': + "${reference_genome.fasta_indexes.fields.path}" +#else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa +#end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> +#if $reference_genome.reference_genome_source == 'history': + ln -s $reference_genome.genome_fasta genomeref.fa; +#end if + </token> + <token name="@GENOME_SELECTOR@"> +#if str($reference_genome.reference_genome_source) == 'cached': + "${reference_genome.fasta_indexes.fields.path}" +#else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa +#end if + </token> + <xml name="input/fasta"> + <param label="Fasta file" name="sequences" type="data" format="fasta"/> + </xml> + + <token name="@SEQUENCE@"> + "$sequences" + </token> + <xml name="input/fasta/protein"> + <param label="Protein fasta file" name="sequences" type="data" format="fasta"/> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_apollo_prep/test-data/ApolloPrep_In.gff3 Fri May 13 04:55:55 2022 +0000 @@ -0,0 +1,14 @@ +##gff-version 3 +##sequence-region testseq_2018-03-08 1 9216 +testseq_2018-03-08 feature gene 154 297 . - . ID=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159323 +testseq_2018-03-08 feature CDS 154 297 . - 0 ID=testseq_2018-03-08.cds_gene_1;Parent=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159409 +testseq_2018-03-08 feature gene 314 507 . - . ID=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159674 +testseq_2018-03-08 feature CDS 314 490 . - 0 ID=testseq_2018-03-08.cds_gene_2;Parent=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159760 +testseq_2018-03-08 CPT_ShineFind Shine_Dalgarno_sequence 504 507 . - . ID=testseq_2018-03-08.cds_gene_2.rbs-0;Parent=testseq_2018-03-08.gene_2;uniqueID=offset-160090 +testseq_2018-03-08 feature gene 487 670 . - . ID=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159882 +testseq_2018-03-08 feature exon 487 657 . - 0 ID=testseq_2018-03-08.exon_gene_3;Parent=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159968 +testseq_2018-03-08 feature CDS 487 657 . - 0 ID=testseq_2018-03-08.cds_gene_3;Parent=testseq_2018-03-08.exon_gene_3;source2=MGA;uniqueID=offset-159969 +testseq_2018-03-08 CPT_ShineFind Shine_Dalgarno_sequence 665 670 . - . ID=testseq_2018-03-08.cds_gene_3.rbs-0;Parent=testseq_2018-03-08.gene_3;uniqueID=offset-160441 +testseq_2018-03-08 feature gene 700 900 . - . ID=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159874 +testseq_2018-03-08 feature CDS 700 790 . - 0 ID=testseq_2018-03-08.cds_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159765 +testseq_2018-03-08 feature CDS 820 900 . - 0 ID=testseq_2018-03-08.cds2_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159762
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_apollo_prep/test-data/ApolloPrep_Out.gff3 Fri May 13 04:55:55 2022 +0000 @@ -0,0 +1,20 @@ +##gff-version 3 +testseq_2018-03-08 feature gene 154 297 . - . ID=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159323; +testseq_2018-03-08 feature CDS 154 297 . - 0 ID=testseq_2018-03-08.cds_gene_1;Parent=testseq_2018-03-08.gene_1;source2=MGA;uniqueID=offset-159409; +testseq_2018-03-08 cpt.prepApollo exon 154 297 . - . ID=testseq_2018-03-08.gene_1.exon;Parent=testseq_2018-03-08.gene_1;ApolloExon=True; +testseq_2018-03-08 feature CDS 154 297 . - 0 ID=testseq_2018-03-08.cds_gene_1;Parent=testseq_2018-03-08.gene_1.exon;source2=MGA;uniqueID=offset-159409; +testseq_2018-03-08 feature gene 314 507 . - . ID=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159674; +testseq_2018-03-08 feature CDS 314 490 . - 0 ID=testseq_2018-03-08.cds_gene_2;Parent=testseq_2018-03-08.gene_2;source2=MGA;uniqueID=offset-159760; +testseq_2018-03-08 CPT_ShineFind Shine_Dalgarno_sequence 504 507 . - . ID=testseq_2018-03-08.cds_gene_2.rbs-0;Parent=testseq_2018-03-08.gene_2;uniqueID=offset-160090; +testseq_2018-03-08 cpt.prepApollo exon 314 490 . - . ID=testseq_2018-03-08.gene_2.exon;Parent=testseq_2018-03-08.gene_2;ApolloExon=True; +testseq_2018-03-08 feature CDS 314 490 . - 0 ID=testseq_2018-03-08.cds_gene_2;Parent=testseq_2018-03-08.gene_2.exon;source2=MGA;uniqueID=offset-159760; +testseq_2018-03-08 feature gene 487 670 . - . ID=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159882; +testseq_2018-03-08 feature exon 487 657 . - . ID=testseq_2018-03-08.exon_gene_3;Parent=testseq_2018-03-08.gene_3;source2=MGA;uniqueID=offset-159968; +testseq_2018-03-08 feature CDS 487 657 . - 0 ID=testseq_2018-03-08.cds_gene_3;Parent=testseq_2018-03-08.exon_gene_3;source2=MGA;uniqueID=offset-159969; +testseq_2018-03-08 CPT_ShineFind Shine_Dalgarno_sequence 665 670 . - . ID=testseq_2018-03-08.cds_gene_3.rbs-0;Parent=testseq_2018-03-08.gene_3;uniqueID=offset-160441; +testseq_2018-03-08 feature gene 700 900 . - . ID=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159874; +testseq_2018-03-08 feature CDS 700 790 . - 0 ID=testseq_2018-03-08.cds_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159765; +testseq_2018-03-08 feature CDS 820 900 . - 0 ID=testseq_2018-03-08.cds2_gene_4;Parent=testseq_2018-03-08.gene_4;source2=MGA;uniqueID=offset-159762; +testseq_2018-03-08 cpt.prepApollo exon 700 900 . - . ID=testseq_2018-03-08.gene_4.exon;Parent=testseq_2018-03-08.gene_4;ApolloExon=True; +testseq_2018-03-08 feature CDS 700 790 . - 0 ID=testseq_2018-03-08.cds_gene_4;Parent=testseq_2018-03-08.gene_4.exon;source2=MGA;uniqueID=offset-159765; +testseq_2018-03-08 feature CDS 820 900 . - 0 ID=testseq_2018-03-08.cds2_gene_4;Parent=testseq_2018-03-08.gene_4.exon;source2=MGA;uniqueID=offset-159762;