Mercurial > repos > earlhaminst > gstf_preparation
diff gstf_preparation.xml @ 11:dbe37a658cd2 draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
author | earlhaminst |
---|---|
date | Sun, 27 Sep 2020 18:54:31 +0000 |
parents | e8e75a79de59 |
children | 99bae410128c |
line wrap: on
line diff
--- a/gstf_preparation.xml Thu Oct 31 08:16:51 2019 -0400 +++ b/gstf_preparation.xml Sun Sep 27 18:54:31 2020 +0000 @@ -1,5 +1,8 @@ -<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.1"> +<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.2"> <description>converts data for the workflow</description> + <requirements> + <requirement type="package" version="3.7">python</requirement> + </requirements> <command detect_errors="exit_code"><![CDATA[ python '$__tool_directory__/gstf_preparation.py' #for $q in $queries @@ -14,10 +17,10 @@ --fasta '${fasta_input}' #end for #if $headers - --headers + --headers $headers #end if -#if $longestCDS - -l +#if $filter + --filter $filter #end if #if $regions --regions '$regions' @@ -36,8 +39,18 @@ </repeat> <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" /> <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding CDS datasets in FASTA format" help="Each FASTA header line should start with a transcript id" /> - <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" /> - <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the >TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" /> + <param name="filter" type="select" display="radio" label="Which transcripts to keep"> + <option value="canonical" selected="true">Only canonical transcripts (or longest CDS per gene)</option> + <option value="coding">Only protein-coding transcripts</option> + <option value="">All transcripts</option> + </param> + + <param name="headers" type="select" display="radio" label="Change the header line of the FASTA sequences to the following format" help="As required by TreeBest, part of the GeneSeqToFamily workflow, only TranscriptId_species is acceptable format by Aequatus visualisation"> + <option value="TranscriptId_species" selected="true">TranscriptId_species</option> + <option value="GeneSymbol-TranscriptID_species">GeneSymbol-TranscriptID_species</option> + <option value="TranscriptSymbol-TranscriptID_species">TranscriptSymbol-TranscriptID_species</option> + <option value="">Don't change</option> + </param> <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered out" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter out chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" /> </inputs> @@ -51,49 +64,52 @@ <tests> <test expect_num_outputs="2"> + <repeat name="queries"> + <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> + <param name="genome" value="caenorhabditis_elegans" /> + </repeat> <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> - <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> - <param name="genome" value="caenorhabditis_elegans" /> - <param name="longestCDS" value="false" /> - <param name="headers" value="true" /> + <param name="filter" value="coding" /> + <param name="headers" value="TranscriptId_species" /> <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> <output name="output_fasta" file="test1.fasta" /> </test> <test expect_num_outputs="2"> + <repeat name="queries"> + <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> + <param name="genome" value="caenorhabditis_elegans" /> + </repeat> <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> - <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> - <param name="genome" value="caenorhabditis_elegans" /> - <param name="longestCDS" value="true" /> - <param name="headers" value="true" /> + <param name="filter" value="canonical" /> + <param name="headers" value="TranscriptId_species" /> <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> <output name="output_fasta" file="test1_longest.fasta" /> </test> <test expect_num_outputs="2"> - <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> - <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> - <param name="genome" value="caenorhabditis_elegans" /> - <param name="longestCDS" value="false" /> - <param name="headers" value="false" /> + <param name="json" ftype="gff3" value="gene.json" /> + <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> + <param name="filter" value="" /> + <param name="headers" value="" /> - <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> - <output name="output_fasta" file="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> + <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> + <output name="output_fasta" file="CDS.fasta" /> </test> <test expect_num_outputs="2"> + <param name="json" ftype="json" value="gene.json" /> <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> - <param name="json" ftype="json" value="gene.json" /> - <param name="longestCDS" value="false" /> - <param name="headers" value="true" /> + <param name="filter" value="coding" /> + <param name="headers" value="TranscriptId_species" /> <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> <output name="output_fasta" file="test4.fasta" /> </test> <test> + <param name="json" ftype="json" value="gene.json" /> <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> - <param name="json" ftype="json" value="gene.json" /> - <param name="longestCDS" value="false" /> - <param name="headers" value="true" /> + <param name="filter" value="coding" /> + <param name="headers" value="TranscriptId_species" /> <param name="regions" value="X" /> <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> @@ -101,11 +117,13 @@ <output name="filtered_fasta" file="test5.ns.fasta" /> </test> <test expect_num_outputs="2"> + <repeat name="queries"> + <param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" /> + <param name="genome" value="mus_pahari" /> + </repeat> <param name="fasta_inputs" ftype="fasta" value="Mus_pahari.PAHARI_EIJ_v1.1.cds.all.shortened.fa" /> - <param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" /> - <param name="genome" value="mus_pahari" /> - <param name="longestCDS" value="true" /> - <param name="headers" value="true" /> + <param name="filter" value="canonical" /> + <param name="headers" value="TranscriptId_species" /> <output name="output_db" file="test6.sqlite" compare="sim_size" delta="30000" /> <output name="output_fasta" file="test6.fasta" /> @@ -116,12 +134,12 @@ This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format. -It also filters the CDS FASTA datasets to: +It also filters the CDS FASTA datasets to keep only the transcripts present in the gene feature information. -- remove coding sequences whose length is not a multiple of 3 -- keep only the transcripts present in the gene feature information. - -Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow). +Optionally it can also: +- keep only canonical transcripts (or the longest CDS per gene, if this attribute is not provided) +- remove sequences which are annotated as non protein-coding or whose length is not a multiple of 3 +- change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow). Example GFF3 file::