gstf_preparation: gstf_preparation.xml comparison

comparison gstf_preparation.xml @ 11:dbe37a658cd2 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"

author	earlhaminst
date	Sun, 27 Sep 2020 18:54:31 +0000
parents	e8e75a79de59
children	99bae410128c

comparison

equal deleted inserted replaced

-:e8e75a79de59
+:dbe37a658cd2
-<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.1">
+<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.2">
 <description>converts data for the workflow</description>
+<requirements>
+<requirement type="package" version="3.7">python</requirement>
+</requirements>
 <command detect_errors="exit_code"><![CDATA[
 python '$__tool_directory__/gstf_preparation.py'
 #for $q in $queries
 --gff3 '${q.genome}:${q.gff3_input}'
 #end for
 #end if
 #for $fasta_input in $fasta_inputs
 --fasta '${fasta_input}'
 #end for
 #if $headers
---headers
+--headers $headers
 #end if
-#if $longestCDS
+#if $filter
--l
+--filter $filter
 #end if
 #if $regions
 --regions '$regions'
 --ff '$filtered_fasta'
 #end if
 <validator type="empty_field" />
 </param>
 </repeat>
 <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" />
 <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding CDS datasets in FASTA format" help="Each FASTA header line should start with a transcript id" />
-<param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" />
+<param name="filter" type="select" display="radio" label="Which transcripts to keep">
-<param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the &gt;TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" />
+<option value="canonical" selected="true">Only canonical transcripts (or longest CDS per gene)</option>
+<option value="coding">Only protein-coding transcripts</option>
+<option value="">All transcripts</option>
+</param>
+<param name="headers" type="select" display="radio" label="Change the header line of the FASTA sequences to the following format" help="As required by TreeBest, part of the GeneSeqToFamily workflow, only TranscriptId_species is acceptable format by Aequatus visualisation">
+<option value="TranscriptId_species" selected="true">TranscriptId_species</option>
+<option value="GeneSymbol-TranscriptID_species">GeneSymbol-TranscriptID_species</option>
+<option value="TranscriptSymbol-TranscriptID_species">TranscriptSymbol-TranscriptID_species</option>
+<option value="">Don't change</option>
+</param>
 <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered out" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter out chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" />
 </inputs>
 <outputs>
 <data name="output_db" format="sqlite" label="${tool.name} on ${on_string}: SQLite" />
 </data>
 </outputs>
 <tests>
 <test expect_num_outputs="2">
+<repeat name="queries">
+<param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
+<param name="genome" value="caenorhabditis_elegans" />
+</repeat>
 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
-<param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
+<param name="filter" value="coding" />
-<param name="genome" value="caenorhabditis_elegans" />
+<param name="headers" value="TranscriptId_species" />
-<param name="longestCDS" value="false" />
-<param name="headers" value="true" />
 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" />
 <output name="output_fasta" file="test1.fasta" />
 </test>
 <test expect_num_outputs="2">
+<repeat name="queries">
+<param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
+<param name="genome" value="caenorhabditis_elegans" />
+</repeat>
 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
-<param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
+<param name="filter" value="canonical" />
-<param name="genome" value="caenorhabditis_elegans" />
+<param name="headers" value="TranscriptId_species" />
-<param name="longestCDS" value="true" />
-<param name="headers" value="true" />
 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" />
 <output name="output_fasta" file="test1_longest.fasta" />
 </test>
 <test expect_num_outputs="2">
-<param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
+<param name="json" ftype="gff3" value="gene.json" />
-<param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
+<param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />
-<param name="genome" value="caenorhabditis_elegans" />
+<param name="filter" value="" />
-<param name="longestCDS" value="false" />
+<param name="headers" value="" />
-<param name="headers" value="false" />
-<output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" />
+<output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />
-<output name="output_fasta" file="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
+<output name="output_fasta" file="CDS.fasta" />
 </test>
 <test expect_num_outputs="2">
+<param name="json" ftype="json" value="gene.json" />
 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />
-<param name="json" ftype="json" value="gene.json" />
+<param name="filter" value="coding" />
-<param name="longestCDS" value="false" />
+<param name="headers" value="TranscriptId_species" />
-<param name="headers" value="true" />
 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />
 <output name="output_fasta" file="test4.fasta" />
 </test>
 <test>
+<param name="json" ftype="json" value="gene.json" />
 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />
-<param name="json" ftype="json" value="gene.json" />
+<param name="filter" value="coding" />
-<param name="longestCDS" value="false" />
+<param name="headers" value="TranscriptId_species" />
-<param name="headers" value="true" />
 <param name="regions" value="X" />
 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />
 <output name="output_fasta" file="test5_filtered.fasta" />
 <output name="filtered_fasta" file="test5.ns.fasta" />
 </test>
 <test expect_num_outputs="2">
+<repeat name="queries">
+<param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" />
+<param name="genome" value="mus_pahari" />
+</repeat>
 <param name="fasta_inputs" ftype="fasta" value="Mus_pahari.PAHARI_EIJ_v1.1.cds.all.shortened.fa" />
-<param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" />
+<param name="filter" value="canonical" />
-<param name="genome" value="mus_pahari" />
+<param name="headers" value="TranscriptId_species" />
-<param name="longestCDS" value="true" />
-<param name="headers" value="true" />
 <output name="output_db" file="test6.sqlite" compare="sim_size" delta="30000" />
 <output name="output_fasta" file="test6.fasta" />
 </test>
 </tests>
 <help><![CDATA[
 **What it does**
 This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format.
-It also filters the CDS FASTA datasets to:
+It also filters the CDS FASTA datasets to keep only the transcripts present in the gene feature information.
-- remove coding sequences whose length is not a multiple of 3
+Optionally it can also:
-- keep only the transcripts present in the gene feature information.
+- keep only canonical transcripts (or the longest CDS per gene, if this attribute is not provided)
+- remove sequences which are annotated as non protein-coding or whose length is not a multiple of 3
-Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).
+- change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).
 Example GFF3 file::
 scaffold_0  MYZPE13164_Clone_G006_v1.0  gene            44968   69413   .   -   .   ID=MYZPE13164_G006_v1.0_000000030;Name=MYZPE13164_G006_v1.0_000000030;biotype=protein_coding
 scaffold_0  MYZPE13164_Clone_G006_v1.0  mRNA            44968   69413   .   -   .   ID=MYZPE13164_G006_v1.0_000000030.1;Parent=MYZPE13164_G006_v1.0_000000030;Name=MYZPE13164_G006_v1.0_000000030.1;biotype=protein_coding;_AED=0.31

Mercurial > repos > earlhaminst > gstf_preparation

comparison gstf_preparation.xml @ 11:dbe37a658cd2 draft