Mercurial > repos > earlhaminst > gstf_preparation
diff gstf_preparation.xml @ 4:284f64ad9d43 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
author | earlhaminst |
---|---|
date | Fri, 08 Dec 2017 05:32:12 -0500 |
parents | 19644996bc2a |
children | 56bbdbfe3eaa |
line wrap: on
line diff
--- a/gstf_preparation.xml Fri Nov 24 12:32:39 2017 -0500 +++ b/gstf_preparation.xml Fri Dec 08 05:32:12 2017 -0500 @@ -1,4 +1,4 @@ -<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.3.0"> +<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.0"> <description>converts data for the workflow</description> <command detect_errors="exit_code"> <![CDATA[ @@ -14,6 +14,12 @@ #for $fasta_input in $fasta_inputs --fasta '${fasta_input}' #end for +#if $headers + --headers +#end if +#if $longestCDS + -l +#end if -o '$output_db' --of '$output_fasta' ]]> @@ -28,6 +34,8 @@ </repeat> <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" /> <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding FASTA datasets" help="Each FASTA header line should start with a transcript id" /> + <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" /> + <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the >TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" /> </inputs> <outputs> @@ -40,12 +48,37 @@ <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> <param name="genome" value="caenorhabditis_elegans" /> + <param name="longestCDS" value="false" /> + <param name="headers" value="true" /> + <output name="output_db" file="test1.sqlite" compare="sim_size" /> <output name="output_fasta" file="test1.fasta" /> </test> <test> + <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> + <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> + <param name="genome" value="caenorhabditis_elegans" /> + <param name="longestCDS" value="true" /> + <param name="headers" value="true" /> + + <output name="output_db" file="test1.sqlite" compare="sim_size" /> + <output name="output_fasta" file="test1_longest.fasta" /> + </test> + <test> + <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> + <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> + <param name="genome" value="caenorhabditis_elegans" /> + <param name="longestCDS" value="false" /> + <param name="headers" value="false" /> + + <output name="output_db" file="test1.sqlite" compare="sim_size" /> + <output name="output_fasta" file="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> + </test> + <test> <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> <param name="json" ftype="json" value="gene.json" /> + <param name="longestCDS" value="false" /> + <param name="headers" value="true" /> <output name="output_db" file="test2.sqlite" compare="sim_size" /> <output name="output_fasta" file="test2.fasta" /> @@ -55,7 +88,9 @@ <![CDATA[ **What it does** -This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format and modify the header lines of a corresponding CDS FASTA to be used with the GeneSeqToFamily workflow. +This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format. + +It also filters a CDS FASTA dataset to keep only the transcripts present in the gene feature information. Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow). Example GFF3 file::