diff gstf_preparation.xml @ 8:92f3966d5bc3 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
author earlhaminst
date Wed, 16 May 2018 20:03:57 -0400
parents 9ef7661e8e9c
children e8e75a79de59
line wrap: on
line diff
--- a/gstf_preparation.xml	Wed Apr 25 11:06:03 2018 -0400
+++ b/gstf_preparation.xml	Wed May 16 20:03:57 2018 -0400
@@ -37,7 +37,7 @@
             </param>
         </repeat>
         <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" />
-        <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding FASTA datasets" help="Each FASTA header line should start with a transcript id" />
+        <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding CDS datasets in FASTA format" help="Each FASTA header line should start with a transcript id" />
         <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" />
         <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the &gt;TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" />
         <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" />
@@ -111,7 +111,12 @@
 
 This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format.
 
-It also filters a CDS FASTA dataset to keep only the transcripts present in the gene feature information. Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).
+It also filters the CDS FASTA datasets to:
+
+- remove coding sequences whose length is not a multiple of 3
+- keep only the transcripts present in the gene feature information.
+
+Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).
 
 Example GFF3 file::