diff sailfish.xml @ 0:3b4ed0e473dc draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sailfish commit bd2dd2419ea52f30cd7de2f7109a12b49b5d0dba-dirty
author bgruening
date Fri, 16 Oct 2015 15:09:03 -0400
parents
children 06646e81c543
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sailfish.xml	Fri Oct 16 15:09:03 2015 -0400
@@ -0,0 +1,324 @@
+<tool id="sailfish" name="Sailfish" version="0.7.6.0">
+    <description>transcript quantification from RNA-seq data</description>
+    <requirements>
+        <requirement type="package" version="0.7.6">sailfish</requirement>
+    </requirements>
+    <macros>
+        <xml name="strandedness">
+            <param name="strandedness" type="select" label="Specify the strandedness of the reads">
+                <option value="U" selected="True">Not stranded</option>
+                <option value="SF">read 1 (or single-end read) comes from the forward strand</option>
+                <option value="SR">read 1 (or single-end read) comes from the reverse strand</option>
+            </param>
+        </xml>
+    </macros>
+    <stdio>
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+        <regex match="Error:" />
+        <regex match="Exception:" />
+    </stdio>
+    <version_command>sailfish -version</version_command>
+    <command>
+<![CDATA[
+
+        #if $refTranscriptSource.TranscriptSource == "history":
+            sailfish index
+                --transcripts $refTranscriptSource.ownFile
+                --kmerSize $refTranscriptSource.kmerSize
+                --out ./index_dir
+                --threads "\${GALAXY_SLOTS:-4}"
+            #set $index_path = './index_dir'
+        #else:
+            #set $index_path = $refTranscriptSource.index.fields.path
+        #end if
+
+        &&
+
+        #if $single_or_paired.single_or_paired_opts == 'single':
+            ln -s $single_or_paired.input_singles ./single.$single_or_paired.input_singles.ext &&
+        #else:
+            ln -s $single_or_paired.input_mate1 ./mate1.$single_or_paired.input_mate1.ext &&
+            ln -s $single_or_paired.input_mate2 ./mate2.$single_or_paired.input_mate2.ext &&
+        #end if
+
+
+        #if $geneMap:
+            ln -s "$geneMap" ./geneMap.$geneMap.ext &&
+        #end if
+
+        sailfish quant
+            --index $index_path
+            #if $single_or_paired.single_or_paired_opts == 'single':
+                --libType "${single_or_paired.orientation}${single_or_paired.strandedness}"
+                --unmated_reads ./single.$single_or_paired.input_singles.ext
+            #else:
+                --mates1 ./mate1.$single_or_paired.input_mate1.ext
+                --mates2 ./mate2.$single_or_paired.input_mate2.ext
+                --libType "${single_or_paired.orientation}${single_or_paired.strandedness}"
+            #end if
+            --output ./ 
+            $biasCorrect
+            --threads "\${GALAXY_SLOTS:-4}"
+
+            #if $fldMean:
+                --fldMean $fldMean
+            #end if
+
+            #if $fldSD:
+                --fldSD $fldSD
+            #end if
+
+            #if $maxReadOcc:
+                --maxReadOcc $maxReadOcc
+            #end if
+
+            #if $geneMap:
+                --geneMap ./geneMap.${geneMap.ext}
+            #end if
+
+            $noEffectiveLengthCorrection
+            $useVBOpt
+            $allowOrphans
+
+            $unsmoothedFLD
+            --maxFragLen ${maxFragLen}
+            --txpAggregationKey "${txpAggregationKey}"
+
+]]>
+    </command>
+    <inputs>
+        <conditional name="refTranscriptSource">
+            <param name="TranscriptSource" type="select" label="Select a reference transcriptome from your history or use a built-in index?" help="Built-ins were indexed using default options">
+                <option value="indexed">Use a built-in index</option>
+                <option value="history" selected="True">Use one from the history</option>
+            </param>
+            <when value="indexed">
+                <param name="index" type="select" label="Select a reference transcriptome" help="If your transcriptome of interest is not listed, contact your Galaxy admin">
+                    <options from_data_table="sailfish_indexes">
+                        <filter type="sort_by" column="2"/>
+                        <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+                    </options>
+                </param>
+            </when>  <!-- build-in -->
+            <when value="history">
+                <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference transcriptome" />
+                <param argument="kmerSize" type="integer" value="21" max="32" label="The size of the k-mer on which the index is built"
+                    help="There is a tradeoff here between the distinctiveness of the k-mers and their robustness to errors. 
+                        The shorter the k-mers, the more robust they will be to errors in the reads, but the longer the k-mers,
+                        the more distinct they will be.  We generally recommend using a k-mer size of at least 20."/>
+            </when>  <!-- history -->
+        </conditional>  <!-- refTranscriptSource -->
+
+        <conditional name="single_or_paired">
+            <param name="single_or_paired_opts" type="select" label="Is this library mate-paired?">
+                <option value="single">Single-end</option>
+                <option value="paired">Paired-end</option>
+            </param>
+            <when value="single">
+                <param name="input_singles" type="data" format="fastq,fasta" label="FASTQ/FASTA file" help="FASTQ file." />
+                <expand macro="strandedness" />
+            </when>
+            <when value="paired">
+                <param name="input_mate1" type="data" format="fastq,fasta" label="Mate pair 1" help="FASTQ file." />
+                <param name="input_mate2" type="data" format="fastq,fasta" label="Mate pair 2" help="FASTQ file." />
+                <param name="orientation" type="select" label="Relative orientation of reads within a pair">
+                    <option value="M">Mates are oriented in the same direction (M = matching)</option>
+                    <option value="O">Mates are oriented away from each other (O = outward)</option>
+                    <option value="I" selected="True">Mates are oriented toward each other (I = inward)</option>
+                </param>
+                <expand macro="strandedness" />
+            </when>
+        </conditional>
+
+        <param argument="--geneMap" type="data" format="tabular,gff,gtf" optional="True" label="File containing a mapping of transcripts to genes"
+            help="Calculates the aggregated gene-level abundance estimations. This file should be eiher a GTF file or tab-delimited format
+            where each line contains the name of a transcript and the gene to which it belongs separated by a tab." />
+
+        <param argument="--biasCorrect" type="boolean" truevalue="--biasCorrect" falsevalue="" checked="False"
+                    label="Perform bias correction" help=""/>
+
+        <param argument="--fldMean" type="integer" value="200" optional="True" label="Calculate effective lengths"
+            help="If single end reads are being used for quantification, or there are an insufficient number of uniquely mapping reads when performing paired-end quantification
+                    to estimate the empirical fragment length distribution, then use this value to calculate effective lengths."/>
+
+        <param argument="--fldSD" type="integer" value="80" optional="True" label="Standard deviation"
+            help="The standard deviation used in the fragment length distribution for single-end quantification or when an empirical distribution cannot be learned."/>
+
+        <param argument="--maxReadOcc" type="integer" value="200" optional="True" label="Maximal read mapping occurence"
+            help="Reads mapping to more than this many places won't be considered."/>
+
+        <param argument="--noEffectiveLengthCorrection" type="boolean" truevalue="--noEffectiveLengthCorrection" falsevalue="" checked="False"
+            label="Disable effective length correction" help="Disables effective length correction when computing the probability that a fragment was generated from a transcript.
+            If this flag is passed in, the fragment length distribution is not taken into account when computing this probability."/>
+
+        <param argument="--useVBOpt" type="boolean" truevalue="--useVBOpt" falsevalue="" checked="False"
+            label="Use Variational Bayesian EM algorithm for optimization" help=""/>
+
+        <param argument="--allowOrphans" type="boolean" truevalue="--allowOrphans" falsevalue="" checked="False"
+            label="Consider orphaned reads as valid hits when performing lightweight-alignment"
+            help="This option will increase sensitivity (allow more reads to map and more transcripts to be detected), but may decrease specificity as orphaned alignments are more likely to be spurious."/>
+
+        <param argument="--unsmoothedFLD" type="boolean" truevalue="--unsmoothedFLD" falsevalue="" checked="False"
+            label="Use the un-smoothed approach to effective length correction" help="This traditional approach works by convolving the FLD with the characteristic function over each transcript."/>
+
+        <param argument="--maxFragLen" type="integer" value="1000" optional="True"
+            label="The maximum length of a fragment to consider when building the empirical fragment length distribution"
+            help=""/>
+
+        <param argument="--txpAggregationKey" value="gene_id" type="text" label="The key for aggregating transcripts during gene-level estimates"
+            help="The default is the gene_id field, but other fields (e.g. gene_name) might be useful depending on the specifics of the annotation being used." />
+
+    </inputs>
+    <outputs>
+        <data name="output_quant" format="tabular" from_work_dir="quant.sf" label="${tool.name} on ${on_string} (Quantification)" />
+        <data name="output_bias_corrected_quant" format="tabular" from_work_dir="quant_bias_corrected.sf" label="${tool.name} on ${on_string} (Bias corrected Quantification)">
+            <filter>bias_correct == '--biasCorrect'</filter>
+        </data>
+        <data name="output_gene_quant" format="tabular" from_work_dir="quant.genes.sf" label="${tool.name} on ${on_string} (Gene Quantification)">
+            <filter>geneMap is True</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="single_or_paired_opts" value="paired" />
+            <param name="input_mate1" value="reads_1.fastq" />
+            <param name="input_mate2" value="reads_2.fastq" />
+            <param name="biasCorrect" value="True" />
+            <param name="TranscriptSource" value="history" />
+            <param name="ownFile" value="transcripts.fasta" ftype="fasta" />
+            <output file="sailfish_quant_result1.tab" ftype="tabular" name="output_quant" />
+            <output file="sailfish_bias_result1.tab" ftype="tabular" name="output_bias_corrected_quant" />
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+**What it does**
+
+Sailfish is a tool for transcript quantification from RNA-seq data.  It
+requires a set of target transcripts (either from a reference or _de-novo_
+assembly) to quantify.  All you need to run Sailfish is a fasta file containing
+your reference transcripts and a (set of) fasta/fastq file(s) containing your
+reads.  Sailfish runs in two phases; indexing and quantification.  The indexing
+step is independent of the reads, and only need to be run one for a particular
+set of reference transcripts and choice of k (the k-mer size). The
+quantification step, obviously, is specific to the set of RNA-seq reads and is
+thus run more frequently.
+
+When the quantification output contains a number of columns:
+(1) Transcript ID,
+(2) Transcript Length,
+(3) Transcripts per Million (TPM) and
+(4) Estimated number of reads (an estimate of the number of reads drawn from this transcript given the transcript’s relative abundance and length).
+
+The first two columns are self-explanatory, the next four are measures of transcript abundance and the final is a commonly used input for differential expression tools.
+The Transcripts per Million quantification number is computed as described in [1], and is meant as an estimate of the number of transcripts, per million observed transcripts,
+originating from each isoform. Its benefit over the F/RPKM measure is that it is independent of the mean expressed transcript length
+(i.e. if the mean expressed transcript length varies between samples, for example, this alone can affect differential analysis based on the K/RPKM.).
+
+
+
+Fragment Library Types
+======================
+
+There are numerous library preparation protocols for RNA-seq that result in
+sequencing reads with different characteristics.  For example, reads can be
+single end (only one side of a fragment is recorded as a read) or paired-end
+(reads are generated from both ends of a fragment).  Further, the sequencing
+reads themselves may be unstraned or strand-specific.  Finally, paired-end
+protocols will have a specified relative orientation.  To characterize the
+various different typs of sequencing libraries, we've created a miniature
+"language" that allows for the succinct description of the many different types
+of possible fragment libraries.  For paired-end reads, the possible
+orientations, along with a graphical description of what they mean, are
+illustrated below:
+
+.. image:: ReadLibraryIllustration.png
+
+The library type string consists of three parts: the relative orientation of
+the reads, the strandedness of the library, and the directionality of the
+reads.
+
+The first part of the library string (relative orientation) is only provided if
+the library is paired-end. The possible options are:
+
+::
+
+    I = inward
+    O = outward
+    M = matching
+
+The second part of the read library string specifies whether the protocol is
+stranded or unstranded; the options are:
+
+::
+
+    S = stranded
+    U = unstranded
+
+If the protocol is unstranded, then we're done.  The final part of the library
+string specifies the strand from which the read originates in a strand-specific
+protocol — it is only provided if the library is stranded (i.e. if the
+library format string is of the form S).  The possible values are:
+
+::
+
+    F = read 1 (or single-end read) comes from the forward strand
+    R = read 1 (or single-end read) comes from the reverse strand
+
+So, for example, if you wanted to specify a fragment library of strand-specific
+paired-end reads, oriented toward each other, where read 1 comes from the
+forward strand and read 2 comes from the reverse strand, you would specify ``-l
+ISF`` on the command line.  This designates that the library being processed has
+the type "ISF" meaning, **I**\ nward (the relative orientation), **S**\ tranted
+(the protocol is strand-specific), **F**\ orward (read 1 comes from the forward
+strand).
+
+The single end library strings are a bit simpler than their pair-end counter
+parts, since there is no relative orientation of which to speak.  Thus, the
+only possible library format types for single-end reads are ``U`` (for
+unstranded), ``SF`` (for strand-specific reads coming from the forward strand)
+and ``SR`` (for strand-specific reads coming from the reverse strand).
+
+A few more examples of some library format strings and their interpretations are:
+
+::
+
+    IU (an unstranded paired-end library where the reads face each other)
+
+::
+
+    SF (a stranded single-end protocol where the reads come from the forward strand)
+
+::
+
+    OSR (a stranded paired-end protocol where the reads face away from each other,
+         read1 comes from reverse strand and read2 comes from the forward strand)
+
+.. note:: Correspondence to TopHat library types 
+
+   The popular `TopHat <http://ccb.jhu.edu/software/tophat/index.shtml>`_ RNA-seq 
+   read aligner has a different convention for specifying the format of the library.
+   Below is a table that provides the corresponding sailfish/salmon library format
+   string for each of the potential TopHat library types:
+
+
+   +---------------------+-------------------------+  
+   | TopHat              | Salmon (and Sailfish)   |
+   +=====================+============+============+
+   |                     | Paired-end | Single-end | 
+   +---------------------+------------+------------+
+   |``-fr-unstranded``   |``-l IU``   |``-l U``    |
+   +---------------------+------------+------------+
+   |``-fr-firststrand``  |``-l ISR``  |``-l SR``   |
+   +---------------------+------------+------------+
+   |``-fr-secondstrand`` |``-l ISF``  |``-l SF``   |
+   +---------------------+------------+------------+
+
+   The remaining salmon library format strings are not directly expressible in terms
+   of the TopHat library types, and so there is no direct mapping for them.
+
+
+]]>
+    </help>
+</tool>