Mercurial > repos > bgruening > sailfish
diff sailfish.xml @ 0:3b4ed0e473dc draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sailfish commit bd2dd2419ea52f30cd7de2f7109a12b49b5d0dba-dirty
author | bgruening |
---|---|
date | Fri, 16 Oct 2015 15:09:03 -0400 |
parents | |
children | 06646e81c543 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sailfish.xml Fri Oct 16 15:09:03 2015 -0400 @@ -0,0 +1,324 @@ +<tool id="sailfish" name="Sailfish" version="0.7.6.0"> + <description>transcript quantification from RNA-seq data</description> + <requirements> + <requirement type="package" version="0.7.6">sailfish</requirement> + </requirements> + <macros> + <xml name="strandedness"> + <param name="strandedness" type="select" label="Specify the strandedness of the reads"> + <option value="U" selected="True">Not stranded</option> + <option value="SF">read 1 (or single-end read) comes from the forward strand</option> + <option value="SR">read 1 (or single-end read) comes from the reverse strand</option> + </param> + </xml> + </macros> + <stdio> + <exit_code range="1:" /> + <exit_code range=":-1" /> + <regex match="Error:" /> + <regex match="Exception:" /> + </stdio> + <version_command>sailfish -version</version_command> + <command> +<![CDATA[ + + #if $refTranscriptSource.TranscriptSource == "history": + sailfish index + --transcripts $refTranscriptSource.ownFile + --kmerSize $refTranscriptSource.kmerSize + --out ./index_dir + --threads "\${GALAXY_SLOTS:-4}" + #set $index_path = './index_dir' + #else: + #set $index_path = $refTranscriptSource.index.fields.path + #end if + + && + + #if $single_or_paired.single_or_paired_opts == 'single': + ln -s $single_or_paired.input_singles ./single.$single_or_paired.input_singles.ext && + #else: + ln -s $single_or_paired.input_mate1 ./mate1.$single_or_paired.input_mate1.ext && + ln -s $single_or_paired.input_mate2 ./mate2.$single_or_paired.input_mate2.ext && + #end if + + + #if $geneMap: + ln -s "$geneMap" ./geneMap.$geneMap.ext && + #end if + + sailfish quant + --index $index_path + #if $single_or_paired.single_or_paired_opts == 'single': + --libType "${single_or_paired.orientation}${single_or_paired.strandedness}" + --unmated_reads ./single.$single_or_paired.input_singles.ext + #else: + --mates1 ./mate1.$single_or_paired.input_mate1.ext + --mates2 ./mate2.$single_or_paired.input_mate2.ext + --libType "${single_or_paired.orientation}${single_or_paired.strandedness}" + #end if + --output ./ + $biasCorrect + --threads "\${GALAXY_SLOTS:-4}" + + #if $fldMean: + --fldMean $fldMean + #end if + + #if $fldSD: + --fldSD $fldSD + #end if + + #if $maxReadOcc: + --maxReadOcc $maxReadOcc + #end if + + #if $geneMap: + --geneMap ./geneMap.${geneMap.ext} + #end if + + $noEffectiveLengthCorrection + $useVBOpt + $allowOrphans + + $unsmoothedFLD + --maxFragLen ${maxFragLen} + --txpAggregationKey "${txpAggregationKey}" + +]]> + </command> + <inputs> + <conditional name="refTranscriptSource"> + <param name="TranscriptSource" type="select" label="Select a reference transcriptome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history" selected="True">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference transcriptome" help="If your transcriptome of interest is not listed, contact your Galaxy admin"> + <options from_data_table="sailfish_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> <!-- build-in --> + <when value="history"> + <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference transcriptome" /> + <param argument="kmerSize" type="integer" value="21" max="32" label="The size of the k-mer on which the index is built" + help="There is a tradeoff here between the distinctiveness of the k-mers and their robustness to errors. + The shorter the k-mers, the more robust they will be to errors in the reads, but the longer the k-mers, + the more distinct they will be. We generally recommend using a k-mer size of at least 20."/> + </when> <!-- history --> + </conditional> <!-- refTranscriptSource --> + + <conditional name="single_or_paired"> + <param name="single_or_paired_opts" type="select" label="Is this library mate-paired?"> + <option value="single">Single-end</option> + <option value="paired">Paired-end</option> + </param> + <when value="single"> + <param name="input_singles" type="data" format="fastq,fasta" label="FASTQ/FASTA file" help="FASTQ file." /> + <expand macro="strandedness" /> + </when> + <when value="paired"> + <param name="input_mate1" type="data" format="fastq,fasta" label="Mate pair 1" help="FASTQ file." /> + <param name="input_mate2" type="data" format="fastq,fasta" label="Mate pair 2" help="FASTQ file." /> + <param name="orientation" type="select" label="Relative orientation of reads within a pair"> + <option value="M">Mates are oriented in the same direction (M = matching)</option> + <option value="O">Mates are oriented away from each other (O = outward)</option> + <option value="I" selected="True">Mates are oriented toward each other (I = inward)</option> + </param> + <expand macro="strandedness" /> + </when> + </conditional> + + <param argument="--geneMap" type="data" format="tabular,gff,gtf" optional="True" label="File containing a mapping of transcripts to genes" + help="Calculates the aggregated gene-level abundance estimations. This file should be eiher a GTF file or tab-delimited format + where each line contains the name of a transcript and the gene to which it belongs separated by a tab." /> + + <param argument="--biasCorrect" type="boolean" truevalue="--biasCorrect" falsevalue="" checked="False" + label="Perform bias correction" help=""/> + + <param argument="--fldMean" type="integer" value="200" optional="True" label="Calculate effective lengths" + help="If single end reads are being used for quantification, or there are an insufficient number of uniquely mapping reads when performing paired-end quantification + to estimate the empirical fragment length distribution, then use this value to calculate effective lengths."/> + + <param argument="--fldSD" type="integer" value="80" optional="True" label="Standard deviation" + help="The standard deviation used in the fragment length distribution for single-end quantification or when an empirical distribution cannot be learned."/> + + <param argument="--maxReadOcc" type="integer" value="200" optional="True" label="Maximal read mapping occurence" + help="Reads mapping to more than this many places won't be considered."/> + + <param argument="--noEffectiveLengthCorrection" type="boolean" truevalue="--noEffectiveLengthCorrection" falsevalue="" checked="False" + label="Disable effective length correction" help="Disables effective length correction when computing the probability that a fragment was generated from a transcript. + If this flag is passed in, the fragment length distribution is not taken into account when computing this probability."/> + + <param argument="--useVBOpt" type="boolean" truevalue="--useVBOpt" falsevalue="" checked="False" + label="Use Variational Bayesian EM algorithm for optimization" help=""/> + + <param argument="--allowOrphans" type="boolean" truevalue="--allowOrphans" falsevalue="" checked="False" + label="Consider orphaned reads as valid hits when performing lightweight-alignment" + help="This option will increase sensitivity (allow more reads to map and more transcripts to be detected), but may decrease specificity as orphaned alignments are more likely to be spurious."/> + + <param argument="--unsmoothedFLD" type="boolean" truevalue="--unsmoothedFLD" falsevalue="" checked="False" + label="Use the un-smoothed approach to effective length correction" help="This traditional approach works by convolving the FLD with the characteristic function over each transcript."/> + + <param argument="--maxFragLen" type="integer" value="1000" optional="True" + label="The maximum length of a fragment to consider when building the empirical fragment length distribution" + help=""/> + + <param argument="--txpAggregationKey" value="gene_id" type="text" label="The key for aggregating transcripts during gene-level estimates" + help="The default is the gene_id field, but other fields (e.g. gene_name) might be useful depending on the specifics of the annotation being used." /> + + </inputs> + <outputs> + <data name="output_quant" format="tabular" from_work_dir="quant.sf" label="${tool.name} on ${on_string} (Quantification)" /> + <data name="output_bias_corrected_quant" format="tabular" from_work_dir="quant_bias_corrected.sf" label="${tool.name} on ${on_string} (Bias corrected Quantification)"> + <filter>bias_correct == '--biasCorrect'</filter> + </data> + <data name="output_gene_quant" format="tabular" from_work_dir="quant.genes.sf" label="${tool.name} on ${on_string} (Gene Quantification)"> + <filter>geneMap is True</filter> + </data> + </outputs> + <tests> + <test> + <param name="single_or_paired_opts" value="paired" /> + <param name="input_mate1" value="reads_1.fastq" /> + <param name="input_mate2" value="reads_2.fastq" /> + <param name="biasCorrect" value="True" /> + <param name="TranscriptSource" value="history" /> + <param name="ownFile" value="transcripts.fasta" ftype="fasta" /> + <output file="sailfish_quant_result1.tab" ftype="tabular" name="output_quant" /> + <output file="sailfish_bias_result1.tab" ftype="tabular" name="output_bias_corrected_quant" /> + </test> + </tests> + <help> +<![CDATA[ +**What it does** + +Sailfish is a tool for transcript quantification from RNA-seq data. It +requires a set of target transcripts (either from a reference or _de-novo_ +assembly) to quantify. All you need to run Sailfish is a fasta file containing +your reference transcripts and a (set of) fasta/fastq file(s) containing your +reads. Sailfish runs in two phases; indexing and quantification. The indexing +step is independent of the reads, and only need to be run one for a particular +set of reference transcripts and choice of k (the k-mer size). The +quantification step, obviously, is specific to the set of RNA-seq reads and is +thus run more frequently. + +When the quantification output contains a number of columns: +(1) Transcript ID, +(2) Transcript Length, +(3) Transcripts per Million (TPM) and +(4) Estimated number of reads (an estimate of the number of reads drawn from this transcript given the transcript’s relative abundance and length). + +The first two columns are self-explanatory, the next four are measures of transcript abundance and the final is a commonly used input for differential expression tools. +The Transcripts per Million quantification number is computed as described in [1], and is meant as an estimate of the number of transcripts, per million observed transcripts, +originating from each isoform. Its benefit over the F/RPKM measure is that it is independent of the mean expressed transcript length +(i.e. if the mean expressed transcript length varies between samples, for example, this alone can affect differential analysis based on the K/RPKM.). + + + +Fragment Library Types +====================== + +There are numerous library preparation protocols for RNA-seq that result in +sequencing reads with different characteristics. For example, reads can be +single end (only one side of a fragment is recorded as a read) or paired-end +(reads are generated from both ends of a fragment). Further, the sequencing +reads themselves may be unstraned or strand-specific. Finally, paired-end +protocols will have a specified relative orientation. To characterize the +various different typs of sequencing libraries, we've created a miniature +"language" that allows for the succinct description of the many different types +of possible fragment libraries. For paired-end reads, the possible +orientations, along with a graphical description of what they mean, are +illustrated below: + +.. image:: ReadLibraryIllustration.png + +The library type string consists of three parts: the relative orientation of +the reads, the strandedness of the library, and the directionality of the +reads. + +The first part of the library string (relative orientation) is only provided if +the library is paired-end. The possible options are: + +:: + + I = inward + O = outward + M = matching + +The second part of the read library string specifies whether the protocol is +stranded or unstranded; the options are: + +:: + + S = stranded + U = unstranded + +If the protocol is unstranded, then we're done. The final part of the library +string specifies the strand from which the read originates in a strand-specific +protocol — it is only provided if the library is stranded (i.e. if the +library format string is of the form S). The possible values are: + +:: + + F = read 1 (or single-end read) comes from the forward strand + R = read 1 (or single-end read) comes from the reverse strand + +So, for example, if you wanted to specify a fragment library of strand-specific +paired-end reads, oriented toward each other, where read 1 comes from the +forward strand and read 2 comes from the reverse strand, you would specify ``-l +ISF`` on the command line. This designates that the library being processed has +the type "ISF" meaning, **I**\ nward (the relative orientation), **S**\ tranted +(the protocol is strand-specific), **F**\ orward (read 1 comes from the forward +strand). + +The single end library strings are a bit simpler than their pair-end counter +parts, since there is no relative orientation of which to speak. Thus, the +only possible library format types for single-end reads are ``U`` (for +unstranded), ``SF`` (for strand-specific reads coming from the forward strand) +and ``SR`` (for strand-specific reads coming from the reverse strand). + +A few more examples of some library format strings and their interpretations are: + +:: + + IU (an unstranded paired-end library where the reads face each other) + +:: + + SF (a stranded single-end protocol where the reads come from the forward strand) + +:: + + OSR (a stranded paired-end protocol where the reads face away from each other, + read1 comes from reverse strand and read2 comes from the forward strand) + +.. note:: Correspondence to TopHat library types + + The popular `TopHat <http://ccb.jhu.edu/software/tophat/index.shtml>`_ RNA-seq + read aligner has a different convention for specifying the format of the library. + Below is a table that provides the corresponding sailfish/salmon library format + string for each of the potential TopHat library types: + + + +---------------------+-------------------------+ + | TopHat | Salmon (and Sailfish) | + +=====================+============+============+ + | | Paired-end | Single-end | + +---------------------+------------+------------+ + |``-fr-unstranded`` |``-l IU`` |``-l U`` | + +---------------------+------------+------------+ + |``-fr-firststrand`` |``-l ISR`` |``-l SR`` | + +---------------------+------------+------------+ + |``-fr-secondstrand`` |``-l ISF`` |``-l SF`` | + +---------------------+------------+------------+ + + The remaining salmon library format strings are not directly expressible in terms + of the TopHat library types, and so there is no direct mapping for them. + + +]]> + </help> +</tool>