Mercurial > repos > bgruening > sailfish
view sailfish.xml @ 3:85d0d479f05b draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sailfish commit af1a2810e59d86e039983060d998625a5410bb61
author | bgruening |
---|---|
date | Mon, 21 Dec 2015 18:18:03 -0500 |
parents | 20eab032389f |
children | 03c74355227f |
line wrap: on
line source
<tool id="sailfish" name="Sailfish" version="0.7.6.0"> <description>transcript quantification from RNA-seq data</description> <requirements> <requirement type="package" version="0.7.6">sailfish</requirement> </requirements> <macros> <xml name="strandedness"> <param name="strandedness" type="select" label="Specify the strandedness of the reads"> <option value="U" selected="True">Not stranded (U)</option> <option value="SF">read 1 (or single-end read) comes from the forward strand (SF)</option> <option value="SR">read 1 (or single-end read) comes from the reverse strand (SR)</option> </param> </xml> </macros> <stdio> <exit_code range="1:" /> <exit_code range=":-1" /> <regex match="Error:" /> <regex match="Exception:" /> <regex match="Exception :" /> </stdio> <version_command>sailfish -version</version_command> <command> <![CDATA[ #if $refTranscriptSource.TranscriptSource == "history": sailfish index --transcripts $refTranscriptSource.ownFile --kmerSize $refTranscriptSource.kmerSize --out ./index_dir --threads "\${GALAXY_SLOTS:-4}" #set $index_path = './index_dir' #else: #set $index_path = $refTranscriptSource.index.fields.path #end if && #if $single_or_paired.single_or_paired_opts == 'single': #if $single_or_paired.input_singles.ext == 'fasta': #set $ext = 'fasta' #else: #set $ext = 'fastq' #end if ln -s $single_or_paired.input_singles ./single.$ext && #else: #if $single_or_paired.input_mate1.ext == 'fasta': #set $ext = 'fasta' #else: #set $ext = 'fastq' #end if ln -s $single_or_paired.input_mate1 ./mate1.$ext && ln -s $single_or_paired.input_mate2 ./mate2.$ext && #end if #if $geneMap: ln -s "$geneMap" ./geneMap.$geneMap.ext && #end if sailfish quant --index $index_path #if $single_or_paired.single_or_paired_opts == 'single': --libType ${single_or_paired.strandedness} --unmatedReads ./single.$ext #else: --mates1 ./mate1.$ext --mates2 ./mate2.$ext --libType "${single_or_paired.orientation}${single_or_paired.strandedness}" #end if --output ./ $biasCorrect --threads "\${GALAXY_SLOTS:-4}" #if $fldMean: --fldMean $fldMean #end if #if $fldSD: --fldSD $fldSD #end if #if $maxReadOcc: --maxReadOcc $maxReadOcc #end if #if $geneMap: --geneMap ./geneMap.${geneMap.ext} #end if $noEffectiveLengthCorrection $useVBOpt $allowOrphans $unsmoothedFLD --maxFragLen ${maxFragLen} --txpAggregationKey "${txpAggregationKey}" ]]> </command> <inputs> <conditional name="refTranscriptSource"> <param name="TranscriptSource" type="select" label="Select a reference transcriptome from your history or use a built-in index?" help="Built-ins were indexed using default options"> <option value="indexed">Use a built-in index</option> <option value="history" selected="True">Use one from the history</option> </param> <when value="indexed"> <param name="index" type="select" label="Select a reference transcriptome" help="If your transcriptome of interest is not listed, contact your Galaxy admin"> <options from_data_table="sailfish_indexes"> <filter type="sort_by" column="2"/> <validator type="no_options" message="No indexes are available for the selected input dataset"/> </options> </param> </when> <!-- build-in --> <when value="history"> <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference transcriptome" /> <param argument="kmerSize" type="integer" value="21" max="32" label="The size of the k-mer on which the index is built" help="There is a tradeoff here between the distinctiveness of the k-mers and their robustness to errors. The shorter the k-mers, the more robust they will be to errors in the reads, but the longer the k-mers, the more distinct they will be. We generally recommend using a k-mer size of at least 20."/> </when> <!-- history --> </conditional> <!-- refTranscriptSource --> <conditional name="single_or_paired"> <param name="single_or_paired_opts" type="select" label="Is this library mate-paired?"> <option value="single">Single-end</option> <option value="paired">Paired-end</option> </param> <when value="single"> <param name="input_singles" type="data" format="fastq,fasta" label="FASTQ/FASTA file" help="FASTQ file." /> <expand macro="strandedness" /> </when> <when value="paired"> <param name="input_mate1" type="data" format="fastq,fasta" label="Mate pair 1" help="FASTQ file." /> <param name="input_mate2" type="data" format="fastq,fasta" label="Mate pair 2" help="FASTQ file." /> <param name="orientation" type="select" label="Relative orientation of reads within a pair"> <option value="M">Mates are oriented in the same direction (M = matching)</option> <option value="O">Mates are oriented away from each other (O = outward)</option> <option value="I" selected="True">Mates are oriented toward each other (I = inward)</option> </param> <expand macro="strandedness" /> </when> </conditional> <param argument="--geneMap" type="data" format="tabular,gff,gtf" optional="True" label="File containing a mapping of transcripts to genes" help="Calculates the aggregated gene-level abundance estimations. This file should be eiher a GTF file or tab-delimited format where each line contains the name of a transcript and the gene to which it belongs separated by a tab." /> <param argument="--biasCorrect" type="boolean" truevalue="--biasCorrect" falsevalue="" checked="False" label="Perform bias correction" help=""/> <param argument="--fldMean" type="integer" value="200" optional="True" label="Calculate effective lengths" help="If single end reads are being used for quantification, or there are an insufficient number of uniquely mapping reads when performing paired-end quantification to estimate the empirical fragment length distribution, then use this value to calculate effective lengths."/> <param argument="--fldSD" type="integer" value="80" optional="True" label="Standard deviation" help="The standard deviation used in the fragment length distribution for single-end quantification or when an empirical distribution cannot be learned."/> <param argument="--maxReadOcc" type="integer" value="200" optional="True" label="Maximal read mapping occurence" help="Reads mapping to more than this many places won't be considered."/> <param argument="--noEffectiveLengthCorrection" type="boolean" truevalue="--noEffectiveLengthCorrection" falsevalue="" checked="False" label="Disable effective length correction" help="Disables effective length correction when computing the probability that a fragment was generated from a transcript. If this flag is passed in, the fragment length distribution is not taken into account when computing this probability."/> <param argument="--useVBOpt" type="boolean" truevalue="--useVBOpt" falsevalue="" checked="False" label="Use Variational Bayesian EM algorithm for optimization" help=""/> <param argument="--allowOrphans" type="boolean" truevalue="--allowOrphans" falsevalue="" checked="False" label="Consider orphaned reads as valid hits when performing lightweight-alignment" help="This option will increase sensitivity (allow more reads to map and more transcripts to be detected), but may decrease specificity as orphaned alignments are more likely to be spurious."/> <param argument="--unsmoothedFLD" type="boolean" truevalue="--unsmoothedFLD" falsevalue="" checked="False" label="Use the un-smoothed approach to effective length correction" help="This traditional approach works by convolving the FLD with the characteristic function over each transcript."/> <param argument="--maxFragLen" type="integer" value="1000" optional="True" label="The maximum length of a fragment to consider when building the empirical fragment length distribution" help=""/> <param argument="--txpAggregationKey" value="gene_id" type="text" label="The key for aggregating transcripts during gene-level estimates" help="The default is the gene_id field, but other fields (e.g. gene_name) might be useful depending on the specifics of the annotation being used." /> </inputs> <outputs> <data name="output_quant" format="tabular" from_work_dir="quant.sf" label="${tool.name} on ${on_string} (Quantification)" /> <data name="output_bias_corrected_quant" format="tabular" from_work_dir="quant_bias_corrected.sf" label="${tool.name} on ${on_string} (Bias corrected Quantification)"> <filter>biasCorrect is True</filter> </data> <data name="output_gene_quant" format="tabular" from_work_dir="quant.genes.sf" label="${tool.name} on ${on_string} (Gene Quantification)"> <filter>geneMap is True</filter> </data> </outputs> <tests> <test> <param name="single_or_paired_opts" value="paired" /> <param name="input_mate1" value="reads_1.fastq" /> <param name="input_mate2" value="reads_2.fastq" /> <param name="biasCorrect" value="True" /> <param name="TranscriptSource" value="history" /> <param name="ownFile" value="transcripts.fasta" ftype="fasta" /> <output file="sailfish_quant_result1.tab" ftype="tabular" name="output_quant" /> <output file="sailfish_bias_result1.tab" ftype="tabular" name="output_bias_corrected_quant" /> </test> </tests> <help> <![CDATA[ **What it does** Sailfish is a tool for transcript quantification from RNA-seq data. It requires a set of target transcripts (either from a reference or de-novo assembly) to quantify. All you need to run Sailfish is a fasta file containing your reference transcripts and a (set of) fasta/fastq file(s) containing your reads. Sailfish runs in two phases; indexing and quantification. The indexing step is independent of the reads, and only need to be run one for a particular set of reference transcripts and choice of k (the k-mer size). The quantification step, obviously, is specific to the set of RNA-seq reads and is thus run more frequently. When the quantification output contains a number of columns: (1) Transcript ID, (2) Transcript Length, (3) Transcripts per Million (TPM) and (4) Estimated number of reads (an estimate of the number of reads drawn from this transcript given the transcript’s relative abundance and length). The first two columns are self-explanatory, the next four are measures of transcript abundance and the final is a commonly used input for differential expression tools. The Transcripts per Million quantification number is computed as described in [1], and is meant as an estimate of the number of transcripts, per million observed transcripts, originating from each isoform. Its benefit over the F/RPKM measure is that it is independent of the mean expressed transcript length (i.e. if the mean expressed transcript length varies between samples, for example, this alone can affect differential analysis based on the K/RPKM.). Fragment Library Types ====================== There are numerous library preparation protocols for RNA-seq that result in sequencing reads with different characteristics. For example, reads can be single end (only one side of a fragment is recorded as a read) or paired-end (reads are generated from both ends of a fragment). Further, the sequencing reads themselves may be unstraned or strand-specific. Finally, paired-end protocols will have a specified relative orientation. To characterize the various different typs of sequencing libraries, we've created a miniature "language" that allows for the succinct description of the many different types of possible fragment libraries. For paired-end reads, the possible orientations, along with a graphical description of what they mean, are illustrated below: .. image:: ReadLibraryIllustration.png The library type string consists of three parts: the relative orientation of the reads, the strandedness of the library, and the directionality of the reads. The first part of the library string (relative orientation) is only provided if the library is paired-end. The possible options are: :: I = inward O = outward M = matching The second part of the read library string specifies whether the protocol is stranded or unstranded; the options are: :: S = stranded U = unstranded If the protocol is unstranded, then we're done. The final part of the library string specifies the strand from which the read originates in a strand-specific protocol — it is only provided if the library is stranded (i.e. if the library format string is of the form S). The possible values are: :: F = read 1 (or single-end read) comes from the forward strand R = read 1 (or single-end read) comes from the reverse strand So, for example, if you wanted to specify a fragment library of strand-specific paired-end reads, oriented toward each other, where read 1 comes from the forward strand and read 2 comes from the reverse strand, you would specify ``-l ISF`` on the command line. This designates that the library being processed has the type "ISF" meaning, **I**\ nward (the relative orientation), **S**\ tranted (the protocol is strand-specific), **F**\ orward (read 1 comes from the forward strand). The single end library strings are a bit simpler than their pair-end counter parts, since there is no relative orientation of which to speak. Thus, the only possible library format types for single-end reads are ``U`` (for unstranded), ``SF`` (for strand-specific reads coming from the forward strand) and ``SR`` (for strand-specific reads coming from the reverse strand). A few more examples of some library format strings and their interpretations are: :: IU (an unstranded paired-end library where the reads face each other) :: SF (a stranded single-end protocol where the reads come from the forward strand) :: OSR (a stranded paired-end protocol where the reads face away from each other, read1 comes from reverse strand and read2 comes from the forward strand) .. note:: Correspondence to TopHat library types The popular `TopHat <http://ccb.jhu.edu/software/tophat/index.shtml>`_ RNA-seq read aligner has a different convention for specifying the format of the library. Below is a table that provides the corresponding sailfish/salmon library format string for each of the potential TopHat library types: +---------------------+-------------------------+ | TopHat | Salmon (and Sailfish) | +=====================+============+============+ | | Paired-end | Single-end | +---------------------+------------+------------+ |``-fr-unstranded`` |``-l IU`` |``-l U`` | +---------------------+------------+------------+ |``-fr-firststrand`` |``-l ISR`` |``-l SR`` | +---------------------+------------+------------+ |``-fr-secondstrand`` |``-l ISF`` |``-l SF`` | +---------------------+------------+------------+ The remaining salmon library format strings are not directly expressible in terms of the TopHat library types, and so there is no direct mapping for them. ]]> </help> </tool>