Mercurial > repos > aaronpetkau > flash
view FLASH.xml @ 2:6889442b27dc draft default tip
Uploaded
author | aaronpetkau |
---|---|
date | Sat, 04 Jul 2015 08:58:21 -0400 |
parents | |
children |
line wrap: on
line source
<tool id="FLASH" name="FLASH" version="1.3.0"> <description>merge paired-end reads from fragments that are shorter than twice the length of reads</description> <command interpreter="bash"> FLASH.sh $extendedFrags $notCombined1 $notCombined2 $interNotCombined $readsAndPairs $log_file -o out -t 4 #if $min_overlap -m $min_overlap #end if #if $max_overlap -M $max_overlap #else -M 250 #end if #if $outputs.output_type == "Interleaved_fastq" --interleaved-output #else if $outputs.output_type == "tab" -To #end if #if $options.options_select == "advanced" #if $options.max_mismatch_density -x $options.max_mismatch_density #end if #if $options.phred_offset -p $options.phred_offset #end if #if $options.read_length -r $options.read_length #end if #if $options.fragment_length -f $options.fragment_length #end if #if $options.fragment_stdev -s $options.fragment_stdev #end if #if $options.cap_mismatch_quals $options.cap_mismatch_quals #end if #if $options.quiet $options.quiet #end if #end if #if $input_type.sPaired == "paired": $input_type.pInput1 $input_type.pInput2 #elif $input_type.sPaired == "collections": $input_type.fastq_collection.forward $input_type.fastq_collection.reverse #end if </command> <inputs> <conditional name="input_type"> <param name="sPaired" type="select" label="Single Pair or Collection"> <option value="collections">Paired-end Collections</option> <option value="paired">Paired-end</option> </param> <when value="paired"> <param name="pInput1" type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" label="Forward FASTQ file" help="Must have ASCII encoded quality scores"/> <param name="pInput2" type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" label="Reverse FASTQ file" help="File format must match the Forward FASTQ file"/> </when> <when value="collections"> <param name="fastq_collection" type="data_collection" label="Paired-end Fastq collection" help="" optional="false" format="txt" collection_type="paired" /> </when> </conditional> <param name="min_overlap" type="integer" label="Minimum overlap" optional="true"/> <param name="max_overlap" type="integer" label="Maximum overlap" value="250" optional="true"/> <conditional name="outputs"> <param name="output_type" type="select" label="Output type"> <option value="Non-interleaved_fastq">Non-interleaved fastq</option> <option value="Interleaved_fastq">Interleaved fastq</option> <option value="tab">Tab-deliminated</option> </param> </conditional> <conditional name="options"> <param name="options_select" type="select" label="Options Type"> <option value="basic">Basic</option> <option value="advanced">Advanced</option> </param> <when value="advanced"> <param name="max_mismatch_density" type="float" label="Maximum mismatch density" optional="true"/> <param name="phred_offset" type="select" label="Phred-offset" optional="true"> <option value="33">33</option> <option value="64">64</option> </param> <param name="read_length" type="integer" label="Average read length" optional="true"/> <param name="fragment_length" type="integer" label="Fragment length" optional="true"/> <param name="fragment_stdev" type="integer" label="Fragment length standard deviation" optional="true"/> <param name="cap_mismatch_quals" type="boolean" label="Cap mismatch quality scores" truevalue="--cap-mismatch-quals" optional="true"/> <!--<param name="compress" type="boolean" label="Compress output files with gzip" optional="true"/> <param name="compress_prog" type="text" label="Compression program" optional="true"/> <param name="compress_prog_args" type="text" label="Compression program arguments" optional="true"/> <~~~~~~~~Phil says the compression options aren't needed--> <param name="quiet" type="boolean" label="Do not print informational messages" truevalue="-q" optional="true"/> </when> </conditional> </inputs> <outputs> <data format="fastqsanger" name="extendedFrags" label="Merged reads"> <filter>outputs['output_type'] != "tab"</filter> </data> <data format="fastqsanger" name="notCombined1" label="Read 1 of mate pairs not merged"> <filter>outputs['output_type'] == "Non-interleaved_fastq"</filter> </data> <data format="fastqsanger" name="notCombined2" label="Read 2 of mate pairs not merged"> <filter>outputs['output_type'] == "Non-interleaved_fastq"</filter> </data> <data format="fastqsanger" name="interNotCombined" label="Interleaved non-combined pairs"> <filter>outputs['output_type'] == "Interleaved_fastq"</filter> </data> <data format="tabular" name="readsAndPairs" label="Merged and non-merged pairs"> <filter>outputs['output_type'] == "tab"</filter> </data> <data format="txt" name="log_file" label="Log file"/> <!-- <data format="txt" name="numericHistogram" label="Numeric histogram of merged read lengths"/> <data format="txt" name="visualHistogram" label="Visual histogram of merged read lengths"/>--> </outputs> <requirements> <requirement type="package" version="1.2.9">FLASH</requirement> </requirements> <help> ---------------------------------------------------------------------------- DESCRIPTION ---------------------------------------------------------------------------- FLASH (Fast Length Adjustment of SHort reads) is an accurate and fast tool to merge paired-end reads that were generated from DNA fragments whose lengths are shorter than twice the length of reads. Merged read pairs result in unpaired longer reads, which are generally more desired in genome assembly and genome analysis processes. Briefly, the FLASH algorithm considers all possible overlaps at or above a minimum length between the reads in a pair and chooses the overlap that results in the lowest mismatch density (proportion of mismatched bases in the overlapped region). Ties between multiple overlaps are broken by considering quality scores at mismatch sites. When building the merged sequence, FLASH computes a consensus sequence in the overlapped region. More details can be found in the original publication (http://bioinformatics.oxfordjournals.org/content/27/21/2957.full). Limitations of FLASH include: - FLASH cannot merge paired-end reads that do not overlap. - FLASH cannot merge read pairs that have an outward orientation, either due to being "jumping" reads or due to excessive trimming. - FLASH is not designed for data that has a significant amount of indel errors (such as Sanger sequencing data). It is best suited for Illumina data. ---------------------------------------------------------------------------- MANDATORY INPUT ---------------------------------------------------------------------------- The most common input to FLASH is two FASTQ files containing read 1 and read 2 of each mate pair, respectively, in the same order. Alternatively, you may provide one FASTQ file, which may be standard input, containing paired-end reads in either interleaved FASTQ (see the --interleaved-input option) or tab-delimited (see the --tab-delimited-input option) format. In all cases, gzip compressed input is autodetected. Also, in all cases, the PHRED offset is, by default, assumed to be 33; use the --phred-offset option to change it. ---------------------------------------------------------------------------- OUTPUT ---------------------------------------------------------------------------- The default output of FLASH consists of the following files: - out.extendedFrags.fastq The merged reads. - out.notCombined_1.fastq Read 1 of mate pairs that were not merged. - out.notCombined_2.fastq Read 2 of mate pairs that were not merged. - out.hist Numeric histogram of merged read lengths. - out.histogram Visual histogram of merged read lengths. FLASH also logs informational messages to standard output. These can be redirected to a file, as in the following example: $ flash reads_1.fq reads_2.fq | tee flash.log In addition, FLASH supports several features affecting the output: - Writing the merged reads directly to standard output (--to-stdout) - Writing gzip compressed output files (-z) or using an external compression program (--compress-prog) - Writing the uncombined read pairs in interleaved FASTQ format (--interleaved-output) - Writing all output reads to a single file in tab-delimited format (--tab-delimited-output) ---------------------------------------------------------------------------- OPTIONS ---------------------------------------------------------------------------- -m, --min-overlap=NUM The minimum required overlap length between two reads to provide a confident overlap. Default: 10bp. -M, --max-overlap=NUM Maximum overlap length expected in approximately 90% of read pairs. It is by default set to 70bp, which works well for 100bp reads generated from a 180bp library, assuming a normal distribution of fragment lengths. Overlaps longer than the maximum overlap parameter are still considered as good overlaps, but the mismatch density (explained below) is calculated over the first max_overlap bases in the overlapped region rather than the entire overlap. Default: 70bp, or calculated from the specified read length, fragment length, and fragment length standard deviation. -x, --max-mismatch-density=NUM Maximum allowed ratio between the number of mismatched base pairs and the overlap length. Two reads will not be combined with a given overlap if that overlap results in a mismatched base density higher than this value. Note: Any occurence of an 'N' in either read is ignored and not counted towards the mismatches or overlap length. Our experimental results suggest that higher values of the maximum mismatch density yield larger numbers of correctly merged read pairs but at the expense of higher numbers of incorrectly merged read pairs. Default: 0.25. -p, --phred-offset=OFFSET The smallest ASCII value of the characters used to represent quality values of bases in FASTQ files. It should be set to either 33, which corresponds to the later Illumina platforms and Sanger platforms, or 64, which corresponds to the earlier Illumina platforms. Default: 33. -r, --read-len=LEN -f, --fragment-len=LEN -s, --fragment-len-stddev=LEN Average read length, fragment length, and fragment standard deviation. These are convenience parameters only, as they are only used for calculating the maximum overlap (--max-overlap) parameter. The maximum overlap is calculated as the overlap of average-length reads from an average-size fragment plus 2.5 times the fragment length standard deviation. The default values are -r 100, -f 180, and -s 18, so this works out to a maximum overlap of 65 bp. If --max-overlap is specified, then the specified value overrides the calculated value. If you do not know the standard deviation of the fragment library, you can probably assume that the standard deviation is 10% of the average fragment length. --cap-mismatch-quals Cap quality scores assigned at mismatch locations to 2. This was the default behavior in FLASH v1.2.7 and earlier. Later versions will instead calculate such scores as the absolute value of the difference in quality scores, but at least 2. Essentially, the new behavior prevents a low quality base call that is likely a sequencing error from significantly bringing down the quality of a high quality, likely correct base call. --interleaved-input Instead of requiring files MATES_1.FASTQ and MATES_2.FASTQ, allow a single file MATES.FASTQ that has the paired-end reads interleaved. Specify "-" to read from standard input. --interleaved-output Write the uncombined pairs in interleaved FASTQ format. -I, --interleaved Equivalent to specifying both --interleaved-input and --interleaved-output. -Ti, --tab-delimited-input Assume the input is in tab-delimited format rather than FASTQ, in the format described below in '--tab-delimited-output'. In this mode you should provide a single input file, each line of which must contain either a read pair (5 fields) or a single read (3 fields). FLASH will try to combine the read pairs. Single reads will be written to the output file as-is if also using --tab-delimited-output; otherwise they will be ignored. Note that you may specify "-" as the input file to read the tab-delimited data from standard input. -To, --tab-delimited-output Write output in tab-delimited format (not FASTQ). Each line will contain either a combined pair in the format 'tag <tab> seq <tab> qual' or an uncombined pair in the format 'tag <tab> seq_1 <tab> qual_1 <tab> seq_2 <tab> qual_2'. -o, --output-prefix=PREFIX Prefix of output files. Default: "out". -d, --output-directory=DIR Path to directory for output files. Default: current working directory. -c, --to-stdout Write the combined reads to standard output. In this mode, with FASTQ output (the default) the uncombined reads are discarded. With tab-delimited output, uncombined reads are included in the tab-delimited data written to standard output. In both cases, histogram files are not written, and informational messages are sent to standard error rather than to standard output. --suffix=SUFFIX, --output-suffix=SUFFIX Use SUFFIX as the suffix of the output files after ".fastq". A dot before the suffix is assumed, unless an empty suffix is provided. Default: nothing; or 'gz' if -z is specified; or PROG if --compress-prog=PROG is specified. -t, --threads=NTHREADS Set the number of worker threads. This is in addition to the I/O threads. Default: number of processors. Note: if you need FLASH's output to appear deterministically or in the same order as the original reads, you must specify -t 1 (--threads=1). -q, --quiet Do not print informational messages. -h, --help Display this help and exit. -v, --version Display version. </help> </tool>