Mercurial > repos > aaronpetkau > flash
changeset 2:6889442b27dc draft default tip
Uploaded
author | aaronpetkau |
---|---|
date | Sat, 04 Jul 2015 08:58:21 -0400 |
parents | a444685f161c |
children | |
files | FLASH.sh FLASH.xml README tool_dependencies.xml |
diffstat | 4 files changed, 494 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FLASH.sh Sat Jul 04 08:58:21 2015 -0400 @@ -0,0 +1,41 @@ +#/bin/bash + +#grab output files +merged_reads=$1 +shift +not_combined_1=$1 +shift +not_combined_2=$1 +shift +inter_not_combined=$1 +shift +reads_and_pairs=$1 +shift +log_file=$1 +shift + +flash $@ > $log_file +sleep 5 #sleep because phil says so + +if [ -f out.notCombined_2.fastq ]; +then + mv out.notCombined_2.fastq $not_combined_2 +fi +if [ -f out.notCombined_1.fastq ]; +then + mv out.notCombined_1.fastq $not_combined_1 +fi +if [ -f out.notCombined.fastq ]; +then + mv out.notCombined.fastq $inter_not_combined +fi +if [ -f out.readsAndPairs.tab ]; +then + mv out.readsAndPairs.tab $reads_and_pairs +fi +if [ -f out.extendedFrags.fastq ]; +then + mv out.extendedFrags.fastq $merged_reads +fi + +exit 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FLASH.xml Sat Jul 04 08:58:21 2015 -0400 @@ -0,0 +1,331 @@ +<tool id="FLASH" name="FLASH" version="1.3.0"> + <description>merge paired-end reads from fragments that are shorter than twice the length of reads</description> + <command interpreter="bash"> + FLASH.sh $extendedFrags $notCombined1 $notCombined2 $interNotCombined $readsAndPairs $log_file -o out -t 4 + #if $min_overlap + -m $min_overlap + #end if + #if $max_overlap + -M $max_overlap + #else + -M 250 + #end if + #if $outputs.output_type == "Interleaved_fastq" + --interleaved-output + #else if $outputs.output_type == "tab" + -To + #end if + #if $options.options_select == "advanced" + #if $options.max_mismatch_density + -x $options.max_mismatch_density + #end if + #if $options.phred_offset + -p $options.phred_offset + #end if + #if $options.read_length + -r $options.read_length + #end if + #if $options.fragment_length + -f $options.fragment_length + #end if + #if $options.fragment_stdev + -s $options.fragment_stdev + #end if + #if $options.cap_mismatch_quals + $options.cap_mismatch_quals + #end if + #if $options.quiet + $options.quiet + #end if + #end if + + #if $input_type.sPaired == "paired": + $input_type.pInput1 $input_type.pInput2 + #elif $input_type.sPaired == "collections": + $input_type.fastq_collection.forward $input_type.fastq_collection.reverse + #end if + + </command> + <inputs> + <conditional name="input_type"> + <param name="sPaired" type="select" label="Single Pair or Collection"> + <option value="collections">Paired-end Collections</option> + <option value="paired">Paired-end</option> + </param> + <when value="paired"> + <param name="pInput1" type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" label="Forward FASTQ file" help="Must have ASCII encoded quality scores"/> + <param name="pInput2" type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" label="Reverse FASTQ file" help="File format must match the Forward FASTQ file"/> + </when> + <when value="collections"> + <param name="fastq_collection" type="data_collection" label="Paired-end Fastq collection" help="" optional="false" format="txt" collection_type="paired" /> + </when> + </conditional> + + <param name="min_overlap" type="integer" label="Minimum overlap" optional="true"/> + <param name="max_overlap" type="integer" label="Maximum overlap" value="250" optional="true"/> + <conditional name="outputs"> + <param name="output_type" type="select" label="Output type"> + <option value="Non-interleaved_fastq">Non-interleaved fastq</option> + <option value="Interleaved_fastq">Interleaved fastq</option> + <option value="tab">Tab-deliminated</option> + </param> + </conditional> + <conditional name="options"> + <param name="options_select" type="select" label="Options Type"> + <option value="basic">Basic</option> + <option value="advanced">Advanced</option> + </param> + <when value="advanced"> + <param name="max_mismatch_density" type="float" label="Maximum mismatch density" optional="true"/> + <param name="phred_offset" type="select" label="Phred-offset" optional="true"> + <option value="33">33</option> + <option value="64">64</option> + </param> + <param name="read_length" type="integer" label="Average read length" optional="true"/> + <param name="fragment_length" type="integer" label="Fragment length" optional="true"/> + <param name="fragment_stdev" type="integer" label="Fragment length standard deviation" optional="true"/> + <param name="cap_mismatch_quals" type="boolean" label="Cap mismatch quality scores" truevalue="--cap-mismatch-quals" optional="true"/> + <!--<param name="compress" type="boolean" label="Compress output files with gzip" optional="true"/> + <param name="compress_prog" type="text" label="Compression program" optional="true"/> + <param name="compress_prog_args" type="text" label="Compression program arguments" optional="true"/> <~~~~~~~~Phil says the compression options aren't needed--> + <param name="quiet" type="boolean" label="Do not print informational messages" truevalue="-q" optional="true"/> + </when> + </conditional> + </inputs> + <outputs> + <data format="fastqsanger" name="extendedFrags" label="Merged reads"> + <filter>outputs['output_type'] != "tab"</filter> + </data> + <data format="fastqsanger" name="notCombined1" label="Read 1 of mate pairs not merged"> + <filter>outputs['output_type'] == "Non-interleaved_fastq"</filter> + </data> + <data format="fastqsanger" name="notCombined2" label="Read 2 of mate pairs not merged"> + <filter>outputs['output_type'] == "Non-interleaved_fastq"</filter> + </data> + <data format="fastqsanger" name="interNotCombined" label="Interleaved non-combined pairs"> + <filter>outputs['output_type'] == "Interleaved_fastq"</filter> + </data> + <data format="tabular" name="readsAndPairs" label="Merged and non-merged pairs"> + <filter>outputs['output_type'] == "tab"</filter> + </data> + <data format="txt" name="log_file" label="Log file"/> + <!-- <data format="txt" name="numericHistogram" label="Numeric histogram of merged read lengths"/> + <data format="txt" name="visualHistogram" label="Visual histogram of merged read lengths"/>--> + </outputs> + <requirements> + <requirement type="package" version="1.2.9">FLASH</requirement> + </requirements> + <help> +---------------------------------------------------------------------------- + DESCRIPTION +---------------------------------------------------------------------------- + +FLASH (Fast Length Adjustment of SHort reads) is an accurate and fast tool +to merge paired-end reads that were generated from DNA fragments whose +lengths are shorter than twice the length of reads. Merged read pairs result +in unpaired longer reads, which are generally more desired in genome +assembly and genome analysis processes. + +Briefly, the FLASH algorithm considers all possible overlaps at or above a +minimum length between the reads in a pair and chooses the overlap that +results in the lowest mismatch density (proportion of mismatched bases in +the overlapped region). Ties between multiple overlaps are broken by +considering quality scores at mismatch sites. When building the merged +sequence, FLASH computes a consensus sequence in the overlapped region. +More details can be found in the original publication +(http://bioinformatics.oxfordjournals.org/content/27/21/2957.full). + +Limitations of FLASH include: + - FLASH cannot merge paired-end reads that do not overlap. + - FLASH cannot merge read pairs that have an outward orientation, either + due to being "jumping" reads or due to excessive trimming. + - FLASH is not designed for data that has a significant amount of indel + errors (such as Sanger sequencing data). It is best suited for Illumina + data. + +---------------------------------------------------------------------------- + MANDATORY INPUT +---------------------------------------------------------------------------- + +The most common input to FLASH is two FASTQ files containing read 1 and read 2 +of each mate pair, respectively, in the same order. + +Alternatively, you may provide one FASTQ file, which may be standard input, +containing paired-end reads in either interleaved FASTQ (see the +--interleaved-input option) or tab-delimited (see the --tab-delimited-input +option) format. In all cases, gzip compressed input is autodetected. Also, +in all cases, the PHRED offset is, by default, assumed to be 33; use the +--phred-offset option to change it. + +---------------------------------------------------------------------------- + OUTPUT +---------------------------------------------------------------------------- + +The default output of FLASH consists of the following files: + + - out.extendedFrags.fastq The merged reads. + - out.notCombined_1.fastq Read 1 of mate pairs that were not merged. + - out.notCombined_2.fastq Read 2 of mate pairs that were not merged. + - out.hist Numeric histogram of merged read lengths. + - out.histogram Visual histogram of merged read lengths. + +FLASH also logs informational messages to standard output. These can be +redirected to a file, as in the following example: + + $ flash reads_1.fq reads_2.fq | tee flash.log + +In addition, FLASH supports several features affecting the output: + + - Writing the merged reads directly to standard output (--to-stdout) + - Writing gzip compressed output files (-z) or using an external + compression program (--compress-prog) + - Writing the uncombined read pairs in interleaved FASTQ format + (--interleaved-output) + - Writing all output reads to a single file in tab-delimited format + (--tab-delimited-output) + +---------------------------------------------------------------------------- + OPTIONS +---------------------------------------------------------------------------- + + -m, --min-overlap=NUM The minimum required overlap length between two + reads to provide a confident overlap. Default: + 10bp. + + -M, --max-overlap=NUM Maximum overlap length expected in approximately + 90% of read pairs. It is by default set to 70bp, + which works well for 100bp reads generated from a + 180bp library, assuming a normal distribution of + fragment lengths. Overlaps longer than the maximum + overlap parameter are still considered as good + overlaps, but the mismatch density (explained below) + is calculated over the first max_overlap bases in + the overlapped region rather than the entire + overlap. Default: 70bp, or calculated from the + specified read length, fragment length, and fragment + length standard deviation. + + -x, --max-mismatch-density=NUM + Maximum allowed ratio between the number of + mismatched base pairs and the overlap length. + Two reads will not be combined with a given overlap + if that overlap results in a mismatched base density + higher than this value. Note: Any occurence of an + 'N' in either read is ignored and not counted + towards the mismatches or overlap length. Our + experimental results suggest that higher values of + the maximum mismatch density yield larger + numbers of correctly merged read pairs but at + the expense of higher numbers of incorrectly + merged read pairs. Default: 0.25. + + -p, --phred-offset=OFFSET + The smallest ASCII value of the characters used to + represent quality values of bases in FASTQ files. + It should be set to either 33, which corresponds + to the later Illumina platforms and Sanger + platforms, or 64, which corresponds to the + earlier Illumina platforms. Default: 33. + + -r, --read-len=LEN + + -f, --fragment-len=LEN + + -s, --fragment-len-stddev=LEN + Average read length, fragment length, and fragment + standard deviation. These are convenience parameters + only, as they are only used for calculating the + maximum overlap (--max-overlap) parameter. + The maximum overlap is calculated as the overlap of + average-length reads from an average-size fragment + plus 2.5 times the fragment length standard + deviation. The default values are -r 100, -f 180, + and -s 18, so this works out to a maximum overlap of + 65 bp. If --max-overlap is specified, then the + specified value overrides the calculated value. + + If you do not know the standard deviation of the + fragment library, you can probably assume that the + standard deviation is 10% of the average fragment + length. + + --cap-mismatch-quals Cap quality scores assigned at mismatch locations + to 2. This was the default behavior in FLASH v1.2.7 + and earlier. Later versions will instead calculate + such scores as the + absolute value of the difference in quality scores, + but at least 2. Essentially, the new behavior + prevents a low quality base call that is likely a + sequencing error from significantly bringing down + the quality of a high quality, likely correct base + call. + + --interleaved-input Instead of requiring files MATES_1.FASTQ and + MATES_2.FASTQ, allow a single file MATES.FASTQ that + has the paired-end reads interleaved. Specify "-" + to read from standard input. + + --interleaved-output Write the uncombined pairs in interleaved FASTQ + format. + + -I, --interleaved Equivalent to specifying both --interleaved-input + and --interleaved-output. + + -Ti, --tab-delimited-input + Assume the input is in tab-delimited format + rather than FASTQ, in the format described below in + '--tab-delimited-output'. In this mode you should + provide a single input file, each line of which must + contain either a read pair (5 fields) or a single + read (3 fields). FLASH will try to combine the read + pairs. Single reads will be written to the output + file as-is if also using --tab-delimited-output; + otherwise they will be ignored. Note that you may + specify "-" as the input file to read the + tab-delimited data from standard input. + + -To, --tab-delimited-output + Write output in tab-delimited format (not FASTQ). + Each line will contain either a combined pair in the + format 'tag <tab> seq <tab> qual' or an uncombined + pair in the format 'tag <tab> seq_1 <tab> qual_1 + <tab> seq_2 <tab> qual_2'. + + -o, --output-prefix=PREFIX + Prefix of output files. Default: "out". + + -d, --output-directory=DIR + Path to directory for output files. Default: + current working directory. + + -c, --to-stdout + Write the combined reads to standard output. In + this mode, with FASTQ output (the default) the + uncombined reads are discarded. With tab-delimited + output, uncombined reads are included in the + tab-delimited data written to standard output. + In both cases, histogram files are not written, + and informational messages are sent to standard + error rather than to standard output. + + --suffix=SUFFIX, --output-suffix=SUFFIX + Use SUFFIX as the suffix of the output files + after ".fastq". A dot before the suffix is assumed, + unless an empty suffix is provided. Default: + nothing; or 'gz' if -z is specified; or PROG if + --compress-prog=PROG is specified. + + -t, --threads=NTHREADS Set the number of worker threads. This is in + addition to the I/O threads. Default: number of + processors. Note: if you need FLASH's output to + appear deterministically or in the same order as + the original reads, you must specify -t 1 + (--threads=1). + + -q, --quiet Do not print informational messages. + + -h, --help Display this help and exit. + + -v, --version Display version. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Sat Jul 04 08:58:21 2015 -0400 @@ -0,0 +1,106 @@ +Tool wrapper by Brian Yeo +brian.yeo@phac.aspc.gc.ca + + INTRODUCTION + +FLASH (Fast Length Adjustment of SHort reads) is an accurate and fast tool +to merge paired-end reads that were generated from DNA fragments whose +lengths are shorter than twice the length of reads. Merged read pairs result +in unpaired longer reads, which are generally more desired in genome +assembly and genome analysis processes. + +Briefly, the FLASH algorithm considers all possible overlaps at or above a +minimum length between the reads in a pair and chooses the overlap that +results in the lowest mismatch density (proportion of mismatched bases in +the overlapped region). Ties between multiple overlaps are broken by +considering quality scores at mismatch sites. When building the merged +sequence, FLASH computes a consensus sequence in the overlapped region. +More details can be found in the original publication +(http://bioinformatics.oxfordjournals.org/content/27/21/2957.full). + +Limitations of FLASH include: + - FLASH cannot merge paired-end reads that do not overlap. + - FLASH cannot merge read pairs that have an outward orientation, either + due to being "jumping" reads or due to excessive trimming. + - FLASH is not designed for data that has a significant amount of indel + errors (such as Sanger sequencing data). It is best suited for Illumina + data. + + INSTALLATION + +On UNIX-compatible systems, including GNU/Linux and Mac OS X, you must compile +FLASH from source. The only dependency, other than functions that are expected +to be available in the C library, is the zlib data compression library. To +install FLASH, download the tarball, untar it, and compile the code using the +provided Makefile: + + $ tar xzf FLASH-1.2.9.tar.gz + $ cd FLASH-1.2.9 + $ make + +The executable file that is produced is named 'flash'. To run it from the +command line you must copy it to a location on your $PATH variable, or else run +it with a path including a directory, such as "./flash". + +FLASH also runs on Windows, and you can compile it on Windows using MinGW. +However, for convenience you may instead download a standalone Windows binary +from the SourceForge page (https://sourceforge.net/projects/flashpage/). + + USAGE + +Please compile FLASH and run `flash --help' to see command-line usage +information and information about input/output files. + + MULTITHREADING + +By default, FLASH uses multiple threads. There are "combiner" threads that do +the actual read combining, as well as up to 5 threads that are used for I/O (up +to 2 readers, up to 3 writers). The default number of combiner threads is the +number of processors; however, it can be adjusted with the -t option (long +option: --threads). + +When multiple combiner threads are used, the order of the combined and +uncombined reads in the output files will be nondeterministic. If you need to +enforce that the output reads appear in the same order as the input, you must +specify --threads=1. + + PERFORMANCE + +Since the FLASH algorithm considers each read pair independently, FLASH will, by +default, process read pairs in parallel. FLASH v1.2.9 and later also make use +of vector instructions available on modern x86 CPUs. Consequently, FLASH works +quite fast, even with low-cost computing resources. As an example, we ran FLASH +v1.2.9 on a laptop with a dual-core 2.3 GHz AMD x86_64 processor and it +processed one million 101-bp read pairs in 11.6 seconds with the default +parameters. Less than 2 MB of memory was used. Actual timing results will +vary, but they will depend primarily on the number of CPUs available, the speed +of each CPU, and on the I/O speed of reading the input files and writing the +output files. FLASH is designed to be scalable to dozens of processors, +although its speed may be limited by I/O in such cases. + + ACCURACY + +With reads' error rate of 1% or less, FLASH processes over 99% of read pairs +correctly. With error rate of 2%, FLASH processes over 98% of read pairs +correctly when default parameters are used. With more aggressive parameters +(i.e., -x 0.35), FLASH processes over 90% of read pairs correctly even when the +error rate is 5%. + + PUBLICATION + +Title: FLASH: fast length adjustment of short reads to improve genome assemblies +Authors: Tanja Magoč and Steven L. Salzberg +URL: http://bioinformatics.oxfordjournals.org/content/27/21/2957.full + + LICENSE + +FLASH is released under the GNU General Public License Version 3 or later (see +COPYING). + + COMMENTS/QUESTIONS/REQUESTS + +Send an e-mail to flash.comment@gmail.com + +Other versions are available from the SourceForge page: + +https://sourceforge.net/projects/flashpage/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Sat Jul 04 08:58:21 2015 -0400 @@ -0,0 +1,16 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="FLASH" version="1.2.9"> + <install version="1.0"> + <actions> + <action type="download_by_url" target_filename="FLASH-1.2.9.tar.gz">http://sourceforge.net/projects/flashpage/files/FLASH-1.2.9.tar.gz/download</action> + <action type="shell_command">make</action> + <action type="shell_command">cp -r * $INSTALL_DIR</action> + <action type="chmod"><file mode="777">$INSTALL_DIR/flash</file></action> + <action type="set_environment"> + <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR</environment_variable> + </action> + </actions> + </install> + </package> +</tool_dependency>