# HG changeset patch
# User aaronpetkau
# Date 1436014701 14400
# Node ID 6889442b27dcc691e48a52fbb3986afde59abd41
# Parent a444685f161ca877fc127adfbf63c5d7f2858e5d
Uploaded
diff -r a444685f161c -r 6889442b27dc FLASH.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/FLASH.sh Sat Jul 04 08:58:21 2015 -0400
@@ -0,0 +1,41 @@
+#/bin/bash
+
+#grab output files
+merged_reads=$1
+shift
+not_combined_1=$1
+shift
+not_combined_2=$1
+shift
+inter_not_combined=$1
+shift
+reads_and_pairs=$1
+shift
+log_file=$1
+shift
+
+flash $@ > $log_file
+sleep 5 #sleep because phil says so
+
+if [ -f out.notCombined_2.fastq ];
+then
+ mv out.notCombined_2.fastq $not_combined_2
+fi
+if [ -f out.notCombined_1.fastq ];
+then
+ mv out.notCombined_1.fastq $not_combined_1
+fi
+if [ -f out.notCombined.fastq ];
+then
+ mv out.notCombined.fastq $inter_not_combined
+fi
+if [ -f out.readsAndPairs.tab ];
+then
+ mv out.readsAndPairs.tab $reads_and_pairs
+fi
+if [ -f out.extendedFrags.fastq ];
+then
+ mv out.extendedFrags.fastq $merged_reads
+fi
+
+exit 0
diff -r a444685f161c -r 6889442b27dc FLASH.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/FLASH.xml Sat Jul 04 08:58:21 2015 -0400
@@ -0,0 +1,331 @@
+
+ merge paired-end reads from fragments that are shorter than twice the length of reads
+
+ FLASH.sh $extendedFrags $notCombined1 $notCombined2 $interNotCombined $readsAndPairs $log_file -o out -t 4
+ #if $min_overlap
+ -m $min_overlap
+ #end if
+ #if $max_overlap
+ -M $max_overlap
+ #else
+ -M 250
+ #end if
+ #if $outputs.output_type == "Interleaved_fastq"
+ --interleaved-output
+ #else if $outputs.output_type == "tab"
+ -To
+ #end if
+ #if $options.options_select == "advanced"
+ #if $options.max_mismatch_density
+ -x $options.max_mismatch_density
+ #end if
+ #if $options.phred_offset
+ -p $options.phred_offset
+ #end if
+ #if $options.read_length
+ -r $options.read_length
+ #end if
+ #if $options.fragment_length
+ -f $options.fragment_length
+ #end if
+ #if $options.fragment_stdev
+ -s $options.fragment_stdev
+ #end if
+ #if $options.cap_mismatch_quals
+ $options.cap_mismatch_quals
+ #end if
+ #if $options.quiet
+ $options.quiet
+ #end if
+ #end if
+
+ #if $input_type.sPaired == "paired":
+ $input_type.pInput1 $input_type.pInput2
+ #elif $input_type.sPaired == "collections":
+ $input_type.fastq_collection.forward $input_type.fastq_collection.reverse
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ outputs['output_type'] != "tab"
+
+
+ outputs['output_type'] == "Non-interleaved_fastq"
+
+
+ outputs['output_type'] == "Non-interleaved_fastq"
+
+
+ outputs['output_type'] == "Interleaved_fastq"
+
+
+ outputs['output_type'] == "tab"
+
+
+
+
+
+ FLASH
+
+
+----------------------------------------------------------------------------
+ DESCRIPTION
+----------------------------------------------------------------------------
+
+FLASH (Fast Length Adjustment of SHort reads) is an accurate and fast tool
+to merge paired-end reads that were generated from DNA fragments whose
+lengths are shorter than twice the length of reads. Merged read pairs result
+in unpaired longer reads, which are generally more desired in genome
+assembly and genome analysis processes.
+
+Briefly, the FLASH algorithm considers all possible overlaps at or above a
+minimum length between the reads in a pair and chooses the overlap that
+results in the lowest mismatch density (proportion of mismatched bases in
+the overlapped region). Ties between multiple overlaps are broken by
+considering quality scores at mismatch sites. When building the merged
+sequence, FLASH computes a consensus sequence in the overlapped region.
+More details can be found in the original publication
+(http://bioinformatics.oxfordjournals.org/content/27/21/2957.full).
+
+Limitations of FLASH include:
+ - FLASH cannot merge paired-end reads that do not overlap.
+ - FLASH cannot merge read pairs that have an outward orientation, either
+ due to being "jumping" reads or due to excessive trimming.
+ - FLASH is not designed for data that has a significant amount of indel
+ errors (such as Sanger sequencing data). It is best suited for Illumina
+ data.
+
+----------------------------------------------------------------------------
+ MANDATORY INPUT
+----------------------------------------------------------------------------
+
+The most common input to FLASH is two FASTQ files containing read 1 and read 2
+of each mate pair, respectively, in the same order.
+
+Alternatively, you may provide one FASTQ file, which may be standard input,
+containing paired-end reads in either interleaved FASTQ (see the
+--interleaved-input option) or tab-delimited (see the --tab-delimited-input
+option) format. In all cases, gzip compressed input is autodetected. Also,
+in all cases, the PHRED offset is, by default, assumed to be 33; use the
+--phred-offset option to change it.
+
+----------------------------------------------------------------------------
+ OUTPUT
+----------------------------------------------------------------------------
+
+The default output of FLASH consists of the following files:
+
+ - out.extendedFrags.fastq The merged reads.
+ - out.notCombined_1.fastq Read 1 of mate pairs that were not merged.
+ - out.notCombined_2.fastq Read 2 of mate pairs that were not merged.
+ - out.hist Numeric histogram of merged read lengths.
+ - out.histogram Visual histogram of merged read lengths.
+
+FLASH also logs informational messages to standard output. These can be
+redirected to a file, as in the following example:
+
+ $ flash reads_1.fq reads_2.fq | tee flash.log
+
+In addition, FLASH supports several features affecting the output:
+
+ - Writing the merged reads directly to standard output (--to-stdout)
+ - Writing gzip compressed output files (-z) or using an external
+ compression program (--compress-prog)
+ - Writing the uncombined read pairs in interleaved FASTQ format
+ (--interleaved-output)
+ - Writing all output reads to a single file in tab-delimited format
+ (--tab-delimited-output)
+
+----------------------------------------------------------------------------
+ OPTIONS
+----------------------------------------------------------------------------
+
+ -m, --min-overlap=NUM The minimum required overlap length between two
+ reads to provide a confident overlap. Default:
+ 10bp.
+
+ -M, --max-overlap=NUM Maximum overlap length expected in approximately
+ 90% of read pairs. It is by default set to 70bp,
+ which works well for 100bp reads generated from a
+ 180bp library, assuming a normal distribution of
+ fragment lengths. Overlaps longer than the maximum
+ overlap parameter are still considered as good
+ overlaps, but the mismatch density (explained below)
+ is calculated over the first max_overlap bases in
+ the overlapped region rather than the entire
+ overlap. Default: 70bp, or calculated from the
+ specified read length, fragment length, and fragment
+ length standard deviation.
+
+ -x, --max-mismatch-density=NUM
+ Maximum allowed ratio between the number of
+ mismatched base pairs and the overlap length.
+ Two reads will not be combined with a given overlap
+ if that overlap results in a mismatched base density
+ higher than this value. Note: Any occurence of an
+ 'N' in either read is ignored and not counted
+ towards the mismatches or overlap length. Our
+ experimental results suggest that higher values of
+ the maximum mismatch density yield larger
+ numbers of correctly merged read pairs but at
+ the expense of higher numbers of incorrectly
+ merged read pairs. Default: 0.25.
+
+ -p, --phred-offset=OFFSET
+ The smallest ASCII value of the characters used to
+ represent quality values of bases in FASTQ files.
+ It should be set to either 33, which corresponds
+ to the later Illumina platforms and Sanger
+ platforms, or 64, which corresponds to the
+ earlier Illumina platforms. Default: 33.
+
+ -r, --read-len=LEN
+
+ -f, --fragment-len=LEN
+
+ -s, --fragment-len-stddev=LEN
+ Average read length, fragment length, and fragment
+ standard deviation. These are convenience parameters
+ only, as they are only used for calculating the
+ maximum overlap (--max-overlap) parameter.
+ The maximum overlap is calculated as the overlap of
+ average-length reads from an average-size fragment
+ plus 2.5 times the fragment length standard
+ deviation. The default values are -r 100, -f 180,
+ and -s 18, so this works out to a maximum overlap of
+ 65 bp. If --max-overlap is specified, then the
+ specified value overrides the calculated value.
+
+ If you do not know the standard deviation of the
+ fragment library, you can probably assume that the
+ standard deviation is 10% of the average fragment
+ length.
+
+ --cap-mismatch-quals Cap quality scores assigned at mismatch locations
+ to 2. This was the default behavior in FLASH v1.2.7
+ and earlier. Later versions will instead calculate
+ such scores as the
+ absolute value of the difference in quality scores,
+ but at least 2. Essentially, the new behavior
+ prevents a low quality base call that is likely a
+ sequencing error from significantly bringing down
+ the quality of a high quality, likely correct base
+ call.
+
+ --interleaved-input Instead of requiring files MATES_1.FASTQ and
+ MATES_2.FASTQ, allow a single file MATES.FASTQ that
+ has the paired-end reads interleaved. Specify "-"
+ to read from standard input.
+
+ --interleaved-output Write the uncombined pairs in interleaved FASTQ
+ format.
+
+ -I, --interleaved Equivalent to specifying both --interleaved-input
+ and --interleaved-output.
+
+ -Ti, --tab-delimited-input
+ Assume the input is in tab-delimited format
+ rather than FASTQ, in the format described below in
+ '--tab-delimited-output'. In this mode you should
+ provide a single input file, each line of which must
+ contain either a read pair (5 fields) or a single
+ read (3 fields). FLASH will try to combine the read
+ pairs. Single reads will be written to the output
+ file as-is if also using --tab-delimited-output;
+ otherwise they will be ignored. Note that you may
+ specify "-" as the input file to read the
+ tab-delimited data from standard input.
+
+ -To, --tab-delimited-output
+ Write output in tab-delimited format (not FASTQ).
+ Each line will contain either a combined pair in the
+ format 'tag <tab> seq <tab> qual' or an uncombined
+ pair in the format 'tag <tab> seq_1 <tab> qual_1
+ <tab> seq_2 <tab> qual_2'.
+
+ -o, --output-prefix=PREFIX
+ Prefix of output files. Default: "out".
+
+ -d, --output-directory=DIR
+ Path to directory for output files. Default:
+ current working directory.
+
+ -c, --to-stdout
+ Write the combined reads to standard output. In
+ this mode, with FASTQ output (the default) the
+ uncombined reads are discarded. With tab-delimited
+ output, uncombined reads are included in the
+ tab-delimited data written to standard output.
+ In both cases, histogram files are not written,
+ and informational messages are sent to standard
+ error rather than to standard output.
+
+ --suffix=SUFFIX, --output-suffix=SUFFIX
+ Use SUFFIX as the suffix of the output files
+ after ".fastq". A dot before the suffix is assumed,
+ unless an empty suffix is provided. Default:
+ nothing; or 'gz' if -z is specified; or PROG if
+ --compress-prog=PROG is specified.
+
+ -t, --threads=NTHREADS Set the number of worker threads. This is in
+ addition to the I/O threads. Default: number of
+ processors. Note: if you need FLASH's output to
+ appear deterministically or in the same order as
+ the original reads, you must specify -t 1
+ (--threads=1).
+
+ -q, --quiet Do not print informational messages.
+
+ -h, --help Display this help and exit.
+
+ -v, --version Display version.
+
+
diff -r a444685f161c -r 6889442b27dc README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README Sat Jul 04 08:58:21 2015 -0400
@@ -0,0 +1,106 @@
+Tool wrapper by Brian Yeo
+brian.yeo@phac.aspc.gc.ca
+
+ INTRODUCTION
+
+FLASH (Fast Length Adjustment of SHort reads) is an accurate and fast tool
+to merge paired-end reads that were generated from DNA fragments whose
+lengths are shorter than twice the length of reads. Merged read pairs result
+in unpaired longer reads, which are generally more desired in genome
+assembly and genome analysis processes.
+
+Briefly, the FLASH algorithm considers all possible overlaps at or above a
+minimum length between the reads in a pair and chooses the overlap that
+results in the lowest mismatch density (proportion of mismatched bases in
+the overlapped region). Ties between multiple overlaps are broken by
+considering quality scores at mismatch sites. When building the merged
+sequence, FLASH computes a consensus sequence in the overlapped region.
+More details can be found in the original publication
+(http://bioinformatics.oxfordjournals.org/content/27/21/2957.full).
+
+Limitations of FLASH include:
+ - FLASH cannot merge paired-end reads that do not overlap.
+ - FLASH cannot merge read pairs that have an outward orientation, either
+ due to being "jumping" reads or due to excessive trimming.
+ - FLASH is not designed for data that has a significant amount of indel
+ errors (such as Sanger sequencing data). It is best suited for Illumina
+ data.
+
+ INSTALLATION
+
+On UNIX-compatible systems, including GNU/Linux and Mac OS X, you must compile
+FLASH from source. The only dependency, other than functions that are expected
+to be available in the C library, is the zlib data compression library. To
+install FLASH, download the tarball, untar it, and compile the code using the
+provided Makefile:
+
+ $ tar xzf FLASH-1.2.9.tar.gz
+ $ cd FLASH-1.2.9
+ $ make
+
+The executable file that is produced is named 'flash'. To run it from the
+command line you must copy it to a location on your $PATH variable, or else run
+it with a path including a directory, such as "./flash".
+
+FLASH also runs on Windows, and you can compile it on Windows using MinGW.
+However, for convenience you may instead download a standalone Windows binary
+from the SourceForge page (https://sourceforge.net/projects/flashpage/).
+
+ USAGE
+
+Please compile FLASH and run `flash --help' to see command-line usage
+information and information about input/output files.
+
+ MULTITHREADING
+
+By default, FLASH uses multiple threads. There are "combiner" threads that do
+the actual read combining, as well as up to 5 threads that are used for I/O (up
+to 2 readers, up to 3 writers). The default number of combiner threads is the
+number of processors; however, it can be adjusted with the -t option (long
+option: --threads).
+
+When multiple combiner threads are used, the order of the combined and
+uncombined reads in the output files will be nondeterministic. If you need to
+enforce that the output reads appear in the same order as the input, you must
+specify --threads=1.
+
+ PERFORMANCE
+
+Since the FLASH algorithm considers each read pair independently, FLASH will, by
+default, process read pairs in parallel. FLASH v1.2.9 and later also make use
+of vector instructions available on modern x86 CPUs. Consequently, FLASH works
+quite fast, even with low-cost computing resources. As an example, we ran FLASH
+v1.2.9 on a laptop with a dual-core 2.3 GHz AMD x86_64 processor and it
+processed one million 101-bp read pairs in 11.6 seconds with the default
+parameters. Less than 2 MB of memory was used. Actual timing results will
+vary, but they will depend primarily on the number of CPUs available, the speed
+of each CPU, and on the I/O speed of reading the input files and writing the
+output files. FLASH is designed to be scalable to dozens of processors,
+although its speed may be limited by I/O in such cases.
+
+ ACCURACY
+
+With reads' error rate of 1% or less, FLASH processes over 99% of read pairs
+correctly. With error rate of 2%, FLASH processes over 98% of read pairs
+correctly when default parameters are used. With more aggressive parameters
+(i.e., -x 0.35), FLASH processes over 90% of read pairs correctly even when the
+error rate is 5%.
+
+ PUBLICATION
+
+Title: FLASH: fast length adjustment of short reads to improve genome assemblies
+Authors: Tanja Magoč and Steven L. Salzberg
+URL: http://bioinformatics.oxfordjournals.org/content/27/21/2957.full
+
+ LICENSE
+
+FLASH is released under the GNU General Public License Version 3 or later (see
+COPYING).
+
+ COMMENTS/QUESTIONS/REQUESTS
+
+Send an e-mail to flash.comment@gmail.com
+
+Other versions are available from the SourceForge page:
+
+https://sourceforge.net/projects/flashpage/
diff -r a444685f161c -r 6889442b27dc tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Sat Jul 04 08:58:21 2015 -0400
@@ -0,0 +1,16 @@
+
+
+
+
+
+ http://sourceforge.net/projects/flashpage/files/FLASH-1.2.9.tar.gz/download
+ make
+ cp -r * $INSTALL_DIR
+ $INSTALL_DIR/flash
+
+ $INSTALL_DIR
+
+
+
+
+