Mercurial > repos > xuebing > sharplabtool
view tools/sr_mapping/bfast_wrapper.xml @ 1:cdcb0ce84a1b
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:45:15 -0500 |
parents | 9071e359b9a3 |
children |
line wrap: on
line source
<tool id="bfast_wrapper" name="Map with BFAST" version="0.1.3"> <description></description> <command interpreter="python">bfast_wrapper.py --numThreads="4" ##HACK: hardcode numThreads for now, should come from a location file --fastq="$input1" #if $input1.extension.startswith( "fastqcs" ): ##if extention starts with fastqcs, then we have a color space file --space="1" ##color space #else --space="0" #end if --output="$output" $suppressHeader #if $refGenomeSource.refGenomeSource_type == "history": ##build indexes on the fly --buildIndex --ref="${refGenomeSource.ownFile}" --indexMask="${",".join( [ "%s:%s" % ( str( custom_index.get( 'mask' ) ).strip(), str( custom_index.get( 'hash_width' ) ).strip() ) for custom_index in $refGenomeSource.custom_index ] )}" ${refGenomeSource.indexing_repeatmasker} #if $refGenomeSource.indexing_option.indexing_option_selector == "contig_offset": --indexContigOptions="${refGenomeSource.indexing_option.start_contig},${refGenomeSource.indexing_option.start_pos},${refGenomeSource.indexing_option.end_contig},${refGenomeSource.indexing_option.end_pos}" #elif $refGenomeSource.indexing_option.indexing_option_selector == "exons_file": --indexExonsFileName="${refGenomeSource.indexing_option.exons_file}" #end if #else: ##use precomputed indexes --ref="${ refGenomeSource.indices.fields.path }" #end if #if $params.source_select == "full": --offsets="$params.offsets" --keySize="$params.keySize" --maxKeyMatches="$params.maxKeyMatches" --maxNumMatches="$params.maxNumMatches" --whichStrand="$params.whichStrand" #if str( $params.scoringMatrixFileName ) != 'None': --scoringMatrixFileName="$params.scoringMatrixFileName" #end if ${params.ungapped} ${params.unconstrained} --offset="${params.offset}" --avgMismatchQuality="${params.avgMismatchQuality}" --algorithm="${params.localalign_params.algorithm}" ${params.unpaired} ${params.reverseStrand} #if $params.localalign_params.algorithm == "3": ${params.localalign_params.pairedEndInfer} ${params.localalign_params.randomBest} #end if #end if </command> <inputs> <param name="input1" type="data" format="fastqsanger,fastqcssanger" label="FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/> <conditional name="refGenomeSource"> <param name="refGenomeSource_type" type="select" label="Will you select a reference genome from your history or use a built-in index?"> <option value="indexed">Use a built-in index</option> <option value="history">Use one from the history</option> </param> <when value="indexed"> <param name="indices" type="select" label="Select a reference genome index set"> <options from_data_table="bfast_indexes"> <filter type="multiple_splitter" column="2" separator=","/> <filter type="param_value" column="2" ref="input1" ref_attribute="extension"/> <filter type="sort_by" column="3"/> <validator type="no_options" message="No indexes are available for the selected input dataset"/> </options> </param> </when> <when value="history"> <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" /> <repeat name="custom_index" title="Custom indice" min="1" > <param name="mask" type="text" value="" label="Specify the mask" size="20"> <!-- <validator type="no_options" message="No indexes are available for the selected input dataset"/> need is int validator here or regex all 01s--> </param> <param name="hash_width" type="integer" value="" label="Hash Width" /> </repeat> <param name="indexing_repeatmasker" type="boolean" truevalue="--indexRepeatMasker" falsevalue="" checked="False" label="Do not index lower case sequences" help="Such as those created by RepeatMasker"/> <conditional name="indexing_option"> <param name="indexing_option_selector" type="select" label="BFAST indexing settings to use" help="For most indexing needs use default settings. If you want full control use the other options."> <option value="default">Default</option> <option value="contig_offset">Contig Offset</option> <option value="exons_file">Exons file</option> </param> <when value="default"> <!-- nothing here --> </when> <when value="contig_offset"> <param name="start_contig" type="integer" value="-1" label="Start Contig" help="Specifies the first contig to include when building indexes. (advanced users only)" /> <param name="start_pos" type="integer" value="-1" label="Start Position" help="Specifies the first position in the first contig to include when building indexes. (advanced users only)" /> <param name="end_contig" type="integer" value="-1" label="End Contig" help="Specifies the last contig to include when building indexes. (advanced users only)" /> <param name="end_pos" type="integer" value="-1" label="End Position" help="Specifies the last position in the last contig to include when building indexes. (advanced users only)" /> </when> <when value="exons_file"> <param name="exons_file" type="data" format="tabular" label="Select an exons file from history" help="See BFAST manual for file format requirements. (advanced users only)"/> </when> </conditional> </when> </conditional> <conditional name="params"> <param name="source_select" type="select" label="BFAST matching settings to use" help="For most mapping needs use Commonly Used settings. If you want full control use Full Parameter List"> <option value="pre_set">Commonly Used</option> <option value="full">Full Parameter List</option> </param> <when value="pre_set"> <!-- nothing here --> </when> <when value="full"> <param name="offsets" type="text" value="" label="The offsets for 'bfast match'" help="Set if not all offsets from the 5' end of the read are to be examined (advanced users only)" /> <param name="keySize" type="integer" value="-1" label="Truncate key size in 'match'" help="Set this to reduce the effective key size of all indexes in 'bfast match' (advanced users only)" /> <param name="maxKeyMatches" type="integer" value="8" label="The maximum number of matches to allow before a key is ignored" help="Lower values will result in more unique regions being examined, while larger values will allow include repetitive regions" /> <param name="maxNumMatches" type="integer" value="384" label="The maximum number of matches to allow before a read is discarded" help="Larger values will allow more hits to be examined" /> <param name="whichStrand" type="select" label="The strands to consider" help="Both strands, forward strand only, or reverse strand only"> <option value="0">Both strands</option> <option value="1">Forward strand only</option> <option value="2">Reverse strand only</option> </param> <param name="scoringMatrixFileName" type="data" format="text" optional="True" label="Scoring Matrix file used to score the alignments" help="See BFAST manual for file format requirements. (advanced users only)"/> <param name="ungapped" type="boolean" truevalue="--ungapped" falsevalue="" checked="no" label="Perform ungapped local alignment" help="Performing ungapped local alignment will not consider indels while providing a significant speed increase" /> <param name="unconstrained" type="boolean" truevalue="--unconstrained" falsevalue="" checked="no" label="Perform unconstrained local alignment" help="Performing unconstrained local alignment will not use mask constraints at the cost of speed" /> <param name="offset" type="integer" value="20" label="The number of bases before and after each hit to consider in local alignment" help="Larger values will allow for larger insertions and deletions to be detected at the cost of speed" /> <param name="avgMismatchQuality" type="integer" value="10" label="The average mismatch quality" help="This can be used as a scaling factor for mapping quality (advanced users only)" /> <conditional name="localalign_params"> <param name="algorithm" type="select" label="The post processing algorithm" help="This determines how reads with multiple candidate alignments are returned. Unique alignments will return an alignment if the read has only one candidate alignment. Uniquely best scoring alignments will return one alignment for a read if that alignment has a better alignment score than the rest of the candidate alignments. All best scoring alignments will return all alignments that have the best alignment score for a read."> <option value="0" selected="True">No filtering</option> <option value="1">All alignments that pass filtering</option> <option value="2">Unique alignments</option> <option value="3">Uniquely best scoring alignments</option> <option value="4">All best scoring alignments</option> </param> <when value="0"> <!-- nothing here --> </when> <when value="1"> <!-- nothing here --> </when> <when value="2"> <!-- nothing here --> </when> <when value="4"> <!-- nothing here --> </when> <when value="3"> <param name="pairedEndInfer" type="boolean" truevalue="--pairedEndInfer" falsevalue="" checked="no" label="pairedEndInfer" help="break ties when one end of a paired end read by estimating the insert size distribution" /> <param name="randomBest" type="boolean" truevalue="--randomBest" falsevalue="" checked="no" label="Random alignments" help="output a random best scoring alignment (advanced users only)" /> </when> </conditional> <param name="unpaired" type="boolean" truevalue="--unpaired" falsevalue="" checked="no" label="Disallow pairing" help="do not choose alignments based on pairing" /> <param name="reverseStrand" type="boolean" truevalue="--reverseStrand" falsevalue="" checked="no" label="Reverse paired ends" help="paired end reads are given on reverse strands" /> </when> </conditional> <param name="suppressHeader" type="boolean" truevalue="--suppressHeader" falsevalue="" checked="False" label="Suppress the header in the output SAM file" help="BFAST produces SAM with several lines of header information" /> </inputs> <outputs> <data format="sam" name="output" label="${tool.name} on ${on_string}: mapped reads"> <actions> <conditional name="refGenomeSource.refGenomeSource_type"> <when value="indexed"> <action type="metadata" name="dbkey"> <option type="from_data_table" column="1" name="bfast_indexes"> <filter type="param_value" ref="refGenomeSource.indices" column="0" /> </option> </action> </when> <when value="history"> <action type="metadata" name="dbkey"> <option type="from_param" name="refGenomeSource.ownFile" param_attribute="dbkey" /> </action> </when> </conditional> </actions> </data> </outputs> <help> **What it does** BFAST facilitates the fast and accurate mapping of short reads to reference sequences. Some advantages of BFAST include: * Speed: enables billions of short reads to be mapped quickly. * Accuracy: A priori probabilities for mapping reads with defined set of variants * An easy way to measurably tune accuracy at the expense of speed. Specifically, BFAST was designed to facilitate whole-genome resequencing, where mapping billions of short reads with variants is of utmost importance. BFAST supports both Illumina and ABI SOLiD data, as well as any other Next-Generation Sequencing Technology (454, Helicos), with particular emphasis on sensitivity towards errors, SNPs and especially indels. Other algorithms take short-cuts by ignoring errors, certain types of variants (indels), and even require further alignment, all to be the "fastest" (but still not complete). BFAST is able to be tuned to find variants regardless of the error-rate, polymorphism rate, or other factors. ------ Please cite the website "http://bfast.sourceforge.net" as well as the accompanying papers: Homer N, Merriman B, Nelson SF. BFAST: An alignment tool for large scale genome resequencing. PMID: 19907642 PLoS ONE. 2009 4(11): e7767. http://dx.doi.org/10.1371/journal.pone.0007767 Homer N, Merriman B, Nelson SF. Local alignment of two-base encoded DNA sequence. BMC Bioinformatics. 2009 Jun 9;10(1):175. PMID: 19508732 http://dx.doi.org/10.1186/1471-2105-10-175 ------ **Know what you are doing** .. class:: warningmark There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. .. __: http://bfast.sourceforge.net/ ------ **Input formats** BFAST accepts files in Sanger FASTQ format. Use the FASTQ Groomer to prepare your files. ------ **Outputs** The output is in SAM format, and has the following columns:: Column Description -------- -------------------------------------------------------- 1 QNAME Query (pair) NAME 2 FLAG bitwise FLAG 3 RNAME Reference sequence NAME 4 POS 1-based leftmost POSition/coordinate of clipped sequence 5 MAPQ MAPping Quality (Phred-scaled) 6 CIGAR extended CIGAR string 7 MRNM Mate Reference sequence NaMe ('=' if same as RNAME) 8 MPOS 1-based Mate POSition 9 ISIZE Inferred insert SIZE 10 SEQ query SEQuence on the same strand as the reference 11 QUAL query QUALity (ASCII-33 gives the Phred base quality) 12 OPT variable OPTional fields in the format TAG:VTYPE:VALU The flags are as follows:: Flag Description ------ ------------------------------------- 0x0001 the read is paired in sequencing 0x0002 the read is mapped in a proper pair 0x0004 the query sequence itself is unmapped 0x0008 the mate is unmapped 0x0010 strand of the query (1 for reverse) 0x0020 strand of the mate 0x0040 the read is the first read in a pair 0x0080 the read is the second read in a pair 0x0100 the alignment is not primary It looks like this (scroll sideways to see the entire example):: QNAME FLAG RNAME POS MAPQ CIAGR MRNM MPOS ISIZE SEQ QUAL OPT HWI-EAS91_1_30788AAXX:1:1:1761:343 4 * 0 0 * * 0 0 AAAAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAACNNANNGAGTNGNNNNNNNGCTTCCCACAGNNCTGG hhhhhhh;;hhhhhhhhhhh^hOhhhhghhhfhhhgh;;h;;hhhh;h;;;;;;;hhhhhhghhhh;;Phhh HWI-EAS91_1_30788AAXX:1:1:1578:331 4 * 0 0 * * 0 0 GTATAGANNAATAAGAAAAAAAAAAATGAAGACTTTCNNANNTCTGNANNNNNNNTCTTTTTTCAGNNGTAG hhhhhhh;;hhhhhhhhhhhhhhhhhhhhhhhhhhhh;;h;;hhhh;h;;;;;;;hhhhhhhhhhh;;hhVh ------- **BFAST settings** All of the options have a default value. You can change any of them. Most of the options in BFAST have been implemented here. ------ **BFAST parameter list** This is an exhaustive list of BFAST options: For **match**:: -o STRING Specifies the offset [Use all] -l Specifies to load all main or secondary indexes into memory -A INT 0: NT space 1: Color space [0] -k INT Specifies to truncate all indexes to have the given key size (must be greater than the hash width) [Not Using] -K INT Specifies the maximum number of matches to allow before a key is ignored [8] -M INT Specifies the maximum total number of matches to consider before the read is discarded [384] -w INT 0: consider both strands 1: forward strand only 2: reverse strand only [0] -n INT Specifies the number of threads to use [1] -t Specifies to output timing information For **localalign**:: -x FILE Specifies the file name storing the scoring matrix -u Do ungapped local alignment (the default is gapped). -U Do not use mask constraints from the match step -A INT 0: NT space 1: Color space [0] -o INT Specifies the number of bases before and after the match to include in the reference genome -M INT Specifies the maximum total number of matches to consider before the read is discarded [384] -q INT Specifies the average mismatch quality -n INT Specifies the number of threads to use [1] -t Specifies to output timing information For **postprocess**:: -a INT Specifies the algorithm to choose the alignment for each end of the read: 0: No filtering will occur. 1: All alignments that pass the filters will be output 2: Only consider reads that have been aligned uniquely 3: Choose uniquely the alignment with the best score 4: Choose all alignments with the best score -A INT 0: NT space 1: Color space [0] -U Specifies that pairing should not be performed -R Specifies that paired reads are on opposite strands -q INT Specifies the average mismatch quality -x FILE Specifies the file name storing the scoring matrix -z Specifies to output a random best scoring alignment (with -a 3) -r FILE Specifies to add the RG in the specified file to the SAM header and updates the RG tag (and LB/PU tags if present) in the reads (SAM only) -n INT Specifies the number of threads to use [1] -t Specifies to output timing information </help> <requirements> <requirement type="package">bfast</requirement> </requirements> <tests> <test> <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger" /> <param name="refGenomeSource_type" value="history" /> <param name="ownFile" ftype="fasta" value="phiX.fasta" /> <param name="mask" value="111111111111111111" /> <param name="hash_width" value="14" /> <param name="source_select" value="pre_set" /> <param name="indexing_repeatmasker" value="False" /> <param name="indexing_option_selector" value="default" /> <param name="suppressHeader" value="" /> <output name="output" ftype="sam" file="bfast_out1.sam" /> </test> <test> <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger"/> <param name="refGenomeSource_type" value="history" /> <param name="ownFile" ftype="fasta" value="phiX.fasta" /> <param name="mask" value="111111111111111111" /> <param name="hash_width" value="14" /> <param name="source_select" value="pre_set" /> <param name="indexing_repeatmasker" value="False" /> <param name="indexing_option_selector" value="default" /> <param name="suppressHeader" value="--suppressHeader" /> <output name="output" ftype="sam" file="bfast_out1.sam" lines_diff="3" /><!-- 3 headers exist in compare file, but headers are suppressed --> </test> <test> <param name="input1" ftype="fastqcssanger" value="random_phiX_1.fastqcssanger" /> <param name="refGenomeSource_type" value="history" /> <param name="ownFile" ftype="fasta" value="phiX.fasta" /> <param name="mask" value="111111111111111111" /> <param name="hash_width" value="14" /> <param name="source_select" value="pre_set" /> <param name="indexing_repeatmasker" value="False" /> <param name="indexing_option_selector" value="default" /> <param name="suppressHeader" value="" /> <output name="output" ftype="sam" file="bfast_out2.sam" /> </test> <!-- test of pre-indexed data now --> <test> <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger" /> <param name="refGenomeSource_type" value="indexed" /> <param name="indices" value="phiX_nt_50" /> <param name="source_select" value="pre_set" /> <param name="suppressHeader" value="" /> <output name="output" ftype="sam" file="bfast_out3.sam" lines_diff="2" /><!-- MD:Z:11T38 instead of MD:Z:50 on one line--> </test> </tests> </tool>