comparison tools/sr_mapping/bfast_wrapper.xml @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 <tool id="bfast_wrapper" name="Map with BFAST" version="0.1.3">
2 <description></description>
3 <command interpreter="python">bfast_wrapper.py
4 --numThreads="4" ##HACK: hardcode numThreads for now, should come from a location file
5 --fastq="$input1"
6 #if $input1.extension.startswith( "fastqcs" ):
7 ##if extention starts with fastqcs, then we have a color space file
8 --space="1" ##color space
9 #else
10 --space="0"
11 #end if
12 --output="$output"
13 $suppressHeader
14
15 #if $refGenomeSource.refGenomeSource_type == "history":
16 ##build indexes on the fly
17 --buildIndex
18 --ref="${refGenomeSource.ownFile}"
19 --indexMask="${",".join( [ "%s:%s" % ( str( custom_index.get( 'mask' ) ).strip(), str( custom_index.get( 'hash_width' ) ).strip() ) for custom_index in $refGenomeSource.custom_index ] )}"
20 ${refGenomeSource.indexing_repeatmasker}
21 #if $refGenomeSource.indexing_option.indexing_option_selector == "contig_offset":
22 --indexContigOptions="${refGenomeSource.indexing_option.start_contig},${refGenomeSource.indexing_option.start_pos},${refGenomeSource.indexing_option.end_contig},${refGenomeSource.indexing_option.end_pos}"
23 #elif $refGenomeSource.indexing_option.indexing_option_selector == "exons_file":
24 --indexExonsFileName="${refGenomeSource.indexing_option.exons_file}"
25 #end if
26 #else:
27 ##use precomputed indexes
28 --ref="${ refGenomeSource.indices.fields.path }"
29 #end if
30
31 #if $params.source_select == "full":
32 --offsets="$params.offsets"
33 --keySize="$params.keySize"
34 --maxKeyMatches="$params.maxKeyMatches"
35 --maxNumMatches="$params.maxNumMatches"
36 --whichStrand="$params.whichStrand"
37
38 #if str( $params.scoringMatrixFileName ) != 'None':
39 --scoringMatrixFileName="$params.scoringMatrixFileName"
40 #end if
41 ${params.ungapped}
42 ${params.unconstrained}
43 --offset="${params.offset}"
44 --avgMismatchQuality="${params.avgMismatchQuality}"
45
46 --algorithm="${params.localalign_params.algorithm}"
47 ${params.unpaired}
48 ${params.reverseStrand}
49 #if $params.localalign_params.algorithm == "3":
50 ${params.localalign_params.pairedEndInfer}
51 ${params.localalign_params.randomBest}
52 #end if
53 #end if
54 </command>
55 <inputs>
56 <param name="input1" type="data" format="fastqsanger,fastqcssanger" label="FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/>
57 <conditional name="refGenomeSource">
58 <param name="refGenomeSource_type" type="select" label="Will you select a reference genome from your history or use a built-in index?">
59 <option value="indexed">Use a built-in index</option>
60 <option value="history">Use one from the history</option>
61 </param>
62 <when value="indexed">
63 <param name="indices" type="select" label="Select a reference genome index set">
64 <options from_data_table="bfast_indexes">
65 <filter type="multiple_splitter" column="2" separator=","/>
66 <filter type="param_value" column="2" ref="input1" ref_attribute="extension"/>
67 <filter type="sort_by" column="3"/>
68 <validator type="no_options" message="No indexes are available for the selected input dataset"/>
69 </options>
70 </param>
71 </when>
72 <when value="history">
73 <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />
74 <repeat name="custom_index" title="Custom indice" min="1" >
75 <param name="mask" type="text" value="" label="Specify the mask" size="20">
76 <!-- <validator type="no_options" message="No indexes are available for the selected input dataset"/> need is int validator here or regex all 01s-->
77 </param>
78 <param name="hash_width" type="integer" value="" label="Hash Width" />
79 </repeat>
80 <param name="indexing_repeatmasker" type="boolean" truevalue="--indexRepeatMasker" falsevalue="" checked="False" label="Do not index lower case sequences" help="Such as those created by RepeatMasker"/>
81 <conditional name="indexing_option">
82 <param name="indexing_option_selector" type="select" label="BFAST indexing settings to use" help="For most indexing needs use default settings. If you want full control use the other options.">
83 <option value="default">Default</option>
84 <option value="contig_offset">Contig Offset</option>
85 <option value="exons_file">Exons file</option>
86 </param>
87 <when value="default">
88 <!-- nothing here -->
89 </when>
90 <when value="contig_offset">
91 <param name="start_contig" type="integer" value="-1" label="Start Contig" help="Specifies the first contig to include when building indexes. (advanced users only)" />
92 <param name="start_pos" type="integer" value="-1" label="Start Position" help="Specifies the first position in the first contig to include when building indexes. (advanced users only)" />
93 <param name="end_contig" type="integer" value="-1" label="End Contig" help="Specifies the last contig to include when building indexes. (advanced users only)" />
94 <param name="end_pos" type="integer" value="-1" label="End Position" help="Specifies the last position in the last contig to include when building indexes. (advanced users only)" />
95 </when>
96 <when value="exons_file">
97 <param name="exons_file" type="data" format="tabular" label="Select an exons file from history" help="See BFAST manual for file format requirements. (advanced users only)"/>
98 </when>
99 </conditional>
100 </when>
101 </conditional>
102 <conditional name="params">
103 <param name="source_select" type="select" label="BFAST matching settings to use" help="For most mapping needs use Commonly Used settings. If you want full control use Full Parameter List">
104 <option value="pre_set">Commonly Used</option>
105 <option value="full">Full Parameter List</option>
106 </param>
107 <when value="pre_set">
108 <!-- nothing here -->
109 </when>
110 <when value="full">
111 <param name="offsets" type="text" value="" label="The offsets for 'bfast match'" help="Set if not all offsets from the 5' end of the read are to be examined (advanced users only)" />
112 <param name="keySize" type="integer" value="-1" label="Truncate key size in 'match'" help="Set this to reduce the effective key size of all indexes in 'bfast match' (advanced users only)" />
113 <param name="maxKeyMatches" type="integer" value="8" label="The maximum number of matches to allow before a key is ignored" help="Lower values will result in more unique regions being examined, while larger values will allow include repetitive regions" />
114 <param name="maxNumMatches" type="integer" value="384" label="The maximum number of matches to allow before a read is discarded" help="Larger values will allow more hits to be examined" />
115 <param name="whichStrand" type="select" label="The strands to consider" help="Both strands, forward strand only, or reverse strand only">
116 <option value="0">Both strands</option>
117 <option value="1">Forward strand only</option>
118 <option value="2">Reverse strand only</option>
119 </param>
120
121 <param name="scoringMatrixFileName" type="data" format="text" optional="True" label="Scoring Matrix file used to score the alignments" help="See BFAST manual for file format requirements. (advanced users only)"/>
122 <param name="ungapped" type="boolean" truevalue="--ungapped" falsevalue="" checked="no" label="Perform ungapped local alignment" help="Performing ungapped local alignment will not consider indels while providing a significant speed increase" />
123 <param name="unconstrained" type="boolean" truevalue="--unconstrained" falsevalue="" checked="no" label="Perform unconstrained local alignment" help="Performing unconstrained local alignment will not use mask constraints at the cost of speed" />
124 <param name="offset" type="integer" value="20" label="The number of bases before and after each hit to consider in local alignment" help="Larger values will allow for larger insertions and deletions to be detected at the cost of speed" />
125 <param name="avgMismatchQuality" type="integer" value="10" label="The average mismatch quality" help="This can be used as a scaling factor for mapping quality (advanced users only)" />
126
127 <conditional name="localalign_params">
128 <param name="algorithm" type="select" label="The post processing algorithm" help="This determines how reads with multiple candidate alignments are returned. Unique alignments will return an alignment if the read has only one candidate alignment. Uniquely best scoring alignments will return one alignment for a read if that alignment has a better alignment score than the rest of the candidate alignments. All best scoring alignments will return all alignments that have the best alignment score for a read.">
129 <option value="0" selected="True">No filtering</option>
130 <option value="1">All alignments that pass filtering</option>
131 <option value="2">Unique alignments</option>
132 <option value="3">Uniquely best scoring alignments</option>
133 <option value="4">All best scoring alignments</option>
134 </param>
135 <when value="0">
136 <!-- nothing here -->
137 </when>
138 <when value="1">
139 <!-- nothing here -->
140 </when>
141 <when value="2">
142 <!-- nothing here -->
143 </when>
144 <when value="4">
145 <!-- nothing here -->
146 </when>
147 <when value="3">
148 <param name="pairedEndInfer" type="boolean" truevalue="--pairedEndInfer" falsevalue="" checked="no" label="pairedEndInfer" help="break ties when one end of a paired end read by estimating the insert size distribution" />
149 <param name="randomBest" type="boolean" truevalue="--randomBest" falsevalue="" checked="no" label="Random alignments" help="output a random best scoring alignment (advanced users only)" />
150 </when>
151 </conditional>
152 <param name="unpaired" type="boolean" truevalue="--unpaired" falsevalue="" checked="no" label="Disallow pairing" help="do not choose alignments based on pairing" />
153 <param name="reverseStrand" type="boolean" truevalue="--reverseStrand" falsevalue="" checked="no" label="Reverse paired ends" help="paired end reads are given on reverse strands" />
154
155 </when>
156 </conditional>
157 <param name="suppressHeader" type="boolean" truevalue="--suppressHeader" falsevalue="" checked="False" label="Suppress the header in the output SAM file" help="BFAST produces SAM with several lines of header information" />
158 </inputs>
159 <outputs>
160 <data format="sam" name="output" label="${tool.name} on ${on_string}: mapped reads">
161 <actions>
162 <conditional name="refGenomeSource.refGenomeSource_type">
163 <when value="indexed">
164 <action type="metadata" name="dbkey">
165 <option type="from_data_table" column="1" name="bfast_indexes">
166 <filter type="param_value" ref="refGenomeSource.indices" column="0" />
167 </option>
168 </action>
169 </when>
170 <when value="history">
171 <action type="metadata" name="dbkey">
172 <option type="from_param" name="refGenomeSource.ownFile" param_attribute="dbkey" />
173 </action>
174 </when>
175 </conditional>
176 </actions>
177 </data>
178 </outputs>
179 <help>
180 **What it does**
181
182 BFAST facilitates the fast and accurate mapping of short reads to reference sequences. Some advantages of BFAST include:
183 * Speed: enables billions of short reads to be mapped quickly.
184 * Accuracy: A priori probabilities for mapping reads with defined set of variants
185 * An easy way to measurably tune accuracy at the expense of speed.
186 Specifically, BFAST was designed to facilitate whole-genome resequencing, where mapping billions of short reads with variants is of utmost importance.
187
188 BFAST supports both Illumina and ABI SOLiD data, as well as any other Next-Generation Sequencing Technology (454, Helicos), with particular emphasis on sensitivity towards errors, SNPs and especially indels. Other algorithms take short-cuts by ignoring errors, certain types of variants (indels), and even require further alignment, all to be the "fastest" (but still not complete). BFAST is able to be tuned to find variants regardless of the error-rate, polymorphism rate, or other factors.
189
190 ------
191
192 Please cite the website "http://bfast.sourceforge.net" as well as the accompanying
193 papers:
194
195 Homer N, Merriman B, Nelson SF.
196 BFAST: An alignment tool for large scale genome resequencing.
197 PMID: 19907642
198 PLoS ONE. 2009 4(11): e7767.
199 http://dx.doi.org/10.1371/journal.pone.0007767
200
201 Homer N, Merriman B, Nelson SF.
202 Local alignment of two-base encoded DNA sequence.
203 BMC Bioinformatics. 2009 Jun 9;10(1):175.
204 PMID: 19508732
205 http://dx.doi.org/10.1186/1471-2105-10-175
206
207 ------
208
209 **Know what you are doing**
210
211 .. class:: warningmark
212
213 There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.
214
215 .. __: http://bfast.sourceforge.net/
216
217 ------
218
219 **Input formats**
220
221 BFAST accepts files in Sanger FASTQ format. Use the FASTQ Groomer to prepare your files.
222
223 ------
224
225 **Outputs**
226
227 The output is in SAM format, and has the following columns::
228
229 Column Description
230 -------- --------------------------------------------------------
231 1 QNAME Query (pair) NAME
232 2 FLAG bitwise FLAG
233 3 RNAME Reference sequence NAME
234 4 POS 1-based leftmost POSition/coordinate of clipped sequence
235 5 MAPQ MAPping Quality (Phred-scaled)
236 6 CIGAR extended CIGAR string
237 7 MRNM Mate Reference sequence NaMe ('=' if same as RNAME)
238 8 MPOS 1-based Mate POSition
239 9 ISIZE Inferred insert SIZE
240 10 SEQ query SEQuence on the same strand as the reference
241 11 QUAL query QUALity (ASCII-33 gives the Phred base quality)
242 12 OPT variable OPTional fields in the format TAG:VTYPE:VALU
243
244 The flags are as follows::
245
246 Flag Description
247 ------ -------------------------------------
248 0x0001 the read is paired in sequencing
249 0x0002 the read is mapped in a proper pair
250 0x0004 the query sequence itself is unmapped
251 0x0008 the mate is unmapped
252 0x0010 strand of the query (1 for reverse)
253 0x0020 strand of the mate
254 0x0040 the read is the first read in a pair
255 0x0080 the read is the second read in a pair
256 0x0100 the alignment is not primary
257
258 It looks like this (scroll sideways to see the entire example)::
259
260 QNAME FLAG RNAME POS MAPQ CIAGR MRNM MPOS ISIZE SEQ QUAL OPT
261 HWI-EAS91_1_30788AAXX:1:1:1761:343 4 * 0 0 * * 0 0 AAAAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAACNNANNGAGTNGNNNNNNNGCTTCCCACAGNNCTGG hhhhhhh;;hhhhhhhhhhh^hOhhhhghhhfhhhgh;;h;;hhhh;h;;;;;;;hhhhhhghhhh;;Phhh
262 HWI-EAS91_1_30788AAXX:1:1:1578:331 4 * 0 0 * * 0 0 GTATAGANNAATAAGAAAAAAAAAAATGAAGACTTTCNNANNTCTGNANNNNNNNTCTTTTTTCAGNNGTAG hhhhhhh;;hhhhhhhhhhhhhhhhhhhhhhhhhhhh;;h;;hhhh;h;;;;;;;hhhhhhhhhhh;;hhVh
263
264 -------
265
266 **BFAST settings**
267
268 All of the options have a default value. You can change any of them. Most of the options in BFAST have been implemented here.
269
270 ------
271
272 **BFAST parameter list**
273
274 This is an exhaustive list of BFAST options:
275
276 For **match**::
277
278 -o STRING Specifies the offset [Use all]
279 -l Specifies to load all main or secondary indexes into memory
280 -A INT 0: NT space 1: Color space [0]
281 -k INT Specifies to truncate all indexes to have the given key size
282 (must be greater than the hash width) [Not Using]
283 -K INT Specifies the maximum number of matches to allow before a key
284 is ignored [8]
285 -M INT Specifies the maximum total number of matches to consider
286 before the read is discarded [384]
287 -w INT 0: consider both strands 1: forward strand only 2: reverse
288 strand only [0]
289 -n INT Specifies the number of threads to use [1]
290 -t Specifies to output timing information
291
292 For **localalign**::
293
294 -x FILE Specifies the file name storing the scoring matrix
295 -u Do ungapped local alignment (the default is gapped).
296 -U Do not use mask constraints from the match step
297 -A INT 0: NT space 1: Color space [0]
298 -o INT Specifies the number of bases before and after the match to
299 include in the reference genome
300 -M INT Specifies the maximum total number of matches to consider
301 before the read is discarded [384]
302 -q INT Specifies the average mismatch quality
303 -n INT Specifies the number of threads to use [1]
304 -t Specifies to output timing information
305
306 For **postprocess**::
307
308 -a INT Specifies the algorithm to choose the alignment for each end of the read:
309
310 0: No filtering will occur.
311 1: All alignments that pass the filters will be output
312 2: Only consider reads that have been aligned uniquely
313 3: Choose uniquely the alignment with the best score
314 4: Choose all alignments with the best score
315
316 -A INT 0: NT space 1: Color space [0]
317 -U Specifies that pairing should not be performed
318 -R Specifies that paired reads are on opposite strands
319 -q INT Specifies the average mismatch quality
320 -x FILE Specifies the file name storing the scoring matrix
321 -z Specifies to output a random best scoring alignment (with -a 3)
322 -r FILE Specifies to add the RG in the specified file to the SAM
323 header and updates the RG tag (and LB/PU tags if present) in
324 the reads (SAM only)
325 -n INT Specifies the number of threads to use [1]
326 -t Specifies to output timing information
327
328 </help>
329 <requirements>
330 <requirement type="package">bfast</requirement>
331 </requirements>
332 <tests>
333 <test>
334 <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger" />
335 <param name="refGenomeSource_type" value="history" />
336 <param name="ownFile" ftype="fasta" value="phiX.fasta" />
337 <param name="mask" value="111111111111111111" />
338 <param name="hash_width" value="14" />
339 <param name="source_select" value="pre_set" />
340 <param name="indexing_repeatmasker" value="False" />
341 <param name="indexing_option_selector" value="default" />
342 <param name="suppressHeader" value="" />
343 <output name="output" ftype="sam" file="bfast_out1.sam" />
344 </test>
345 <test>
346 <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger"/>
347 <param name="refGenomeSource_type" value="history" />
348 <param name="ownFile" ftype="fasta" value="phiX.fasta" />
349 <param name="mask" value="111111111111111111" />
350 <param name="hash_width" value="14" />
351 <param name="source_select" value="pre_set" />
352 <param name="indexing_repeatmasker" value="False" />
353 <param name="indexing_option_selector" value="default" />
354 <param name="suppressHeader" value="--suppressHeader" />
355 <output name="output" ftype="sam" file="bfast_out1.sam" lines_diff="3" /><!-- 3 headers exist in compare file, but headers are suppressed -->
356 </test>
357 <test>
358 <param name="input1" ftype="fastqcssanger" value="random_phiX_1.fastqcssanger" />
359 <param name="refGenomeSource_type" value="history" />
360 <param name="ownFile" ftype="fasta" value="phiX.fasta" />
361 <param name="mask" value="111111111111111111" />
362 <param name="hash_width" value="14" />
363 <param name="source_select" value="pre_set" />
364 <param name="indexing_repeatmasker" value="False" />
365 <param name="indexing_option_selector" value="default" />
366 <param name="suppressHeader" value="" />
367 <output name="output" ftype="sam" file="bfast_out2.sam" />
368 </test>
369 <!-- test of pre-indexed data now -->
370 <test>
371 <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger" />
372 <param name="refGenomeSource_type" value="indexed" />
373 <param name="indices" value="phiX_nt_50" />
374 <param name="source_select" value="pre_set" />
375 <param name="suppressHeader" value="" />
376 <output name="output" ftype="sam" file="bfast_out3.sam" lines_diff="2" /><!-- MD:Z:11T38 instead of MD:Z:50 on one line-->
377 </test>
378 </tests>
379 </tool>