comparison unicycler.xml @ 0:e9c1cdb9f9dc draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/unicycler commit 97c19413f9d7e85c013b82ffafa73632c79c8f0b
author iuc
date Thu, 06 Jul 2017 17:07:43 -0400
parents
children f13d0498a199
comparison
equal deleted inserted replaced
-1:000000000000 0:e9c1cdb9f9dc
1 <tool id="unicycler" name="Create assemblies with Unicycler" version="0.2.0">
2 <requirements>
3 <requirement type="package" version="0.3.0b">unicycler</requirement>
4 </requirements>
5 <command detect_errors="exit_code"><![CDATA[
6
7 ## Preparing files
8
9 #if str( $paired_unpaired.fastq_input_selector ) == "paired":
10
11 #if $paired_unpaired.fastq_input1.is_of_type('fastqsanger'):
12 ln -s '${paired_unpaired.fastq_input1}' fq1.fastq &&
13 #elif $paired_unpaired.fastq_input1.is_of_type('fastqsanger.gz'):
14 ln -s '${paired_unpaired.fastq_input1}' fq1.fastq.gz &&
15 #end if
16
17 #if $paired_unpaired.fastq_input2.is_of_type('fastqsanger'):
18 ln -s '${paired_unpaired.fastq_input2}' fq2.fastq &&
19 #elif $paired_unpaired.fastq_input2.is_of_type('fastqsanger.gz'):
20 ln -s '${paired_unpaired.fastq_input1}' fq2.fastq.gz &&
21 #end if
22
23 #elif str( $paired_unpaired.fastq_input_selector ) == "paired_collection":
24
25 #if $paired_unpaired.fastq_input1.forward.is_of_type('fastqsanger'):
26 ln -s '${paired_unpaired.fastq_input1.forward}' fq1.fastq &&
27 #elif $paired_unpaired.fastq_input1.forward.is_of_type('fastqsanger.gz'):
28 ln -s '${paired_unpaired.fastq_input1.forward}' fq1.fastq.gz &&
29 #end if
30
31 #if $paired_unpaired.fastq_input1.reverse.is_of_type('fastqsanger'):
32 ln -s '${paired_unpaired.fastq_input1.reverse}' fq2.fastq &&
33 #elif $paired_unpaired.fastq_input1.reverse.is_of_type('fastqsanger.gz'):
34 ln -s '${paired_unpaired.fastq_input2.reverse}' fq2.fastq.gz &&
35 #end if
36
37 #elif str( $paired_unpaired.fastq_input_selector ) == "single":
38
39 #if $paired_unpaired.fastq_input1.is_of_type('fastqsanger'):
40 ln -s '${paired_unpaired.fastq_input1}' fq.fastq &&
41 #elif $paired_unpaired.fastq_input1.is_of_type('fastqsanger.gz'):
42 ln -s '${paired_unpaired.fastq_input1}' fq.fastq.gz &&
43 #end if
44
45 #end if
46
47 ## Get location for pilon installation
48
49 pilon=`pilon --jar_dir` &&
50
51 #if $long_reads:
52 #if $long_reads.is_of_type('fastqsanger'):
53 #set lr = "lr.fastq"
54 ln -s '${long_reads}' lr.fastq &&
55 #elif $long_reads.is_of_type('fastqsanger.gz'):
56 #set lr = "lr.fastq.gz"
57 ln -s '${long_reads}' lr.fastq.gz &&
58 #elif $long_reads.is_of_type('fasta'):
59 #set lr = "lr.fasta"
60 ln -s '${long_reads}' lr.fasta &&
61 #end if
62 #end if
63
64 ## Running Unicycler
65
66 unicycler -t "\${GALAXY_SLOTS:-4}"
67
68 -o ./
69 --verbosity 3
70 --pilon_path \$pilon
71
72 #if str( $paired_unpaired.fastq_input_selector ) != "single":
73
74 #if $paired_unpaired.fastq_input1.is_of_type('fastqsanger'):
75 -1 fq1.fastq
76 #elif $paired_unpaired.fastq_input1.is_of_type('fastqsanger.gz'):
77 -1 fq1.fastq.gz
78 #end if
79
80 #if $paired_unpaired.fastq_input2.is_of_type('fastqsanger'):
81 -2 fq2.fastq
82 #elif $paired_unpaired.fastq_input2.is_of_type('fastqsanger.gz'):
83 -2 fq2.fastq.gz
84 #end if
85
86 #else:
87
88 #if $paired_unpaired.fastq_input1.is_of_type('fastqsanger'):
89 -s fq.fastq
90 #elif $paired_unpaired.fastq_input1.is_of_type('fastqsanger.gz'):
91 -s fq.fastq.gz
92 #end if
93
94 #end if
95
96 #if $long_reads:
97
98 -l $lr
99
100 #end if
101
102 ## General Unicycler Options section
103 ## ----------------------------------------------------------
104
105 --mode '${uc_opt.mode}'
106
107 #if $uc_opt.min_fasta_length:
108 --min_fasta_length $uc_opt.min_fasta_length
109 #end if
110
111 #if $uc_opt.lin_seq:
112 --expected_linear $uc_opt.lin_seq
113 #end if
114
115 $uc_opt.no_correct
116 $uc_opt.no_rotate
117
118 ## Rotation Options section
119 ## ----------------------------------------------------------
120
121 #if $spades.min_kmer_frac:
122 --min_kmer_frac $spades.min_kmer_frac
123 #end if
124
125 #if $spades.max_kmer_frac:
126 --max_kmer_frac $spades.max_kmer_frac
127 #end if
128
129 #if $spades.kmer_count:
130 --kmer_count $spades.kmer_count
131 #end if
132
133 ## Rotation Options section
134 ## ----------------------------------------------------------
135
136 #if $rotation.start_genes:
137 --start_genes '${rotation.rotation_fasta.start_genes}'
138 #end if
139
140 #if $rotation.start_gene_id:
141 --start_gene_id $rotation.start_gene_id
142 #end if
143
144 #if $rotation.start_gene_cov:
145 --start_gene_cov $rotation.start_gene_cov
146 #end if
147
148 ## Pilon Options section
149 ## ----------------------------------------------------------
150
151 #if $pilon.min_polish_size:
152 --min_polish_size $pilon.min_polish_size
153 #end if
154
155 ## Graph Cleaning Options sdection
156 ## ----------------------------------------------------------
157
158 #if $graph_clean.min_component_size:
159 --min_component_size $graph_clean.min_component_size
160 #end if
161 #if $graph_clean.min_dead_end_size:
162 --min_dead_end_size $graph_clean.min_dead_end_size
163 #end if
164
165 ## Long Read Alignment Options
166 ## ----------------------------------------------------------
167
168
169 #if $lr_align.contamination_fasta:
170 --contamination '${lr_align.contamination_fasta}'
171 #end if
172
173 #if $lr_align.scores:
174 --scores '${lr_align.scores}'
175 #end if
176
177 #if $lr_align.low_score:
178 --low_score $lr_align.low_score
179 #end if
180
181
182 ]]></command>
183
184 <inputs>
185 <conditional name="paired_unpaired">
186 <param name="fastq_input_selector" type="select" label="Paired or Single end data?" help="Select between paired and single end data">
187 <option selected="True" value="paired">Paired</option>
188 <option value="paired_collection">Paired Collection</option>
189 <option value="single">Single</option>
190 </param>
191 <when value="paired">
192 <param name="fastq_input1" argument="-1" type="data" format="fastqsanger,fastqsanger.gz" label="Select first set of reads" help="Specify dataset with forward reads"/>
193 <param name="fastq_input2" argument="-2" type="data" format="fastqsanger,fastqsanger.gz" label="Select second set of reads" help="Specify dataset with reverse reads"/>
194 </when>
195 <when value="paired_collection">
196 <param name="fastq_input1" format="fastqsanger,fastqsanger.gz" type="data_collection" collection_type="paired" label="Select a paired collection" />
197 </when>
198 <when value="single">
199 <param name="fastq_input1" argument="-s" type="data" format="fastqsanger,fastqsanger.gz" label="Select unpaired reads" help="Specify dataset with unpaired reads"/>
200 </when>
201 </conditional>
202 <param name="long_reads" argument="--long" optional="True" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select long reads. If there are no long reads, leave this empty"/>
203
204 <section name="uc_opt" expanded="True" title="Unicycler options">
205 <param argument="--mode" type="select" label="Select Bridging mode">
206 <option value="conservative">Conservative (smaller contigs, lower misassembly)</option>
207 <option value="normal" selected="True">Normal (moderate contig size and misassembly rate)</option>
208 <option value="bold">Bold (longest contigs, higher misassembly rate)</option>
209 </param>
210 <param argument="--min_fasta_length" optional="True" type="integer" value="" label="Exclude contigs from the FASTA file which are shorter than this length (bp)" help="default = 1"/>
211 <param argument="--no_correct" optional="True" type="boolean" checked="False" truevalue="--no_correct" falsevalue="" label="Skip SPAdes error correction step" help="This option turns off SPAdes error correction. Generally it is highly recommended to use correction."/>
212 <param argument="--no_rotate" optional="True" type="boolean" checked="False" truevalue="--no_rotate" falsevalue="" label="Do not rotate completed replicons to start at a standard gene." help="Unicycler uses TBLASTN to search for dnaA or repA alleles in each completed replicon. If one is found, the sequence is rotated and/or flipped so that it begins with that gene encoded on the forward strand. This provides consistently oriented assemblies and reduces the risk that a gene will be split across the start and end of the sequence."/>
213 <param argument="--no_pilon" optional="True" type="boolean" checked="False" truevalue="--no_pilon" falsevalue="" label="Do not use Pilon to polish the final assembly." help="Unicycler uses Pilon tool for polishing final assembly."/>
214 <param name="lin_seq" argument="--expected_linear_seqs" optional="True" type="integer" value="" label="The expected number of linear (i.e. non-circular) sequences in the assembly" help="default = 0"/>
215 </section>
216
217 <section name="spades" expanded="False" title="SPAdes options" help="Unicycler uses SPAdes to construct assembly graphs. You can modify some of the SPAdes settings here. Use this ONLY if you know what you are doing!">
218 <param argument="--min_kmer_frac" optional="True" type="float" min="0" max="1" value="" label="Lowest k-mer size for SPAdes assembly, expressed as a fraction of the read length" help="default = 0.2"/>
219 <param argument="--max_kmer_frac" optional="True" type="float" min="0" max="1" value="" label="Highest k-mer size for SPAdes assembly, expressed as a fraction of the read length" help="default = 0.95"/>
220 <param argument="--kmer_count" optional="True" type="integer" value="" label="Number of k-mer steps to use in SPAdes assembly" help="default = 10"/>
221 </section>
222
223 <section name="rotation" expanded="False" title="Rotation options" help="These options control the rotation of completed circular sequence near the end of the Unicycler pipeline. Use this ONLY if you know what you are doing!">
224 <param argument="--start_genes" optional="True" type="data" format="fasta" label="FASTA file of genes for start point of rotated replicons" />
225 <param argument="--start_gene_id" optional="True" type="integer" min="0" max="100" value="" label="The minimum required BLAST percent identity for a start gene search" help="default = 90"/>
226 <param argument="--start_gene_cov" optional="True" type="integer" min="0" max="100" value="" label="The minimum required BLAST percent coverage for a start gene search" help="default = 95"/>
227 </section>
228
229 <section name="pilon" title="Pilon options" expanded="False">
230 <param argument="--min_polish_size" optional="True" type="integer" min="0" label="Contigs shorter than this value (bp) will not be polished using Pilon" help="default = 1000"/>
231 </section>
232
233 <section name="graph_clean" expanded="False" title="Graph cleaning options" help="These options control the removal of small leftover sequences after bridging is complete.">
234 <param argument="--min_component_size" optional="True" type="integer" value="" label="Unbridged graph components smaller than this size will be removed from the final graph" help="default = 1000"/>
235 <param argument="--min_dead_end_size" optional="True" type="integer" value="" label="Graph dead ends smaller than this size will be removed from the final graph" help="default = 1000"/>
236 </section>
237
238 <section name="lr_align" expanded="False" title="Long read alignment parameters" help="These options control the alignment of long reads to the assembly graph.">
239 <param name="contamination_fasta" argument="--contamination" optional="True" type="data" format="fasta" label="FASTA file of known contamination in long reads, e.g. lambda, phiXm or puc18 spike-ins." />
240 <param argument="--scores" optional="True" type="text" value="" label="Comma-delimited string of alignment scores: match, mismatch, gap open, gap extend" help="default = 3,-6,-5,-2"/>
241 <param argument="--low_score" optional="True" type="integer" value="" label="Score threshold - alignments below this are considered poor" help="default = set automatically"/>
242 </section>
243 </inputs>
244
245 <outputs>
246 <data format="txt" name="assembly_grapth" from_work_dir="assembly.gfa" label="${tool.name} on ${on_string}: Final Assembly Graph" />
247 <data format="fasta" name="assembly" from_work_dir="assembly.fasta" label="${tool.name} on ${on_string}: Final Assembly"/>
248 </outputs>
249
250 <tests>
251 <test>
252 <param name="fastq_input_selector" value="paired" />
253 <param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger" />
254 <param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger" />
255 <param name="mode" value="normal" />
256 <param name="no_correct" value="true" />
257 <param name="no_rotate" value="false" />
258 <param name="no_pilon" value="false" />
259 <output ftype="fasta" name="assembly">
260 <assert_contents>
261 <has_text text="length=5386" />
262 </assert_contents>
263 </output>
264 </test>
265 <!--
266 Following test corresponds to the command:
267
268 unicycler -t "${GALAXY_SLOTS:-8}" -o ./ - -verbose 3 - -pilon_path `pilon - -jar_dir` \
269 -1 test-data/phix_f.fq.gz -2 test-data/phix_r.fq.gz -l test-data/onp.fa \
270 - -mode 'normal' - -no_correct
271
272 This command causes a segfault with the current version of unicycler on bioconda for Linux
273 during the minimap step (which seems to be compiled C code). A gist of the log can be found
274 at: https://gist.github.com/jmchilton/b411b695170c1daea6589f5d76e326cb.
275 -->
276 <test>
277 <param name="fastq_input_selector" value="paired" />
278 <param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger" />
279 <param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger" />
280 <param name="long_reads" value="onp.fa" ftype="fasta" />
281 <param name="mode" value="normal" />
282 <param name="no_correct" value="true" />
283 <param name="no_rotate" value="false" />
284 <param name="no_pilon" value="false" />
285 <output ftype="fasta" name="assembly">
286 <assert_contents>
287 <has_text text="length=5386" />
288 </assert_contents>
289 </output>
290 </test>
291 </tests>
292 <help><![CDATA[
293
294 **Unicycler**
295
296 Unicycler is a hybrid assembly pipeline for bacterial genomes. It uses both Illumina reads and long reads (PacBio or Nanopore) to produce complete and accurate assemblies. It is written by `Ryan Wick`_ at the University of Melbourne's Centre for Systems Genomics. Much of the description below is lifted from Unicycler's `github page`_.
297
298 .. _`Ryan Wick`: https://github.com/rrwick
299 .. _`github page`: https://github.com/rrwick/Unicycler
300
301 -----
302
303 **Input data**
304
305 Unicycler accepts inputs short (Illumina) reads in FASTQ format. Galaxy places additional requirement of having these in FASTQ format with `Sanger encoding`_ of quality scores. Long reads (from Oxford Nanopore or PacBio) can be either in FASTQ of FASTA form.
306
307 .. _`Sanger encoding`: https://en.wikipedia.org/wiki/FASTQ_format#Quality
308
309 The input options are::
310
311 -1 SHORT1, --short1 SHORT1
312 FASTQ file of short reads (first reads in each pair)
313 -2 SHORT2, --short2 SHORT2
314 FASTQ file of short reads (second reads in each pair)
315 -s SHORT_UNPAIRED, --short_unpaired SHORT_UNPAIRED
316 FASTQ file of unpaired short reads
317 -l LONG, --long LONG
318 FASTQ or FASTA file of long reads, if all reads are available at start.
319
320 -----
321
322 **Bridging mode**
323
324 Unicycler can be run in three modes: conservative, normal (the default) and bold, set with the --mode option. Conservative mode is least likely to produce a complete assembly but has a very low risk of misassembly. Bold mode is most likely to produce a complete assembly but carries greater risk of misassembly. Normal mode is intermediate regarding both completeness and misassembly risk. See `description of modes`_ for more information.
325
326 .. _`description of modes`: https://github.com/rrwick/Unicycler#conservative-normal-and-bold
327
328 The available modes are::
329
330 --mode {conservative,normal,bold}
331 Bridging mode (default: normal)
332 conservative = smaller contigs, lowest misassembly rate
333 normal = moderate contig size and misassembly rate
334 bold = longest contigs, higher misassembly rate
335
336 ----
337
338 **Skip SPAdes error correction step**
339
340 Sequencing data contains a substantial number of sequencing errors that manifest themselves as deviations (bulges and non-connected components) within the assembly graph. One of the ways to improve the graph even constructing it is to minimize the amount sequencing errors by performing error correction. SPAdes, which is used by Unicycler for error correction and assembly, uses `BayesHammer`_ to correct the reads. Here is a brief summary of what it does:
341
342 1. SPAdes (or rather BayesHammer) counts *k*-mers in reads and computed *k*-mer statistics that takes into account base quality values.
343 2. `Hamming graph`_ is constructed for *k*-mers is which *k*-mers are nodes. In this graph edges connect nodes (*k*-mers) is they differ from each other by a number of nucleotides up to a certain threshold (the `Hamming distance`_). The graph is central to the error correction algorithm.
344 3. At this step Bayesian subclustering of the graph produced in the previous step. For each *k*-mer we now know the center of its subcluster.
345 4. Solid *k*-mers are derived from cluster centers and are assumed to be *error free*.
346 5. Solid *k*-mers are mapped back to the reads and used to correct them.
347
348 This step takes considerable time, so if one need to quickly evaluate assemblies this step can be skipped. However, this is not recommended if one if trying to produce a final high quality assembly.
349
350 .. _`BayesHammer`: https://goo.gl/1iGkMe
351 .. _`Hamming graph`: https://en.wikipedia.org/wiki/Hamming_graph
352 .. _`Hamming distance`: https://en.wikipedia.org/wiki/Hamming_distance
353
354 This following option turns error correction on and off::
355
356 --no_correct
357 Skip SPAdes error correction step
358 (default: conduct SPAdes error correction)
359
360 -----
361
362 **Do not rotate completed replicons to start at a standard gene**
363
364 Unicycler uses TBLASTN to search for dnaA or repA alleles in each completed replicon. If one is found, the sequence is rotated and/or flipped so that it begins with that gene encoded on the forward strand. This provides consistently oriented assemblies and reduces the risk that a gene will be split across the start and end of the sequence.
365
366 The following option turns rotation on and off::
367
368 --no_rotate
369 Do not rotate completed replicons
370 to start at a standard gene
371 (default: completed replicons are rotated)
372
373 **Do not use Pilon to polish the final assembly**
374
375 `Pilon`_ is a tool for improving overall quality of draft assemblies and finding variation among strains. Unicycler uses it for assembly *polishing*.
376
377 The following option turns pilon part of Unicycler pipeline on and off::
378
379 --no_pilon
380 Do not use Pilon to polish the
381 final assembly (default: Pilon is used)
382
383 .. _`Pilon`: https://github.com/broadinstitute/pilon/wiki
384
385 ------
386
387 **Expected number of linear sequences**
388
389 If you expect your sample to contain linear (non circular) sequences, set this option::
390
391 --expected_linear_seqs EXPECTED_LINEAR_SEQS
392 The expected number of linear (i.e. non-circular)
393 sequences in the underlying sequence
394
395 ----
396
397 **SPAdes options**
398
399 This section provides control of SPAdes options::
400
401 --min_kmer_frac MIN_KMER_FRAC
402 Lowest k-mer size for SPAdes assembly,
403 expressed as a fraction of the read length
404 (default: 0.2)
405 --max_kmer_frac MAX_KMER_FRAC
406 Highest k-mer size for SPAdes assembly,
407 expressed as a fraction of the read length
408 (default: 0.95)
409 --kmer_count KMER_COUNT
410 Number of k-mer steps to use in
411 SPAdes assembly (default: 10)
412
413 ----
414
415 **Rotation options**
416
417 Unicycler attempts to rotate circular assemblies to make sure that they begin at a consistent starting gene. The following parameters control assembly rotation::
418
419 --start_genes START_GENES
420 FASTA file of genes for start point
421 of rotated replicons
422 (default: start_genes.fasta)
423 --start_gene_id START_GENE_ID
424 The minimum required BLAST percent identity
425 for a start gene search
426 (default: 90.0)
427 --start_gene_cov START_GENE_COV
428 The minimum required BLAST percent coverage
429 for a start gene search
430 (default: 95.0)
431
432 -----
433
434 **Graph cleaning options**
435
436 These options control the removal of small leftover sequences after bridging is complete::
437
438 --min_component_size MIN_COMPONENT_SIZE
439 Unbridged graph components smaller
440 than this size (bp) will be removed
441 from the final graph (default: 1000)
442 --min_dead_end_size MIN_DEAD_END_SIZE
443 Graph dead ends smaller than this size (bp)
444 will be removed from the final graph
445 (default: 1000)
446
447 -----
448
449 **Long read alignment options**
450
451 These options control the alignment of long reads to the assembly graph::
452
453 --contamination CONTAMINATION
454 FASTA file of known contamination in long reads
455 --scores SCORES
456 Comma-delimited string of alignment scores:
457 match, mismatch, gap open, gap extend
458 (default: 3,-6,-5,-2)
459 --low_score LOW_SCORE
460 Score threshold - alignments below this
461 are considered poor
462 (default: set threshold automatically)
463
464 -----
465
466 **Outputs**
467
468 Galaxy's wrapped for Unicycler produces two outputs:
469
470 * final assembly in FASTA format
471 * final assembly grapth in graph format
472
473 While most will likely be interested in the FASTA dataset, the graph dataset is also quite useful and can be visualized using tools such as `Bandage`_.
474
475
476 .. _`Bandage`: https://github.com/rrwick/Bandage
477
478
479 ]]></help>
480 <citations>
481 <citation type="doi">10.1101/096412</citation>
482 </citations>
483 </tool>