Mercurial > repos > bgruening > flye
comparison flye.xml @ 11:291923e6f276 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flye commit acf41fab409bef4882d5d12cbf991452b408076e
author | bgruening |
---|---|
date | Mon, 18 Mar 2024 12:44:09 +0000 |
parents | cb8dfd28c16f |
children | 3e4f8642c77e |
comparison
equal
deleted
inserted
replaced
10:cb8dfd28c16f | 11:291923e6f276 |
---|---|
1 <tool id="flye" name="Flye" version="@TOOL_VERSION@+galaxy@SUFFIX_VERSION@" profile="20.01"> | 1 <tool id="flye" name="Flye" version="@TOOL_VERSION@+galaxy@SUFFIX_VERSION@" profile="20.01"> |
2 <description>de novo assembler for single molecule sequencing reads</description> | 2 <description>de novo assembler for single molecule sequencing reads</description> |
3 <macros> | 3 <macros> |
4 <import>macros.xml</import> | 4 <import>macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="edam_ontology"/> | |
7 <expand macro="xrefs"/> | |
6 <expand macro="requirements" /> | 8 <expand macro="requirements" /> |
7 <expand macro="edam_ontology"/> | |
8 <version_command>flye --version</version_command> | 9 <version_command>flye --version</version_command> |
9 <command detect_errors="exit_code"><![CDATA[ | 10 <command detect_errors="exit_code"><![CDATA[ |
10 #for $counter, $input in enumerate($inputs): | 11 #for $counter, $input in enumerate($inputs): |
11 #if $input.is_of_type('fastqsanger', 'fastq'): | 12 #if $input.is_of_type('fastqsanger', 'fastq'): |
12 #set $ext = 'fastq' | 13 #set $ext = 'fastq' |
15 #elif $input.is_of_type('fasta.gz'): | 16 #elif $input.is_of_type('fasta.gz'): |
16 #set $ext = 'fasta.gz' | 17 #set $ext = 'fasta.gz' |
17 #elif $input.is_of_type('fasta'): | 18 #elif $input.is_of_type('fasta'): |
18 #set $ext = 'fasta' | 19 #set $ext = 'fasta' |
19 #end if | 20 #end if |
20 ln -s '$input' ./input_${counter}.${ext} && | 21 ln -sf '$input' ./input_${counter}.${ext} && |
21 #end for | 22 #end for |
22 flye | 23 flye |
23 $mode_conditional.mode | 24 $mode_conditional.mode |
24 #for $counter, $input in enumerate($inputs): | 25 #for $counter, $input in enumerate($inputs): |
25 ./input_${counter}.$ext | 26 ./input_${counter}.$ext |
226 <has_size value="1248" delta="100"/> | 227 <has_size value="1248" delta="100"/> |
227 </assert_contents> | 228 </assert_contents> |
228 </output> | 229 </output> |
229 <output name="assembly_gfa" ftype="txt"> | 230 <output name="assembly_gfa" ftype="txt"> |
230 <assert_contents> | 231 <assert_contents> |
231 <has_size value="420252" delta="100"/> | 232 <has_size value="419414" delta="100"/> |
232 </assert_contents> | 233 </assert_contents> |
233 </output> | 234 </output> |
234 <output name="consensus" ftype="fasta"> | 235 <output name="consensus" ftype="fasta"> |
235 <assert_contents> | 236 <assert_contents> |
236 <has_size value="427129" delta="100"/> | 237 <has_size value="426277" delta="100"/> |
237 </assert_contents> | 238 </assert_contents> |
238 </output> | 239 </output> |
239 </test> | 240 </test> |
240 <!--Test 06: hifi error option--> | 241 <!--Test 06: hifi error option--> |
241 <test expect_num_outputs="4"> | 242 <test expect_num_outputs="4"> |
250 <has_size value="286" delta="100"/> | 251 <has_size value="286" delta="100"/> |
251 </assert_contents> | 252 </assert_contents> |
252 </output> | 253 </output> |
253 <output name="assembly_graph" ftype="graph_dot"> | 254 <output name="assembly_graph" ftype="graph_dot"> |
254 <assert_contents> | 255 <assert_contents> |
255 <has_size value="1273" delta="100"/> | 256 <has_size value="1500" delta="100"/> |
256 </assert_contents> | 257 </assert_contents> |
257 </output> | 258 </output> |
258 <output name="assembly_gfa" ftype="txt"> | 259 <output name="assembly_gfa" ftype="txt"> |
259 <assert_contents> | 260 <assert_contents> |
260 <has_size value="420252" delta="100"/> | 261 <has_size value="418422" delta="100"/> |
261 </assert_contents> | 262 </assert_contents> |
262 </output> | 263 </output> |
263 <output name="consensus" ftype="fasta"> | 264 <output name="consensus" ftype="fasta"> |
264 <assert_contents> | 265 <assert_contents> |
265 <has_size value="427129" delta="100"/> | 266 <has_size value="425147" delta="200"/> |
266 </assert_contents> | 267 </assert_contents> |
267 </output> | 268 </output> |
268 </test> | 269 </test> |
269 <!--Test 07: keep haplotypes--> | 270 <!--Test 07: keep haplotypes--> |
270 <test expect_num_outputs="4"> | 271 <test expect_num_outputs="4"> |
285 <has_size value="1273" delta="100"/> | 286 <has_size value="1273" delta="100"/> |
286 </assert_contents> | 287 </assert_contents> |
287 </output> | 288 </output> |
288 <output name="assembly_gfa" ftype="txt"> | 289 <output name="assembly_gfa" ftype="txt"> |
289 <assert_contents> | 290 <assert_contents> |
290 <has_size value="420252" delta="100"/> | 291 <has_size value="418511" delta="100"/> |
291 </assert_contents> | 292 </assert_contents> |
292 </output> | 293 </output> |
293 <output name="consensus" ftype="fasta"> | 294 <output name="consensus" ftype="fasta"> |
294 <assert_contents> | 295 <assert_contents> |
295 <has_size value="427129" delta="100"/> | 296 <has_size value="425267" delta="100"/> |
296 </assert_contents> | 297 </assert_contents> |
297 </output> | 298 </output> |
298 </test> | 299 </test> |
299 <!--Test 08: scaffolding mode--> | 300 <!--Test 08: scaffolding mode--> |
300 <test expect_num_outputs="4"> | 301 <test expect_num_outputs="4"> |
301 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> | 302 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> |
302 <param name="mode" value="--nano-hq"/> | 303 <param name="mode" value="--nano-hq"/> |
303 <param name="min_overlap" value="1000"/> | 304 <param name="min_overlap" value="1000"/> |
304 <param name="scaffolding" value="true"/> | 305 <param name="scaffold" value="true"/> |
305 <output name="assembly_info" ftype="tabular"> | 306 <output name="assembly_info" ftype="tabular"> |
306 <assert_contents> | 307 <assert_contents> |
307 <has_size value="286" delta="100"/> | 308 <has_size value="286" delta="100"/> |
308 </assert_contents> | 309 </assert_contents> |
309 </output> | 310 </output> |
312 <has_size value="1248" delta="100"/> | 313 <has_size value="1248" delta="100"/> |
313 </assert_contents> | 314 </assert_contents> |
314 </output> | 315 </output> |
315 <output name="assembly_gfa" ftype="txt"> | 316 <output name="assembly_gfa" ftype="txt"> |
316 <assert_contents> | 317 <assert_contents> |
317 <has_size value="420252" delta="100"/> | 318 <has_size value="419414" delta="1000"/> |
318 </assert_contents> | 319 </assert_contents> |
319 </output> | 320 </output> |
320 <output name="consensus" ftype="fasta"> | 321 <output name="consensus" ftype="fasta"> |
321 <assert_contents> | 322 <assert_contents> |
322 <has_size value="427129" delta="100"/> | 323 <has_size value="426277" delta="1000"/> |
323 </assert_contents> | 324 </assert_contents> |
324 </output> | 325 </output> |
325 </test> | 326 </test> |
326 <!--Test 09: test not-alt-contigs parameter w--> | 327 <!--Test 09: test not-alt-contigs parameter w--> |
327 <test expect_num_outputs="4"> | 328 <test expect_num_outputs="4"> |
351 </output> | 352 </output> |
352 </test> | 353 </test> |
353 </tests> | 354 </tests> |
354 <help><![CDATA[ | 355 <help><![CDATA[ |
355 | 356 |
356 .. class:: infomark | |
357 | |
358 **Purpose** | 357 **Purpose** |
359 | 358 |
360 Flye is a de novo assembler for single molecule sequencing reads, such as those produced by PacBio and Oxford Nanopore Technologies. | 359 Flye is a de novo assembler for single molecule sequencing reads, such as those produced by PacBio and Oxford Nanopore Technologies. |
361 It is designed for a wide range of datasets, from small bacterial projects to large mammalian-scale assemblies. The package represents | 360 It is designed for a wide range of datasets, from small bacterial projects to large mammalian-scale assemblies. The package represents |
362 a complete pipeline: it takes raw PacBio/ONT reads as input and outputs polished contigs. Flye also has a special mode for metagenome | 361 a complete pipeline: it takes raw PacBio/ONT reads as input and outputs polished contigs. Flye also has a special mode for metagenome |
363 assembly. | 362 assembly. |
364 | 363 |
365 ---- | 364 ---- |
366 | 365 |
367 .. class:: infomark | |
368 | |
369 **Quick usage** | 366 **Quick usage** |
370 | 367 |
371 Input reads can be in FASTA or FASTQ format, uncompressed or compressed with gz. Currently, PacBio (raw, corrected, HiFi) and ONT reads | 368 Input reads can be in FASTA or FASTQ format, uncompressed or compressed with gz. Currently, PacBio (raw, corrected, HiFi) and ONT reads |
372 (raw, corrected) are supported. Expected error rates are <30% for raw, <3% for corrected, and <1% for HiFi. Note that Flye was primarily | 369 (raw, corrected) are supported. Expected error rates are <30% for raw, <3% for corrected, and <1% for HiFi. Note that Flye was primarily |
373 developed to run on raw reads. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o | 370 developed to run on raw reads. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o |
378 To reduce memory consumption for large genome assemblies, you can use a subset of the longest reads for initial disjointig assembly by | 375 To reduce memory consumption for large genome assemblies, you can use a subset of the longest reads for initial disjointig assembly by |
379 specifying *--asm-coverage* and *--genome-size* options. Typically, 40x coverage is enough to produce good disjointigs. | 376 specifying *--asm-coverage* and *--genome-size* options. Typically, 40x coverage is enough to produce good disjointigs. |
380 | 377 |
381 ---- | 378 ---- |
382 | 379 |
383 .. class:: infomark | |
384 | |
385 **Outputs** | 380 **Outputs** |
386 | 381 |
387 The main output files are: | 382 The main output files are: |
388 | 383 |
389 :: | 384 * Final assembly: contains contigs and possibly scaffolds (see below). |
390 | 385 * Final repeat graph: note that the edge sequences might be different (shorter) than contig sequences, because contigs might include multiple graph edges. |
391 - Final assembly: contains contigs and possibly scaffolds (see below). | 386 * Extra information about contigs (such as length or coverage). |
392 - Final repeat graph: note that the edge sequences might be different (shorter) than contig sequences, because contigs might include multiple graph edges. | |
393 - Extra information about contigs (such as length or coverage). | |
394 | 387 |
395 Each contig is formed by a single unique graph edge. If possible, unique contigs are extended with the sequence from flanking unresolved repeats on the graph. Thus, | 388 Each contig is formed by a single unique graph edge. If possible, unique contigs are extended with the sequence from flanking unresolved repeats on the graph. Thus, |
396 a contig fully contains the corresponding graph edge (with the same id), but might be longer then this edge. This is somewhat similar to unitig-contig relation in | 389 a contig fully contains the corresponding graph edge (with the same id), but might be longer then this edge. This is somewhat similar to unitig-contig relation in |
397 OLC assemblers. In a rare case when a repetitive graph edge is not covered by the set of "extended" contigs, it will be also output in the assembly file. | 390 OLC assemblers. In a rare case when a repetitive graph edge is not covered by the set of "extended" contigs, it will be also output in the assembly file. |
398 | 391 |
400 the assembly file (with a scaffold prefix). Since it is hard to give a reliable estimate of the gap size, those gaps are represented with the default 100 Ns. | 393 the assembly file (with a scaffold prefix). Since it is hard to give a reliable estimate of the gap size, those gaps are represented with the default 100 Ns. |
401 assembly_info.txt file (below) contains additional information about how scaffolds were formed. | 394 assembly_info.txt file (below) contains additional information about how scaffolds were formed. |
402 | 395 |
403 Extra information about contigs/scaffolds is output into the assembly_info.txt file. It is a tab-delimited table with the columns as follows: | 396 Extra information about contigs/scaffolds is output into the assembly_info.txt file. It is a tab-delimited table with the columns as follows: |
404 | 397 |
405 :: | 398 * Contig/scaffold id |
406 | 399 * Length |
407 - Contig/scaffold id | 400 * Coverage |
408 - Length | 401 * Is circular, (Y)es or (N)o |
409 - Coverage | 402 * Is repetitive, (Y)es or (N)o |
410 - Is circular, (Y)es or (N)o | 403 * Multiplicity (based on coverage) |
411 - Is repetitive, (Y)es or (N)o | 404 * Alternative group |
412 - Multiplicity (based on coverage) | 405 * Graph path (graph path corresponding to this contig/scaffold). |
413 - Alternative group | 406 |
414 - Graph path (graph path corresponding to this contig/scaffold). | 407 Scaffold gaps are marked with `??` symbols, and `*` symbol denotes a terminal graph node. Alternative contigs (representing alternative haplotypes) will have the same alt. |
415 | 408 group ID. Primary contigs are marked by `*`. |
416 Scaffold gaps are marked with ?? symbols, and * symbol denotes a terminal graph node. Alternative contigs (representing alternative haplotypes) will have the same alt. | |
417 group ID. Primary contigs are marked by *. | |
418 | 409 |
419 ---- | 410 ---- |
420 | 411 |
421 .. class:: infomark | |
422 | |
423 **Algorithm Description** | 412 **Algorithm Description** |
424 | 413 |
425 This is a brief description of the Flye algorithm. Please refer to the manuscript for more detailed information. The draft contig extension is organized as follows: | 414 This is a brief description of the Flye algorithm. Please refer to the manuscript for more detailed information. The draft contig extension is organized as follows: |
426 | 415 |
427 :: | 416 * K-mer counting / erroneous k-mer pre-filtering |
428 | 417 * Solid k-mer selection (k-mers with sufficient frequency, which are unlikely to be erroneous) |
429 - K-mer counting / erroneous k-mer pre-filtering | 418 * Contig extension. The algorithm starts from a single read and extends it with a next overlapping read (overlaps are dynamically detected using the selected solid k-mers). |
430 - Solid k-mer selection (k-mers with sufficient frequency, which are unlikely to be erroneous) | |
431 - Contig extension. The algorithm starts from a single read and extends it with a next overlapping read (overlaps are dynamically detected using the selected solid k-mers). | |
432 | 419 |
433 Note that we do not attempt to resolve repeats at this stage, thus the reconstructed contigs might contain misassemblies. Flye then aligns the reads on these draft | 420 Note that we do not attempt to resolve repeats at this stage, thus the reconstructed contigs might contain misassemblies. Flye then aligns the reads on these draft |
434 contigs using minimap2 and calls a consensus. Afterwards, Flye performs repeat analysis as follows: | 421 contigs using minimap2 and calls a consensus. Afterwards, Flye performs repeat analysis as follows: |
435 | 422 |
436 :: | 423 * Repeat graph is constructed from the (possibly misassembled) contigs |
437 | 424 * In this graph all repeats longer than minimum overlap are collapsed |
438 - Repeat graph is constructed from the (possibly misassembled) contigs | 425 * The algorithm resolves repeats using the read information and graph structure |
439 - In this graph all repeats longer than minimum overlap are collapsed | 426 * The unbranching paths in the graph are output as contigs |
440 - The algorithm resolves repeats using the read information and graph structure | |
441 - The unbranching paths in the graph are output as contigs | |
442 | 427 |
443 If enabled, after resolving bridged repeats, Trestle module attempts to resolve simple unbridged repeats (of multiplicity 2) using the heterogeneities between repeat copies. | 428 If enabled, after resolving bridged repeats, Trestle module attempts to resolve simple unbridged repeats (of multiplicity 2) using the heterogeneities between repeat copies. |
444 Finally, Flye performs polishing of the resulting assembly to correct the remaining errors: | 429 Finally, Flye performs polishing of the resulting assembly to correct the remaining errors: |
445 | 430 |
446 :: | 431 * Alignment of all reads to the current assembly using minimap2 |
447 | 432 * Partition the alignment into mini-alignments (bubbles) |
448 - Alignment of all reads to the current assembly using minimap2 | 433 * Error correction of each bubble using a maximum likelihood approach |
449 - Partition the alignment into mini-alignments (bubbles) | |
450 - Error correction of each bubble using a maximum likelihood approach | |
451 | |
452 | 434 |
453 The polishing steps could be repeated, which might slightly increase quality for some datasets. | 435 The polishing steps could be repeated, which might slightly increase quality for some datasets. |
454 | 436 |
455 | 437 |
456 ]]></help> | 438 ]]></help> |