Mercurial > repos > bgruening > flye
diff flye.xml @ 9:276f5d8712d5 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flye commit 7c956f5b897dc366b2e5b7e37a2fea4b51a814f3"
author | bgruening |
---|---|
date | Tue, 23 Nov 2021 08:35:32 +0000 |
parents | e27815e82dd4 |
children | cb8dfd28c16f |
line wrap: on
line diff
--- a/flye.xml Wed Jun 30 20:02:51 2021 +0000 +++ b/flye.xml Tue Nov 23 08:35:32 2021 +0000 @@ -20,15 +20,15 @@ ln -s '$input' ./input_${counter}.${ext} && #end for flye - $mode + $mode_conditional.mode #for $counter, $input in enumerate($inputs): ./input_${counter}.$ext #end for -o out_dir -t \${GALAXY_SLOTS:-4} -i $iterations - #if $hifi_error: - --hifi-error $hifi_error + #if $mode_conditional.mode == '--pacbio-hifi' and $mode_conditional.hifi_error: + --hifi-error $mode_conditional.hifi_error #end if #if $min_overlap: -m $min_overlap @@ -37,55 +37,58 @@ --asm-coverage $asm.asm_coverage -g '${asm.genome_size}' #end if - $plasmids $meta - $trestle + $scaffold ]]></command> <inputs> <param name="inputs" type="data" format="fasta,fasta.gz,fastq,fastq.gz,fastqsanger.gz,fastqsanger" multiple="true" label="Input reads" /> - <param name="mode" type="select" label="Mode"> - <option value="--nano-raw">Nanopore raw</option> - <option value="--nano-corr">Nanopore corrected</option> - <option value="--pacbio-hifi">PacBio HiFi</option> - <option value="--pacbio-raw">PacBio raw</option> - <option value="--pacbio-corr">PacBio corrected</option> - <option value="--subassemblies">High-quality contig-like input</option> - </param> - <param argument="--iterations" type="integer" value="0" label="Number of polishing iterations" + <conditional name="mode_conditional"> + <param name="mode" type="select" label="Mode"> + <option value="--nano-raw">Nanopore raw (--nano-raw)</option> + <option value="--nano-corr">Nanopore corrected (--nano-corr)</option> + <option value="--nano-hq">Nanopore HQ (--nano-hq)</option> + <option value="--pacbio-raw">PacBio raw (--pacbio-raw)</option> + <option value="--pacbio-corr">PacBio corrected (--pacbio-corr)</option> + <option value="--pacbio-hifi">PacBio HiFi (--pacbio-hifi)</option> + </param> + <when value="--nano-raw"/> + <when value="--nano-corr"/> + <when value="--nano-hq"/> + <when value="--pacbio-raw"/> + <when value="--pacbio-corr"/> + <when value="--pacbio-hifi"> + <param argument="--hifi-error" type="float" min="0" max="1" optional="true" label="Expected HiFi reads error rate" help="Default: 0.01"/> + </when> + </conditional> + <param argument="--iterations" type="integer" value="1" label="Number of polishing iterations" help="Polishing is performed as the final assembly stage. By default, Flye runs one polishing iteration. Additional iterations might correct a small number of extra errors (due to improvements on how reads may align to the corrected assembly). If the - parameter is set to 0, the polishing is not performed."/> - <param argument="--min-overlap" type="integer" optional="true" label="Minimum overlap between reads" + parameter is set to 0, the polishing is not performed"/> + <param argument="--min-overlap" type="integer" min="1000" max="10000" optional="true" label="Minimum overlap between reads" help="This sets a minimum overlap length for two reads to be considered overlapping. By default it is chosen automatically based on the read length distribution (reads N90) and does not require manual setting. Typical value is 3k-5k (and down to 1k for datasets with shorter read length). Intuitively, we want to set this parameter as high as possible, so the repeat graph is less tangled. However, higher values might lead to assembly gaps. In some rare cases it makes sense to manually increase minimum overlap for assemblies of big genomes with long reads and high coverage." /> - <param argument="--hifi-error" type="float" min="0" max="1" optional="true" label="Expected HiFi reads error rate" help="Default: 0.01"/> - <param argument="--plasmids" type="boolean" truevalue="--plasmids" falsevalue="" checked="False" label="Rescue short unassembled plasmids" /> <param argument="--keep-haplotypes" type="boolean" truevalue="--keep-haplotypes" falsevalue="" checked="False" label="Keep haplotypes" help="By default, Flye collapses graph structures caused by alternative haplotypes (bubbles, superbubbles, roundabouts) to produce longer consensus contigs. This option retains the alternative paths on the graph, producing less contigouos, but more detailed assembly."/> - <param argument="--trestle" type="boolean" truevalue="--trestle" falsevalue="" - checked="False" label="Enable Trestle" - help="Trestle is an extra module that resolves simple repeats of multipicity 2 that were not bridged by reads. Depending on the datasets, it might - resolve a few extra repeats, which is helpful for small (bacterial genomes). On large genomes, the contiguity improvements are usually minimal, - but the computation might take a lot of time" /> + <param argument="--scaffold" type="boolean" truevalue="--scaffold" falsevalue="" label="Enable scaffolding using graph" + help="Starting from the version 2.9 Flye does not perform scaffolding by default, which guarantees that all assembled sequences do not have any gaps" /> <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" checked="False" label="Perform metagenomic assembly" help="It is designed for highly non-uniform coverage and is sensitive to underrepresented sequence at low coverage (as low as 2x). In some examples of simple metagenomes, we observed that the normal mode assembled more contigious bacterial consensus sequence, while the metagenome mode was slightly more fragmented, but revealed strain mixtures"/> <conditional name="asm"> - <param name="asm_select" type="select" label="Reduced contig assembly coverage"> + <param name="asm_select" type="select" label="Reduced contig assembly coverage" help="Typically, assemblies of large genomes at high coverage require a hundreds of RAM. For high coverage assemblies, + you can reduce memory usage by using only a subset of longest reads for initial contig extension stage (usually, the memory bottleneck)"> <option value="true">Enable reduced coverage for initial disjointing assembly</option> <option value="false" selected="true">Disable reduced coverage for initial disjointing assembly</option> </param> <when value="true"> <param argument="--asm-coverage" type="integer" min="0" value="30" label="Reduced coverage for initial disjointing assembly" - help="Typically, assemblies of large genomes at high coverage require a hundreds of RAM. For high coverage assemblies, - you can reduce memory usage by using only a subset of longest reads for initial contig extension stage (usually, the memory bottleneck). - The parameter --asm-coverage specifies the target coverage of the longest reads. For a typical assembly, 30x is enough to produce good + help="This parameter specifies the target coverage of the longest reads. For a typical assembly, 30x is enough to produce good initial contigs. Regardless of this parameter, all reads will be used at the later pipeline stages."/> <param argument="--genome-size" type="text" optional="true" label="Estimated genome size" help="For example, 5m or 2.6g. No longer required as input. However, it must be used in conjunction with --asm-coverage option."> @@ -106,10 +109,11 @@ </data> </outputs> <tests> - <!--Test 01--> + <!--Test 01: pacbio-raw--> <test expect_num_outputs="5"> <param name="inputs" ftype="fastq.gz" value="ecoli_01.fastq.gz,ecoli_02.fastq.gz,ecoli_03.fastq.gz,ecoli_04.fastq.gz,ecoli_05.fastq.gz,ecoli_06.fastq.gz,ecoli_07.fastq.gz"/> <param name="mode" value="--pacbio-raw"/> + <param name="iterations" value="0"/> <param name="generate_log" value="true"/> <output name="assembly_info" file="result1_assembly_info.txt" ftype="tabular" compare="sim_size"/> <output name="assembly_graph" file="result1_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> @@ -117,10 +121,11 @@ <output name="consensus" file="result1_assembly.fasta" ftype="fasta" compare="sim_size"/> <output name="flye_log" file="result1.log" ftype="txt" compare="sim_size"/> </test> - <!--Test 02--> + <!--Test 02: nano raw--> <test expect_num_outputs="4"> <param name="inputs" ftype="fasta.gz" value="nanopore.fasta.gz"/> <param name="mode" value="--nano-raw"/> + <param name="iterations" value="0"/> <output name="assembly_info" ftype="tabular"> <assert_contents> <has_size value="95" delta="100"/> @@ -142,14 +147,15 @@ </assert_contents> </output> </test> - <!--Test 03--> + <!--Test 03: reduce coverage--> <test expect_num_outputs="4"> <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> - <param name="mode" value="--pacbio-hifi"/> - <param name="iterations" value="1"/> + <conditional name="mode_conditional"> + <param name="mode" value="--nano-raw"/> + </conditional> <conditional name="asm"> <param name="asm_select" value="true" /> - <param name="asm" value="100"/> + <param name="asm" value="30"/> <param name="genome_size" value="3980000"/> </conditional> <output name="assembly_info" ftype="tabular"> @@ -159,27 +165,27 @@ </output> <output name="assembly_graph" ftype="graph_dot"> <assert_contents> - <has_size value="2135" delta="100"/> + <has_size value="1840" delta="100"/> </assert_contents> </output> <output name="assembly_gfa" ftype="txt"> <assert_contents> - <has_size value="114351" delta="100"/> + <has_size value="420752" delta="100"/> </assert_contents> </output> <output name="consensus" ftype="fasta"> <assert_contents> - <has_size value="116191" delta="100"/> + <has_size value="427580" delta="100"/> </assert_contents> </output> </test> - <!--Test 04--> + <!--Test 04: metagenomic mode--> <test expect_num_outputs="4"> <param name="inputs" ftype="fastq.gz" value="ecoli_01.fastq.gz,ecoli_02.fastq.gz,ecoli_03.fastq.gz,ecoli_04.fastq.gz,ecoli_05.fastq.gz,ecoli_06.fastq.gz,ecoli_07.fastq.gz"/> - <param name="mode" value="--pacbio-raw"/> - <param name="iterations" value="1"/> + <conditional name="mode_conditional"> + <param name="mode" value="--pacbio-raw"/> + </conditional> <param name="meta" value="true"/> - <param name="plasmids" value="true"/> <output name="assembly_info" ftype="tabular"> <assert_contents> <has_size value="95" delta="100"/> @@ -192,20 +198,22 @@ </output> <output name="assembly_gfa" ftype="txt"> <assert_contents> - <has_size value="418051" delta="100"/> + <has_size value="418729" delta="100"/> </assert_contents> </output> <output name="consensus" ftype="fasta"> <assert_contents> - <has_size value="425000" delta="100"/> + <has_size value="425667" delta="100"/> </assert_contents> </output> </test> - <!--Test 05--> + <!--Test 05: nanopore HQ mode--> <test expect_num_outputs="4"> <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> - <param name="mode" value="--pacbio-hifi"/> - <param name="iterations" value="1"/> + <conditional name="mode_conditional"> + <param name="mode" value="--nano-hq"/> + </conditional> + <param name="min_overlap" value="1000"/> <output name="assembly_info" ftype="tabular"> <assert_contents> <has_size value="286" delta="100"/> @@ -213,26 +221,28 @@ </output> <output name="assembly_graph" ftype="graph_dot"> <assert_contents> - <has_size value="2135" delta="100"/> + <has_size value="1248" delta="100"/> </assert_contents> </output> <output name="assembly_gfa" ftype="txt"> <assert_contents> - <has_size value="114351" delta="100"/> + <has_size value="420252" delta="100"/> </assert_contents> </output> <output name="consensus" ftype="fasta"> <assert_contents> - <has_size value="116191" delta="100"/> + <has_size value="427129" delta="100"/> </assert_contents> </output> </test> - <!--Test 06--> + <!--Test 06: hifi error option--> <test expect_num_outputs="4"> <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> - <param name="mode" value="--pacbio-hifi"/> - <param name="iterations" value="1"/> - <param name="hifi-error" value="0.02"/> + <conditional name="mode_conditional"> + <param name="mode" value="--pacbio-hifi"/> + <param name="hifi_error" value="0.21"/> + </conditional> + <param name="min_overlap" value="1000"/> <output name="assembly_info" ftype="tabular"> <assert_contents> <has_size value="286" delta="100"/> @@ -240,25 +250,28 @@ </output> <output name="assembly_graph" ftype="graph_dot"> <assert_contents> - <has_size value="2135" delta="100"/> + <has_size value="1273" delta="100"/> </assert_contents> </output> <output name="assembly_gfa" ftype="txt"> <assert_contents> - <has_size value="114351" delta="100"/> + <has_size value="420252" delta="100"/> </assert_contents> </output> <output name="consensus" ftype="fasta"> <assert_contents> - <has_size value="116191" delta="100"/> + <has_size value="427129" delta="100"/> </assert_contents> </output> </test> - <!--Test 07--> + <!--Test 07: keep haplotypes--> <test expect_num_outputs="4"> <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> - <param name="mode" value="--pacbio-hifi"/> - <param name="iterations" value="1"/> + <conditional name="mode_conditional"> + <param name="mode" value="--pacbio-corr"/> + <param name="hifi_error" value="0.21"/> + </conditional> + <param name="min_overlap" value="1000"/> <param name="keep-haplotypes" value="true"/> <output name="assembly_info" ftype="tabular"> <assert_contents> @@ -267,17 +280,44 @@ </output> <output name="assembly_graph" ftype="graph_dot"> <assert_contents> - <has_size value="2135" delta="100"/> + <has_size value="1273" delta="100"/> </assert_contents> </output> <output name="assembly_gfa" ftype="txt"> <assert_contents> - <has_size value="114351" delta="100"/> + <has_size value="420252" delta="100"/> </assert_contents> </output> <output name="consensus" ftype="fasta"> <assert_contents> - <has_size value="116191" delta="100"/> + <has_size value="427129" delta="100"/> + </assert_contents> + </output> + </test> + <!--Test 08: scaffolding mode--> + <test expect_num_outputs="4"> + <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> + <param name="mode" value="--nano-hq"/> + <param name="min_overlap" value="1000"/> + <param name="scaffolding" value="true"/> + <output name="assembly_info" ftype="tabular"> + <assert_contents> + <has_size value="286" delta="100"/> + </assert_contents> + </output> + <output name="assembly_graph" ftype="graph_dot"> + <assert_contents> + <has_size value="1248" delta="100"/> + </assert_contents> + </output> + <output name="assembly_gfa" ftype="txt"> + <assert_contents> + <has_size value="420252" delta="100"/> + </assert_contents> + </output> + <output name="consensus" ftype="fasta"> + <assert_contents> + <has_size value="427129" delta="100"/> </assert_contents> </output> </test> @@ -301,8 +341,7 @@ Input reads can be in FASTA or FASTQ format, uncompressed or compressed with gz. Currently, PacBio (raw, corrected, HiFi) and ONT reads (raw, corrected) are supported. Expected error rates are <30% for raw, <3% for corrected, and <1% for HiFi. Note that Flye was primarily -developed to run on raw reads. Additionally, the *--subassemblies* option performs a consensus assembly of multiple sets of high-quality -contigs. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o +developed to run on raw reads. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o ption enables the mode for metagenome/uneven coverage assembly. Genome size estimate is no longer a required option. You need to provide an estimate if using *--asm-coverage* option.