comparison flye.xml @ 8:e27815e82dd4 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flye commit 7bec5df9cb30dd196ae99565d77547e12d05fa48"
author bgruening
date Wed, 30 Jun 2021 20:02:51 +0000
parents 8d4f03b5fe9d
children 276f5d8712d5
comparison
equal deleted inserted replaced
7:8d4f03b5fe9d 8:e27815e82dd4
1 <tool id="flye" name="Flye assembly" version="2.8.2+galaxy0"> 1 <tool id="flye" name="Flye" version="@TOOL_VERSION@+galaxy@SUFFIX_VERSION@" profile="20.01">
2 <description>of long and error-prone reads</description> 2 <description>de novo assembler for single molecule sequencing reads</description>
3 <macros> 3 <macros>
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="requirements" /> 6 <expand macro="requirements" />
7 <expand macro="edam_ontology"/>
7 <version_command>flye --version</version_command> 8 <version_command>flye --version</version_command>
8 <command detect_errors="exit_code"> 9 <command detect_errors="exit_code"><![CDATA[
9 <![CDATA[ 10 #for $counter, $input in enumerate($inputs):
10 11 #if $input.is_of_type('fastqsanger', 'fastq'):
11 #for $counter, $input in enumerate($inputs): 12 #set $ext = 'fastq'
12 13 #elif $input.is_of_type('fastqsanger.gz', 'fastq.gz'):
13 #if $input.is_of_type('fastqsanger', 'fastq'): 14 #set $ext = 'fastq.gz'
14 #set $ext = 'fastq' 15 #elif $input.is_of_type('fasta.gz'):
15 #elif $input.is_of_type('fastqsanger.gz', 'fastq.gz'): 16 #set $ext = 'fasta.gz'
16 #set $ext = 'fastq.gz' 17 #elif $input.is_of_type('fasta'):
17 #elif $input.is_of_type('fasta.gz'): 18 #set $ext = 'fasta'
18 #set $ext = 'fasta.gz' 19 #end if
19 #elif $input.is_of_type('fasta'): 20 ln -s '$input' ./input_${counter}.${ext} &&
20 #set $ext = 'fasta' 21 #end for
22 flye
23 $mode
24 #for $counter, $input in enumerate($inputs):
25 ./input_${counter}.$ext
26 #end for
27 -o out_dir
28 -t \${GALAXY_SLOTS:-4}
29 -i $iterations
30 #if $hifi_error:
31 --hifi-error $hifi_error
21 #end if 32 #end if
22 ln -s '$input' ./input_${counter}.${ext} && 33 #if $min_overlap:
23 #end for 34 -m $min_overlap
24 35 #end if
25 flye 36 #if $asm.asm_select == 'true':
26 $mode 37 --asm-coverage $asm.asm_coverage
27 #for $counter, $input in enumerate($inputs): 38 -g '${asm.genome_size}'
28 ./input_${counter}.$ext 39 #end if
29 #end for 40 $plasmids
30 41 $meta
31 -o out_dir 42 $trestle
32 -t \${GALAXY_SLOTS:-4}
33 -i $i
34 #if $m:
35 -m '$m'
36 #end if
37 #if str($asm.asm_select) == "true":
38 --asm-coverage '$asm.asm'
39 -g '$asm.g'
40 #end if
41 ${plasmids}
42 ${meta}
43 ${no_trestle}
44 2>&1
45 ]]></command> 43 ]]></command>
46 <inputs> 44 <inputs>
47 <param name="inputs" type="data" format="fasta,fasta.gz,fastq,fastq.gz,fastqsanger.gz,fastqsanger" multiple="true" label="Input reads" /> 45 <param name="inputs" type="data" format="fasta,fasta.gz,fastq,fastq.gz,fastqsanger.gz,fastqsanger" multiple="true" label="Input reads" />
48 <param name="mode" type="select" label="Mode"> 46 <param name="mode" type="select" label="Mode">
49 <option value="--nano-raw">Nanopore raw</option> 47 <option value="--nano-raw">Nanopore raw</option>
50 <option value="--nano-corr">Nanopore corrected</option> 48 <option value="--nano-corr">Nanopore corrected</option>
49 <option value="--pacbio-hifi">PacBio HiFi</option>
51 <option value="--pacbio-raw">PacBio raw</option> 50 <option value="--pacbio-raw">PacBio raw</option>
52 <option value="--pacbio-corr">PacBio corrected</option> 51 <option value="--pacbio-corr">PacBio corrected</option>
53 <option value="--subassemblies">high-quality contig-like input</option> 52 <option value="--subassemblies">High-quality contig-like input</option>
54 </param> 53 </param>
55 <param argument="-i" type="integer" value="1" label="number of polishing iterations" /> 54 <param argument="--iterations" type="integer" value="0" label="Number of polishing iterations"
56 <param argument="-m" type="integer" optional="true" label="minimum overlap between reads (default: auto)" /> 55 help="Polishing is performed as the final assembly stage. By default, Flye runs one polishing iteration. Additional iterations
57 56 might correct a small number of extra errors (due to improvements on how reads may align to the corrected assembly). If the
57 parameter is set to 0, the polishing is not performed."/>
58 <param argument="--min-overlap" type="integer" optional="true" label="Minimum overlap between reads"
59 help="This sets a minimum overlap length for two reads to be considered overlapping. By default it is chosen
60 automatically based on the read length distribution (reads N90) and does not require manual setting. Typical
61 value is 3k-5k (and down to 1k for datasets with shorter read length). Intuitively, we want to set this
62 parameter as high as possible, so the repeat graph is less tangled. However, higher values might lead to assembly gaps.
63 In some rare cases it makes sense to manually increase minimum overlap for assemblies of big genomes with long reads and high coverage." />
64 <param argument="--hifi-error" type="float" min="0" max="1" optional="true" label="Expected HiFi reads error rate" help="Default: 0.01"/>
65 <param argument="--plasmids" type="boolean" truevalue="--plasmids" falsevalue="" checked="False" label="Rescue short unassembled plasmids" />
66 <param argument="--keep-haplotypes" type="boolean" truevalue="--keep-haplotypes" falsevalue="" checked="False" label="Keep haplotypes"
67 help="By default, Flye collapses graph structures caused by alternative haplotypes (bubbles, superbubbles, roundabouts) to produce longer
68 consensus contigs. This option retains the alternative paths on the graph, producing less contigouos, but more detailed assembly."/>
69 <param argument="--trestle" type="boolean" truevalue="--trestle" falsevalue=""
70 checked="False" label="Enable Trestle"
71 help="Trestle is an extra module that resolves simple repeats of multipicity 2 that were not bridged by reads. Depending on the datasets, it might
72 resolve a few extra repeats, which is helpful for small (bacterial genomes). On large genomes, the contiguity improvements are usually minimal,
73 but the computation might take a lot of time" />
74 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" checked="False" label="Perform metagenomic assembly"
75 help="It is designed for highly non-uniform coverage and is sensitive to underrepresented sequence at low coverage (as low as 2x).
76 In some examples of simple metagenomes, we observed that the normal mode assembled more contigious bacterial
77 consensus sequence, while the metagenome mode was slightly more fragmented, but revealed strain mixtures"/>
58 <conditional name="asm"> 78 <conditional name="asm">
59 <param name="asm_select" type="select" label="description" help=""> 79 <param name="asm_select" type="select" label="Reduced contig assembly coverage">
60 <option value="true">Enable reduced coverage for initial disjointing assembly</option> 80 <option value="true">Enable reduced coverage for initial disjointing assembly</option>
61 <option value="false" selected="true">Disable reduced coverage for initial disjointing assembly</option> 81 <option value="false" selected="true">Disable reduced coverage for initial disjointing assembly</option>
62 </param> 82 </param>
63 <when value="true"> 83 <when value="true">
64 <param name="asm" argument="--asm-coverage" type="integer" optional="true" label="reduced coverage for initial disjointing assembly" /> 84 <param argument="--asm-coverage" type="integer" min="0" value="30"
65 <param argument="-g" type="text" label="estimated genome size (for example, 5m or 2.6g)"> 85 label="Reduced coverage for initial disjointing assembly"
86 help="Typically, assemblies of large genomes at high coverage require a hundreds of RAM. For high coverage assemblies,
87 you can reduce memory usage by using only a subset of longest reads for initial contig extension stage (usually, the memory bottleneck).
88 The parameter --asm-coverage specifies the target coverage of the longest reads. For a typical assembly, 30x is enough to produce good
89 initial contigs. Regardless of this parameter, all reads will be used at the later pipeline stages."/>
90 <param argument="--genome-size" type="text" optional="true" label="Estimated genome size"
91 help="For example, 5m or 2.6g. No longer required as input. However, it must be used in conjunction with --asm-coverage option.">
66 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator> 92 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator>
67 </param> 93 </param>
68 </when> 94 </when>
69 <when value="false" /> 95 <when value="false" />
70 </conditional> 96 </conditional>
71 97 <param name="generate_log" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate a log file"/>
72 <param argument="--plasmids" type="boolean" truevalue="--plasmids" falsevalue="" checked="False" label="rescue short unassembled plasmids" />
73 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" checked="False" label="perform metagenomic assembly" />
74 <param name="no_trestle" argument="--no-trestle" type="boolean" truevalue="--no-trestle" falsevalue="" checked="False" label="skip trestle stage" />
75 </inputs> 98 </inputs>
76 <outputs> 99 <outputs>
77 <data name="consensus" format="fasta" from_work_dir="out_dir/assembly.fasta" label="${tool.name} on ${on_string} (consensus)"/> 100 <data name="consensus" format="fasta" from_work_dir="out_dir/assembly.fasta" label="${tool.name} on ${on_string}: consensus"/>
78 <data name="assembly_graph" format="graph_dot" from_work_dir="out_dir/assembly_graph.gv" label="${tool.name} on ${on_string} (assembly_graph)"/> 101 <data name="assembly_graph" format="graph_dot" from_work_dir="out_dir/assembly_graph.gv" label="${tool.name} on ${on_string}: assembly graph"/>
79 <data name="assembly_gfa" format="txt" from_work_dir="out_dir/assembly_graph.gfa" label="${tool.name} on ${on_string} (Graphical Fragment Assembly)"/> 102 <data name="assembly_gfa" format="txt" from_work_dir="out_dir/assembly_graph.gfa" label="${tool.name} on ${on_string}: graphical fragment assembly"/>
80 <data name="assembly_info" format="tabular" from_work_dir="out_dir/assembly_info.txt" label="${tool.name} on ${on_string} (assembly_info)"/> 103 <data name="assembly_info" format="tabular" from_work_dir="out_dir/assembly_info.txt" label="${tool.name} on ${on_string}: assembly info"/>
81 <data name="flye_log" format="txt" from_work_dir="out_dir/flye.log" label="${tool.name} on ${on_string} (log)"/> 104 <data name="flye_log" format="txt" from_work_dir="out_dir/flye.log" label="${tool.name} on ${on_string}: log">
105 <filter>generate_log</filter>
106 </data>
82 </outputs> 107 </outputs>
83 <tests> 108 <tests>
84 <test> 109 <!--Test 01-->
85 <param name="inputs" ftype="fasta" value="nanopore.fasta"/> 110 <test expect_num_outputs="5">
111 <param name="inputs" ftype="fastq.gz" value="ecoli_01.fastq.gz,ecoli_02.fastq.gz,ecoli_03.fastq.gz,ecoli_04.fastq.gz,ecoli_05.fastq.gz,ecoli_06.fastq.gz,ecoli_07.fastq.gz"/>
86 <param name="mode" value="--pacbio-raw"/> 112 <param name="mode" value="--pacbio-raw"/>
113 <param name="generate_log" value="true"/>
87 <output name="assembly_info" file="result1_assembly_info.txt" ftype="tabular" compare="sim_size"/> 114 <output name="assembly_info" file="result1_assembly_info.txt" ftype="tabular" compare="sim_size"/>
88 <output name="assembly_graph" file="result1_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> 115 <output name="assembly_graph" file="result1_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/>
89 <output name="assembly_gfa" file="result1_assembly_graph.gfa" ftype="txt" compare="sim_size"/> 116 <output name="assembly_gfa" file="result1_assembly_graph.gfa" ftype="txt" compare="sim_size"/>
90 <output name="consensus" file="result1_assembly.fasta" ftype="fasta" compare="sim_size"/> 117 <output name="consensus" file="result1_assembly.fasta" ftype="fasta" compare="sim_size"/>
91 </test> 118 <output name="flye_log" file="result1.log" ftype="txt" compare="sim_size"/>
92 <test> 119 </test>
93 <param name="inputs" ftype="fasta" value="nanopore.fasta"/> 120 <!--Test 02-->
121 <test expect_num_outputs="4">
122 <param name="inputs" ftype="fasta.gz" value="nanopore.fasta.gz"/>
94 <param name="mode" value="--nano-raw"/> 123 <param name="mode" value="--nano-raw"/>
95 <output name="assembly_info" file="result2_assembly_info.txt" ftype="tabular" compare="sim_size"/> 124 <output name="assembly_info" ftype="tabular">
96 <output name="assembly_graph" file="result2_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> 125 <assert_contents>
97 <output name="assembly_gfa" file="result2_assembly_graph.gfa" ftype="txt" compare="sim_size"/> 126 <has_size value="95" delta="100"/>
98 <output name="consensus" file="result2_assembly.fasta" ftype="fasta" compare="sim_size"/> 127 </assert_contents>
99 </test> 128 </output>
100 <test> 129 <output name="assembly_graph" ftype="graph_dot">
101 <param name="inputs" ftype="fasta" value="nanopore.fasta"/> 130 <assert_contents>
102 <param name="mode" value="--nano-corr"/> 131 <has_size value="803" delta="100"/>
103 <param name="i" value="2"/> 132 </assert_contents>
133 </output>
134 <output name="assembly_gfa" ftype="txt">
135 <assert_contents>
136 <has_size value="35047" delta="100"/>
137 </assert_contents>
138 </output>
139 <output name="consensus" ftype="fasta">
140 <assert_contents>
141 <has_size value="35573" delta="100"/>
142 </assert_contents>
143 </output>
144 </test>
145 <!--Test 03-->
146 <test expect_num_outputs="4">
147 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/>
148 <param name="mode" value="--pacbio-hifi"/>
149 <param name="iterations" value="1"/>
104 <conditional name="asm"> 150 <conditional name="asm">
105 <param name="asm_select" value="true" /> 151 <param name="asm_select" value="true" />
106 <param name="asm" value="40"/> 152 <param name="asm" value="100"/>
107 <param name="g" value="10000"/> 153 <param name="genome_size" value="3980000"/>
108 </conditional> 154 </conditional>
109 <output name="assembly_info" file="result3_assembly_info.txt" ftype="tabular" compare="sim_size"/> 155 <output name="assembly_info" ftype="tabular">
110 <output name="assembly_graph" file="result3_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> 156 <assert_contents>
111 <output name="assembly_gfa" file="result3_assembly_graph.gfa" ftype="txt" compare="sim_size"/> 157 <has_size value="286" delta="100"/>
112 <output name="consensus" file="result3_assembly.fasta" ftype="fasta" compare="sim_size"/> 158 </assert_contents>
113 </test> 159 </output>
114 <test> 160 <output name="assembly_graph" ftype="graph_dot">
115 <param name="inputs" ftype="fasta" value="nanopore.fasta"/> 161 <assert_contents>
162 <has_size value="2135" delta="100"/>
163 </assert_contents>
164 </output>
165 <output name="assembly_gfa" ftype="txt">
166 <assert_contents>
167 <has_size value="114351" delta="100"/>
168 </assert_contents>
169 </output>
170 <output name="consensus" ftype="fasta">
171 <assert_contents>
172 <has_size value="116191" delta="100"/>
173 </assert_contents>
174 </output>
175 </test>
176 <!--Test 04-->
177 <test expect_num_outputs="4">
178 <param name="inputs" ftype="fastq.gz" value="ecoli_01.fastq.gz,ecoli_02.fastq.gz,ecoli_03.fastq.gz,ecoli_04.fastq.gz,ecoli_05.fastq.gz,ecoli_06.fastq.gz,ecoli_07.fastq.gz"/>
116 <param name="mode" value="--pacbio-raw"/> 179 <param name="mode" value="--pacbio-raw"/>
117 <param name="i" value="1"/> 180 <param name="iterations" value="1"/>
118 <param name="meta" value="true"/> 181 <param name="meta" value="true"/>
119 <param name="plasmids" value="true"/> 182 <param name="plasmids" value="true"/>
120 <param name="no-trestle" value="true"/> 183 <output name="assembly_info" ftype="tabular">
121 <output name="assembly_info" file="result4_assembly_info.txt" ftype="tabular" compare="sim_size"/> 184 <assert_contents>
122 <output name="assembly_graph" file="result4_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> 185 <has_size value="95" delta="100"/>
123 <output name="assembly_gfa" file="result4_assembly_graph.gfa" ftype="txt" compare="sim_size"/> 186 </assert_contents>
124 <output name="consensus" file="result4_assembly.fasta" ftype="fasta" compare="sim_size"/> 187 </output>
188 <output name="assembly_graph" ftype="graph_dot">
189 <assert_contents>
190 <has_size value="367" delta="100"/>
191 </assert_contents>
192 </output>
193 <output name="assembly_gfa" ftype="txt">
194 <assert_contents>
195 <has_size value="418051" delta="100"/>
196 </assert_contents>
197 </output>
198 <output name="consensus" ftype="fasta">
199 <assert_contents>
200 <has_size value="425000" delta="100"/>
201 </assert_contents>
202 </output>
203 </test>
204 <!--Test 05-->
205 <test expect_num_outputs="4">
206 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/>
207 <param name="mode" value="--pacbio-hifi"/>
208 <param name="iterations" value="1"/>
209 <output name="assembly_info" ftype="tabular">
210 <assert_contents>
211 <has_size value="286" delta="100"/>
212 </assert_contents>
213 </output>
214 <output name="assembly_graph" ftype="graph_dot">
215 <assert_contents>
216 <has_size value="2135" delta="100"/>
217 </assert_contents>
218 </output>
219 <output name="assembly_gfa" ftype="txt">
220 <assert_contents>
221 <has_size value="114351" delta="100"/>
222 </assert_contents>
223 </output>
224 <output name="consensus" ftype="fasta">
225 <assert_contents>
226 <has_size value="116191" delta="100"/>
227 </assert_contents>
228 </output>
229 </test>
230 <!--Test 06-->
231 <test expect_num_outputs="4">
232 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/>
233 <param name="mode" value="--pacbio-hifi"/>
234 <param name="iterations" value="1"/>
235 <param name="hifi-error" value="0.02"/>
236 <output name="assembly_info" ftype="tabular">
237 <assert_contents>
238 <has_size value="286" delta="100"/>
239 </assert_contents>
240 </output>
241 <output name="assembly_graph" ftype="graph_dot">
242 <assert_contents>
243 <has_size value="2135" delta="100"/>
244 </assert_contents>
245 </output>
246 <output name="assembly_gfa" ftype="txt">
247 <assert_contents>
248 <has_size value="114351" delta="100"/>
249 </assert_contents>
250 </output>
251 <output name="consensus" ftype="fasta">
252 <assert_contents>
253 <has_size value="116191" delta="100"/>
254 </assert_contents>
255 </output>
256 </test>
257 <!--Test 07-->
258 <test expect_num_outputs="4">
259 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/>
260 <param name="mode" value="--pacbio-hifi"/>
261 <param name="iterations" value="1"/>
262 <param name="keep-haplotypes" value="true"/>
263 <output name="assembly_info" ftype="tabular">
264 <assert_contents>
265 <has_size value="286" delta="100"/>
266 </assert_contents>
267 </output>
268 <output name="assembly_graph" ftype="graph_dot">
269 <assert_contents>
270 <has_size value="2135" delta="100"/>
271 </assert_contents>
272 </output>
273 <output name="assembly_gfa" ftype="txt">
274 <assert_contents>
275 <has_size value="114351" delta="100"/>
276 </assert_contents>
277 </output>
278 <output name="consensus" ftype="fasta">
279 <assert_contents>
280 <has_size value="116191" delta="100"/>
281 </assert_contents>
282 </output>
125 </test> 283 </test>
126 </tests> 284 </tests>
127 <help><![CDATA[ 285 <help><![CDATA[
128 286
129 Input reads could be in FASTA or FASTQ format, uncompressed 287 .. class:: infomark
130 or compressed with gz. Currenlty, raw and corrected reads 288
131 from PacBio and ONT are supported. The expected error rates are 289 **Purpose**
132 <30% for raw and <2% for corrected reads. Additionally, 290
133 --subassemblies option performs a consensus assembly of multiple 291 Flye is a de novo assembler for single molecule sequencing reads, such as those produced by PacBio and Oxford Nanopore Technologies.
134 sets of high-quality contigs. You may specify multiple 292 It is designed for a wide range of datasets, from small bacterial projects to large mammalian-scale assemblies. The package represents
135 files with reads (separated by spaces). Mixing different read 293 a complete pipeline: it takes raw PacBio/ONT reads as input and outputs polished contigs. Flye also has a special mode for metagenome
136 types is not yet supported. 294 assembly.
137 295
138 You must provide an estimate of the genome size as input, 296 ----
139 which is used for solid k-mers selection. The estimate could 297
140 be rough (e.g. withing 0.5x-2x range) and does not affect 298 .. class:: infomark
141 the other assembly stages. Standard size modificators are 299
142 supported (e.g. 5m or 2.6g). 300 **Quick usage**
143 301
144 ]]></help> 302 Input reads can be in FASTA or FASTQ format, uncompressed or compressed with gz. Currently, PacBio (raw, corrected, HiFi) and ONT reads
303 (raw, corrected) are supported. Expected error rates are <30% for raw, <3% for corrected, and <1% for HiFi. Note that Flye was primarily
304 developed to run on raw reads. Additionally, the *--subassemblies* option performs a consensus assembly of multiple sets of high-quality
305 contigs. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o
306 ption enables the mode for metagenome/uneven coverage assembly.
307
308 Genome size estimate is no longer a required option. You need to provide an estimate if using *--asm-coverage* option.
309
310 To reduce memory consumption for large genome assemblies, you can use a subset of the longest reads for initial disjointig assembly by
311 specifying *--asm-coverage* and *--genome-size* options. Typically, 40x coverage is enough to produce good disjointigs.
312
313 ----
314
315 .. class:: infomark
316
317 **Outputs**
318
319 The main output files are:
320
321 ::
322
323 - Final assembly: contains contigs and possibly scaffolds (see below).
324 - Final repeat graph: note that the edge sequences might be different (shorter) than contig sequences, because contigs might include multiple graph edges.
325 - Extra information about contigs (such as length or coverage).
326
327 Each contig is formed by a single unique graph edge. If possible, unique contigs are extended with the sequence from flanking unresolved repeats on the graph. Thus,
328 a contig fully contains the corresponding graph edge (with the same id), but might be longer then this edge. This is somewhat similar to unitig-contig relation in
329 OLC assemblers. In a rare case when a repetitive graph edge is not covered by the set of "extended" contigs, it will be also output in the assembly file.
330
331 Sometimes it is possible to further order contigs into scaffolds based on the repeat graph structure. These ordered contigs will be output as a part of scaffold in
332 the assembly file (with a scaffold prefix). Since it is hard to give a reliable estimate of the gap size, those gaps are represented with the default 100 Ns.
333 assembly_info.txt file (below) contains additional information about how scaffolds were formed.
334
335 Extra information about contigs/scaffolds is output into the assembly_info.txt file. It is a tab-delimited table with the columns as follows:
336
337 ::
338
339 - Contig/scaffold id
340 - Length
341 - Coverage
342 - Is circular, (Y)es or (N)o
343 - Is repetitive, (Y)es or (N)o
344 - Multiplicity (based on coverage)
345 - Alternative group
346 - Graph path (graph path corresponding to this contig/scaffold).
347
348 Scaffold gaps are marked with ?? symbols, and * symbol denotes a terminal graph node. Alternative contigs (representing alternative haplotypes) will have the same alt.
349 group ID. Primary contigs are marked by *.
350
351 ----
352
353 .. class:: infomark
354
355 **Algorithm Description**
356
357 This is a brief description of the Flye algorithm. Please refer to the manuscript for more detailed information. The draft contig extension is organized as follows:
358
359 ::
360
361 - K-mer counting / erroneous k-mer pre-filtering
362 - Solid k-mer selection (k-mers with sufficient frequency, which are unlikely to be erroneous)
363 - Contig extension. The algorithm starts from a single read and extends it with a next overlapping read (overlaps are dynamically detected using the selected solid k-mers).
364
365 Note that we do not attempt to resolve repeats at this stage, thus the reconstructed contigs might contain misassemblies. Flye then aligns the reads on these draft
366 contigs using minimap2 and calls a consensus. Afterwards, Flye performs repeat analysis as follows:
367
368 ::
369
370 - Repeat graph is constructed from the (possibly misassembled) contigs
371 - In this graph all repeats longer than minimum overlap are collapsed
372 - The algorithm resolves repeats using the read information and graph structure
373 - The unbranching paths in the graph are output as contigs
374
375 If enabled, after resolving bridged repeats, Trestle module attempts to resolve simple unbridged repeats (of multiplicity 2) using the heterogeneities between repeat copies.
376 Finally, Flye performs polishing of the resulting assembly to correct the remaining errors:
377
378 ::
379
380 - Alignment of all reads to the current assembly using minimap2
381 - Partition the alignment into mini-alignments (bubbles)
382 - Error correction of each bubble using a maximum likelihood approach
383
384
385 The polishing steps could be repeated, which might slightly increase quality for some datasets.
386
387
388 ]]></help>
145 <expand macro="citations" /> 389 <expand macro="citations" />
146 </tool> 390 </tool>