Mercurial > repos > bgruening > flye
comparison flye.xml @ 8:e27815e82dd4 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flye commit 7bec5df9cb30dd196ae99565d77547e12d05fa48"
author | bgruening |
---|---|
date | Wed, 30 Jun 2021 20:02:51 +0000 |
parents | 8d4f03b5fe9d |
children | 276f5d8712d5 |
comparison
equal
deleted
inserted
replaced
7:8d4f03b5fe9d | 8:e27815e82dd4 |
---|---|
1 <tool id="flye" name="Flye assembly" version="2.8.2+galaxy0"> | 1 <tool id="flye" name="Flye" version="@TOOL_VERSION@+galaxy@SUFFIX_VERSION@" profile="20.01"> |
2 <description>of long and error-prone reads</description> | 2 <description>de novo assembler for single molecule sequencing reads</description> |
3 <macros> | 3 <macros> |
4 <import>macros.xml</import> | 4 <import>macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="requirements" /> | 6 <expand macro="requirements" /> |
7 <expand macro="edam_ontology"/> | |
7 <version_command>flye --version</version_command> | 8 <version_command>flye --version</version_command> |
8 <command detect_errors="exit_code"> | 9 <command detect_errors="exit_code"><![CDATA[ |
9 <![CDATA[ | 10 #for $counter, $input in enumerate($inputs): |
10 | 11 #if $input.is_of_type('fastqsanger', 'fastq'): |
11 #for $counter, $input in enumerate($inputs): | 12 #set $ext = 'fastq' |
12 | 13 #elif $input.is_of_type('fastqsanger.gz', 'fastq.gz'): |
13 #if $input.is_of_type('fastqsanger', 'fastq'): | 14 #set $ext = 'fastq.gz' |
14 #set $ext = 'fastq' | 15 #elif $input.is_of_type('fasta.gz'): |
15 #elif $input.is_of_type('fastqsanger.gz', 'fastq.gz'): | 16 #set $ext = 'fasta.gz' |
16 #set $ext = 'fastq.gz' | 17 #elif $input.is_of_type('fasta'): |
17 #elif $input.is_of_type('fasta.gz'): | 18 #set $ext = 'fasta' |
18 #set $ext = 'fasta.gz' | 19 #end if |
19 #elif $input.is_of_type('fasta'): | 20 ln -s '$input' ./input_${counter}.${ext} && |
20 #set $ext = 'fasta' | 21 #end for |
22 flye | |
23 $mode | |
24 #for $counter, $input in enumerate($inputs): | |
25 ./input_${counter}.$ext | |
26 #end for | |
27 -o out_dir | |
28 -t \${GALAXY_SLOTS:-4} | |
29 -i $iterations | |
30 #if $hifi_error: | |
31 --hifi-error $hifi_error | |
21 #end if | 32 #end if |
22 ln -s '$input' ./input_${counter}.${ext} && | 33 #if $min_overlap: |
23 #end for | 34 -m $min_overlap |
24 | 35 #end if |
25 flye | 36 #if $asm.asm_select == 'true': |
26 $mode | 37 --asm-coverage $asm.asm_coverage |
27 #for $counter, $input in enumerate($inputs): | 38 -g '${asm.genome_size}' |
28 ./input_${counter}.$ext | 39 #end if |
29 #end for | 40 $plasmids |
30 | 41 $meta |
31 -o out_dir | 42 $trestle |
32 -t \${GALAXY_SLOTS:-4} | |
33 -i $i | |
34 #if $m: | |
35 -m '$m' | |
36 #end if | |
37 #if str($asm.asm_select) == "true": | |
38 --asm-coverage '$asm.asm' | |
39 -g '$asm.g' | |
40 #end if | |
41 ${plasmids} | |
42 ${meta} | |
43 ${no_trestle} | |
44 2>&1 | |
45 ]]></command> | 43 ]]></command> |
46 <inputs> | 44 <inputs> |
47 <param name="inputs" type="data" format="fasta,fasta.gz,fastq,fastq.gz,fastqsanger.gz,fastqsanger" multiple="true" label="Input reads" /> | 45 <param name="inputs" type="data" format="fasta,fasta.gz,fastq,fastq.gz,fastqsanger.gz,fastqsanger" multiple="true" label="Input reads" /> |
48 <param name="mode" type="select" label="Mode"> | 46 <param name="mode" type="select" label="Mode"> |
49 <option value="--nano-raw">Nanopore raw</option> | 47 <option value="--nano-raw">Nanopore raw</option> |
50 <option value="--nano-corr">Nanopore corrected</option> | 48 <option value="--nano-corr">Nanopore corrected</option> |
49 <option value="--pacbio-hifi">PacBio HiFi</option> | |
51 <option value="--pacbio-raw">PacBio raw</option> | 50 <option value="--pacbio-raw">PacBio raw</option> |
52 <option value="--pacbio-corr">PacBio corrected</option> | 51 <option value="--pacbio-corr">PacBio corrected</option> |
53 <option value="--subassemblies">high-quality contig-like input</option> | 52 <option value="--subassemblies">High-quality contig-like input</option> |
54 </param> | 53 </param> |
55 <param argument="-i" type="integer" value="1" label="number of polishing iterations" /> | 54 <param argument="--iterations" type="integer" value="0" label="Number of polishing iterations" |
56 <param argument="-m" type="integer" optional="true" label="minimum overlap between reads (default: auto)" /> | 55 help="Polishing is performed as the final assembly stage. By default, Flye runs one polishing iteration. Additional iterations |
57 | 56 might correct a small number of extra errors (due to improvements on how reads may align to the corrected assembly). If the |
57 parameter is set to 0, the polishing is not performed."/> | |
58 <param argument="--min-overlap" type="integer" optional="true" label="Minimum overlap between reads" | |
59 help="This sets a minimum overlap length for two reads to be considered overlapping. By default it is chosen | |
60 automatically based on the read length distribution (reads N90) and does not require manual setting. Typical | |
61 value is 3k-5k (and down to 1k for datasets with shorter read length). Intuitively, we want to set this | |
62 parameter as high as possible, so the repeat graph is less tangled. However, higher values might lead to assembly gaps. | |
63 In some rare cases it makes sense to manually increase minimum overlap for assemblies of big genomes with long reads and high coverage." /> | |
64 <param argument="--hifi-error" type="float" min="0" max="1" optional="true" label="Expected HiFi reads error rate" help="Default: 0.01"/> | |
65 <param argument="--plasmids" type="boolean" truevalue="--plasmids" falsevalue="" checked="False" label="Rescue short unassembled plasmids" /> | |
66 <param argument="--keep-haplotypes" type="boolean" truevalue="--keep-haplotypes" falsevalue="" checked="False" label="Keep haplotypes" | |
67 help="By default, Flye collapses graph structures caused by alternative haplotypes (bubbles, superbubbles, roundabouts) to produce longer | |
68 consensus contigs. This option retains the alternative paths on the graph, producing less contigouos, but more detailed assembly."/> | |
69 <param argument="--trestle" type="boolean" truevalue="--trestle" falsevalue="" | |
70 checked="False" label="Enable Trestle" | |
71 help="Trestle is an extra module that resolves simple repeats of multipicity 2 that were not bridged by reads. Depending on the datasets, it might | |
72 resolve a few extra repeats, which is helpful for small (bacterial genomes). On large genomes, the contiguity improvements are usually minimal, | |
73 but the computation might take a lot of time" /> | |
74 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" checked="False" label="Perform metagenomic assembly" | |
75 help="It is designed for highly non-uniform coverage and is sensitive to underrepresented sequence at low coverage (as low as 2x). | |
76 In some examples of simple metagenomes, we observed that the normal mode assembled more contigious bacterial | |
77 consensus sequence, while the metagenome mode was slightly more fragmented, but revealed strain mixtures"/> | |
58 <conditional name="asm"> | 78 <conditional name="asm"> |
59 <param name="asm_select" type="select" label="description" help=""> | 79 <param name="asm_select" type="select" label="Reduced contig assembly coverage"> |
60 <option value="true">Enable reduced coverage for initial disjointing assembly</option> | 80 <option value="true">Enable reduced coverage for initial disjointing assembly</option> |
61 <option value="false" selected="true">Disable reduced coverage for initial disjointing assembly</option> | 81 <option value="false" selected="true">Disable reduced coverage for initial disjointing assembly</option> |
62 </param> | 82 </param> |
63 <when value="true"> | 83 <when value="true"> |
64 <param name="asm" argument="--asm-coverage" type="integer" optional="true" label="reduced coverage for initial disjointing assembly" /> | 84 <param argument="--asm-coverage" type="integer" min="0" value="30" |
65 <param argument="-g" type="text" label="estimated genome size (for example, 5m or 2.6g)"> | 85 label="Reduced coverage for initial disjointing assembly" |
86 help="Typically, assemblies of large genomes at high coverage require a hundreds of RAM. For high coverage assemblies, | |
87 you can reduce memory usage by using only a subset of longest reads for initial contig extension stage (usually, the memory bottleneck). | |
88 The parameter --asm-coverage specifies the target coverage of the longest reads. For a typical assembly, 30x is enough to produce good | |
89 initial contigs. Regardless of this parameter, all reads will be used at the later pipeline stages."/> | |
90 <param argument="--genome-size" type="text" optional="true" label="Estimated genome size" | |
91 help="For example, 5m or 2.6g. No longer required as input. However, it must be used in conjunction with --asm-coverage option."> | |
66 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator> | 92 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator> |
67 </param> | 93 </param> |
68 </when> | 94 </when> |
69 <when value="false" /> | 95 <when value="false" /> |
70 </conditional> | 96 </conditional> |
71 | 97 <param name="generate_log" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate a log file"/> |
72 <param argument="--plasmids" type="boolean" truevalue="--plasmids" falsevalue="" checked="False" label="rescue short unassembled plasmids" /> | |
73 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" checked="False" label="perform metagenomic assembly" /> | |
74 <param name="no_trestle" argument="--no-trestle" type="boolean" truevalue="--no-trestle" falsevalue="" checked="False" label="skip trestle stage" /> | |
75 </inputs> | 98 </inputs> |
76 <outputs> | 99 <outputs> |
77 <data name="consensus" format="fasta" from_work_dir="out_dir/assembly.fasta" label="${tool.name} on ${on_string} (consensus)"/> | 100 <data name="consensus" format="fasta" from_work_dir="out_dir/assembly.fasta" label="${tool.name} on ${on_string}: consensus"/> |
78 <data name="assembly_graph" format="graph_dot" from_work_dir="out_dir/assembly_graph.gv" label="${tool.name} on ${on_string} (assembly_graph)"/> | 101 <data name="assembly_graph" format="graph_dot" from_work_dir="out_dir/assembly_graph.gv" label="${tool.name} on ${on_string}: assembly graph"/> |
79 <data name="assembly_gfa" format="txt" from_work_dir="out_dir/assembly_graph.gfa" label="${tool.name} on ${on_string} (Graphical Fragment Assembly)"/> | 102 <data name="assembly_gfa" format="txt" from_work_dir="out_dir/assembly_graph.gfa" label="${tool.name} on ${on_string}: graphical fragment assembly"/> |
80 <data name="assembly_info" format="tabular" from_work_dir="out_dir/assembly_info.txt" label="${tool.name} on ${on_string} (assembly_info)"/> | 103 <data name="assembly_info" format="tabular" from_work_dir="out_dir/assembly_info.txt" label="${tool.name} on ${on_string}: assembly info"/> |
81 <data name="flye_log" format="txt" from_work_dir="out_dir/flye.log" label="${tool.name} on ${on_string} (log)"/> | 104 <data name="flye_log" format="txt" from_work_dir="out_dir/flye.log" label="${tool.name} on ${on_string}: log"> |
105 <filter>generate_log</filter> | |
106 </data> | |
82 </outputs> | 107 </outputs> |
83 <tests> | 108 <tests> |
84 <test> | 109 <!--Test 01--> |
85 <param name="inputs" ftype="fasta" value="nanopore.fasta"/> | 110 <test expect_num_outputs="5"> |
111 <param name="inputs" ftype="fastq.gz" value="ecoli_01.fastq.gz,ecoli_02.fastq.gz,ecoli_03.fastq.gz,ecoli_04.fastq.gz,ecoli_05.fastq.gz,ecoli_06.fastq.gz,ecoli_07.fastq.gz"/> | |
86 <param name="mode" value="--pacbio-raw"/> | 112 <param name="mode" value="--pacbio-raw"/> |
113 <param name="generate_log" value="true"/> | |
87 <output name="assembly_info" file="result1_assembly_info.txt" ftype="tabular" compare="sim_size"/> | 114 <output name="assembly_info" file="result1_assembly_info.txt" ftype="tabular" compare="sim_size"/> |
88 <output name="assembly_graph" file="result1_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> | 115 <output name="assembly_graph" file="result1_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> |
89 <output name="assembly_gfa" file="result1_assembly_graph.gfa" ftype="txt" compare="sim_size"/> | 116 <output name="assembly_gfa" file="result1_assembly_graph.gfa" ftype="txt" compare="sim_size"/> |
90 <output name="consensus" file="result1_assembly.fasta" ftype="fasta" compare="sim_size"/> | 117 <output name="consensus" file="result1_assembly.fasta" ftype="fasta" compare="sim_size"/> |
91 </test> | 118 <output name="flye_log" file="result1.log" ftype="txt" compare="sim_size"/> |
92 <test> | 119 </test> |
93 <param name="inputs" ftype="fasta" value="nanopore.fasta"/> | 120 <!--Test 02--> |
121 <test expect_num_outputs="4"> | |
122 <param name="inputs" ftype="fasta.gz" value="nanopore.fasta.gz"/> | |
94 <param name="mode" value="--nano-raw"/> | 123 <param name="mode" value="--nano-raw"/> |
95 <output name="assembly_info" file="result2_assembly_info.txt" ftype="tabular" compare="sim_size"/> | 124 <output name="assembly_info" ftype="tabular"> |
96 <output name="assembly_graph" file="result2_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> | 125 <assert_contents> |
97 <output name="assembly_gfa" file="result2_assembly_graph.gfa" ftype="txt" compare="sim_size"/> | 126 <has_size value="95" delta="100"/> |
98 <output name="consensus" file="result2_assembly.fasta" ftype="fasta" compare="sim_size"/> | 127 </assert_contents> |
99 </test> | 128 </output> |
100 <test> | 129 <output name="assembly_graph" ftype="graph_dot"> |
101 <param name="inputs" ftype="fasta" value="nanopore.fasta"/> | 130 <assert_contents> |
102 <param name="mode" value="--nano-corr"/> | 131 <has_size value="803" delta="100"/> |
103 <param name="i" value="2"/> | 132 </assert_contents> |
133 </output> | |
134 <output name="assembly_gfa" ftype="txt"> | |
135 <assert_contents> | |
136 <has_size value="35047" delta="100"/> | |
137 </assert_contents> | |
138 </output> | |
139 <output name="consensus" ftype="fasta"> | |
140 <assert_contents> | |
141 <has_size value="35573" delta="100"/> | |
142 </assert_contents> | |
143 </output> | |
144 </test> | |
145 <!--Test 03--> | |
146 <test expect_num_outputs="4"> | |
147 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> | |
148 <param name="mode" value="--pacbio-hifi"/> | |
149 <param name="iterations" value="1"/> | |
104 <conditional name="asm"> | 150 <conditional name="asm"> |
105 <param name="asm_select" value="true" /> | 151 <param name="asm_select" value="true" /> |
106 <param name="asm" value="40"/> | 152 <param name="asm" value="100"/> |
107 <param name="g" value="10000"/> | 153 <param name="genome_size" value="3980000"/> |
108 </conditional> | 154 </conditional> |
109 <output name="assembly_info" file="result3_assembly_info.txt" ftype="tabular" compare="sim_size"/> | 155 <output name="assembly_info" ftype="tabular"> |
110 <output name="assembly_graph" file="result3_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> | 156 <assert_contents> |
111 <output name="assembly_gfa" file="result3_assembly_graph.gfa" ftype="txt" compare="sim_size"/> | 157 <has_size value="286" delta="100"/> |
112 <output name="consensus" file="result3_assembly.fasta" ftype="fasta" compare="sim_size"/> | 158 </assert_contents> |
113 </test> | 159 </output> |
114 <test> | 160 <output name="assembly_graph" ftype="graph_dot"> |
115 <param name="inputs" ftype="fasta" value="nanopore.fasta"/> | 161 <assert_contents> |
162 <has_size value="2135" delta="100"/> | |
163 </assert_contents> | |
164 </output> | |
165 <output name="assembly_gfa" ftype="txt"> | |
166 <assert_contents> | |
167 <has_size value="114351" delta="100"/> | |
168 </assert_contents> | |
169 </output> | |
170 <output name="consensus" ftype="fasta"> | |
171 <assert_contents> | |
172 <has_size value="116191" delta="100"/> | |
173 </assert_contents> | |
174 </output> | |
175 </test> | |
176 <!--Test 04--> | |
177 <test expect_num_outputs="4"> | |
178 <param name="inputs" ftype="fastq.gz" value="ecoli_01.fastq.gz,ecoli_02.fastq.gz,ecoli_03.fastq.gz,ecoli_04.fastq.gz,ecoli_05.fastq.gz,ecoli_06.fastq.gz,ecoli_07.fastq.gz"/> | |
116 <param name="mode" value="--pacbio-raw"/> | 179 <param name="mode" value="--pacbio-raw"/> |
117 <param name="i" value="1"/> | 180 <param name="iterations" value="1"/> |
118 <param name="meta" value="true"/> | 181 <param name="meta" value="true"/> |
119 <param name="plasmids" value="true"/> | 182 <param name="plasmids" value="true"/> |
120 <param name="no-trestle" value="true"/> | 183 <output name="assembly_info" ftype="tabular"> |
121 <output name="assembly_info" file="result4_assembly_info.txt" ftype="tabular" compare="sim_size"/> | 184 <assert_contents> |
122 <output name="assembly_graph" file="result4_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> | 185 <has_size value="95" delta="100"/> |
123 <output name="assembly_gfa" file="result4_assembly_graph.gfa" ftype="txt" compare="sim_size"/> | 186 </assert_contents> |
124 <output name="consensus" file="result4_assembly.fasta" ftype="fasta" compare="sim_size"/> | 187 </output> |
188 <output name="assembly_graph" ftype="graph_dot"> | |
189 <assert_contents> | |
190 <has_size value="367" delta="100"/> | |
191 </assert_contents> | |
192 </output> | |
193 <output name="assembly_gfa" ftype="txt"> | |
194 <assert_contents> | |
195 <has_size value="418051" delta="100"/> | |
196 </assert_contents> | |
197 </output> | |
198 <output name="consensus" ftype="fasta"> | |
199 <assert_contents> | |
200 <has_size value="425000" delta="100"/> | |
201 </assert_contents> | |
202 </output> | |
203 </test> | |
204 <!--Test 05--> | |
205 <test expect_num_outputs="4"> | |
206 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> | |
207 <param name="mode" value="--pacbio-hifi"/> | |
208 <param name="iterations" value="1"/> | |
209 <output name="assembly_info" ftype="tabular"> | |
210 <assert_contents> | |
211 <has_size value="286" delta="100"/> | |
212 </assert_contents> | |
213 </output> | |
214 <output name="assembly_graph" ftype="graph_dot"> | |
215 <assert_contents> | |
216 <has_size value="2135" delta="100"/> | |
217 </assert_contents> | |
218 </output> | |
219 <output name="assembly_gfa" ftype="txt"> | |
220 <assert_contents> | |
221 <has_size value="114351" delta="100"/> | |
222 </assert_contents> | |
223 </output> | |
224 <output name="consensus" ftype="fasta"> | |
225 <assert_contents> | |
226 <has_size value="116191" delta="100"/> | |
227 </assert_contents> | |
228 </output> | |
229 </test> | |
230 <!--Test 06--> | |
231 <test expect_num_outputs="4"> | |
232 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> | |
233 <param name="mode" value="--pacbio-hifi"/> | |
234 <param name="iterations" value="1"/> | |
235 <param name="hifi-error" value="0.02"/> | |
236 <output name="assembly_info" ftype="tabular"> | |
237 <assert_contents> | |
238 <has_size value="286" delta="100"/> | |
239 </assert_contents> | |
240 </output> | |
241 <output name="assembly_graph" ftype="graph_dot"> | |
242 <assert_contents> | |
243 <has_size value="2135" delta="100"/> | |
244 </assert_contents> | |
245 </output> | |
246 <output name="assembly_gfa" ftype="txt"> | |
247 <assert_contents> | |
248 <has_size value="114351" delta="100"/> | |
249 </assert_contents> | |
250 </output> | |
251 <output name="consensus" ftype="fasta"> | |
252 <assert_contents> | |
253 <has_size value="116191" delta="100"/> | |
254 </assert_contents> | |
255 </output> | |
256 </test> | |
257 <!--Test 07--> | |
258 <test expect_num_outputs="4"> | |
259 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> | |
260 <param name="mode" value="--pacbio-hifi"/> | |
261 <param name="iterations" value="1"/> | |
262 <param name="keep-haplotypes" value="true"/> | |
263 <output name="assembly_info" ftype="tabular"> | |
264 <assert_contents> | |
265 <has_size value="286" delta="100"/> | |
266 </assert_contents> | |
267 </output> | |
268 <output name="assembly_graph" ftype="graph_dot"> | |
269 <assert_contents> | |
270 <has_size value="2135" delta="100"/> | |
271 </assert_contents> | |
272 </output> | |
273 <output name="assembly_gfa" ftype="txt"> | |
274 <assert_contents> | |
275 <has_size value="114351" delta="100"/> | |
276 </assert_contents> | |
277 </output> | |
278 <output name="consensus" ftype="fasta"> | |
279 <assert_contents> | |
280 <has_size value="116191" delta="100"/> | |
281 </assert_contents> | |
282 </output> | |
125 </test> | 283 </test> |
126 </tests> | 284 </tests> |
127 <help><![CDATA[ | 285 <help><![CDATA[ |
128 | 286 |
129 Input reads could be in FASTA or FASTQ format, uncompressed | 287 .. class:: infomark |
130 or compressed with gz. Currenlty, raw and corrected reads | 288 |
131 from PacBio and ONT are supported. The expected error rates are | 289 **Purpose** |
132 <30% for raw and <2% for corrected reads. Additionally, | 290 |
133 --subassemblies option performs a consensus assembly of multiple | 291 Flye is a de novo assembler for single molecule sequencing reads, such as those produced by PacBio and Oxford Nanopore Technologies. |
134 sets of high-quality contigs. You may specify multiple | 292 It is designed for a wide range of datasets, from small bacterial projects to large mammalian-scale assemblies. The package represents |
135 files with reads (separated by spaces). Mixing different read | 293 a complete pipeline: it takes raw PacBio/ONT reads as input and outputs polished contigs. Flye also has a special mode for metagenome |
136 types is not yet supported. | 294 assembly. |
137 | 295 |
138 You must provide an estimate of the genome size as input, | 296 ---- |
139 which is used for solid k-mers selection. The estimate could | 297 |
140 be rough (e.g. withing 0.5x-2x range) and does not affect | 298 .. class:: infomark |
141 the other assembly stages. Standard size modificators are | 299 |
142 supported (e.g. 5m or 2.6g). | 300 **Quick usage** |
143 | 301 |
144 ]]></help> | 302 Input reads can be in FASTA or FASTQ format, uncompressed or compressed with gz. Currently, PacBio (raw, corrected, HiFi) and ONT reads |
303 (raw, corrected) are supported. Expected error rates are <30% for raw, <3% for corrected, and <1% for HiFi. Note that Flye was primarily | |
304 developed to run on raw reads. Additionally, the *--subassemblies* option performs a consensus assembly of multiple sets of high-quality | |
305 contigs. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o | |
306 ption enables the mode for metagenome/uneven coverage assembly. | |
307 | |
308 Genome size estimate is no longer a required option. You need to provide an estimate if using *--asm-coverage* option. | |
309 | |
310 To reduce memory consumption for large genome assemblies, you can use a subset of the longest reads for initial disjointig assembly by | |
311 specifying *--asm-coverage* and *--genome-size* options. Typically, 40x coverage is enough to produce good disjointigs. | |
312 | |
313 ---- | |
314 | |
315 .. class:: infomark | |
316 | |
317 **Outputs** | |
318 | |
319 The main output files are: | |
320 | |
321 :: | |
322 | |
323 - Final assembly: contains contigs and possibly scaffolds (see below). | |
324 - Final repeat graph: note that the edge sequences might be different (shorter) than contig sequences, because contigs might include multiple graph edges. | |
325 - Extra information about contigs (such as length or coverage). | |
326 | |
327 Each contig is formed by a single unique graph edge. If possible, unique contigs are extended with the sequence from flanking unresolved repeats on the graph. Thus, | |
328 a contig fully contains the corresponding graph edge (with the same id), but might be longer then this edge. This is somewhat similar to unitig-contig relation in | |
329 OLC assemblers. In a rare case when a repetitive graph edge is not covered by the set of "extended" contigs, it will be also output in the assembly file. | |
330 | |
331 Sometimes it is possible to further order contigs into scaffolds based on the repeat graph structure. These ordered contigs will be output as a part of scaffold in | |
332 the assembly file (with a scaffold prefix). Since it is hard to give a reliable estimate of the gap size, those gaps are represented with the default 100 Ns. | |
333 assembly_info.txt file (below) contains additional information about how scaffolds were formed. | |
334 | |
335 Extra information about contigs/scaffolds is output into the assembly_info.txt file. It is a tab-delimited table with the columns as follows: | |
336 | |
337 :: | |
338 | |
339 - Contig/scaffold id | |
340 - Length | |
341 - Coverage | |
342 - Is circular, (Y)es or (N)o | |
343 - Is repetitive, (Y)es or (N)o | |
344 - Multiplicity (based on coverage) | |
345 - Alternative group | |
346 - Graph path (graph path corresponding to this contig/scaffold). | |
347 | |
348 Scaffold gaps are marked with ?? symbols, and * symbol denotes a terminal graph node. Alternative contigs (representing alternative haplotypes) will have the same alt. | |
349 group ID. Primary contigs are marked by *. | |
350 | |
351 ---- | |
352 | |
353 .. class:: infomark | |
354 | |
355 **Algorithm Description** | |
356 | |
357 This is a brief description of the Flye algorithm. Please refer to the manuscript for more detailed information. The draft contig extension is organized as follows: | |
358 | |
359 :: | |
360 | |
361 - K-mer counting / erroneous k-mer pre-filtering | |
362 - Solid k-mer selection (k-mers with sufficient frequency, which are unlikely to be erroneous) | |
363 - Contig extension. The algorithm starts from a single read and extends it with a next overlapping read (overlaps are dynamically detected using the selected solid k-mers). | |
364 | |
365 Note that we do not attempt to resolve repeats at this stage, thus the reconstructed contigs might contain misassemblies. Flye then aligns the reads on these draft | |
366 contigs using minimap2 and calls a consensus. Afterwards, Flye performs repeat analysis as follows: | |
367 | |
368 :: | |
369 | |
370 - Repeat graph is constructed from the (possibly misassembled) contigs | |
371 - In this graph all repeats longer than minimum overlap are collapsed | |
372 - The algorithm resolves repeats using the read information and graph structure | |
373 - The unbranching paths in the graph are output as contigs | |
374 | |
375 If enabled, after resolving bridged repeats, Trestle module attempts to resolve simple unbridged repeats (of multiplicity 2) using the heterogeneities between repeat copies. | |
376 Finally, Flye performs polishing of the resulting assembly to correct the remaining errors: | |
377 | |
378 :: | |
379 | |
380 - Alignment of all reads to the current assembly using minimap2 | |
381 - Partition the alignment into mini-alignments (bubbles) | |
382 - Error correction of each bubble using a maximum likelihood approach | |
383 | |
384 | |
385 The polishing steps could be repeated, which might slightly increase quality for some datasets. | |
386 | |
387 | |
388 ]]></help> | |
145 <expand macro="citations" /> | 389 <expand macro="citations" /> |
146 </tool> | 390 </tool> |