comparison wtdbg.xml @ 2:2668027a533b draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/wtdbg commit 05f8373310ce1728426b89f33b643406e0cba54b"
author bgruening
date Sat, 29 Jan 2022 12:49:28 +0000
parents e100f3f4d80e
children
comparison
equal deleted inserted replaced
1:e100f3f4d80e 2:2668027a533b
1 <tool id="wtdbg" name="WTDBG" version="2.0"> 1 <tool id="wtdbg" name="WTDBG2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="GPL-3.0-only">
2 <description>De novo assembler AND consensuser for long noisy sequences</description> 2 <description>Fast de novo sequence assembler for long noisy reads</description>
3 <xrefs>
4 <xref type='bio.tools'>wtdbg2</xref>
5 </xrefs>
3 <macros> 6 <macros>
4 <import>macros.xml</import> 7 <import>macros.xml</import>
5 </macros> 8 </macros>
6 <expand macro="requirements" /> 9 <requirements>
7 <version_command>wtdbg2 -help | grep 'Version:'</version_command> 10 <requirement type="package" version="@TOOL_VERSION@">wtdbg</requirement>
11 </requirements>
12 <version_command>wtdbg2 -V</version_command>
8 <command detect_errors="exit_code"><![CDATA[ 13 <command detect_errors="exit_code"><![CDATA[
9 wtdbg2 14 ## helper function to sort fastqs before fastas in input readsets
10 -t \${GALAXY_SLOTS:-4} 15 #def sort_fastq_fasta(files):
11 -i '$i' 16 #set fastqs = [f for f in $files if f.is_of_type('fastq')]
12 -o 'dbg' 17 #set fastas = [f for f in $files if f.is_of_type('fasta')]
13 #if $I: 18 #set out = $fastqs + $fastas
14 -I '$I' 19 #return $out
15 #end if 20 #end def
16 #if $load_alignments: 21
17 --load-alignments '$load_alignments' 22 ## prepare readset inputs (must be fastqs then fastas if multiple files)
18 #end if 23 #set input_reads_sorted = $sort_fastq_fasta($input_reads)
19 24
20 -k $k 25 ## perform assembly
21 -p $p 26 wtdbg2
22 -K $K 27 -t \${GALAXY_SLOTS:-4}
23 -E $E 28 -x '${sequencing_technology}'
24 $F 29 -g '${genome_size}'
25 -S $S 30 @ASM_OPTIONS@
26 -X $X 31
27 -Y $Y 32 #for $readset in $input_reads_sorted:
28 -x $x 33 -i '${readset}'
29 -y $y 34 #end for
30 -l $l 35 -fo out
31 -m $m 36
32 -s $s 37 &&
33 --tidy-reads $tidy_reads 38 wtpoa-cns
34 --edge-min $edge_min 39 -t \${GALAXY_SLOTS:-4}
35 $rescue_low_cov_edges 40 -i out.ctg.lay.gz
36 && 41 -fo out.fa
37 wtdbg-cns 42 @CNS_OPTIONS@
38 -t \${GALAXY_SLOTS:-4}
39 -o dbg.ctg.lay.fa
40 -i dbg.ctg.lay
41 -j $cns.j
42 -k $cns.k
43 -Z $cns.Z
44 -W $cns.W
45 -H $cns.H
46 -L $cns.L
47 -c $cns.c
48 -M $cns.M
49 -X $cns.X
50 -I $cns.I
51 -D $cns.D
52 -E $cns.E
53 -m $cns.m
54 -S $cns.S
55
56 ]]></command> 43 ]]></command>
57 <inputs> 44 <inputs>
58 <param type="data" argument="-i" format="fasta,fasta.gz" label="Long reads sequences file"/> 45 <param name="input_reads" type="data" format="fastq,fasta" multiple='true' label="Select input reads from history" help="Select one or more input fastq or fasta files from your history. To select multiple files, use ctrl + click" />
59 <param type="data" argument="-I" format="fasta,fasta.gz" optional="True" label="Error-free sequences file"/> 46 <param name="sequencing_technology" type="select" label="Sequencing Technology" help="Sequencing technology used to generate reads">
60 <param type="data" argument="--load-alignments" name="load_alignments" format="tabular" optional="True" label="Load pre-computed alignments"/> 47 <option value="ont">Oxford Nanopore (ont)</option>
61 48 <option value="ccs">PacBio CCS (ccs)</option>
62 <param argument="k" type="integer" value="0" min="0" max="25" label="Kmer fsize" /> 49 <option value="rs">PacBio RSII (rs)</option>
63 <param argument="p" type="integer" value="21" min="0" max="25" label="Kmer psize" /> 50 <option value="sq">PacBio Sequel (sq)</option>
64 <param argument="K" type="float" value="1000" min="0" max="65535" label="Filter high frequency kmers" /> 51 </param>
65 <param argument="E" type="integer" value="2" label="Min kmer frequency" /> 52 <param name="genome_size" type="text" value="" label="Genome size" help="Estimated genome size. k/m/g suffix is allowed - eg a 4500000bp ecoli genome can be written as 4.5m. For a human genome, use 3.2g">
66 <param argument="F" type="boolean" truevalue="-F" falsevalue="" checked="False" label="Filter low frequency kmers by a 4G-bytes array" /> 53 <sanitizer invalid_char="">
67 <param argument="S" type="integer" value="4" label="Subsampling kmers, 1/S kmers are indexed" /> 54 <valid initial="string.letters,string.digits">
68 <param argument="X" type="integer" value="4" label="Max number of bin (256bp) in one gap" /> 55 <add value="." />
69 <param argument="Y" type="integer" value="4" label="Max number of bin (256bp) in one deviation" /> 56 </valid>
70 <param argument="x" type="integer" value="-7" label="penalty for BIN gap" /> 57 </sanitizer>
71 <param argument="y" type="integer" value="-21" label="penalty for BIN deviation" /> 58 </param>
72 <param argument="l" type="float" value="2048" min="1" label="Min length of alignment" /> 59
73 <param argument="m" type="float" value="200" label="Min matched" /> 60 <section name="asm" title="Assembly Options" expanded="false">
74 <param argument="s" type="float" value="0.2" label="Max length variation of two aligned fragments" /> 61 <param argument="-X" type="float" value="50" label="Read depth" help="(-X) [float] Choose the best [float] depth from input reads. ie if the estimated genome size is 5m, setting this value to 50.0 would select the best 2.5mb worth of reads." />
75 62 <param argument="-L" type="integer" value="0" label="Min read length" help="(-L) [int] Choose the longest subread and drop reads shorter than [int]" />
76 <param argument="--tidy-reads" name="tidy_reads" type="integer" value="0" label="Filter reads less than tidy-reads" /> 63 <param argument="-k" type="integer" value="0" min="0" max="23" label="Kmer size" help="(-k) [int] Kmer size, 0 &#8804; k &#8804; 23" />
77 <param argument="--edge-min" name="edge_min" type="integer" value="3" label="The minimal depth of a valid edge set to" /> 64 <param argument="-p" type="integer" value="21" label="Homopolymer-compressed kmer size" help="(-p) [int] Homopolymer-compressed kmer size, 0 &#8804; p &#8804; 23" />
78 <param argument="--rescue-low-cov-edges" name="rescue_low_cov_edges" type="boolean" truevalue="--rescue-low-cov-edges" 65 <param argument="-K" type="float" value="1000" label="Max kmer frequency" help="(-K) [float] Filter high frequency kmers where frequency > [float]" />
79 falsevalue="" label="Try to rescue low coverage edges" /> 66 <param argument="-s" type="float" value="0.05" label="Min read similarity" help="(-s) [float] Min similarity between reads to label as related, calculated by kmer matched length / aligned length" />
80 67 <param argument="-e" type="integer" value="3" label="Min edge depth" help="(-e) [int] Min read depth of a valid edge" />
81 <section name="cns" title="Consensus options"> 68 <param name="realign" type="boolean" truevalue="-R" falsevalue="" label="Realignment" help="(-R) Enable realignment mode" />
82 <!-- optional inputs --> 69 <param name="contained_reads" type="boolean" truevalue="-A" falsevalue="" label="Contained reads" help="(-A) Keep contained reads during alignment" />
83 <!-- <param argument="-i" type="data" format="utg.cns" label="Input file(s) *.utg.cns" /> --> 70 </section>
84 71
85 <param argument="-j" type="integer" value="1000" label="Expected length of node" /> 72 <section name="cns" title="Consensus Options" expanded="false">
86 <param argument="-k" type="integer" value="15" label="Kmer size for long reads" /> 73 <param argument="-j" type="integer" value="1500" label="Expected length of node" />
87 <param argument="-Z" type="integer" value="4" label="Z-cutoff, drop the lower" /> 74 <param argument="-M" type="integer" value="2" label="Match score" />
88 <param argument="-W" type="integer" value="48" label="W-cutoff, drop the lagger (position)" /> 75 <param argument="-X" type="integer" value="-5" label="Mismatch score" />
89 <param argument="-H" type="integer" value="1" label="High coverage bonus" /> 76 <param argument="-I" type="integer" value="-2" label="Insertion score" />
90 <param argument="-L" type="integer" value="10" label="High coverage cutoff" /> 77 <param argument="-D" type="integer" value="-4" label="Deletion score" />
91 <param argument="-c" type="select" label="Candidate strategy"> 78 <param argument="-b" type="integer" value="0" label="Tri-base match bonus" />
92 <option value="0" selected="true">best-kmers</option> 79 <param argument="-H" type="integer" value="-3" label="Homopolymer merge score used in dp-call-cns mode" />
93 <option value="1" >median length</option> 80 <param argument="-B" type="text" value="64,1024,0.92" label="POA Bandwidth (Wmin,Wmax,mat_rate)" help="mat_rate = matched_bases/total_bases" />
94 <option value="2" >first (include)</option> 81 <param argument="-W" type="integer" value="200" label="Window size in the middle of the first read for fast align remaining reads. If -W is negative, will disable fast align, but use the abs(-W) as Band align score cutoff" />
95 <option value="3" >first (exclude)</option> 82 <param argument="-w" type="integer" value="100" label="Min size of aligned size in window. Will default to -W * 0.5" />
96 <option value="4" >longest</option> 83 <param argument="-A" type="boolean" truevalue="-A" falsevalue="" label="Abort TriPOA" help="Abort TriPOA when any read cannot be fast aligned, then try POA" />
97 <option value="5" >shortest</option> 84 <param argument="-S" type="select" label="Shuffle mode">
85 <option value="0">don't shuffle reads</option>
86 <option value="1" selected="true">shuffle using shared kmers</option>
87 <option value="2" >shuffle using subsampling</option>
98 </param> 88 </param>
99 89 <param argument="-R" type="integer" value="16" label="Realignment bandwidth" help="set to 0 to disable" />
100 <param argument="-M" type="integer" value="2" label="Match score" /> 90 <param argument="-c" type="select" label="Consensus mode">
101 <param argument="-X" type="integer" value="-7" label="Mismatch score" /> 91 <option value="0" selected="true">run-length</option>
102 <param argument="-I" type="integer" value="-3" label="Insertion score" /> 92 <option value="1">dp-call-cns</option>
103 <param argument="-D" type="integer" value="-4" label="Deletion score" />
104 <param argument="-E" type="integer" value="-2" label="Gap extension score" />
105 <param argument="-m" type="select" label="Correction mode">
106 <option value="1" selected="true">DBG correction</option>
107 <option value="2" >DAG correction</option>
108 </param> 93 </param>
109 <param argument="-S" type="integer" value="1" label="Correct structure before error correction" /> 94 <param argument="-C" type="integer" value="3" label="Min count of bases to call a consensus base" />
95 <param argument="-F" type="float" value="0.5" label="Min frequency of non-gap bases to call a consensus base" />
96 <param argument="-N" type="integer" value="20" label="Max number of reads in PO-MSA" />
110 </section> 97 </section>
111
112 </inputs> 98 </inputs>
113 <outputs> 99 <outputs>
114 <data name="output_alignments" format="fasta" label="${tool.name} alignments" from_work_dir="dbg.alignments" /> 100 <data name='out_assembly' format='fasta' label="${tool.name} on ${on_string}: assembled contigs" from_work_dir="out.fa" />
115 <data name="output_ctglay" format="txt" label="${tool.name} contigs layout" from_work_dir="dbg.ctg.lay" />
116 <data name="output_consensus" format="fasta" label="${tool.name} consensus" from_work_dir="dbg.ctg.lay.fa" />
117 </outputs> 101 </outputs>
118 <tests> 102 <tests>
119 <test> 103 <test expect_num_outputs="1">
120 <param name="i" value="ecoli-reads.fa"/> 104 <param name="input_reads" value="test1.fastq" />
121 <output name="output_alignments" file="result1.alignments"/> 105 <param name="sequencing_technology" value="ont" />
122 <output name="output_ctglay" file="result1.ctg.lay"/> 106 <param name="genome_size" value="60k" />
123 <output name="output_consensus" file="consensus_result1.fa"/> 107 <output name="out_assembly">
108 <assert_contents>
109 <has_text text=">ctg1 " />
110 <has_text text=">ctg2 " />
111 <has_text text=">ctg3 " />
112 <has_size value="70000" delta="10000" />
113 </assert_contents>
114 </output>
124 </test> 115 </test>
125 <test> 116 <test expect_num_outputs="1">
126 <param name="i" value="ecoli-reads.fa"/> 117 <param name="input_reads" value="test1_head.fa,test1_tail.fastq" />
127 <param name="tidy_reads" value="5000"/> 118 <param name="sequencing_technology" value="ont" />
128 <param name="edge_min" value="2"/> 119 <param name="genome_size" value="60k" />
129 <param name="rescue_low_cov_edges" value="True"/> 120 <output name="out_assembly">
130 <output name="output_consensus" file="consensus_result2.fa"/> 121 <assert_contents>
122 <has_text text=">ctg1 " />
123 <has_text text=">ctg2 " />
124 <has_text text=">ctg3 " />
125 <has_size value="70000" delta="10000" />
126 </assert_contents>
127 </output>
131 </test> 128 </test>
132 <test> 129 <test expect_num_outputs="1">
133 <param name="i" value="ecoli-reads.fa"/> 130 <param name="input_reads" value="test1.fastq" />
134 <param name="cns.c" value="1"/> 131 <param name="sequencing_technology" value="ont" />
135 <param name="cns.E" value="-3"/> 132 <param name="genome_size" value="60k" />
136 <param name="cns.j" value="500"/> 133 <section name="asm">
137 <param name="cns.m" value="2"/> 134 <param name='X' value="10.0" />
138 <param name="cns.k" value="5"/> 135 <param name='L' value="2000" />
139 <output name="output_consensus" file="consensus_result3.fa"/> 136 <param name='k' value="15" />
137 <param name='p' value="0" />
138 <param name='K' value="500" />
139 <param name='l' value="1024" />
140 <param name='m' value="200" />
141 <param name='s' value="0.1" />
142 <param name='e' value="5" />
143 </section>
144 <output name="out_assembly">
145 <assert_contents>
146 <has_text text=">ctg1 " />
147 <has_text text=">ctg2 " />
148 <has_text text=">ctg3 " />
149 <has_size value="50000" delta="10000" />
150 </assert_contents>
151 </output>
140 </test> 152 </test>
141
142 </tests> 153 </tests>
143
144 <help><![CDATA[ 154 <help><![CDATA[
155
156 **Wtdbg2**
157
158 |
159
145 **What it does** 160 **What it does**
146 161
147 WTDBG is a de novo assembler for long noisy sequences, based on fuzzy Bruijn graphs (FBG). 162 WTDBG is a de novo assembler for long noisy sequences, based on fuzzy Bruijn graphs (FBG).
163
164 "Wtdbg2 is a de novo sequence assembler for long noisy reads produced by PacBio or Oxford Nanopore Technologies (ONT). It assembles raw reads without error correction and then builds the consensus from intermediate assembly output. Wtdbg2 is able to assemble the human and even the 32Gb Axolotl genome at a speed tens of times faster than CANU and FALCON while producing contigs of comparable base accuracy.
148 165
149 **Alignment** 166 **Alignment**
150 167
151 KBM (Kmer-BIN-Mapping) groups k-mers from each non-overlapped sliding 256 bp fragments in long reads into bins. 168 KBM (Kmer-BIN-Mapping) groups k-mers from each non-overlapped sliding 256 bp fragments in long reads into bins.
152 Bins of which most k-mers are high frequency, are filtered as highly repetitive ones. 169 Bins of which most k-mers are high frequency, are filtered as highly repetitive ones.
160 FBG (Fuzzy Bruijn Graph) is composed of vertices in length of 1024 bp from reads, and edges connecting vertices 177 FBG (Fuzzy Bruijn Graph) is composed of vertices in length of 1024 bp from reads, and edges connecting vertices
161 in their order on read paths. Comparing with DBG, the size of vertices in FBG are much bigger, thus won't be 178 in their order on read paths. Comparing with DBG, the size of vertices in FBG are much bigger, thus won't be
162 sensitive to small repeat. To tolerate high sequencing errors, FBG's vertices are found using gapped 179 sensitive to small repeat. To tolerate high sequencing errors, FBG's vertices are found using gapped
163 sequence alignments from KBM or other aligners, comparing with searching identical k-mers in DBG. 180 sequence alignments from KBM or other aligners, comparing with searching identical k-mers in DBG.
164 181
165 ]]></help> 182 See the github (https://github.com/ruanjue/wtdbg2) and paper (https://doi.org/10.1038/s41592-019-0669-3) for more information.
183
184 |
185
186 **Input**
187
188 One or more fastq or fasta files. Can be in any fastq/fasta format with any valid exension.
189
190 |
191
192 **Output**
193
194 Assembled contigs (the assembled genome).
195 To polish, use external tools such as pilon, racon, medaka, nextpolish etc.
196
197 |
198
199 **Sequencing Technology Preset Information**
200
201 - Oxford Nanopore (ont) (genome size < 1G): -p 0 -k 15 -AS 2 -s 0.05 -L 5000
202 - Oxford Nanopore (ont) (genome size >= 1G): -p 19 -AS 2 -s 0.05 -L 5000
203 - PacBio CCS (ccs): -p 21 -k 0 -AS 4 -K 0.05 -s 0.5
204 - PacBio RSII (rs): -p 21 -S 4 -s 0.05 -L 5000
205 - PacBio Sequel (sq) (genome size < 1G): -p 0 -k 15 -AS 2 -s 0.05 -L 5000
206 - PacBio Sequel (sq) (genome size >= 1G): -p 19 -AS 2 -s 0.05 -L 5000
207
208 |
209 ]]></help>
166 <expand macro="citations" /> 210 <expand macro="citations" />
167 </tool> 211 </tool>