Mercurial > repos > bgruening > wtdbg

diff wtdbg.xml @ 2:2668027a533b draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/wtdbg commit 05f8373310ce1728426b89f33b643406e0cba54b"
author: bgruening
date: Sat, 29 Jan 2022 12:49:28 +0000
parents: e100f3f4d80e
--- a/wtdbg.xml	Thu Sep 27 17:52:13 2018 -0400
+++ b/wtdbg.xml	Sat Jan 29 12:49:28 2022 +0000
@@ -1,167 +1,211 @@
-<tool id="wtdbg" name="WTDBG" version="2.0">
-    <description>De novo assembler AND consensuser for long noisy sequences</description>
-    <macros>
-        <import>macros.xml</import>
-    </macros>
-    <expand macro="requirements" />
-    <version_command>wtdbg2 -help | grep 'Version:'</version_command>
-    <command detect_errors="exit_code"><![CDATA[
-    wtdbg2
-        -t \${GALAXY_SLOTS:-4} 
-        -i '$i'
-        -o 'dbg' 
-        #if $I:
-            -I '$I'
-        #end if
-        #if $load_alignments:
-            --load-alignments '$load_alignments'
-        #end if
-
-        -k $k
-        -p $p
-        -K $K
-        -E $E
-        $F
-        -S $S
-        -X $X
-        -Y $Y
-        -x $x
-        -y $y
-        -l $l
-        -m $m
-        -s $s
-        --tidy-reads $tidy_reads 
-        --edge-min $edge_min 
-        $rescue_low_cov_edges
-    &&    
-    wtdbg-cns 
-        -t \${GALAXY_SLOTS:-4} 
-        -o dbg.ctg.lay.fa
-        -i dbg.ctg.lay
-        -j $cns.j
-        -k $cns.k
-        -Z $cns.Z
-        -W $cns.W
-        -H $cns.H
-        -L $cns.L
-        -c $cns.c
-        -M $cns.M
-        -X $cns.X
-        -I $cns.I
-        -D $cns.D
-        -E $cns.E
-        -m $cns.m
-        -S $cns.S
-
-    ]]></command>
-    <inputs>
-        <param type="data" argument="-i" format="fasta,fasta.gz" label="Long reads sequences file"/>
-        <param type="data" argument="-I" format="fasta,fasta.gz" optional="True" label="Error-free sequences file"/>
-        <param type="data" argument="--load-alignments" name="load_alignments" format="tabular" optional="True" label="Load pre-computed alignments"/>
-
-        <param argument="k" type="integer" value="0" min="0" max="25" label="Kmer fsize" />
-        <param argument="p" type="integer" value="21" min="0" max="25" label="Kmer psize" />
-        <param argument="K" type="float" value="1000" min="0" max="65535" label="Filter high frequency kmers" />
-        <param argument="E" type="integer" value="2" label="Min kmer frequency" />
-        <param argument="F" type="boolean" truevalue="-F" falsevalue="" checked="False" label="Filter low frequency kmers by a 4G-bytes array" />
-        <param argument="S" type="integer" value="4" label="Subsampling kmers, 1/S kmers are indexed" />
-        <param argument="X" type="integer" value="4" label="Max number of bin (256bp) in one gap" />
-        <param argument="Y" type="integer" value="4" label="Max number of bin (256bp) in one deviation" />
-        <param argument="x" type="integer" value="-7" label="penalty for BIN gap" />
-        <param argument="y" type="integer" value="-21" label="penalty for BIN deviation" />
-        <param argument="l" type="float" value="2048" min="1" label="Min length of alignment" />
-        <param argument="m" type="float" value="200" label="Min matched" />
-        <param argument="s" type="float" value="0.2" label="Max length variation of two aligned fragments" />
-
-        <param argument="--tidy-reads" name="tidy_reads" type="integer" value="0" label="Filter reads less than tidy-reads" />
-        <param argument="--edge-min" name="edge_min" type="integer" value="3" label="The minimal depth of a valid edge set to" />
-        <param argument="--rescue-low-cov-edges" name="rescue_low_cov_edges" type="boolean" truevalue="--rescue-low-cov-edges" 
-            falsevalue="" label="Try to rescue low coverage edges" />
-
-        <section name="cns" title="Consensus options">
-            <!-- optional inputs -->
-            <!-- <param argument="-i" type="data" format="utg.cns" label="Input file(s) *.utg.cns" /> -->
-
-            <param argument="-j" type="integer" value="1000" label="Expected length of node" />
-            <param argument="-k" type="integer" value="15" label="Kmer size for long reads" />
-            <param argument="-Z" type="integer" value="4" label="Z-cutoff, drop the lower" />
-            <param argument="-W" type="integer" value="48" label="W-cutoff, drop the lagger (position)" />
-            <param argument="-H" type="integer" value="1" label="High coverage bonus" />
-            <param argument="-L" type="integer" value="10" label="High coverage cutoff" />
-            <param argument="-c" type="select" label="Candidate strategy">
-                <option value="0" selected="true">best-kmers</option>
-                <option value="1" >median length</option>
-                <option value="2" >first (include)</option>
-                <option value="3" >first (exclude)</option>
-                <option value="4" >longest</option>
-                <option value="5" >shortest</option>
-            </param>
-
-            <param argument="-M" type="integer" value="2" label="Match score" />
-            <param argument="-X" type="integer" value="-7" label="Mismatch score" />
-            <param argument="-I" type="integer" value="-3" label="Insertion score" />
-            <param argument="-D" type="integer" value="-4" label="Deletion score" />
-            <param argument="-E" type="integer" value="-2" label="Gap extension score" />
-            <param argument="-m" type="select" label="Correction mode">
-                <option value="1" selected="true">DBG correction</option>
-                <option value="2" >DAG correction</option>
-            </param>
-            <param argument="-S" type="integer" value="1" label="Correct structure before error correction" />
-        </section>
-
-    </inputs>
-    <outputs>
-        <data name="output_alignments" format="fasta" label="${tool.name}  alignments" from_work_dir="dbg.alignments" />
-        <data name="output_ctglay" format="txt" label="${tool.name}  contigs layout" from_work_dir="dbg.ctg.lay" />
-        <data name="output_consensus" format="fasta" label="${tool.name} consensus" from_work_dir="dbg.ctg.lay.fa" />
-    </outputs>
-    <tests>
-        <test>
-            <param name="i" value="ecoli-reads.fa"/>
-            <output name="output_alignments" file="result1.alignments"/>
-            <output name="output_ctglay" file="result1.ctg.lay"/>
-            <output name="output_consensus" file="consensus_result1.fa"/>
-        </test>
-        <test>
-            <param name="i" value="ecoli-reads.fa"/>
-            <param name="tidy_reads" value="5000"/>
-            <param name="edge_min" value="2"/>
-            <param name="rescue_low_cov_edges" value="True"/>
-            <output name="output_consensus" file="consensus_result2.fa"/>
-        </test>
-        <test>
-            <param name="i" value="ecoli-reads.fa"/>
-            <param name="cns.c" value="1"/>
-            <param name="cns.E" value="-3"/>
-            <param name="cns.j" value="500"/>
-            <param name="cns.m" value="2"/>
-            <param name="cns.k" value="5"/>
-            <output name="output_consensus" file="consensus_result3.fa"/>
-        </test>
-    
-    </tests>
-    
-    <help><![CDATA[
-**What it does**
-
-WTDBG is a de novo assembler for long noisy sequences, based on fuzzy Bruijn graphs (FBG).
-
-**Alignment**
-
-KBM (Kmer-BIN-Mapping) groups k-mers from each non-overlapped sliding 256 bp fragments in long reads into bins.
-Bins of which most k-mers are high frequency, are filtered as highly repetitive ones.
-Then, KBM searches synteny of matched bin pairs in sequences in a dynamic programming way.
-A matched bin pair in two sequences is defined as two bins different by original but share a set of k-mers.
-The result of alignments in KBM have the same features of traditional sequence alignment, excepting the unit of
-KBM alignments is 256 bp bin instead of single base.
-
-**Assembly**
-
-FBG (Fuzzy Bruijn Graph) is composed of vertices in length of 1024 bp from reads, and edges connecting vertices
-in their order on read paths. Comparing with DBG, the size of vertices in FBG are much bigger, thus won't be
-sensitive to small repeat. To tolerate high sequencing errors, FBG's vertices are found using gapped
-sequence alignments from KBM or other aligners, comparing with searching identical k-mers in DBG.
-
-  ]]></help>
-    <expand macro="citations" />
-</tool>
+<tool id="wtdbg" name="WTDBG2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="GPL-3.0-only">
+    <description>Fast de novo sequence assembler for long noisy reads</description>
+    <xrefs>
+        <xref type='bio.tools'>wtdbg2</xref>
+    </xrefs>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">wtdbg</requirement>
+    </requirements>
+    <version_command>wtdbg2 -V</version_command>
+    <command detect_errors="exit_code"><![CDATA[
+        ## helper function to sort fastqs before fastas in input readsets
+        #def sort_fastq_fasta(files):
+            #set fastqs = [f for f in $files if f.is_of_type('fastq')]
+            #set fastas = [f for f in $files if f.is_of_type('fasta')]
+            #set out = $fastqs + $fastas
+            #return $out
+        #end def
+
+        ## prepare readset inputs (must be fastqs then fastas if multiple files)
+        #set input_reads_sorted = $sort_fastq_fasta($input_reads)
+        
+        ## perform assembly
+        wtdbg2 
+        -t \${GALAXY_SLOTS:-4}
+        -x '${sequencing_technology}'
+        -g '${genome_size}'
+        @ASM_OPTIONS@
+
+        #for $readset in $input_reads_sorted:
+            -i '${readset}'
+        #end for
+        -fo out 
+        
+        &&
+        wtpoa-cns
+        -t \${GALAXY_SLOTS:-4}
+        -i out.ctg.lay.gz
+        -fo out.fa
+        @CNS_OPTIONS@
+    ]]></command>
+    <inputs>
+        <param name="input_reads" type="data" format="fastq,fasta" multiple='true' label="Select input reads from history" help="Select one or more input fastq or fasta files from your history. To select multiple files, use ctrl + click" />
+        <param name="sequencing_technology" type="select" label="Sequencing Technology" help="Sequencing technology used to generate reads">
+            <option value="ont">Oxford Nanopore (ont)</option>
+            <option value="ccs">PacBio CCS (ccs)</option>
+            <option value="rs">PacBio RSII (rs)</option>
+            <option value="sq">PacBio Sequel (sq)</option>
+        </param>
+        <param name="genome_size" type="text" value="" label="Genome size" help="Estimated genome size. k/m/g suffix is allowed - eg a 4500000bp ecoli genome can be written as 4.5m. For a human genome, use 3.2g">
+            <sanitizer invalid_char="">
+                <valid initial="string.letters,string.digits">
+                    <add value="." />
+                </valid>
+            </sanitizer>
+        </param>
+
+        <section name="asm" title="Assembly Options" expanded="false">
+            <param argument="-X" type="float" value="50" label="Read depth" help="(-X) [float] Choose the best [float] depth from input reads. ie if the estimated genome size is 5m, setting this value to 50.0 would select the best 2.5mb worth of reads." />
+            <param argument="-L" type="integer" value="0" label="Min read length" help="(-L) [int] Choose the longest subread and drop reads shorter than [int]" />
+            <param argument="-k" type="integer" value="0" min="0" max="23" label="Kmer size" help="(-k) [int] Kmer size, 0 &#8804; k &#8804; 23" />
+            <param argument="-p" type="integer" value="21" label="Homopolymer-compressed kmer size" help="(-p) [int] Homopolymer-compressed kmer size, 0 &#8804; p &#8804; 23" />
+            <param argument="-K" type="float" value="1000" label="Max kmer frequency" help="(-K) [float] Filter high frequency kmers where frequency > [float]" />
+            <param argument="-s" type="float" value="0.05" label="Min read similarity" help="(-s) [float] Min similarity between reads to label as related, calculated by kmer matched length / aligned length" />
+            <param argument="-e" type="integer" value="3" label="Min edge depth" help="(-e) [int] Min read depth of a valid edge" />
+            <param name="realign" type="boolean" truevalue="-R" falsevalue="" label="Realignment" help="(-R) Enable realignment mode" />
+            <param name="contained_reads" type="boolean" truevalue="-A" falsevalue="" label="Contained reads" help="(-A) Keep contained reads during alignment" />
+        </section>
+
+        <section name="cns" title="Consensus Options" expanded="false">
+            <param argument="-j" type="integer" value="1500" label="Expected length of node" />           
+            <param argument="-M" type="integer" value="2" label="Match score" />
+            <param argument="-X" type="integer" value="-5" label="Mismatch score" />
+            <param argument="-I" type="integer" value="-2" label="Insertion score" />
+            <param argument="-D" type="integer" value="-4" label="Deletion score" />
+            <param argument="-b" type="integer" value="0" label="Tri-base match bonus" />
+            <param argument="-H" type="integer" value="-3" label="Homopolymer merge score used in dp-call-cns mode" />
+            <param argument="-B" type="text" value="64,1024,0.92" label="POA Bandwidth (Wmin,Wmax,mat_rate)" help="mat_rate = matched_bases/total_bases" />
+            <param argument="-W" type="integer" value="200" label="Window size in the middle of the first read for fast align remaining reads. If -W is negative, will disable fast align, but use the abs(-W) as Band align score cutoff" />
+            <param argument="-w" type="integer" value="100" label="Min size of aligned size in window. Will default to -W * 0.5" />
+            <param argument="-A" type="boolean" truevalue="-A" falsevalue="" label="Abort TriPOA" help="Abort TriPOA when any read cannot be fast aligned, then try POA" />
+            <param argument="-S" type="select" label="Shuffle mode">
+                <option value="0">don't shuffle reads</option>
+                <option value="1" selected="true">shuffle using shared kmers</option>
+                <option value="2" >shuffle using subsampling</option>
+            </param>
+            <param argument="-R" type="integer" value="16" label="Realignment bandwidth" help="set to 0 to disable" />
+            <param argument="-c" type="select" label="Consensus mode">
+                <option value="0" selected="true">run-length</option>
+                <option value="1">dp-call-cns</option>
+            </param>
+            <param argument="-C" type="integer" value="3" label="Min count of bases to call a consensus base" />
+            <param argument="-F" type="float" value="0.5" label="Min frequency of non-gap bases to call a consensus base" />
+            <param argument="-N" type="integer" value="20" label="Max number of reads in PO-MSA" />
+        </section>
+    </inputs>
+    <outputs>
+        <data name='out_assembly' format='fasta' label="${tool.name} on ${on_string}: assembled contigs" from_work_dir="out.fa" />
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="input_reads" value="test1.fastq" />
+            <param name="sequencing_technology" value="ont" />
+            <param name="genome_size" value="60k" />
+            <output name="out_assembly">
+                <assert_contents>
+                    <has_text text=">ctg1 " />
+                    <has_text text=">ctg2 " />
+                    <has_text text=">ctg3 " />
+                    <has_size value="70000" delta="10000" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input_reads" value="test1_head.fa,test1_tail.fastq" />
+            <param name="sequencing_technology" value="ont" />
+            <param name="genome_size" value="60k" />
+            <output name="out_assembly">
+                <assert_contents>
+                    <has_text text=">ctg1 " />
+                    <has_text text=">ctg2 " />
+                    <has_text text=">ctg3 " />
+                    <has_size value="70000" delta="10000" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input_reads" value="test1.fastq" />
+            <param name="sequencing_technology" value="ont" />
+            <param name="genome_size" value="60k" />
+            <section name="asm">
+                <param name='X' value="10.0" />
+                <param name='L' value="2000" />
+                <param name='k' value="15" />
+                <param name='p' value="0" />
+                <param name='K' value="500" />
+                <param name='l' value="1024" />
+                <param name='m' value="200" />
+                <param name='s' value="0.1" />
+                <param name='e' value="5" />
+            </section>
+            <output name="out_assembly">
+                <assert_contents>
+                    <has_text text=">ctg1 " />
+                    <has_text text=">ctg2 " />
+                    <has_text text=">ctg3 " />
+                    <has_size value="50000" delta="10000" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+**Wtdbg2**
+
+|
+
+**What it does**
+
+WTDBG is a de novo assembler for long noisy sequences, based on fuzzy Bruijn graphs (FBG).
+
+"Wtdbg2 is a de novo sequence assembler for long noisy reads produced by PacBio or Oxford Nanopore Technologies (ONT). It assembles raw reads without error correction and then builds the consensus from intermediate assembly output. Wtdbg2 is able to assemble the human and even the 32Gb Axolotl genome at a speed tens of times faster than CANU and FALCON while producing contigs of comparable base accuracy.
+
+**Alignment**
+
+KBM (Kmer-BIN-Mapping) groups k-mers from each non-overlapped sliding 256 bp fragments in long reads into bins.
+Bins of which most k-mers are high frequency, are filtered as highly repetitive ones.
+Then, KBM searches synteny of matched bin pairs in sequences in a dynamic programming way.
+A matched bin pair in two sequences is defined as two bins different by original but share a set of k-mers.
+The result of alignments in KBM have the same features of traditional sequence alignment, excepting the unit of
+KBM alignments is 256 bp bin instead of single base.
+
+**Assembly**
+
+FBG (Fuzzy Bruijn Graph) is composed of vertices in length of 1024 bp from reads, and edges connecting vertices
+in their order on read paths. Comparing with DBG, the size of vertices in FBG are much bigger, thus won't be
+sensitive to small repeat. To tolerate high sequencing errors, FBG's vertices are found using gapped
+sequence alignments from KBM or other aligners, comparing with searching identical k-mers in DBG.
+
+See the github (https://github.com/ruanjue/wtdbg2) and paper (https://doi.org/10.1038/s41592-019-0669-3) for more information. 
+
+|
+
+**Input**
+
+One or more fastq or fasta files. Can be in any fastq/fasta format with any valid exension. 
+
+|
+
+**Output**
+
+Assembled contigs (the assembled genome).
+To polish, use external tools such as pilon, racon, medaka, nextpolish etc.
+
+|
+
+**Sequencing Technology Preset Information**
+
+- Oxford Nanopore (ont) (genome size < 1G):     -p 0 -k 15 -AS 2 -s 0.05 -L 5000
+- Oxford Nanopore (ont) (genome size >= 1G):    -p 19 -AS 2 -s 0.05 -L 5000
+- PacBio CCS (ccs):                             -p 21 -k 0 -AS 4 -K 0.05 -s 0.5
+- PacBio RSII (rs):                             -p 21 -S 4 -s 0.05 -L 5000
+- PacBio Sequel (sq) (genome size < 1G):        -p 0 -k 15 -AS 2 -s 0.05 -L 5000
+- PacBio Sequel (sq) (genome size >= 1G):       -p 19 -AS 2 -s 0.05 -L 5000
+
+|
+    ]]></help>
+    <expand macro="citations" />
+</tool>
\ No newline at end of file
author	bgruening
date	Sat, 29 Jan 2022 12:49:28 +0000
parents	e100f3f4d80e
children