Mercurial > repos > iuc > purge_dups

diff purge_dups.xml @ 4:a315c25dc813 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/purge_dups commit d2ef7bd6598695a681446eaf9c5b8c142e8a0199"
author: iuc
date: Tue, 12 Oct 2021 19:07:05 +0000
parents: 76d4cbefff85
--- a/purge_dups.xml	Mon Jun 14 18:01:05 2021 +0000
+++ b/purge_dups.xml	Tue Oct 12 19:07:05 2021 +0000
@@ -1,71 +1,10 @@
 <tool id="purge_dups" name="Purge overlaps" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
     <description>and haplotigs in an assembly based on read depth (purge_dups)</description>
     <macros>
-        <token name="@TOOL_VERSION@">1.2.5</token>
-        <token name="@VERSION_SUFFIX@">3</token>
-        <xml name="trimmers">
-            <section name="section_hist" title="Histogram plot options" >
-                <!--<param name="cutoffs_his" type="data" optional="true" format="txt" label="Read depth cutoffs file" />-->
-                <param argument="--ymin" type="integer" optional="true" min="0" label="Specify a minimum for the Y axis"/>
-                <param argument="--ymax" type="integer" optional="true" label="Specify a maximum for the Y axis"/>
-                <param argument="--xmin" type="integer" optional="true" min="0" label="Specify a minimum for the X axis"/>
-                <param argument="--xmax" type="integer" optional="true" label="Specify a maximum for the X axis"/>
-                <param argument="--title" type="text" value="Read depth histogram plot" label="Histogram title"/>
-            </section>
-        </xml>
-        <token name="@HIST_PLOT@"><![CDATA[
-            python '$__tool_directory__/hist_plot.py' 
-            --cutoffs cutoffs.tsv
-            #if $function_select.section_hist.ymin
-                --ymin $function_select.section_hist.ymin
-            #end if
-            #if $function_select.section_hist.ymax
-                --ymax $function_select.section_hist.ymax
-            #end if
-            #if $function_select.section_hist.xmin
-                --xmin $function_select.section_hist.xmin
-            #end if
-            #if $function_select.section_hist.xmax
-                --xmax $function_select.section_hist.xmax
-            #end if
-            #if $function_select.section_hist.title
-                --title '${function_select.section_hist.title}'
-            #end if
-            depth.stat hist.png
-        ]]></token>
-        <token name="@CALCUTS@"><![CDATA[
-            calcuts
-            #if $function_select.section_calcuts.min_depth:
-                -f $function_select.section_calcuts.min_depth
-            #end if
-            #if $function_select.section_calcuts.low_depth:
-                -l $function_select.section_calcuts.low_depth
-            #end if
-            #if $function_select.section_calcuts.transition:
-                -m $function_select.section_calcuts.transition
-            #end if
-            #if $function_select.section_calcuts.upper_depth:
-                -u $function_select.section_calcuts.upper_depth
-            #end if
-            $function_select.section_calcuts.ploidy
-        ]]></token>
-        <xml name="calcuts">
-            <section name="section_calcuts" title="Calcuts options">
-                <param name="min_depth" type="float" label="Minimum depth count fraction to maximum depth coun" min="0" max="1" argument="-f" optional="true" help="Default = 0.1"/>
-                <param name="low_depth" label="Lower bound for read depth" type="integer" argument="-l" optional="true"/>
-                <param name="transition" label="Transition between haploid and diploid" type="integer" argument="-m" optional="true"/>
-                <param name="upper_depth" label="Upper bound for read depth" type="integer" argument="-u" optional="true"/>
-                <param name="ploidy" argument="-d" type="select" label="Ploidy">
-                    <option value="-d 0" selected="true">Diploid [0]</option>
-                    <option value="-d 1">Haploid [1]</option>
-                </param>
-            </section>
-        </xml>
+        <import>macros.xml</import>
     </macros>
-    <requirements>
-        <requirement type="package" version="@TOOL_VERSION@">purge_dups</requirement>
-        <requirement type="package" version="3.4.2">matplotlib-base</requirement>
-    </requirements>
+    <expand macro="xrefs"/>
+    <expand macro="requirements"/>
     <command detect_errors="exit_code"><![CDATA[
         #if $function_select.functions == 'purge_dups':
             #for $i, $file in enumerate($function_select.input):
@@ -82,42 +21,23 @@
             #if $function_select.cutoffs:
                 -T '${function_select.cutoffs}'
             #end if
-            #if $function_select.min_bad:
-                -f $function_select.min_bad
-            #end if
-            #if $function_select.min_align:
-                -a $function_select.min_align
-            #end if
-            #if $function_select.min_match:
-                -b $function_select.min_match
-            #end if
-            #if $function_select.min_chain:
-                -m $function_select.min_chain
-            #end if
-            #if $function_select.max_gap:
-                -M $function_select.max_gap
-            #end if
+            -f $function_select.min_bad
+            -a $function_select.min_align
+            -b $function_select.min_match
+            -m $function_select.min_chain
+            -M $function_select.max_gap
             #if $function_select.double_chain.chaining_rounds == 'two':
                 -2
-                #if $function_select.double_chain.max_gap_2:
-                    -G $function_select.double_chain.max_gap_2
-                #end if
+                -G $function_select.double_chain.max_gap_2
             #end if
-            #if $function_select.min_chain_score:
-                -l $function_select.min_chain_score
-            #end if
-            #if $function_select.max_extend:
-                -E $function_select.max_extend
-            #end if
+            -l $function_select.min_chain_score
+            -E $function_select.max_extend
             #for $i, $file in enumerate($function_select.input):
                 '${i}.gz'
             #end for
             > dups.bed 2> purge_dups.log
         #else if $function_select.functions == 'split_fa':
             split_fa
-            #if $function_select.split:
-                -n $function_select.split
-            #end if
             '${function_select.input}' > split.fasta
         #else if $function_select.functions == 'pbcstat':
             #for $i, $file in enumerate($function_select.input):
@@ -128,20 +48,13 @@
                 #end if
             #end for
             pbcstat
-            #if $function_select.max_cov:
-                -M $function_select.max_cov
-            #end if
-            #if $function_select.min_map_ratio:
-                -f $function_select.min_map_ratio
+            -M $function_select.pbcstat_options.max_cov
+            -f $function_select.pbcstat_options.min_map_ratio
+            #if $function_select.pbcstat_options.min_map_qual:
+                -q $function_select.pbcstat_options.min_map_qual
             #end if
-            #if $function_select.min_map_qual:
-                -q $function_select.min_map_qual
-            #end if
-            #if $function_select.flank:
-                -l $function_select.flank
-            #end if
-            $function_select.primary_alignments
- 
+            -l $function_select.pbcstat_options.flank
+            $function_select.pbcstat_options.primary_alignments
             #for $i, $file in enumerate($function_select.input):
                 '${i}.gz'
             #end for
@@ -151,15 +64,11 @@
 
         #else if $function_select.functions == 'ngscstat':
             ngscstat
-            #if $function_select.min_align_qual:
-                -q $function_select.min_align_qual
-            #end if
+            -q $function_select.ngscstat_options.min_align_qual
     ##        #if $function_select.max_depth:
     ##            -M $function_select.max_depth
     ##        #end if
-            #if $function_select.max_insert:
-                -L $function_select.max_insert
-            #end if
+            -L $function_select.ngscstat_options.max_insert
             '${function_select.input}'
             && mv TX.stat depth.stat
             && @CALCUTS@ depth.stat > cutoffs.tsv 2>calcuts.log
@@ -170,148 +79,203 @@
 
         #else if $function_select.functions == 'get_seqs':
             get_seqs
-            $function_select.coverage
-            $function_select.haplotigs
-            $function_select.end_trim
-            $function_select.split
-            #if $function_select.length:
-                -l $function_select.length
-            #end if
-            #if $function_select.min_ratio:
-                -m $function_select.min_ratio
-            #end if
-            #if $function_select.min_gap:
-                -g $function_select.min_gap
-            #end if
+            $function_select.advanced_options.coverage
+            $function_select.advanced_options.haplotigs
+            $function_select.advanced_options.end_trim
+            $function_select.advanced_options.split
+            -l $function_select.advanced_options.length
+            -m $function_select.advanced_options.min_ratio
+            -g $function_select.advanced_options.min_gap
             '${function_select.bed_input}' '${function_select.fasta_input}'
         #end if
     ]]></command>
     <inputs>
         <conditional name="function_select">
-            <param type="select" name="functions" label="Select the purge_dups function">
+            <param type="select" name="functions" label="Function mode">
+                <option value="pbcstat">Calculate coverage cutoff, base-level read depth and create read depth histogram for PacBio data (calcuts+pbcstat)</option>
+                <option value="ngscstat">Calculate coverage cutoff, base-level read depth and create read depth histogram for Illumina data (calcuts+ngscstat)</option>
+                <option value="calcuts">Calculate coverage cutoffs (calcuts)</option>
+                <option value="split_fa">Split assembly FASTA files by 'N's (split_fa)</option>
                 <option value="purge_dups">Purge haplotigs and overlaps for an assembly (purge_dups)</option>
-                <option value="split_fa">Split FASTA file by 'N's (split_fa)</option>
-                <option value="pbcstat">Calculate coverage cutoff and create read depth histogram and base-levelread depth for PacBio data (calcuts+pbcstats)</option>
-                <option value="ngscstat">Calculate coverage cutoff and create read depth histogram and base-level read detph for Illumina data (calcuts+ngscstat)</option>
-                <option value="calcuts">calculate coverage cutoffs (calcuts)</option>
-                <option value="get_seqs">Obtain seqeuences after purging (get_seqs)</option>
+                <option value="get_seqs">Obtain sequences after purging (get_seqs)</option>
             </param>
             <when value="purge_dups">
                 <param name="input" type="data" format="paf,paf.gz" multiple="true" label="PAF input file"/>
-                <param name="coverage" type="data" format="tabular" optional="true" argument="-c" label="Base-level coverage file" />
-                <param name="cutoffs" type="data" format="tabular" label ="Cutoffs file" optional="true" argument="-T"/>
-                <param name="min_bad" type="float" min="0" max="1" argument="-f" optional="true" label="Minimum fraction of haploid/diploid/bad/repetitive bases in a sequence" help="Default = 0.8"/>
-                <param name="min_align" type="integer" label="Minimum alignment score" argument="-a" optional="true"/>
-                <param name="min_match" type="integer" label="Minimum max match score" argument="-b" optional="true"/>
-                <param name="min_chain" label="Minimum matching bases for chaining" type="integer" argument="-m" optional="true"/>
-                <param name="max_gap" label="Maximum gap size for chaining" type="integer" argument="-M" optional="true"/>
+                <param argument="-c" name="coverage" type="data" 
+                    format="tabular" optional="true" label="Base-level coverage file" 
+                    help="This file is generated with purge_dups by using the 'Calculate coverage cutoff, base-level 
+                        read depth and create read depth histogram' option"/>
+                <param argument="-T" name="cutoffs" type="data" 
+                    format="tabular" optional="true" label ="Cutoffs file" 
+                    help="This file is generated with purge_dups by using the 'Calculate coverage cutoff, base-level 
+                        read depth and create read depth histogram' option"/>
+                <param argument="-f" name="min_bad" type="float" 
+                    min="0" max="1" value="0.8" label="Minimum fraction of haploid/diploid/bad/repetitive bases in a sequence" 
+                    help="This parameter is set for a suspect haplotigs. If 80% (default value) of a scaffold is high coverage 
+                    (defined by the sixth column of the cutoff file), it's a repetitive contig. If 80% is low coverage (defined 
+                    in the first column of the cutoff file), it's a junk contig. If 80%  is above diploid coverage(defined in 
+                    the fourth column of the cutoff file), it's a diploid, otherwise it's a suspect haplotig"/>
+                <param argument="-a" name="min_align" type="integer" 
+                    min="0" value="70" label="Minimum alignment score" 
+                    help="If its alignment score is larger than this parameter and max match score larger than 
+                    the 'mininum max match score' (-b), it is marked as a repeat; if the alignment score is larger than this parameter 
+                    and max match score no larger than the 'mininum max match score', it is marked as a haplotig. 
+                    Otherwise it is left as a candidate primary contig. If after purging, the complete genes reported by BUSCO are 
+                    too low, you can try to increase this parameter"/>
+                <param argument="-b" name="min_match" type="integer" 
+                    min="0" value="200" label="Minimum max match score" 
+                    help="If its alignment score is larger than the 'minimum align score' (-a) and max match score larger than 
+                    this parameter, it is marked as a repeat; if the alignment score is larger than the 'minimum 
+                        align'  and max match score no larger than this parameter, it is marked as a haplotig. 
+                        Otherwise it is left as a candidate primary contig."/>
+                <param argument="-m" name="min_chain" type="integer" 
+                    min="0" value="500" label="Minimum matching bases for chaining" 
+                    help="In the first round, it will asset chains consistent alignments within this parameter value"/>
+                <param argument="-M" name="max_gap" type="integer" 
+                    min="0" value="20000" label="Maximum gap size for chaining"
+                    help="In the first round, it will asset chains consistent alignments within this parameter value"/>
                 <conditional name="double_chain">
                     <param type="select" name="chaining_rounds" label="Rounds of chaining">
                         <option value="one">1 round</option>
                         <option value="two">2 rounds</option>
                     </param>
                     <when value="two">
-                        <param name="max_gap_2" argument="-G" optional="true" label="Maximum gap size for second round of chaining" type="integer"/>
+                        <param argument="-G" name="max_gap_2" type="integer" 
+                            min="0" value="50000" label="Maximum gap size for second round of chaining" 
+                            help="In the second round, it will asset chains consistent alignments within this parameter value"/>
                     </when>
                     <when value="one"/>
                 </conditional>
-                <param name="min_chain_score" argument="-l" optional="true" label="Minimum chaining score for a match" type="integer" />
-                <param name="max_extend" argument="-E" optional="true" label="Maximum extension for contig ends" type="integer" />
+                <param argument="-l" name="min_chain_score" type="integer" 
+                    min="0" value="10000" label="Minimum chaining score for a match"
+                    help="This parameter control the overlap size. You should decrease its value to allow more overlaps"/>
+                <param argument="-E" name="max_extend" type="integer" 
+                    min="0" value="15000" label="Maximum extension for contig ends"
+                    help="If the chained alignment is within this value to the contig ends, it will extended to the ends"/>
+                <param name="log_file" type="boolean" truevalue="true" falsevalue="false" label="Generate log file"/>
             </when>
             <when value="split_fa">
-                <param name="input" type="data" format="fasta" label="Base-level coverage file"/>
-                <param name="split" type="boolean" truevalue="-n" falsevalue="" checked="false" label="Base-level coverage file" />
+                <param name="input" type="data" format="fasta,fasta.gz" label="Assembly FASTA file" help="The sequence will be cleaved in those position in which the nucleotides is an 'N' or an 'n'."/>
+                <!-- This option disables the cleaving process, and yield the original sequence
+                <param argument="-n" type="boolean" truevalue="-n" falsevalue="" checked="false" label="Block split by N" help="Enable this option if you do not want break your scaffols into contigs."/>
+                -->
             </when>
             <when value="pbcstat">
                 <param name="input" type="data" format="paf,paf.gz" multiple="true" label="PAF input file"/>
-                <param name="max_cov" type="integer" label="Maximum coverage" argument="-M" optional="true"/>
-                <param name="min_map_ratio" argument="-f" type="float" min="0" max="1" value="0" label="Minimum mapping length ratio"/>
-                <param name="min_map_qual" type="integer"  argument="-q" optional="true" label="Minimum mapping quality"/>
-                <param name="flank" type="integer" argument="-l" optional="true" label="Flanking space" />
-                <param name="primary_alignments" argument="-p" type="boolean" truevalue="-p" falsevalue="" checked="true" label="Use only primary alignments" />
+                <section name="pbcstat_options" title="PBCSTAT options" expanded="true">
+                    <param argument="-M" name="max_cov" type="integer" min="0" value="500" label="Maximum coverage"/>
+                    <param argument="-f" name="min_map_ratio"  type="float" min="0" max="1" value="0" label="Minimum mapping length ratio"/>
+                    <param argument="-q" name="min_map_qual" type="integer"  optional="true" label="Minimum mapping quality"/>
+                    <param argument="-l" name="flank"  type="integer" min="0" value="0" label="Flanking space" />
+                    <param argument="-p" name="primary_alignments" type="boolean" truevalue="-p" falsevalue="" checked="true" label="Use only primary alignments" />
+                </section>
                 <expand macro="calcuts" />
                 <expand macro="trimmers"/>
+                <expand macro="output_macro">
+                    <option value="pbcstat_coverage" selected="true">PBCSTAT base coverage</option>
+                    <option value="pbcstat_wig">PBCSTAT base coverage (WIG)</option>
+                    <option value="depth_stats">PBCSTAT depths</option>
+                </expand>
             </when>
             <when value="ngscstat">
                 <param name="input" type="data" format="bam" label="BAM input file"/>
-                <param name="min_align_qual" type="integer"  argument="-q" optional="true" label="Minimum alignment quality" />
-                <!-- Param exists in help text, but isn't actually part of the code. Maybe in the next release? -->
-                <!-- <param name="max_depth" type="integer" label="Maximum read depth" argument="-M" optional="true"/> -->
-                <param name="max_insert" type="integer"  argument="-L" optional="true" label="Maximum insert size"/>
+                <section name="ngscstat_options" title="NGSCSTAT options" expanded="true">
+                    <param argument="-q" name="min_align_qual" type="integer" min="0" value="30" label="Minimum alignment quality" />
+                    <!-- Param exists in help text, but isn't actually part of the code. Maybe in the next release? -->
+                    <!-- <param name="max_depth" type="integer" label="Maximum read depth" argument="-M" optional="true"/> -->
+                    <param argument="-L" name="max_insert" type="integer" min="0" value="1000" label="Maximum insert size"/>
+                </section>
                 <expand macro="calcuts" />
                 <expand macro="trimmers"/>
+                <expand macro="output_macro">
+                    <option value="ngscstat_coverage" selected="true">NGSCSTAT base coverage</option>
+                </expand>
             </when>
-
+            
             <when value="calcuts">
-                <param name="input" type="data" format="tabular" label="STAT input file"/>
+                <param name="input" type="data" format="tabular" label="Depths input file"/>
                 <expand macro="calcuts" />
             </when>
-
             <when value="get_seqs">
-                <param name="fasta_input" type="data" format="fasta" label="Fasta input file"/>
-                <param name="bed_input" type="data" format="bed" label="Bed input file"/>
-                <param name="coverage" type="boolean" argument="-c" truevalue="-c" falsevalue="" checked="false" label="Keep high coverage contigs in the primary contig set"/>
-                <param name="haplotigs" type="boolean" argument="-a" truevalue="-a" falsevalue="" checked="false" label="Do not add prefix to haplotigs"/>
-                <param name="length" type="integer" argument="-l" optional="true" label="Minimum primary contig length" help="Default: 1000"/>
-                <param name="min_ratio" type="float" min="0" max="1" argument="-m" optional="true" label="Minimum ratio of remaining primary contig length to the original contig length"/>
-                <param name="end_trim" type="boolean" argument="-e" truevalue="-e" falsevalue="" checked="true" label="Trim end sequences" help="Only remove sequences at end of halplotigs If you also want to remove the duplications in the middle, set to false, however that may delete false positive duplications."/>
-                <param name="split" type="boolean" argument="-s" truevalue="-s" falsevalue="" checked="false" label="Split contigs"/>
-                <param name="min_gap" type="integer" argument="-g" optional="true" help="default=10k" label="Minimum gap size between duplications" />
+                <param name="fasta_input" type="data" format="fasta" label="Assembly FASTA file"/>
+                <param name="bed_input" type="data" format="bed" label="BED input file" help="Generated by the 'purge_dups' function mode."/>
+                <section name="advanced_options" title="Advanced options">
+                    <param argument="-c" name="coverage" type="boolean" 
+                        truevalue="-c" falsevalue="" checked="false" label="Keep high coverage contigs in the primary contig set"/>
+                    <param argument="-a" name="haplotigs" type="boolean"  truevalue="-a" falsevalue="" checked="false" label="Do not add prefix to haplotigs"/>
+                    <param argument="-l" name="length" type="integer" min="0" value="10000" label="Minimum primary contig length" />
+                    <param argument="-m" name="min_ratio" type="float" 
+                        min="0" max="1" value="0.05" label="Minimum ratio of remaining primary contig length to the original contig length"/>
+                    <param argument="-e" name="end_trim" type="boolean"  
+                        truevalue="-e" falsevalue="" checked="true" label="Trim end sequences" 
+                        help="Only remove sequences at end of halplotigs. If you also want to remove the duplications in the middle, 
+                            set to false, however that may delete false positive duplications."/>
+                    <param argument="-s" name="split" type="boolean"  truevalue="-s" falsevalue="" checked="false" label="Split contigs"/>
+                    <param argument="-g" name="min_gap" type="integer" min="0" value="10000" label="Minimum gap size between duplications" />
+                </section>
             </when>
         </conditional>
     </inputs>
     <outputs>
         <!-- Get Seqs -->
-        <data name="get_seqs_hap" format="fasta" from_work_dir="hap.fa" label="${tool.name} on ${on_string}: get seqs haplotype fasta" >
+        <data name="get_seqs_hap" format="fasta" from_work_dir="hap.fa" label="${tool.name} on ${on_string}: get_seqs haplotype" >
             <filter>function_select['functions'] == 'get_seqs'</filter>
         </data>
-        <data name="get_seqs_purged" format="fasta" from_work_dir="purged.fa" label="${tool.name} on ${on_string}: get seqs purged fasta">
+        <data name="get_seqs_purged" format="fasta" from_work_dir="purged.fa" label="${tool.name} on ${on_string}: get_seqs purged sequences">
             <filter>function_select['functions'] == 'get_seqs'</filter>
         </data>
         <!-- Split FA -->
-        <data name="split_fasta" format="fasta" from_work_dir="split.fasta" label="${tool.name} on ${on_string}: split fasta">
+        <data name="split_fasta" format="fasta" from_work_dir="split.fasta" label="${tool.name} on ${on_string}: split FASTA">
             <filter>function_select['functions'] == 'split_fa'</filter>
         </data>
         <!-- Ngscstat -->
-        <data name="ngscstat_cov" format="tabular" from_work_dir="TX.base.cov" label="${tool.name} on ${on_string}: ngscstat base coverage file">
-            <filter>function_select['functions'] == 'ngscstat'</filter>
+        <data name="ngscstat_cov" format="tabular" from_work_dir="TX.base.cov" label="${tool.name} on ${on_string}: NGSCSTAT base coverage">
+            <filter>function_select['functions'] == 'ngscstat'</filter> 
+            <filter>'ngscstat_coverage' in function_select['output_options']</filter>
         </data>
-        <data name="stat_file" format="tabular" from_work_dir="depth.stat"  label="${tool.name} on ${on_string}: stat file">
+        <data name="stat_file" format="tabular" from_work_dir="depth.stat"  label="${tool.name} on ${on_string}: depths">
             <filter>function_select['functions'] == 'ngscstat' or function_select['functions'] == 'pbcstat'</filter>
+            <filter>'depth_stats' in function_select['output_options']</filter>
         </data>
         <!-- Pbcstat -->
-        <data name="pbcstat_cov" format="tabular" from_work_dir="PB.base.cov"  label="${tool.name} on ${on_string}: pbcstat base coverage file">
+        <data name="pbcstat_cov" format="tabular" from_work_dir="PB.base.cov"  label="${tool.name} on ${on_string}: PBCSTAT base coverage">
             <filter>function_select['functions'] == 'pbcstat'</filter>
+            <filter>'pbcstat_coverage' in function_select['output_options']</filter>
         </data>
-        <data name="pbcstat_wig" format="wig" from_work_dir="PB.cov.wig" label="${tool.name} on ${on_string}: pbcstat base wig file">
+        <data name="pbcstat_wig" format="wig" from_work_dir="PB.cov.wig" label="${tool.name} on ${on_string}: PBCSTAT base coverage (WIG)">
             <filter>function_select['functions'] == 'pbcstat'</filter>
+            <filter>'pbcstat_wig' in function_select['output_options']</filter>
         </data>
 
         <data name="hist" format="png" from_work_dir="hist.png" label="${tool.name} on ${on_string}: histogram plot">
             <filter>function_select['functions'] == 'pbcstat' or function_select['functions'] == 'ngscstat'</filter>
+            <filter>'histogram' in function_select['output_options']</filter>
         </data>
 
         <!-- Calcuts -->
-        <data name="calcuts_log" format="txt" from_work_dir="calcuts.log" label="${tool.name} on ${on_string}: calcuts log file">
+        <data name="calcuts_log" format="txt" from_work_dir="calcuts.log" label="${tool.name} on ${on_string}: calcuts log">
             <filter>function_select['functions'] in ('pbcstat', 'ngscstat', 'calcuts')</filter>
+            <filter>'calcuts_log' in function_select['output_options']</filter>
         </data>
-        <data name="calcuts_tab" format="tabular" from_work_dir="cutoffs.tsv" label="${tool.name} on ${on_string}: calcuts cutoff file">
+        <data name="calcuts_cutoff" format="tabular" from_work_dir="cutoffs.tsv" label="${tool.name} on ${on_string}: calcuts cutoff">
             <filter>function_select['functions'] in ('pbcstat', 'ngscstat', 'calcuts')</filter>
+            <filter>'calcuts_cutoff' in function_select['output_options']</filter>
         </data>
         <!-- Purge dups -->
-        <data name="purge_dups_log" format="txt" from_work_dir="purge_dups.log" label="${tool.name} on ${on_string}: purge_dups log file">
+        <data name="purge_dups_log" format="txt" from_work_dir="purge_dups.log" label="${tool.name} on ${on_string}: purge_dups log">
             <filter>function_select['functions'] == 'purge_dups'</filter>
+            <filter>function_select['log_file']</filter>
         </data>
-        <data name="purge_dups_bed" format="bed" from_work_dir="dups.bed" label="${tool.name} on ${on_string}: purge_dups bed file">
+        <data name="purge_dups_bed" format="bed" from_work_dir="dups.bed" label="${tool.name} on ${on_string}: purge_dups BED">
             <filter>function_select['functions'] == 'purge_dups'</filter>
         </data>
     </outputs>
     <tests>
-        <!-- Purge dups -->
-        <test expect_num_outputs="2">
+        <!-- Test 1 Purge dups -->
+        <test expect_num_outputs="1">
             <conditional name="function_select">
                 <param name="functions" value="purge_dups"/>
-                <param name="input" value="test.paf"/>
+                <param name="input" value="assembly_test.paf"/>
                 <param name="coverage" value="test.cov" ftype="tabular"/>
                 <param name="cutoffs" value="cutoffs.tsv" ftype="tabular"/>
                 <param name="min_bad" value="0.01"/>
@@ -326,13 +290,13 @@
                 <param name="min_chain_score" value="1"/>
                 <param name="max_extend" value="100"/>
             </conditional>
-            <output name="purge_dups_bed" value="purge_dups_out.bed"/>
+            <output name="purge_dups_bed" value="purge_dups_01.bed" ftype="bed"/>
         </test>
-        <!-- Purge dups gzip -->
+        <!-- Test 2 Purge dups gzip -->
         <test expect_num_outputs="2">
             <conditional name="function_select">
                 <param name="functions" value="purge_dups"/>
-                <param name="input" value="test.paf.gz" ftype="paf.gz"/>
+                <param name="input" value="assembly_test.paf.gz" ftype="paf.gz"/>
                 <param name="coverage" value="test.cov" ftype="tabular"/>
                 <param name="cutoffs" value="cutoffs.tsv" ftype="tabular"/>
                 <param name="min_bad" value="0.01"/>
@@ -346,36 +310,68 @@
                 </conditional>
                 <param name="min_chain_score" value="1"/>
                 <param name="max_extend" value="100"/>
+                <param name="log_file" value="true"/>
             </conditional>
-            <output name="purge_dups_bed" value="purge_dups_out.bed"/>
+            <output name="purge_dups_bed" value="purge_dups_02.bed" ftype="bed"/>
+            <output name="purge_dups_log" value="purge_dups_log_02.txt" ftype="txt"/>
+
         </test>
-        <!-- Purge dups multiple input -->
-        <test expect_num_outputs="2">
+        <!-- Test 3 Purge dups multiple input -->
+        <test expect_num_outputs="1">
             <conditional name="function_select">
                 <param name="functions" value="purge_dups"/>
-                <param name="input" value="test.paf,test2.paf.gz"/>
+                <param name="input" value="assembly_test.paf,test2.paf.gz"/>
             </conditional>
-            <output name="purge_dups_bed" value="purge_dups_out_2.bed"/>
+            <output name="purge_dups_bed" value="purge_dups_03.bed" ftype="bed"/>
         </test>
-        <!-- Split fa -->
+        <!-- Test 4 Split fa -->
         <test expect_num_outputs="1">
             <conditional name="function_select">
                 <param name="functions" value="split_fa"/>
-                <param name="input" value="test.fasta"/>
-                <param name="split" value="-n"/>
+                <param name="input" value="assembly_test.fasta"/>
             </conditional>
-            <output name="split_fasta" value="split_out.fasta"/>
+            <output name="split_fasta" value="split_04.fasta" ftype="fasta"/>
         </test>
-        <!-- pbcstat -->
+        <!-- Test 5 pbcstat -->
         <test expect_num_outputs="6">
             <conditional name="function_select">
                 <param name="functions" value="pbcstat"/>
-                <param name="input" value="test.paf"/>
-                <param name="max_cov" value="1000"/>
-                <param name="min_map_ratio" value="0.01"/>
-                <param name="min_map_qual" value="1"/>
-                <param name="flank" value="1"/>
-                <param name="primary_alignments" value="-p"/>
+                <param name="input" value="assembly_test.paf"/>
+                <section name="pbcstat_options">
+                    <param name="max_cov" value="1000"/>
+                    <param name="min_map_ratio" value="0.01"/>
+                    <param name="min_map_qual" value="1"/>
+                    <param name="flank" value="1"/>
+                    <param name="primary_alignments" value="-p"/>
+                </section>
+                <section name="section_calcuts">
+                    <param name="min_depth" value="0.01"/>
+                    <param name="low_depth" value="1"/>
+                    <param name="transition" value="1"/>
+                    <param name="upper_depth" value="100"/>
+                    <param name="ploidy" value="-d 0"/>
+                </section>
+                <param name="output_options" value="pbcstat_coverage,pbcstat_wig,depth_stats,histogram,calcuts_cutoff,calcuts_log"/>
+            </conditional>
+            <output name="calcuts_cutoff" value="calcuts_cutoff_05.tabular" ftype="tabular"/>
+            <output name="calcuts_log" value="calcuts_log_05.txt" ftype="txt"/>
+            <output name="pbcstat_cov" value="pbcstat_cov_05.tabular" ftype="tabular"/>
+            <output name="pbcstat_wig" value="pbcstat_cov_05.wig" ftype="wig"/>
+            <output name="stat_file" value="pbcstats_05.tabular" ftype="tabular"/>
+            <output name="hist" value="hist_05.png" ftype="png" compare="sim_size"/>
+        </test>
+        <!-- Test 6 pbcstat gzip -->
+        <test expect_num_outputs="2">
+            <conditional name="function_select">
+                <param name="functions" value="pbcstat"/>
+                <param name="input" value="assembly_test.paf.gz" ftype="paf.gz"/>
+                <section name="pbcstat_options">
+                    <param name="max_cov" value="1000"/>
+                    <param name="min_map_ratio" value="0.01"/>
+                    <param name="min_map_qual" value="1"/>
+                    <param name="flank" value="1"/>
+                    <param name="primary_alignments" value="-p"/>
+                </section>
                 <section name="section_calcuts">
                     <param name="min_depth" value="0.01"/>
                     <param name="low_depth" value="1"/>
@@ -384,22 +380,15 @@
                     <param name="ploidy" value="-d 0"/>
                 </section>
             </conditional>
-            <output name="calcuts_tab" value="calcuts_out.tsv"/>
-            <output name="pbcstat_cov" value="out.cov"/>
-            <output name="pbcstat_wig" value="out.wig"/>
-            <output name="stat_file" value="pbcstats.tabular"/>
-            <output name="hist" value="hist.png" ftype="png" compare="sim_size"/>
+            <param name="output_options" value="pbcstat_coverage,calcuts_cutoff"/>
+            <output name="calcuts_cutoff" value="calcuts_cutoff_06.tabular" ftype="tabular"/>
+            <output name="pbcstat_cov" value="pbcstat_cov_06.tabular" ftype="tabular"/>
         </test>
-        <!-- pbcstat gzip -->
-        <test expect_num_outputs="6">
+        <!-- Test 7 Pbcstat multiple input -->
+        <test expect_num_outputs="2">
             <conditional name="function_select">
                 <param name="functions" value="pbcstat"/>
-                <param name="input" value="test.paf.gz" ftype="paf.gz"/>
-                <param name="max_cov" value="1000"/>
-                <param name="min_map_ratio" value="0.01"/>
-                <param name="min_map_qual" value="1"/>
-                <param name="flank" value="1"/>
-                <param name="primary_alignments" value="-p"/>
+                <param name="input" value="assembly_test.paf,test2.paf.gz"/>
                 <section name="section_calcuts">
                     <param name="min_depth" value="0.01"/>
                     <param name="low_depth" value="1"/>
@@ -408,15 +397,19 @@
                     <param name="ploidy" value="-d 0"/>
                 </section>
             </conditional>
-            <output name="calcuts_tab" value="calcuts_out.tsv"/>
-            <output name="pbcstat_cov" value="out.cov"/>
-            <output name="pbcstat_wig" value="out.wig"/>
+            <param name="output_options" value="pbcstat_coverage,calcuts_cutoff"/>
+            <output name="calcuts_cutoff" value="calcuts_cutoff_07.tabular" ftype="tabular"/>
+            <output name="pbcstat_cov" value="pbcstat_cov_07.tabular" ftype="tabular"/>
         </test>
-        <!-- Pbcstat multiple input -->
-        <test expect_num_outputs="6">
+        <!-- Test 8 ngscstat -->
+        <test expect_num_outputs="2">
             <conditional name="function_select">
-                <param name="functions" value="pbcstat"/>
-                <param name="input" value="test.paf,test2.paf.gz"/>
+                <param name="functions" value="ngscstat"/>
+                <param name="input" value="test.bam"/>
+                <section name="ngscstat_options">
+                    <param name="min_align_qual" value="10"/>
+                    <param name="max_insert" value="100"/>
+                </section>
                 <section name="section_calcuts">
                     <param name="min_depth" value="0.01"/>
                     <param name="low_depth" value="1"/>
@@ -425,32 +418,11 @@
                     <param name="ploidy" value="-d 0"/>
                 </section>
             </conditional>
-            <output name="calcuts_tab" value="calcuts_out.tsv"/>
-            <output name="pbcstat_cov" value="out2.cov"/>
-            <output name="stat_file" value="pbcstats2.tabular"/>
-            <output name="pbcstat_wig" value="out2.wig"/>
+            <param name="output_options" value="ngscstat_coverage,calcuts_cutoff"/>
+            <output name="calcuts_cutoff" value="calcuts_cutoff_08.tabular" ftype="tabular"/>
+            <output name="ngscstat_cov" value="ngsc_cov_08.tabular" ftype="tabular"/>
         </test>
-        <!-- ngscstat -->
-        <test expect_num_outputs="5">
-            <conditional name="function_select">
-                <param name="functions" value="ngscstat"/>
-                <param name="input" value="test.bam"/>
-                <param name="min_align_qual" value="10"/>
-                <param name="max_insert" value="100"/>
-                <section name="section_calcuts">
-                    <param name="min_depth" value="0.01"/>
-                    <param name="low_depth" value="1"/>
-                    <param name="transition" value="1"/>
-                    <param name="upper_depth" value="100"/>
-                    <param name="ploidy" value="-d 0"/>
-                </section>
-            </conditional>
-            <output name="calcuts_tab" value="calcuts_out.tsv"/>
-            <output name="ngscstat_cov" value="ngsc_out.cov"/>
-            <output name="stat_file" value="tx_stats.tabular"/> 
-            <output name="hist" value="hist.png" ftype="png" compare="sim_size"/>
-        </test>
-        <!-- Calcuts -->
+        <!-- Test 9 Calcuts -->
         <test expect_num_outputs="2">
             <conditional name="function_select">
                 <param name="functions" value="calcuts"/>
@@ -463,34 +435,40 @@
                     <param name="ploidy" value="-d 0"/>
                 </section>
             </conditional>
-            <output name="calcuts_tab" value="calcuts_out.tsv"/>
+            <output name="calcuts_cutoff" value="calcuts_cutoff_09.tabular" ftype="tabular"/>
+            <output name="calcuts_log" value="calcuts_log_09.txt" ftype="txt"/>
         </test>
-        <!-- Get seqs -->
+        <!-- Test 10 Get seqs -->
         <test expect_num_outputs="2">
             <conditional name="function_select">
                 <param name="functions" value="get_seqs"/>
                 <param name="fasta_input" value="split_out.fasta"/>
                 <param name="bed_input" value="dups.bed"/>
-                <param name="coverage" value="-c"/>
-                <param name="length" value="10"/>
-                <param name="haplotigs" value="-a"/>
-                <param name="min_ratio" value=".01"/>
-                <param name="end_trim" value="-e"/>
-                <param name="split" value="-s"/>
-                <param name="min_gap" value="100000"/>
+                <section name="advanced_options">
+                    <param name="coverage" value="-c"/>
+                    <param name="length" value="10"/>
+                    <param name="haplotigs" value="-a"/>
+                    <param name="min_ratio" value=".01"/>
+                    <param name="end_trim" value="-e"/>
+                    <param name="split" value="-s"/>
+                    <param name="min_gap" value="100000"/>
+                </section>
             </conditional>
-            <output name="get_seqs_purged" value="purged_out.fa"/>
+            <output name="get_seqs_purged" value="get_seqs_purged_10.fa" ftype="fasta"/>
+            <output name="get_seqs_hap" value="get_seqs_hap_10.fa" ftype="fasta"/>
         </test>
-        <!-- pbcstat histogram options-->
-        <test expect_num_outputs="6">
+        <!-- Test 11 pbcstat histogram options-->
+        <test expect_num_outputs="1">
             <conditional name="function_select">
                 <param name="functions" value="pbcstat"/>
-                <param name="input" value="test.paf"/>
-                <param name="max_cov" value="1000"/>
-                <param name="min_map_ratio" value="0.01"/>
-                <param name="min_map_qual" value="1"/>
-                <param name="flank" value="1"/>
-                <param name="primary_alignments" value="-p"/>
+                <param name="input" value="assembly_test.paf"/>
+                <section name="pbcstat_options">
+                    <param name="max_cov" value="1000"/>
+                    <param name="min_map_ratio" value="0.01"/>
+                    <param name="min_map_qual" value="1"/>
+                    <param name="flank" value="1"/>
+                    <param name="primary_alignments" value="-p"/>
+                </section>
                 <section name="section_calcuts">
                     <param name="min_depth" value="0.01"/>
                     <param name="low_depth" value="1"/>
@@ -504,22 +482,56 @@
                     <param name="cutoffs_his" value="calcuts_out.tsv"/>
                 </section>
             </conditional>
-            <output name="calcuts_tab" value="calcuts_out.tsv"/>
-            <output name="pbcstat_cov" value="out_hist_options.cov"/>
-            <output name="pbcstat_wig" value="out_hist_options.wig"/>
-            <output name="stat_file" value="pbcstats_hist_options.tabular"/>
-            <output name="hist" value="hist_options.png" ftype="png" compare="sim_size"/>
+            <param name="output_options" value="histogram"/>
+            <output name="hist" value="hist_11.png" ftype="png" compare="sim_size"/>
         </test>
     </tests>
     <help><![CDATA[
-        .. class:: infomark
-        
-        **What it does**
+.. class:: infomark
+
+**Purpose**
+
+The purge_dups tools are designed to remove haplotigs and contig overlaps in a de novo assembly based on read depth. 
+purge_dups can significantly improve genome assemblies by removing overlaps and haplotigs caused by sequence divergence 
+in heterozygous regions. This both removes false duplications in primary draft assemblies while retaining completeness and sequence
+integrity, and can improve scaffolding.
+
+----
+
+.. class:: infomark
+
+**Pipeline Guide**
+
+Given a primary assembly, and an alternative assembly (optional, if you have one), follow the steps shown below to build your 
+own purge_dups pipeline, steps with same number can be run simultaneously. Among all the steps, although step 5 is optional, 
+we highly recommend our users to do so, because assemblers may produce overrepresented sequences. In such a case, the final 
+step 5 can be applied to remove those seqeuences.
 
-        The purge_dups tools are designed to remove haplotigs and contig overlaps in a de novo assembly based on read depth.
+    - Step 1: Calculate the coverage cutoffs and base coverages.
+    - Step 2: Split an assembly with the **split_fasfa** function and do a self-self alignment by using minimap2.
+    - Step 3: Purge haplotigs and overlaps with the **purge_dups** function.
+    - Step 4: Get purged primary and haplotig sequences from the draft assembly with the **get_seqs** function.
+    - Step 5: Merge hap.fa file, generated in the previous step, and the alternate assembly, and redo the above steps to get a decent haplotig set.
+
+----
+
+.. class:: infomark
+
+**Limitations**
+
+- Read depth cutoffs calculation: the coverage cutoffs can be larger for a low heterozygosity species, which causes the purged assembly size smaller than expected. In such a case, please use script/hist_plot.py to make the histogram plot and set coverage cutoffs manually.
+- Repeats: purge_dups has a limited ability to process repeats.
+
+----
+
+.. class:: infomark
+
+**Purged assembly validation**
+
+There are many ways to validate the purged assembly. One way is to make a coverage plot for it, the 2nd way is to run `BUSCO <https://busco.ezlab.org/>`_. A thid option is to use `Merqury <https://github.com/marbl/merqury>`_
+
+
 
     ]]></help>
-        <citations>
-        <citation type="doi">10.1093/bioinformatics/btaa025</citation>
-    </citations>
+    <expand macro="citations"/>
 </tool>
author	iuc
date	Tue, 12 Oct 2021 19:07:05 +0000
parents	76d4cbefff85
children