Mercurial > repos > iuc > purge_dups

<tool id="purge_dups" name="Purge overlaps" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
    <description>and haplotigs in an assembly based on read depth (purge_dups)</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="xrefs"/>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
        #if $function_select.functions == 'purge_dups':
            #for $i, $file in enumerate($function_select.input):
                #if $file.is_of_type("paf"):
                    gzip -c '${file}' > '${i}.gz' &&
                #else
                    ln -s '${file}' '${i}.gz' &&
                #end if
            #end for
            purge_dups
            #if $function_select.coverage:
                -c '${function_select.coverage}'
            #end if
            #if $function_select.cutoffs:
                -T '${function_select.cutoffs}'
            #end if
            -f $function_select.min_bad
            -a $function_select.min_align
            -b $function_select.min_match
            -m $function_select.min_chain
            -M $function_select.max_gap
            #if $function_select.double_chain.chaining_rounds == 'two':
                -2
                -G $function_select.double_chain.max_gap_2
            #end if
            -l $function_select.min_chain_score
            -E $function_select.max_extend
            #for $i, $file in enumerate($function_select.input):
                '${i}.gz'
            #end for
            > dups.bed 2> purge_dups.log
        #else if $function_select.functions == 'split_fa':
            split_fa
            '${function_select.input}' > split.fasta
        #else if $function_select.functions == 'pbcstat':
            #for $i, $file in enumerate($function_select.input):
                #if $file.is_of_type('paf'):
                    gzip -c '${file}' > '${i}.gz' &&
                #else
                    ln -s '${file}' '${i}.gz' &&
                #end if
            #end for
            pbcstat
            -M $function_select.pbcstat_options.max_cov
            -f $function_select.pbcstat_options.min_map_ratio
            #if $function_select.pbcstat_options.min_map_qual:
                -q $function_select.pbcstat_options.min_map_qual
            #end if
            -l $function_select.pbcstat_options.flank
            $function_select.pbcstat_options.primary_alignments
            #for $i, $file in enumerate($function_select.input):
                '${i}.gz'
            #end for
            && mv PB.stat depth.stat
            && @CALCUTS@ depth.stat > cutoffs.tsv 2>calcuts.log
            && @HIST_PLOT@

        #else if $function_select.functions == 'ngscstat':
            ngscstat
            -q $function_select.ngscstat_options.min_align_qual
    ##        #if $function_select.max_depth:
    ##            -M $function_select.max_depth
    ##        #end if
            -L $function_select.ngscstat_options.max_insert
            '${function_select.input}'
            && mv TX.stat depth.stat
            && @CALCUTS@ depth.stat > cutoffs.tsv 2>calcuts.log
            && @HIST_PLOT@

        #else if $function_select.functions == 'calcuts':
            @CALCUTS@ '${function_select.input}' > cutoffs.tsv 2>calcuts.log

        #else if $function_select.functions == 'get_seqs':
            get_seqs
            $function_select.advanced_options.coverage
            $function_select.advanced_options.haplotigs
            $function_select.advanced_options.end_trim
            $function_select.advanced_options.split
            -l $function_select.advanced_options.length
            -m $function_select.advanced_options.min_ratio
            -g $function_select.advanced_options.min_gap
            '${function_select.bed_input}' '${function_select.fasta_input}'
        #end if
    ]]></command>
    <inputs>
        <conditional name="function_select">
            <param type="select" name="functions" label="Function mode">
                <option value="pbcstat">Calculate coverage cutoff, base-level read depth and create read depth histogram for PacBio data (calcuts+pbcstat)</option>
                <option value="ngscstat">Calculate coverage cutoff, base-level read depth and create read depth histogram for Illumina data (calcuts+ngscstat)</option>
                <option value="calcuts">Calculate coverage cutoffs (calcuts)</option>
                <option value="split_fa">Split assembly FASTA files by 'N's (split_fa)</option>
                <option value="purge_dups">Purge haplotigs and overlaps for an assembly (purge_dups)</option>
                <option value="get_seqs">Obtain sequences after purging (get_seqs)</option>
            </param>
            <when value="purge_dups">
                <param name="input" type="data" format="paf,paf.gz" multiple="true" label="PAF input file"/>
                <param argument="-c" name="coverage" type="data"
                    format="tabular" optional="true" label="Base-level coverage file"
                    help="This file is generated with purge_dups by using the 'Calculate coverage cutoff, base-level
                        read depth and create read depth histogram' option"/>
                <param argument="-T" name="cutoffs" type="data"
                    format="tabular" optional="true" label ="Cutoffs file"
                    help="This file is generated with purge_dups by using the 'Calculate coverage cutoff, base-level
                        read depth and create read depth histogram' option"/>
                <param argument="-f" name="min_bad" type="float"
                    min="0" max="1" value="0.8" label="Minimum fraction of haploid/diploid/bad/repetitive bases in a sequence"
                    help="This parameter is set for a suspect haplotigs. If 80% (default value) of a scaffold is high coverage
                    (defined by the sixth column of the cutoff file), it's a repetitive contig. If 80% is low coverage (defined
                    in the first column of the cutoff file), it's a junk contig. If 80%  is above diploid coverage(defined in
                    the fourth column of the cutoff file), it's a diploid, otherwise it's a suspect haplotig"/>
                <param argument="-a" name="min_align" type="integer"
                    min="0" value="70" label="Minimum alignment score"
                    help="If its alignment score is larger than this parameter and max match score larger than
                    the 'mininum max match score' (-b), it is marked as a repeat; if the alignment score is larger than this parameter
                    and max match score no larger than the 'mininum max match score', it is marked as a haplotig.
                    Otherwise it is left as a candidate primary contig. If after purging, the complete genes reported by BUSCO are
                    too low, you can try to increase this parameter"/>
                <param argument="-b" name="min_match" type="integer"
                    min="0" value="200" label="Minimum max match score"
                    help="If its alignment score is larger than the 'minimum align score' (-a) and max match score larger than
                    this parameter, it is marked as a repeat; if the alignment score is larger than the 'minimum
                        align'  and max match score no larger than this parameter, it is marked as a haplotig.
                        Otherwise it is left as a candidate primary contig."/>
                <param argument="-m" name="min_chain" type="integer"
                    min="0" value="500" label="Minimum matching bases for chaining"
                    help="In the first round, it will asset chains consistent alignments within this parameter value"/>
                <param argument="-M" name="max_gap" type="integer"
                    min="0" value="20000" label="Maximum gap size for chaining"
                    help="In the first round, it will asset chains consistent alignments within this parameter value"/>
                <conditional name="double_chain">
                    <param type="select" name="chaining_rounds" label="Rounds of chaining">
                        <option value="one">1 round</option>
                        <option value="two">2 rounds</option>
                    </param>
                    <when value="two">
                        <param argument="-G" name="max_gap_2" type="integer"
                            min="0" value="50000" label="Maximum gap size for second round of chaining"
                            help="In the second round, it will asset chains consistent alignments within this parameter value"/>
                    </when>
                    <when value="one"/>
                </conditional>
                <param argument="-l" name="min_chain_score" type="integer"
                    min="0" value="10000" label="Minimum chaining score for a match"
                    help="This parameter control the overlap size. You should decrease its value to allow more overlaps"/>
                <param argument="-E" name="max_extend" type="integer"
                    min="0" value="15000" label="Maximum extension for contig ends"
                    help="If the chained alignment is within this value to the contig ends, it will extended to the ends"/>
                <param name="log_file" type="boolean" truevalue="true" falsevalue="false" label="Generate log file"/>
            </when>
            <when value="split_fa">
                <param name="input" type="data" format="fasta,fasta.gz" label="Assembly FASTA file" help="The sequence will be cleaved in those position in which the nucleotides is an 'N' or an 'n'."/>
                <!-- This option disables the cleaving process, and yield the original sequence
                <param argument="-n" type="boolean" truevalue="-n" falsevalue="" checked="false" label="Block split by N" help="Enable this option if you do not want break your scaffols into contigs."/>
                -->
            </when>
            <when value="pbcstat">
                <param name="input" type="data" format="paf,paf.gz" multiple="true" label="PAF input file"/>
                <section name="pbcstat_options" title="PBCSTAT options" expanded="true">
                    <param argument="-M" name="max_cov" type="integer" min="0" value="500" label="Maximum coverage"/>
                    <param argument="-f" name="min_map_ratio"  type="float" min="0" max="1" value="0" label="Minimum mapping length ratio"/>
                    <param argument="-q" name="min_map_qual" type="integer"  optional="true" label="Minimum mapping quality"/>
                    <param argument="-l" name="flank"  type="integer" min="0" value="0" label="Flanking space" />
                    <param argument="-p" name="primary_alignments" type="boolean" truevalue="-p" falsevalue="" checked="true" label="Use only primary alignments" />
                </section>
                <expand macro="calcuts" />
                <expand macro="trimmers"/>
                <expand macro="output_macro">
                    <option value="pbcstat_coverage" selected="true">PBCSTAT base coverage</option>
                    <option value="pbcstat_wig">PBCSTAT base coverage (WIG)</option>
                    <option value="depth_stats">PBCSTAT depths</option>
                </expand>
            </when>
            <when value="ngscstat">
                <param name="input" type="data" format="bam" label="BAM input file"/>
                <section name="ngscstat_options" title="NGSCSTAT options" expanded="true">
                    <param argument="-q" name="min_align_qual" type="integer" min="0" value="30" label="Minimum alignment quality" />
                    <!-- Param exists in help text, but isn't actually part of the code. Maybe in the next release? -->
                    <!-- <param name="max_depth" type="integer" label="Maximum read depth" argument="-M" optional="true"/> -->
                    <param argument="-L" name="max_insert" type="integer" min="0" value="1000" label="Maximum insert size"/>
                </section>
                <expand macro="calcuts" />
                <expand macro="trimmers"/>
                <expand macro="output_macro">
                    <option value="ngscstat_coverage" selected="true">NGSCSTAT base coverage</option>
                </expand>
            </when>

            <when value="calcuts">
                <param name="input" type="data" format="tabular" label="Depths input file"/>
                <expand macro="calcuts" />
            </when>
            <when value="get_seqs">
                <param name="fasta_input" type="data" format="fasta" label="Assembly FASTA file"/>
                <param name="bed_input" type="data" format="bed" label="BED input file" help="Generated by the 'purge_dups' function mode."/>
                <section name="advanced_options" title="Advanced options">
                    <param argument="-c" name="coverage" type="boolean"
                        truevalue="-c" falsevalue="" checked="false" label="Keep high coverage contigs in the primary contig set"/>
                    <param argument="-a" name="haplotigs" type="boolean"  truevalue="-a" falsevalue="" checked="false" label="Do not add prefix to haplotigs"/>
                    <param argument="-l" name="length" type="integer" min="0" value="10000" label="Minimum primary contig length" />
                    <param argument="-m" name="min_ratio" type="float"
                        min="0" max="1" value="0.05" label="Minimum ratio of remaining primary contig length to the original contig length"/>
                    <param argument="-e" name="end_trim" type="boolean"
                        truevalue="-e" falsevalue="" checked="true" label="Trim end sequences"
                        help="Only remove sequences at end of halplotigs. If you also want to remove the duplications in the middle,
                            set to false, however that may delete false positive duplications."/>
                    <param argument="-s" name="split" type="boolean"  truevalue="-s" falsevalue="" checked="false" label="Split contigs"/>
                    <param argument="-g" name="min_gap" type="integer" min="0" value="10000" label="Minimum gap size between duplications" />
                </section>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <!-- Get Seqs -->
        <data name="get_seqs_hap" format="fasta" from_work_dir="hap.fa" label="${tool.name} on ${on_string}: get_seqs haplotype" >
            <filter>function_select['functions'] == 'get_seqs'</filter>
        </data>
        <data name="get_seqs_purged" format="fasta" from_work_dir="purged.fa" label="${tool.name} on ${on_string}: get_seqs purged sequences">
            <filter>function_select['functions'] == 'get_seqs'</filter>
        </data>
        <!-- Split FA -->
        <data name="split_fasta" format="fasta" from_work_dir="split.fasta" label="${tool.name} on ${on_string}: split FASTA">
            <filter>function_select['functions'] == 'split_fa'</filter>
        </data>
        <!-- Ngscstat -->
        <data name="ngscstat_cov" format="tabular" from_work_dir="TX.base.cov" label="${tool.name} on ${on_string}: NGSCSTAT base coverage">
            <filter>function_select['functions'] == 'ngscstat'</filter>
            <filter>'ngscstat_coverage' in function_select['output_options']</filter>
        </data>
        <data name="stat_file" format="tabular" from_work_dir="depth.stat"  label="${tool.name} on ${on_string}: depths">
            <filter>function_select['functions'] == 'ngscstat' or function_select['functions'] == 'pbcstat'</filter>
            <filter>'depth_stats' in function_select['output_options']</filter>
        </data>
        <!-- Pbcstat -->
        <data name="pbcstat_cov" format="tabular" from_work_dir="PB.base.cov"  label="${tool.name} on ${on_string}: PBCSTAT base coverage">
            <filter>function_select['functions'] == 'pbcstat'</filter>
            <filter>'pbcstat_coverage' in function_select['output_options']</filter>
        </data>
        <data name="pbcstat_wig" format="wig" from_work_dir="PB.cov.wig" label="${tool.name} on ${on_string}: PBCSTAT base coverage (WIG)">
            <filter>function_select['functions'] == 'pbcstat'</filter>
            <filter>'pbcstat_wig' in function_select['output_options']</filter>
        </data>

        <data name="hist" format="png" from_work_dir="hist.png" label="${tool.name} on ${on_string}: histogram plot">
            <filter>function_select['functions'] == 'pbcstat' or function_select['functions'] == 'ngscstat'</filter>
            <filter>'histogram' in function_select['output_options']</filter>
        </data>

        <!-- Calcuts -->
        <data name="calcuts_log" format="txt" from_work_dir="calcuts.log" label="${tool.name} on ${on_string}: calcuts log">
            <filter>function_select['functions'] in ('pbcstat', 'ngscstat', 'calcuts')</filter>
            <filter>'calcuts_log' in function_select['output_options']</filter>
        </data>
        <data name="calcuts_cutoff" format="tabular" from_work_dir="cutoffs.tsv" label="${tool.name} on ${on_string}: calcuts cutoff">
            <filter>function_select['functions'] in ('pbcstat', 'ngscstat', 'calcuts')</filter>
            <filter>'calcuts_cutoff' in function_select['output_options']</filter>
        </data>
        <!-- Purge dups -->
        <data name="purge_dups_log" format="txt" from_work_dir="purge_dups.log" label="${tool.name} on ${on_string}: purge_dups log">
            <filter>function_select['functions'] == 'purge_dups'</filter>
            <filter>function_select['log_file']</filter>
        </data>
        <data name="purge_dups_bed" format="bed" from_work_dir="dups.bed" label="${tool.name} on ${on_string}: purge_dups BED">
            <filter>function_select['functions'] == 'purge_dups'</filter>
        </data>
    </outputs>
    <tests>
        <!-- Test 1 Purge dups -->
        <test expect_num_outputs="1">
            <conditional name="function_select">
                <param name="functions" value="purge_dups"/>
                <param name="input" value="assembly_test.paf"/>
                <param name="coverage" value="test.cov" ftype="tabular"/>
                <param name="cutoffs" value="cutoffs.tsv" ftype="tabular"/>
                <param name="min_bad" value="0.01"/>
                <param name="min_align" value="10"/>
                <param name="min_match" value="100"/>
                <param name="min_chain" value="1"/>
                <param name="max_gap" value="1000"/>
                <conditional name="double_chain">
                    <param name="chaining_rounds" value="two"/>
                    <param name="max_gap_2" value="1001"/>
                </conditional>
                <param name="min_chain_score" value="1"/>
                <param name="max_extend" value="100"/>
            </conditional>
            <output name="purge_dups_bed" value="purge_dups_01.bed" ftype="bed"/>
        </test>
        <!-- Test 2 Purge dups gzip -->
        <test expect_num_outputs="2">
            <conditional name="function_select">
                <param name="functions" value="purge_dups"/>
                <param name="input" value="assembly_test.paf.gz" ftype="paf.gz"/>
                <param name="coverage" value="test.cov" ftype="tabular"/>
                <param name="cutoffs" value="cutoffs.tsv" ftype="tabular"/>
                <param name="min_bad" value="0.01"/>
                <param name="min_align" value="10"/>
                <param name="min_match" value="100"/>
                <param name="min_chain" value="1"/>
                <param name="max_gap" value="1000"/>
                <conditional name="double_chain">
                    <param name="chaining_rounds" value="two"/>
                    <param name="max_gap_2" value="1001"/>
                </conditional>
                <param name="min_chain_score" value="1"/>
                <param name="max_extend" value="100"/>
                <param name="log_file" value="true"/>
            </conditional>
            <output name="purge_dups_bed" value="purge_dups_02.bed" ftype="bed"/>
            <output name="purge_dups_log" value="purge_dups_log_02.txt" ftype="txt"/>

        </test>
        <!-- Test 3 Purge dups multiple input -->
        <test expect_num_outputs="1">
            <conditional name="function_select">
                <param name="functions" value="purge_dups"/>
                <param name="input" value="assembly_test.paf,test2.paf.gz"/>
            </conditional>
            <output name="purge_dups_bed" value="purge_dups_03.bed" ftype="bed"/>
        </test>
        <!-- Test 4 Split fa -->
        <test expect_num_outputs="1">
            <conditional name="function_select">
                <param name="functions" value="split_fa"/>
                <param name="input" value="assembly_test.fasta"/>
            </conditional>
            <output name="split_fasta" value="split_04.fasta" ftype="fasta"/>
        </test>
        <!-- Test 5 pbcstat -->
        <test expect_num_outputs="6">
            <conditional name="function_select">
                <param name="functions" value="pbcstat"/>
                <param name="input" value="assembly_test.paf"/>
                <section name="pbcstat_options">
                    <param name="max_cov" value="1000"/>
                    <param name="min_map_ratio" value="0.01"/>
                    <param name="min_map_qual" value="1"/>
                    <param name="flank" value="1"/>
                    <param name="primary_alignments" value="-p"/>
                </section>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
                <param name="output_options" value="pbcstat_coverage,pbcstat_wig,depth_stats,histogram,calcuts_cutoff,calcuts_log"/>
            </conditional>
            <output name="calcuts_cutoff" value="calcuts_cutoff_05.tabular" ftype="tabular"/>
            <output name="calcuts_log" value="calcuts_log_05.txt" ftype="txt"/>
            <output name="pbcstat_cov" value="pbcstat_cov_05.tabular" ftype="tabular"/>
            <output name="pbcstat_wig" value="pbcstat_cov_05.wig" ftype="wig"/>
            <output name="stat_file" value="pbcstats_05.tabular" ftype="tabular"/>
            <output name="hist" value="hist_05.png" ftype="png" compare="sim_size"/>
        </test>
        <!-- Test 6 pbcstat gzip -->
        <test expect_num_outputs="2">
            <conditional name="function_select">
                <param name="functions" value="pbcstat"/>
                <param name="input" value="assembly_test.paf.gz" ftype="paf.gz"/>
                <section name="pbcstat_options">
                    <param name="max_cov" value="1000"/>
                    <param name="min_map_ratio" value="0.01"/>
                    <param name="min_map_qual" value="1"/>
                    <param name="flank" value="1"/>
                    <param name="primary_alignments" value="-p"/>
                </section>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
            </conditional>
            <param name="output_options" value="pbcstat_coverage,calcuts_cutoff"/>
            <output name="calcuts_cutoff" value="calcuts_cutoff_06.tabular" ftype="tabular"/>
            <output name="pbcstat_cov" value="pbcstat_cov_06.tabular" ftype="tabular"/>
        </test>
        <!-- Test 7 Pbcstat multiple input -->
        <test expect_num_outputs="2">
            <conditional name="function_select">
                <param name="functions" value="pbcstat"/>
                <param name="input" value="assembly_test.paf,test2.paf.gz"/>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
            </conditional>
            <param name="output_options" value="pbcstat_coverage,calcuts_cutoff"/>
            <output name="calcuts_cutoff" value="calcuts_cutoff_07.tabular" ftype="tabular"/>
            <output name="pbcstat_cov" value="pbcstat_cov_07.tabular" ftype="tabular"/>
        </test>
        <!-- Test 8 ngscstat -->
        <test expect_num_outputs="2">
            <conditional name="function_select">
                <param name="functions" value="ngscstat"/>
                <param name="input" value="test.bam"/>
                <section name="ngscstat_options">
                    <param name="min_align_qual" value="10"/>
                    <param name="max_insert" value="100"/>
                </section>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
            </conditional>
            <param name="output_options" value="ngscstat_coverage,calcuts_cutoff"/>
            <output name="calcuts_cutoff" value="calcuts_cutoff_08.tabular" ftype="tabular"/>
            <output name="ngscstat_cov" value="ngsc_cov_08.tabular" ftype="tabular"/>
        </test>
        <!-- Test 9 Calcuts -->
        <test expect_num_outputs="2">
            <conditional name="function_select">
                <param name="functions" value="calcuts"/>
                <param name="input" value="test.stat"/>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
            </conditional>
            <output name="calcuts_cutoff" value="calcuts_cutoff_09.tabular" ftype="tabular"/>
            <output name="calcuts_log" value="calcuts_log_09.txt" ftype="txt"/>
        </test>
        <!-- Test 10 Get seqs -->
        <test expect_num_outputs="2">
            <conditional name="function_select">
                <param name="functions" value="get_seqs"/>
                <param name="fasta_input" value="split_out.fasta"/>
                <param name="bed_input" value="dups.bed"/>
                <section name="advanced_options">
                    <param name="coverage" value="-c"/>
                    <param name="length" value="10"/>
                    <param name="haplotigs" value="-a"/>
                    <param name="min_ratio" value=".01"/>
                    <param name="end_trim" value="-e"/>
                    <param name="split" value="-s"/>
                    <param name="min_gap" value="100000"/>
                </section>
            </conditional>
            <output name="get_seqs_purged" value="get_seqs_purged_10.fa" ftype="fasta"/>
            <output name="get_seqs_hap" value="get_seqs_hap_10.fa" ftype="fasta"/>
        </test>
        <!-- Test 11 pbcstat histogram options-->
        <test expect_num_outputs="1">
            <conditional name="function_select">
                <param name="functions" value="pbcstat"/>
                <param name="input" value="assembly_test.paf"/>
                <section name="pbcstat_options">
                    <param name="max_cov" value="1000"/>
                    <param name="min_map_ratio" value="0.01"/>
                    <param name="min_map_qual" value="1"/>
                    <param name="flank" value="1"/>
                    <param name="primary_alignments" value="-p"/>
                </section>
                <section name="section_calcuts">
                    <param name="min_depth" value="0.01"/>
                    <param name="low_depth" value="1"/>
                    <param name="transition" value="1"/>
                    <param name="upper_depth" value="100"/>
                    <param name="ploidy" value="-d 0"/>
                </section>
                <section name="section_hist">
                    <param name="ymax" value="100"/>
                    <param name="xmax" value="100"/>
                    <param name="cutoffs_his" value="calcuts_out.tsv"/>
                </section>
            </conditional>
            <param name="output_options" value="histogram"/>
            <output name="hist" value="hist_11.png" ftype="png" compare="sim_size"/>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**Purpose**

The purge_dups tools are designed to remove haplotigs and contig overlaps in a de novo assembly based on read depth.
purge_dups can significantly improve genome assemblies by removing overlaps and haplotigs caused by sequence divergence
in heterozygous regions. This both removes false duplications in primary draft assemblies while retaining completeness and sequence
integrity, and can improve scaffolding.

----

.. class:: infomark

**Pipeline Guide**

Given a primary assembly, and an alternative assembly (optional, if you have one), follow the steps shown below to build your
own purge_dups pipeline, steps with same number can be run simultaneously. Among all the steps, although step 5 is optional,
we highly recommend our users to do so, because assemblers may produce overrepresented sequences. In such a case, the final
step 5 can be applied to remove those seqeuences.

    - Step 1: Calculate the coverage cutoffs and base coverages.
    - Step 2: Split an assembly with the **split_fasfa** function and do a self-self alignment by using minimap2.
    - Step 3: Purge haplotigs and overlaps with the **purge_dups** function.
    - Step 4: Get purged primary and haplotig sequences from the draft assembly with the **get_seqs** function.
    - Step 5: Merge hap.fa file, generated in the previous step, and the alternate assembly, and redo the above steps to get a decent haplotig set.

----

.. class:: infomark

**Limitations**

- Read depth cutoffs calculation: the coverage cutoffs can be larger for a low heterozygosity species, which causes the purged assembly size smaller than expected. In such a case, please use script/hist_plot.py to make the histogram plot and set coverage cutoffs manually.
- Repeats: purge_dups has a limited ability to process repeats.

----

.. class:: infomark

**Purged assembly validation**

There are many ways to validate the purged assembly. One way is to make a coverage plot for it, the 2nd way is to run `BUSCO <https://busco.ezlab.org/>`_. A thid option is to use `Merqury <https://github.com/marbl/merqury>`_


    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Tue, 12 Oct 2021 19:07:05 +0000
parents	76d4cbefff85
children