Mercurial > repos > iuc > mummer_nucmer

<tool id="mummer_nucmer" name="Nucmer" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Align two or more sequences</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements">
        <expand macro="gnuplot_requirement"/>
    </expand>
    <command detect_errors="exit_code">
        <![CDATA[
        ln -s $reference_sequence reference.fa &&
        ln -s $query_sequence query.fa &&
        nucmer
            $anchoring
            #if $outform.out_format != "delta":
                --sam-long=outsam.sam
            #end if
            -b '$breaklen'
            -c '$mincluster'
            -D '$diagdiff'
            -d '$diagfactor'
            $noextend
            $direction
            -g '$maxgap'
            -l '$minmatch'
            -L '$minalign'
            $nooptimize
            $nosimplify
            --threads "\${GALAXY_SLOTS:-1}"
            #if $options.advanced == 'enable':
                $options.banded
                $options.large
                $options.genome
                -M '$options.max_chunk'
            #end if
            'reference.fa' 'query.fa'
            #if $outform.out_format == "delta":
               #if $mumplot.plot == 'yes' :
                && mummerplot
                    #if $outform.mumplot.sequences.seq_input == 'yes':
                        -R '$reference_sequence'
                        -Q '$query_sequence'
                        $outform.mumplot.sequences.layout
                    #end if
                    -b '$outform.mumplot.breaklen'
                    $outform.mumplot.color
                    $outform.mumplot.coverage
                    $outform.mumplot.filter
                    $outform.mumplot.fat
                    #if $outform.mumplot.labels.IDs == 'yes':
                        -IdR '$outform.mumplot.labels.ref_id'
                        -IdQ '$outform.mumplot.labels.query_id'
                    #end if
                    -s '$outform.mumplot.size'
                    -terminal png
                    -title '$outform.mumplot.title'
                    $outform.mumplot.snp
                    #if $outform.mumplot.range.custom == 'yes':
                        -x [$outform.mumplot.range.min_x:$outform.mumplot.range.max_x]
                        -y [$outform.mumplot.range.min_y:$outform.mumplot.range.max_y]
                    #end if
                    'out.delta'
                    @MUMMER_GNUPLOT_MANUAL@
                #end if
            #else:
                && samtools dict reference.fa > outsamhead
                && tail -n +3 outsam.sam >> outsamhead
                && samtools sort  -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" outsamhead |
                #if $outform.out_format == 'bam-long':
                    samtools calmd -b --threads {GALAXY_SLOTS:-1} - reference.fa > outsam
                #else if $outform.out_format == 'cram-long':
                    samtools view -C --reference reference.fa -o outsam -
                #end if
            #end if
        ]]>
    </command>
    <inputs>
        <param name="reference_sequence" type="data" format="fasta" label="Reference Sequence" help="FastA or multi-FastA" />
        <param name="query_sequence" type="data" format="fasta" label="Query Sequence" help="FastA or multi-FastA" />
        <conditional name="outform">
            <param name="out_format" type="select" label="Output format" help="Select delta format if a plot is needed. Jbrowse is a good choice to view cram and bam tracks">
                <option value="bam-long">bam format</option>
                <option value="cram-long">cram format</option>
                <option value="delta">Mummer delta format - allows plots</option>
            </param>
            <when value="delta">
                <conditional name="mumplot" >
                    <param name="plot" type="select" label="Create a 2-D dotplot of the input sequences?" >
                        <option value="no">No plot</option>
                        <option value="yes">Plot</option>
                    </param>
                    <when value="yes" >
                        <expand macro="mumplot_input" >
                            <conditional name="sequences" >
                                <param name="seq_input" type="select" label="Plot an ordered set of reference/query sequences?" >
                                    <option value="no">NO</option>
                                    <option value="yes">YES</option>
                                </param>
                                <when value="yes">
                                    <param name="reference_sequence" type="data" format="fasta" label="Reference Sequence" help="(-R)" />
                                    <param name="query_sequence" type="data" format="fasta" multiple="True" label="Query Sequence(s)" help="(-Q)" />
                                    <param argument="--layout" type="boolean" truevalue="--layout" falsevalue="" label="Layout" help="Layout a .delta multiplot in an intelligible fashion." />
                                </when>
                                <when value="no" />
                            </conditional>
                        </expand>
                    </when>
                    <when value="no" />
                </conditional>
            </when>
            <when value="bam-long"/>
            <when value="cram-long"/>
        </conditional>
        <param name="anchoring" type="select" label="Anchoring" help="Choose a match anchoring strategy">
            <option value="">Use default</option>
            <option value="--mum">Unique matches only (--mum)</option>
            <option value="--maxmatch">All matches (--maxmatch)</option>
        </param>

        <param name="breaklen" type="integer" argument="-b" value="200" label="Break Length"
            help="Set the distance an alignment extension will attempt to extend poor scoring regions before giving up." />
        <param name="mincluster" type="integer" argument="-c" value="65" label="Minumum Cluster Length" help="Sets the minimum length of a cluster of matches." />
        <param name="diagdiff" type="integer" argument="-D" value="5" label="Maximum Diagonal Difference"
            help="Set the maximum diagonal difference between two adjacent anchors in a cluster." />
        <param name="diagfactor" type="float" argument="-d" value="0.12" label="Maximum Diagonal Difference"
            help="Set the maximum diagonal difference between two adjacent anchors in a cluster as a differential fraction of the gap length." />
        <param type="boolean" argument="--noextend" truevalue="--noextend" falsevalue="" label="No Extend" help="Do not perform cluster extension step." />
        <param name="direction" type="select" label="Direction" help="Choose a direction of Query Sequence to Use">
            <option value="">Use forward and reverse sequences</option>
            <option value="-f">Use only forward sequence of query (-f)</option>
            <option value="-r">Use only reverse sequence of query (-r)</option>
        </param>
        <param name="maxgap" type="integer" argument="-g" value="90" label="Maximum Gap Distance" help="Set the maximum gap between two adjacent matches in a cluster." />
        <param name="minmatch" type="integer" argument="-l" value="20" label="Minimum Match Length" help="Set the minimum length of a single exact match." />
        <param name="minalign" type="integer" argument="-L" value="0" label="Minumum Alignment Length" help="Minimum length of an alignment, after clustering and extension." />
        <param type="boolean" argument="--nooptimize" truevalue="--nooptimize" falsevalue="" label="No Alignment Score Optimization"
            help="No alignment score optimization, i.e. if an alignment extension reaches the end of a sequence, it will not backtrack to optimize the alignment score and instead terminate the alignment at the end of the sequence. (--nooptimize)" />
        <param type="boolean" argument="--nosimplify" truevalue="--nosimplify" falsevalue="" label="Don't Simplify Alignments"
            help="Don't simplify alignments by removing shadowed clusters. Use this option when aligning a sequence to itself to look for repeats." />
        <conditional name="options">
            <param name="advanced" type="select" label="Additional options">
                <option value="defaults">Use defaults</option>
                <option value="enable">Select additional options</option>
            </param>
            <when value="enable">
                <param type="boolean" argument="--banded" truevalue="--banded" falsevalue="" label="Banding"
                    help="Enforce absolute banding of dynamic programming matrix based on diagdiff parameter. (--banded)" />
                <param  type="boolean" argument="--large" truevalue="--large" falsevalue="" label="Offsets" help="Force the use of large offsets. (--large)" />
                <param name="genome" type="boolean" argument="-G" truevalue="-G" falsevalue="" label="Map genome to genome" help="For long query sequences. (-G)" />
                <param name="max_chunk" type="integer" argument="-M" value="50000" label="Max Chunk" help="Stop adding sequence for a thread if more than MAX already. (-M)" />
            </when>
            <when value="defaults" />
        </conditional>
    </inputs>
    <outputs>
        <data name="delta_output" format="tabular" from_work_dir="out.delta" label="${tool.name} on ${on_string}: delta format">
           <filter>outform["out_format"] == "delta"</filter>
        </data>
        <data name="sam_output" format="bam" from_work_dir="outsam"  label="${tool.name} on ${on_string}">
           <filter>outform["out_format"] != "delta"</filter>
           <change_format>
                <when input="outform.out_format" value="cram-long" format="cram" />
           </change_format>
        </data>
        <data name="png_output" format="png" from_work_dir="out.png" label="${tool.name} on ${on_string}: plot" >
            <filter>outform["out_format"] == "delta" and outform['mumplot']['plot'] == 'yes'</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <param name="advanced" value="defaults" />
            <conditional name="outform">
                <param name="out_format" value="delta" />
            </conditional>
            <param name="plot" value="yes" />
            <param name="seq_input" value="yes" />
            <param name="reference_sequence" ftype="fasta" value="human_aqp3.fasta"/>
            <param name="query_sequence" ftype="fasta" value="mouse_aqp3.fasta" />
            <output name="delta_output" ftype="tabular" compare="diff" lines_diff="2" value="nucmer.txt"/>
            <output name="png_output" ftype="png" compare="sim_size" value="plot.png" />
        </test>
       <test expect_num_outputs="1">
            <param name="advanced" value="defaults" />
            <conditional name="outform">
                <param name="out_format" value="bam-long" />
            </conditional>
            <param name="seq_input" value="yes" />
            <param name="reference_sequence" ftype="fasta" value="human_aqp3.fasta"/>
            <param name="query_sequence" ftype="fasta" value="mouse_aqp3.fasta" />
            <output name="sam_output" ftype="bam" compare="sim_size" value="out.bam" />
        </test>
        <test expect_num_outputs="1">
            <param name="advanced" value="defaults" />
            <conditional name="outform">
                <param name="out_format" value="cram-long" />
            </conditional>
            <param name="seq_input" value="yes" />
            <param name="reference_sequence" ftype="fasta" value="human_aqp3.fasta"/>
            <param name="query_sequence" ftype="fasta" value="mouse_aqp3.fasta" />
            <output name="sam_output" ftype="cram" compare="sim_size" value="out.cram" />
        </test>
    </tests>
    <help><![CDATA[
        nucmer is for the all-vs-all comparison of nucleotide sequences contained in multi-FastA data files. It is best used for highly similar sequence that may
        have large rearrangements. Common use cases are: comparing two unfinished shotgun sequencing assemblies, mapping an unfinished sequencing assembly
        to a finished genome, and comparing two fairly similar genomes that may have large rearrangements and duplications.

        All output coordinates reference the forward strand of the involved sequence, regardless of the match direction. Also, nucmer now uses only matches that
        are unique in the reference sequence by default, use different Anchoring options to change this behavior.

**Options:**::

    Defaults in parentheses

    nucmer

    --sam-long      The original output format of nucmer, the delta format, contains only the minimum information necessary to quickly recreate the alignment.
    It contains the name of the matching sequences, the length of the match, number of errors and positions of indels.
    With --sam-long, it additionally reports the MD string (which specifies the mismatching positions), the sequence and, if applicable,
    the quality values of the matching sequence. The long format is more expensive to compute and it generates larger output files,
    but this option allows nucmer4 to match the behavior of other aligners such as Bowtie2 or BWA.

    --mum             Use anchor matches that are unique in both the reference and query (false)

    --maxmatch        Use all anchor matches regardless of their uniqueness (false)

    -b                Set the distance an alignment extension will attempt to extend poor scoring regions
                      before giving up (200)

    -c                Sets the minimum length of a cluster of matches (65)

    -D                Set the maximum diagonal difference between two adjacent anchors in a cluster (5)

    -d                Set the maximum diagonal difference between two adjacent anchors in a cluster as a
                      differential fraction of the gap length (0.12)

    --noextend        Do not perform cluster extension step (false)

    -f                Use only the forward strand of the Query sequences (false)

    -r                Use only the reverse complement of the Query sequences (false)

    -g                Set the maximum gap between two adjacent matches in a cluster (90)

    -l                Set the minimum length of a single exact match (20)

    -L                Minimum length of an alignment, after clustering and extension (0)

    --nooptimize      No alignment score optimization, i.e. if an alignment extension reaches the end of a
                      sequence, it will not backtrack to optimize the alignment score and instead terminate
                      the alignment at the end of the sequence (false)

    --nosimplify      Don't simplify alignments by removing shadowed clusters. Use this option when aligning
                      a sequence to itself to look for repeats (false)

    --banded          Enforce absolute banding of dynamic programming matrix based on diagdiff parameter (false)

    --large           Force the use of large offsets (false)

    -G                Map genome to genome (long query sequences) (false)

    -M                Max chunk. Stop adding sequence for a thread if more than MAX already. (50000)

    mummerplot

    -b             Highlight alignments with breakpoints further than breaklen nucleotides from the nearest
                   sequence end

    -color         Color plot lines with a percent similarity gradient or turn off all plot color (default
                   color by match dir) If the plot is very sparse, edit the .gp script to plot with
                   'linespoints' instead of 'lines'

    -c             Generate a reference coverage plot (default for .tiling)

    --filter       Only display .delta alignments which represent the "best" hit to any particular spot on
                   either sequence, i.e. a one-to-one mapping of reference and query subsequences

    --fat          Layout sequences using fattest alignment only

    -IdR           Plot a particular reference sequence ID on the X-axis

    -IdQ           Plot a particular query sequence ID on the Y-axis

    -s             Set the output size to small, medium or large (--small) (--medium) (--large) (default 'small')

    --SNP          Highlight SNP locations in each alignment

    -title         Specify the gnuplot plot title (default none)

    -x             Set the xrange for the plot '[min:max]'

    -y             Set the yrange for the plot '[min:max]'

    -R             Plot an ordered set of reference sequences from Rfile

    -Q             Plot an ordered set of query sequences from Qfile

    --layout       Layout a .delta multiplot in an intelligible fashion, this option requires the -R -Q options

    ]]></help>
        <expand macro="citation" />
</tool>
author	iuc
date	Mon, 18 Mar 2024 12:41:25 +0000
parents	e18267f90096
children