diff nucmer.xml @ 4:7cd7a55a678d draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mummer4 commit 026db7297e987c1b7ce7f5dd4f8746d1bd435538
author iuc
date Mon, 18 Mar 2024 12:41:25 +0000
parents e18267f90096
children
line wrap: on
line diff
--- a/nucmer.xml	Sat Nov 27 09:59:16 2021 +0000
+++ b/nucmer.xml	Mon Mar 18 12:41:25 2024 +0000
@@ -11,8 +11,11 @@
         <![CDATA[
         ln -s $reference_sequence reference.fa &&
         ln -s $query_sequence query.fa &&
-        nucmer 
+        nucmer
             $anchoring
+            #if $outform.out_format != "delta":
+                --sam-long=outsam.sam
+            #end if
             -b '$breaklen'
             -c '$mincluster'
             -D '$diagdiff'
@@ -31,111 +34,145 @@
                 $options.genome
                 -M '$options.max_chunk'
             #end if
-            'reference.fa' 'query.fa' 
-            #if $mumplot.plot == 'yes':
+            'reference.fa' 'query.fa'
+            #if $outform.out_format == "delta":
+               #if $mumplot.plot == 'yes' :
                 && mummerplot
-                    #if $mumplot.sequences.seq_input == 'yes':
+                    #if $outform.mumplot.sequences.seq_input == 'yes':
                         -R '$reference_sequence'
                         -Q '$query_sequence'
-                        $mumplot.sequences.layout
+                        $outform.mumplot.sequences.layout
                     #end if
-                    -b '$mumplot.breaklen'
-                    $mumplot.color
-                    $mumplot.coverage
-                    $mumplot.filter
-                    $mumplot.fat
-                    #if $mumplot.labels.IDs == 'yes':
-                        -IdR '$mumplot.labels.ref_id'
-                        -IdQ '$mumplot.labels.query_id'
+                    -b '$outform.mumplot.breaklen'
+                    $outform.mumplot.color
+                    $outform.mumplot.coverage
+                    $outform.mumplot.filter
+                    $outform.mumplot.fat
+                    #if $outform.mumplot.labels.IDs == 'yes':
+                        -IdR '$outform.mumplot.labels.ref_id'
+                        -IdQ '$outform.mumplot.labels.query_id'
                     #end if
-                    -s '$mumplot.size'
+                    -s '$outform.mumplot.size'
                     -terminal png
-                    -title '$mumplot.title'
-                    $mumplot.snp
-                    #if $mumplot.range.custom == 'yes':
-                        -x [$mumplot.range.min_x:$mumplot.range.max_x]
-                        -y [$mumplot.range.min_y:$mumplot.range.max_y]
+                    -title '$outform.mumplot.title'
+                    $outform.mumplot.snp
+                    #if $outform.mumplot.range.custom == 'yes':
+                        -x [$outform.mumplot.range.min_x:$outform.mumplot.range.max_x]
+                        -y [$outform.mumplot.range.min_y:$outform.mumplot.range.max_y]
                     #end if
                     'out.delta'
                     @MUMMER_GNUPLOT_MANUAL@
+                #end if
+            #else:
+                && samtools dict reference.fa > outsamhead
+                && tail -n +3 outsam.sam >> outsamhead
+                && samtools sort  -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" outsamhead |
+                #if $outform.out_format == 'bam-long':
+                    samtools calmd -b --threads {GALAXY_SLOTS:-1} - reference.fa > outsam
+                #else if $outform.out_format == 'cram-long':
+                    samtools view -C --reference reference.fa -o outsam -
+                #end if
             #end if
         ]]>
-    </command> 
+    </command>
     <inputs>
         <param name="reference_sequence" type="data" format="fasta" label="Reference Sequence" help="FastA or multi-FastA" />
         <param name="query_sequence" type="data" format="fasta" label="Query Sequence" help="FastA or multi-FastA" />
+        <conditional name="outform">
+            <param name="out_format" type="select" label="Output format" help="Select delta format if a plot is needed. Jbrowse is a good choice to view cram and bam tracks">
+                <option value="bam-long">bam format</option>
+                <option value="cram-long">cram format</option>
+                <option value="delta">Mummer delta format - allows plots</option>
+            </param>
+            <when value="delta">
+                <conditional name="mumplot" >
+                    <param name="plot" type="select" label="Create a 2-D dotplot of the input sequences?" >
+                        <option value="no">No plot</option>
+                        <option value="yes">Plot</option>
+                    </param>
+                    <when value="yes" >
+                        <expand macro="mumplot_input" >
+                            <conditional name="sequences" >
+                                <param name="seq_input" type="select" label="Plot an ordered set of reference/query sequences?" >
+                                    <option value="no">NO</option>
+                                    <option value="yes">YES</option>
+                                </param>
+                                <when value="yes">
+                                    <param name="reference_sequence" type="data" format="fasta" label="Reference Sequence" help="(-R)" />
+                                    <param name="query_sequence" type="data" format="fasta" multiple="True" label="Query Sequence(s)" help="(-Q)" />
+                                    <param argument="--layout" type="boolean" truevalue="--layout" falsevalue="" label="Layout" help="Layout a .delta multiplot in an intelligible fashion." />
+                                </when>
+                                <when value="no" />
+                            </conditional>
+                        </expand>
+                    </when>
+                    <when value="no" />
+                </conditional>
+            </when>
+            <when value="bam-long"/>
+            <when value="cram-long"/>
+        </conditional>
         <param name="anchoring" type="select" label="Anchoring" help="Choose a match anchoring strategy">
             <option value="">Use default</option>
             <option value="--mum">Unique matches only (--mum)</option>
             <option value="--maxmatch">All matches (--maxmatch)</option>
         </param>
+
         <param name="breaklen" type="integer" argument="-b" value="200" label="Break Length"
-            help="Set the distance an alignment extension will attempt to extend poor scoring regions before giving up. (-b)" />
-        <param name="mincluster" type="integer" argument="-c" value="65" label="Minumum Cluster Length" help="Sets the minimum length of a cluster of matches. (-c)" />
+            help="Set the distance an alignment extension will attempt to extend poor scoring regions before giving up." />
+        <param name="mincluster" type="integer" argument="-c" value="65" label="Minumum Cluster Length" help="Sets the minimum length of a cluster of matches." />
         <param name="diagdiff" type="integer" argument="-D" value="5" label="Maximum Diagonal Difference"
-            help="Set the maximum diagonal difference between two adjacent anchors in a cluster. (-D)" />
+            help="Set the maximum diagonal difference between two adjacent anchors in a cluster." />
         <param name="diagfactor" type="float" argument="-d" value="0.12" label="Maximum Diagonal Difference"
-            help="Set the maximum diagonal difference between two adjacent anchors in a cluster as a differential fraction of the gap length. (-d)" />
-        <param name="noextend" type="boolean" argument="--noextend" truevalue="--noextend" falsevalue="" label="No Extend" help="Do not perform cluster extension step. (--noextend)" />
+            help="Set the maximum diagonal difference between two adjacent anchors in a cluster as a differential fraction of the gap length." />
+        <param type="boolean" argument="--noextend" truevalue="--noextend" falsevalue="" label="No Extend" help="Do not perform cluster extension step." />
         <param name="direction" type="select" label="Direction" help="Choose a direction of Query Sequence to Use">
-            <option value="">Use foward and reverse sequences</option>
+            <option value="">Use forward and reverse sequences</option>
             <option value="-f">Use only forward sequence of query (-f)</option>
-            <option value="-r">Use only reverese sequence of query (-r)</option>
+            <option value="-r">Use only reverse sequence of query (-r)</option>
         </param>
-        <param name="maxgap" type="integer" argument="-g" value="90" label="Maximum Gap Distance" help="Set the maximum gap between two adjacent matches in a cluster. (-g)" />
-        <param name="minmatch" type="integer" argument="-l" value="20" label="Minimum Match Length" help="Set the minimum length of a single exact match. (-l)" />
-        <param name="minalign" type="integer" argument="-L" value="0" label="Minumum Alignment Length" help="Minimum length of an alignment, after clustering and extension. (-L)" />
-        <param name="nooptimize" type="boolean" argument="--nooptimize" truevalue="--nooptimize" falsevalue="" label="No Alignment Score Optimization"
+        <param name="maxgap" type="integer" argument="-g" value="90" label="Maximum Gap Distance" help="Set the maximum gap between two adjacent matches in a cluster." />
+        <param name="minmatch" type="integer" argument="-l" value="20" label="Minimum Match Length" help="Set the minimum length of a single exact match." />
+        <param name="minalign" type="integer" argument="-L" value="0" label="Minumum Alignment Length" help="Minimum length of an alignment, after clustering and extension." />
+        <param type="boolean" argument="--nooptimize" truevalue="--nooptimize" falsevalue="" label="No Alignment Score Optimization"
             help="No alignment score optimization, i.e. if an alignment extension reaches the end of a sequence, it will not backtrack to optimize the alignment score and instead terminate the alignment at the end of the sequence. (--nooptimize)" />
-        <param name="nosimplify" type="boolean" argument="--nosimplify" truevalue="--nosimplify" falsevalue="" label="Don't Simplify Alignments"
-            help="Don't simplify alignments by removing shadowed clusters. Use this option when aligning a sequence to itself to look for repeats. (--nosimplify)" />
+        <param type="boolean" argument="--nosimplify" truevalue="--nosimplify" falsevalue="" label="Don't Simplify Alignments"
+            help="Don't simplify alignments by removing shadowed clusters. Use this option when aligning a sequence to itself to look for repeats." />
         <conditional name="options">
             <param name="advanced" type="select" label="Additional options">
                 <option value="defaults">Use defaults</option>
                 <option value="enable">Select additional options</option>
             </param>
             <when value="enable">
-                <param name="banded" type="boolean" argument="--banded" truevalue="--banded" falsevalue="" label="Banding"
+                <param type="boolean" argument="--banded" truevalue="--banded" falsevalue="" label="Banding"
                     help="Enforce absolute banding of dynamic programming matrix based on diagdiff parameter. (--banded)" />
-                <param name="large" type="boolean" argument="--large" truevalue="--large" falsevalue="" label="Offsets" help="Force the use of large offsets. (--large)" />
+                <param  type="boolean" argument="--large" truevalue="--large" falsevalue="" label="Offsets" help="Force the use of large offsets. (--large)" />
                 <param name="genome" type="boolean" argument="-G" truevalue="-G" falsevalue="" label="Map genome to genome" help="For long query sequences. (-G)" />
                 <param name="max_chunk" type="integer" argument="-M" value="50000" label="Max Chunk" help="Stop adding sequence for a thread if more than MAX already. (-M)" />
             </when>
             <when value="defaults" />
         </conditional>
-        <conditional name="mumplot" >
-            <param name="plot" type="select" label="Do you want to output a 2-D dotplot of the input sequences? (mummerplot)" >
-                <option value="yes">YES</option>
-                <option value="no">NO</option>
-            </param>
-            <when value="yes" >
-                <expand macro="mumplot_input" >
-                    <conditional name="sequences" >
-                        <param name="seq_input" type="select" label="Plot an ordered set of reference/query sequences?" >
-                            <option value="no">NO</option>
-                            <option value="yes">YES</option>
-                        </param>
-                        <when value="yes">
-                            <param name="reference_sequence" type="data" format="fasta" label="Reference Sequence" help="(-R)" />
-                            <param name="query_sequence" type="data" format="fasta" multiple="True" label="Query Sequence(s)" help="(-Q)" />
-                            <param name="layout" type="boolean" argument="--layout" truevalue="--layout" falsevalue="" label="Layout" help="Layout a .delta multiplot in an intelligible fashion. (--layout)" />
-                        </when>
-                        <when value="no" />
-                    </conditional>
-                </expand>
-            </when>
-            <when value="no" />
-        </conditional>
     </inputs>
     <outputs>
-        <data name="delta_output" format="tabular" from_work_dir="out.delta" label="${tool.name} on ${on_string}: alignment" />
+        <data name="delta_output" format="tabular" from_work_dir="out.delta" label="${tool.name} on ${on_string}: delta format">
+           <filter>outform["out_format"] == "delta"</filter>
+        </data>
+        <data name="sam_output" format="bam" from_work_dir="outsam"  label="${tool.name} on ${on_string}">
+           <filter>outform["out_format"] != "delta"</filter>
+           <change_format>
+                <when input="outform.out_format" value="cram-long" format="cram" />
+           </change_format>
+        </data>
         <data name="png_output" format="png" from_work_dir="out.png" label="${tool.name} on ${on_string}: plot" >
-            <filter>mumplot['plot'] == 'yes'</filter>
+            <filter>outform["out_format"] == "delta" and outform['mumplot']['plot'] == 'yes'</filter>
         </data>
     </outputs>
     <tests>
-        <test>
+        <test expect_num_outputs="2">
             <param name="advanced" value="defaults" />
+            <conditional name="outform">
+                <param name="out_format" value="delta" />
+            </conditional>
             <param name="plot" value="yes" />
             <param name="seq_input" value="yes" />
             <param name="reference_sequence" ftype="fasta" value="human_aqp3.fasta"/>
@@ -143,11 +180,34 @@
             <output name="delta_output" ftype="tabular" compare="diff" lines_diff="2" value="nucmer.txt"/>
             <output name="png_output" ftype="png" compare="sim_size" value="plot.png" />
         </test>
+       <test expect_num_outputs="1">
+            <param name="advanced" value="defaults" />
+            <conditional name="outform">
+                <param name="out_format" value="bam-long" />
+            </conditional>
+            <param name="seq_input" value="yes" />
+            <param name="reference_sequence" ftype="fasta" value="human_aqp3.fasta"/>
+            <param name="query_sequence" ftype="fasta" value="mouse_aqp3.fasta" />
+            <output name="sam_output" ftype="bam" compare="sim_size" value="out.bam" />
+        </test>
+        <test expect_num_outputs="1">
+            <param name="advanced" value="defaults" />
+            <conditional name="outform">
+                <param name="out_format" value="cram-long" />
+            </conditional>
+            <param name="seq_input" value="yes" />
+            <param name="reference_sequence" ftype="fasta" value="human_aqp3.fasta"/>
+            <param name="query_sequence" ftype="fasta" value="mouse_aqp3.fasta" />
+            <output name="sam_output" ftype="cram" compare="sim_size" value="out.cram" />
+        </test>
     </tests>
     <help><![CDATA[
-        nucmer is for the all-vs-all comparison of nucleotide sequences contained in multi-FastA data files. It is best used for highly similar sequence that may have large rearrangements. Common use cases are: comparing two unfinished shotgun sequencing assemblies, mapping an unfinished sequencing assembly to a finished genome, and comparing two fairly similar genomes that may have large rearrangements and duplications.
+        nucmer is for the all-vs-all comparison of nucleotide sequences contained in multi-FastA data files. It is best used for highly similar sequence that may
+        have large rearrangements. Common use cases are: comparing two unfinished shotgun sequencing assemblies, mapping an unfinished sequencing assembly
+        to a finished genome, and comparing two fairly similar genomes that may have large rearrangements and duplications.
 
-        All output coordinates reference the forward strand of the involved sequence, regardless of the match direction. Also, nucmer now uses only matches that are unique in the reference sequence by default, use different Anchoring options to change this behavior.
+        All output coordinates reference the forward strand of the involved sequence, regardless of the match direction. Also, nucmer now uses only matches that
+        are unique in the reference sequence by default, use different Anchoring options to change this behavior.
 
 **Options:**::
 
@@ -155,18 +215,24 @@
 
     nucmer
 
+    --sam-long      The original output format of nucmer, the delta format, contains only the minimum information necessary to quickly recreate the alignment.
+    It contains the name of the matching sequences, the length of the match, number of errors and positions of indels.
+    With --sam-long, it additionally reports the MD string (which specifies the mismatching positions), the sequence and, if applicable,
+    the quality values of the matching sequence. The long format is more expensive to compute and it generates larger output files,
+    but this option allows nucmer4 to match the behavior of other aligners such as Bowtie2 or BWA.
+
     --mum             Use anchor matches that are unique in both the reference and query (false)
 
     --maxmatch        Use all anchor matches regardless of their uniqueness (false)
 
-    -b                Set the distance an alignment extension will attempt to extend poor scoring regions 
+    -b                Set the distance an alignment extension will attempt to extend poor scoring regions
                       before giving up (200)
 
     -c                Sets the minimum length of a cluster of matches (65)
 
     -D                Set the maximum diagonal difference between two adjacent anchors in a cluster (5)
 
-    -d                Set the maximum diagonal difference between two adjacent anchors in a cluster as a 
+    -d                Set the maximum diagonal difference between two adjacent anchors in a cluster as a
                       differential fraction of the gap length (0.12)
 
     --noextend        Do not perform cluster extension step (false)
@@ -181,7 +247,7 @@
 
     -L                Minimum length of an alignment, after clustering and extension (0)
 
-    --nooptimize      No alignment score optimization, i.e. if an alignment extension reaches the end of a 
+    --nooptimize      No alignment score optimization, i.e. if an alignment extension reaches the end of a
                       sequence, it will not backtrack to optimize the alignment score and instead terminate
                       the alignment at the end of the sequence (false)
 
@@ -198,16 +264,16 @@
 
     mummerplot
 
-    -b             Highlight alignments with breakpoints further than breaklen nucleotides from the nearest 
+    -b             Highlight alignments with breakpoints further than breaklen nucleotides from the nearest
                    sequence end
 
-    -color         Color plot lines with a percent similarity gradient or turn off all plot color (default 
-                   color by match dir) If the plot is very sparse, edit the .gp script to plot with 
+    -color         Color plot lines with a percent similarity gradient or turn off all plot color (default
+                   color by match dir) If the plot is very sparse, edit the .gp script to plot with
                    'linespoints' instead of 'lines'
 
     -c             Generate a reference coverage plot (default for .tiling)
 
-    --filter       Only display .delta alignments which represent the "best" hit to any particular spot on 
+    --filter       Only display .delta alignments which represent the "best" hit to any particular spot on
                    either sequence, i.e. a one-to-one mapping of reference and query subsequences
 
     --fat          Layout sequences using fattest alignment only