diff MUMmer/mummer_clustering.xml @ 0:61f30d177448 default tip

initial commit on Mummer toolsuite on toolshed
author eric
date Tue, 31 Mar 2015 14:19:49 +0200
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MUMmer/mummer_clustering.xml	Tue Mar 31 14:19:49 2015 +0200
@@ -0,0 +1,238 @@
+<tool id="mummer_clustering" name="MUMmer Clustering" version="0.9.alx" force_history_refresh="True">
+  <description>: order sequence matches in clusters</description>
+  <command>
+	<!-- update this path to the installed location -->
+		$tool.cmd
+		#if $tool.cmd=="gaps":
+			$in_reference
+			#if $tool.gaps_r=="yes":
+				-r
+			#end if
+		#end if
+		#if $tool.cmd=="mgaps":
+			#if $tool.cmd_C=="yes":
+				-C
+			#end if
+			-d $tool.cmd_d
+			#if $tool.cmd_e=="yes": 
+				-e
+			#end if
+			-f $tool.cmd_f
+			-l $tool.cmd_l
+			-s $tool.cmd_s
+		#end if
+		&lt; $tool.in_match_list
+		&gt; $out_tool
+  </command>
+	<inputs>
+	  <conditional name="tool">
+		<param name="cmd" type="select" label="MUMmer maximal matching" help="Algorithms are run with default parameters (none). For specific args see help below" >
+			<option value="gaps" selected="true">gaps</option>
+			<option value="mgaps">mgaps</option>
+		</param>
+		<when value="gaps">
+			<param name="in_reference" type="data" format="fasta" label="Reference FastA file" />
+			<param name="gaps_r" type="select" label="Use reversed [-r]" >
+				<option value="no" selected="true">No</option>
+				<option value="yes">Yes</option>
+			</param>
+			<param name="in_match_list" type="data" format="text" label="MUMmer match list" help="See help for more details" />
+		</when>
+		<when value="mgaps">
+			<param name="in_match_list" type="data" format="text" label="MUMmer match list" help="See help for more details" />
+			<param name="cmd_C" type="select" label="Check input header labels have reversed keyword [-C]" >
+				<option value="no" selected="true">No</option>
+				<option value="yes">Yes</option>
+			</param>
+			<param name="cmd_d" type="integer" size="5" value="5" label="Max fixed diagonal difference [-d]" />
+			<param name="cmd_e" type="select" label="Use extent of cluster [-e]" >
+				<option value="no" selected="true">No</option>
+				<option value="yes">Yes</option>
+			</param>
+			<param name="cmd_f" type="float" size="5" value="0.05" label="Max fraction separation for diagonal difference [-f]" />
+			<param name="cmd_l" type="integer" size="5" value="200" label="Min cluster length [-l]" />
+			<param name="cmd_s" type="integer" size="5" value="1000" label="Max separation adjecent matches in cluster [-s]" />
+		</when>
+	  </conditional>
+	</inputs>
+	<outputs>
+		<data name="out_tool" format="text" label="Clustering output" />
+	</outputs>
+	<requirements>
+<!--         <requirement type="set_environment" version="3.23">MUMMER_PATH</requirement> -->
+        <requirement type="package" version="4.6.4">gnuplot</requirement>
+        <requirement type="package" version="3.23">mummer</requirement>
+	</requirements>
+	<tests>
+		<test>
+		</test>
+	</tests>
+	<help>
+- **MUMmer clustering Galaxy tool wrapper:** Alex Bossers, CVI of Wageningen UR, The Netherlands.
+- **MUMmer suite v3.22:** http://mummer.sourceforge.net
+- **MUMmer tutorials:** http://mummer.sourceforge.net/examples/
+If you found these tools/wrappers usefull in your research, please acknowledge our work. If you improve 
+or modify the wrappers please add instead of substitute yourself into the acknowlegement section :)
+**MUMmer Clustering**
+MUMmer's clustering algorithms attempt to order small individual matches into larger match clusters 
+in order to make the output of mummer more intelligible. A dot plot makes it easy to spot alignment 
+regions from a match list, however when examining the data without graphic aids, it is very difficult 
+to draw any reasonable conclusions from the simple flat file list of matches. Clustering the matches 
+together into larger groups of neighboring matches makes this process much easier by ordering the 
+data and removing spurious matches.
+*gaps* is the primary clustering algorithm for run-mummer1, and although classified as a "clustering" 
+step, gaps is more of a sorting routine. It implements the LIS (longest increasing subset) algorithm 
+to extract the longest consistent set of matches between two sequences, and generates a single 
+cluster that represents the best "straight-line" arrangement of matches between the sequences. By 
+straight-line, we mean no rearrangements or inversions, just a simple path of agreeing matches 
+between the two sequences. This limits the usability of this program to the alignment of genomes 
+that are very similar and with no large scale mutations. *gaps* is best suited for the comparison of 
+near identical sequences with the goal of finding minor mutations like SNPs and small indels.
+Input can be filtered mummer output. The strange syntax is a result of a legacy issue described in 
+the Known problems (manual) section, and requires the header be stripped from the mummer output. In 
+addition, gaps is only designed to handle a single reference and a single query sequence, thus the 
+preceding mummer run must also follow this constraint. The -r is optional and designates the incoming 
+matches as reverse complement matches which must reference the reverse complement of the sequence, 
+therefore forcing mummer to be run without the -c option.
+Reference: http://mummer.sourceforge.net/manual/#gaps
+ > /home/aphillip/data/GHP.1con  Consistent matches
+      183       17     22    none      -      -
+      238       72    108    none     33     33
+      347      181     92    none      1      1
+      458      292     50    none     19     19
+      705      539     44    none      1      1
+      750      584     38    none      1      1
+      807      641     23     -16      0      4
+ (output continues ...)
+ > Wrap around
+   334398   329917     47    none      -    225
+   334446   329965     62    none      1      1
+   334539   330058     20    none     31     31
+   334560   330079     92    none      1      1
+   334653   330172     77    none      1      1
+   334740   330259     41    none     10     10
+ (output continues ...)
+ > /home/aphillip/data/GHP.1con  Other matches
+  1317231     4891     21    none      -      -
+  1317275     4927     21    none      -      -
+  1317804     5399     25    none    508    451
+   947580     5436     36    none      -      -
+    23406     5518     34    none      -      -
+   333079     6592     32    none      -      -
+ (output continues ...)
+Where the first line is the location of the reference file, and the first three columns are the same 
+as the three column match format described in the mummer section. The final three columns are the 
+overlap between this match and the previous match, the gap between the start of this match and the 
+end of the previous match in the reference, and the gap between the start of this match and the end 
+of the previous match in the query respectively.
+*mgaps* was introduced into the MUMmer pipeline in an effort to better handle large-scale 
+rearrangements and duplications. Unlike gaps, mgaps is a full clustering algorithm that is capable 
+of generating multiple groups of consistently ordered matches. Clustering is controlled by a set of 
+command-line parameters that adjust the minimum cluster size, maximum gap between matches, etc. Only 
+matches that were included in clusters will appear in the output, so by adjusting the command-line 
+parameters it is possible to filter out many of the spurious matches, thus leaving only the larger 
+areas of conservation between the input sequences. The major advantage of mgaps is its ability to 
+identify these "islands" of conservation. This frees the user from the single LIS restraints of the 
+gaps program and allows for the identification of large-scale rearrangements, duplications, gene 
+families and so on.
+Gaps can fail to identify clusters because they were not consistent with the LIS. However, by using 
+mgaps, all regions of conservation can now been identified. The only fallback being the increased 
+complexity of the output, where you once had only one cluster for the whole comparison, you usually 
+now get more. Because of this, it can sometimes be difficult separating the repetitive clusters from 
+"correct" clusters, *making mgaps more suited for global alignments instead of localized error detection*.
+Input can be raw mummer output. *mgaps* is only designed to handle a single reference and one or 
+more query sequences, thus the preceding mummer run must also follow this constraint. Please refer 
+to the run-mummer3 script (see online manual) for an example of how to use this program in an 
+alignment pipeline. Note that in order to cluster reverse complement matches, the reverse complement 
+matches must reference the reverse complement strand of the query sequence, therefore forcing mummer 
+to be run without the -c option. A rewrite of this algorithm to handle multiple reference sequences 
+and a better coordinate system (forward coordinates for reverse complement matches) is doubtful but 
+may eventually appear.
+The -d option can be interpreted as the number of insertions allowed between two matches in the same 
+cluster, while the -f option is a fraction equal to (diagonal difference / match separation) where 
+a higher value will increase the indel tolerance. Minimum cluster length is the sum of the contained 
+matches unless the -e option is used. The best way to get a feel for what each parameter controls 
+is to cluster the same data set numerous times with different values and observe the resulting 
+differences. It can also be helpful to set these parameters to the size of the element you wish to 
+capture, i.e. set the minimum cluster size to say the smallest exon you expect and set the max gap 
+to the smallest intron you expect to obtain clusters that could represent single exons (depending 
+of course of the similarity of the two sequences).
+Reference: http://mummer.sourceforge.net/manual/#mgaps
+**Output format**
+Output of *mgaps* shares much in common with the output of mummer and gaps, with a slightly different 
+header formatting than gaps to allow for multiple query sequences and multiple clusters. The output 
+of mgaps run on both forward and reverse complement matches is as follows:
+ > ID41
+ > ID41 Reverse
+  5177399        1    232    none      -      -
+  5177632      234   6794    none      1      1
+  5184433     7035     24    none      7      7
+  5184468     7069     23    none     11     10
+ > ID42
+    10181       43   1521    none      -      -
+ > ID42 Reverse
+  4654536       17     36    none      -      -
+  4654578       57    298    none      6      4
+  4654877      356    226    none      1      1
+ #
+  4655139      845     28    none      -      -
+  4655178      884    694    none     11     11
+  4655873     1579     20    none      1      1
+ #
+  4850044       17   1492    none      -      -
+  4851537     1510    711    none      1      1
+  4852249     2222     42    none      1      1
+ (output continues ...)
+Headers containing the ID for each query sequence are listed after the '>' characters, and a 
+following Reverse keyword identifies the reverse matches for that query sequence. Individual clusters 
+for each sequence are separated by a '#' character, and the six columns are exactly the same as the 
+gaps output (see the gaps section for more details). 
+	</help>