changeset 0:649b9cb20668 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/mcl commit 6fcfaa3d5fefc854ec7398c2848e8db669593b71
author iuc
date Mon, 13 Jun 2022 17:34:32 +0000
parents
children e092787c0a29
files mcl.xml test-data/mcl-simple-pruning.out test-data/mcl-simple.mci test-data/mcl-simple.mci-no-tab.out test-data/mcl-simple.mci.tab test-data/mcl-simple.out test-data/mcl-simple.sif test-data/mcl-simple.tabular
diffstat 8 files changed, 356 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mcl.xml	Mon Jun 13 17:34:32 2022 +0000
@@ -0,0 +1,262 @@
+<tool id="mcl" name="MCL" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
+    <description>Markov Cluster Algorithm for graphs</description>
+    <macros>
+        <token name="@TOOL_VERSION@">14.137</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
+    <xrefs>
+        <xref type="bio.tools">mcl</xref>
+    </xrefs>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">mcl</requirement>
+    </requirements>
+    <version_command>mcl --version</version_command>
+    <command detect_errors="aggressive">
+        <![CDATA[
+
+mcl '$input'
+	-I '$inflation'
+	$input_type_select.input_type
+	-V all -te \${GALAXY_SLOTS:-1}
+	$sum_loops
+	$discard_loops
+	#if $verbosity
+        #for $v in $verbosity
+                -v $v
+        #end for
+	#end if
+	#if $reweight
+			-c $reweight
+	#end if
+	#if $transform
+        #if $input_type_select.input_type == "--abc" or $input_type_select.input_type == "--sif"
+                -abc-tf '$transform'
+        #else if $input_type_select.input_type == ""
+                -tf '$transform'
+        #end if
+	#end if
+	#if $input_type_select.input_type == "--sif"
+        $input_type_select.expect_values
+	#else if $input_type_select.input_type == "" and $input_type_select.use_tab
+		-use-tab $input_type_select.use_tab
+	#end if
+	#if $pruning_options.cutoff
+        -P $pruning_options.cutoff
+	#end if
+	#if $pruning_options.selection_number
+        -S $pruning_options.selection_number
+	#end if
+	#if $pruning_options.recover_number
+        -R $pruning_options.recover_number
+	#end if
+	#if $pruning_options.recover_percentage
+        -pct $pruning_options.recover_percentage
+	#end if
+	-o '$output'
+
+	]]>
+    </command>
+    <inputs>
+        <conditional name="input_type_select">
+            <param name="input_type" type="select" label="Input type">
+                <option value="--abc" selected="true">Labeled</option>
+                <option value="--sif">SIF</option>
+                <option value="">Matrix</option>
+            </param>
+            <when value="">
+                <param type="data" format="tabular" name="use_tab" argument="-use-tab" label="tabular label input" optional="true" help="use tab file to convert the output to labels"/>
+            </when>
+            <when value="--sif">
+                <param type="boolean" name="expect_values" checked="false" truevalue="--expect-values" falsevalue="" argument="--expect-values" label="expect label:weight format" optional="true" help="accept extended SIF (label:weight fields)" />
+            </when>
+            <when value="--abc" />
+        </conditional>
+        <param type="data" format="txt,tabular,sif" name="input" label="Input" optional="false" />
+        <param name="inflation" argument="-I" type="float" value="2.0" label="Inflation" min="1" max="6" help="This value is the main handle for affecting cluster granularity. It is usually chosen somewhere in the range [1.2-5.0]. -I 5.0 will tend to result in fine-grained clusterings, and -I 1.2 will tend to result in very coarse grained clusterings. Your mileage will vary depending on the characteristics of your data. That is why it is a good idea to test the quality and coherency of your clusterings using clm dist and clm info. This will most likely reveal that certain values of -I are simply not right for your data." />
+        <param name="transform" type="text" area="true" label="Transform input matrix values" help="See https://micans.org/mcl/man/mcxio.html#trans for the transformation syntax" >
+            <sanitizer>
+                <valid initial="default">
+                  <add value="#" />
+                </valid>
+              </sanitizer>
+        </param>
+        <param argument="--discard-loops" type="boolean" checked="true" truevalue="--discard-loops=y" falsevalue="--discard-loops=n" label="Discard loops in input" help="Remove any loops that are present in the input. Bear in mind that loops will still be modified in all cases where the loop weight is not maximal among the list of edge weights for a given node." />
+        <param name="reweight" argument="-c" optional="true" type="float" value="1" label="Reweight loops" help="As the final step of loop computation (i.e. after initialization and shadowing) all loop weights are multiplied by the provided value." />
+        <param type="boolean" checked="false" truevalue="--sum-loops" falsevalue="" argument="--sum-loops" label="Set loops to sum of other arcs weights" />
+        <param name="verbosity" type="select" multiple="true" optional="true" label="Verbosity">
+            <option value="pruning">pruning</option>
+            <option value="explain">explain</option>
+            <option value="cls">cls</option>
+        </param>
+        <section title="Pruning options" name="pruning_options">
+            <param name="cutoff" type="integer" value="4000" label="cutoff" argument="-P" optional="true" help="inverse cutoff pruning value. Entries smaller than cutoff are removed"></param>
+            <param name="selection_number" type="integer" value="500" label="selection number" argument="-S" optional="true" help="pruning selection value. maximum number of entries (if applicable)"></param>
+            <param name="recover_number" type="integer" value="600" label="recover number" argument="-R" optional="true" help="Pruning recover number. Revover the largest discarded entries during pruning, if number of entries less then -R"></param>
+            <param name="recover_percentage" type="integer" value="90" label="recover percentage" argument="-pct" min="0" max="100" optional="true" help="Pruning recover percentage. Revover the largest discarded entries during pruning, if sum of remaining entries is less than -pct/100"></param>
+        </section>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" />
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="input" value="mcl-simple.tabular" ftype="tabular" />
+            <conditional name="input_type_select">
+                <param name="input_type" value="--abc" />
+            </conditional>
+            <param name="transform" value="gq(0.7),add(-0.7)" />
+            <param name="inflation" value="3"/>
+            <output name="output" file="mcl-simple.out" ftype="tabular" />
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input" value="mcl-simple.mci" ftype="txt" />
+            <conditional name="input_type_select">
+                <param name="input_type" value="" />
+                <param name="use_tab" value="mcl-simple.mci.tab" ftype="tabular" />
+            </conditional>
+            <param name="transform" value="gq(0.7),add(-0.7)" />
+            <param name="inflation" value="3"/>
+            <output name="output" file="mcl-simple.out" ftype="tabular" />
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input" value="mcl-simple.sif" ftype="sif" />
+            <conditional name="input_type_select">
+                <param name="input_type" value="--sif" />
+                <param name="expect_values" value="true" ftype="tabular" />
+            </conditional>
+            <param name="transform" value="gq(0.7),add(-0.7)" />
+            <param name="inflation" value="3"/>
+            <output name="output" file="mcl-simple.out" ftype="tabular" />
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input" value="mcl-simple.tabular" ftype="tabular" />
+            <conditional name="input_type_select">
+                <param name="input_type" value="--abc" />
+            </conditional>
+            <param name="cutoff" value="1"/>
+            <param name="recover_number" value="1"/>
+            <param name="selection_number" value="1"/>
+            <param name="discard_loops" value="false"/>
+            <param name="reweight" value="0.5"/>
+            <param name="sum_loops" value="true"/>
+            <output name="output" file="mcl-simple-pruning.out" ftype="tabular" />
+        </test>
+        <test>
+            <param name="input" value="mcl-simple.mci" ftype="txt" />
+            <conditional name="input_type_select">
+                <param name="input_type" value="" />
+            </conditional>
+            <output name="output" file="mcl-simple.mci-no-tab.out" ftype="tabular" lines_diff="2"/> 
+        </test>
+    </tests>
+    <help>
+        <![CDATA[ 
+**What it does**
+
+The `Markov Cluster Algorithm`_, aka the MCL algorithm.
+
+The MCL algorithm is short for the Markov Cluster Algorithm, a fast and scalable unsupervised cluster algorithm for graphs (also known as networks) based on simulation of (stochastic) flow in graphs. It has found usage in bioinformatics and other disciplines.
+
+The MCL algorithm finds cluster structure in graphs by a mathematical bootstrapping procedure. The process deterministically computes (the probabilities of) random walks through the graph, and uses two operators transforming one set of probabilities into another. It does so using the language of stochastic matrices (also called Markov matrices) which capture the mathematical concept of random walks on a graph.
+
+The MCL algorithm simulates random walks within a graph by alternation of two operators called expansion and inflation. Expansion coincides with taking the power of a stochastic matrix using the normal matrix product (i.e. matrix squaring). Inflation corresponds with taking the Hadamard power of a matrix (taking powers entrywise), followed by a scaling step, such that the resulting matrix is stochastic again, i.e. the matrix elements (on each column) correspond to probability values. 
+
+The basic interface to the algorithm is very simple - you need only one option (the -I flag) to get to the heart of it. The number of clusters cannot be specified. It is implicitly controlled using the inflation parameter. Inflation affects the granularity or resolution of the clustering outcome, with low values (1.3, 1.4) leading to fewer and larger clusters and high values (5, 6) leading to more and smaller clusters; the default value of 2 is a good starting point. For large graphs you should also be aware of the pruning options for regulating resources.
+
+Network construction and reduction techniques should not be considered as part of a clustering algorithm. Nevertheless particular techniques may benefit particular methods or applications. In mcl many transformations are accessible through the *transform* option. It can be used for edge weight transformations and selection, as well as transformations that act on a graph as a whole. It is for example possible to remove edges with weight below 0.7 by issuing -tf 'gq(0.7)', where the quotes are necessary to prevent the shell from interpreting the parentheses. The option accepts more complicated sequences, such as -tf 'gq(0.7),add(-0.7)'. This causes all remaining edge weights to be shifted to the range [0-0.3], assuming that the input contains correlations. Many more transformations are supported, as documented in mcxio_.
+
+.. _Markov Cluster Algorithm: https://micans.org/mcl/man/mcl.html
+.. _mcxio: https://micans.org/mcl/man/mcxio.html
+
+**Input**
+
+MCL supports a number of different input formats. The recommended wayis to use a labeled input (ABC-format) The input is then a file or stream in which each line encodes an edge in terms of two labels (the 'A' and the 'B') and a numerical value (the 'C'), all separated by white space. MCL also supports SIF format and exposes a native matrix representation, which is useful whenever other programs of the mcl-suite are used in tandem::
+
+	Labeled
+		This simple format expects two or three fields separated by white space on each line. The first and sec-
+		ond fields are interpreted as labels specifying source and destination node respectively. The third fie-
+		ld, if present, specifies the weight of the arc connecting the two nodes.
+	
+	SIF
+		This option tells mcl to expect SIF (Simple Interaction File) format. This format is line based. The fi-
+		rst two fields specify the source node (as a label) and the relationship type. An arbitrary number of f-
+		ields may follow, each containing a label identifying a destination node. The second field is simply ig-
+		nored by mcl. As an extension to the SIF format weights may optionally follow the labels, separated from
+		them with a colon character. It is in this case necessary to use the --expect-values option.
+	
+	  --expect-values(expect label:weight format)
+		accept extended SIF format (label:weight fields)
+	
+	Matrix
+		MCL internal matrix representation. See https://micans.org/mcl/man/mcxio.html#gspec for detailed informa.
+		tion. Use -use-tab to write label output using dictionary file
+
+	  -use-tab <fname> (use mapping to write)
+		-use-tab is only useful when matrix input is used. It will use the tab file to convert the output to lab-
+		els; it does not fail on indices missing from the tab file, but will bind these to generated dummy labels. 
+
+**Options**::
+
+	-I <num> (inflation)
+		Sets the main inflation value to <num>. This value is the main handle for affecting cluster granularity.
+		It is usually chosen somewhere in the range [1.2-5.0]. -I 5.0 will tend to result in fine-grained clust-
+		erings, and -I 1.2 will tend to result in very coarse grained clusterings. Your mileage will vary depen-
+		ding on the characteristics of your data.
+
+	-tf <tf-spec> (transform)
+		transforms the values of the input matrix according to <tf-spec>
+
+	-c <num> (reweight loops)
+		increase loop-weights <num>-fold
+	--sum-loops (set loops to sum of other arcs weights)
+		with the -c <num> option, as the final step of loop computation (i.e. after initialization and shadowing)
+		all loop weights are multiplied by <num>, if supplied.
+		
+	--discard-loops=<y/n> (discard loops in input)
+		By default mcl will remove any loops that are present in the input. Use --discard-loops=n to turn this 
+		off. Bear in mind that loops will still be modified in all cases where the loop weight is not maximal a-
+		mong the list of edge weights for a given node.
+		
+**Pruning options**
+
+After computing a new (column stochastic) matrix vector during expansion (which is matrix multiplication c.q. squaring), the vector is successively exposed to different pruning strategies.The intent of pruning is that many small entries are removed while retaining much of the stochastic mass of the original vector. After pruning, vectors are rescaled to be stochastic again.MCL iterands are theoretically known to be sparse in a weighted sense, and this manoever effectively perturbs the MCL process a little in order to obtain matrices that are genuinely sparse, thus keeping the computation tractable.
+
+mcl proceeds as follows. First, entries that are smaller than cutoff are removed, resulting in a vector with at most 1/cutoff entries. The cutoff can be supplied as the inverse value (1/cutoff) by **-P**.
+
+Second, if the remaining stochastic mass (i.e. the sum of all remaining entries) is less than <pct>/100 (specified by the **-pct** flag) and the number of remaining entries is less than <r> (as specified by the **-R** flag), mcl will try to regain ground by recovering the largest discarded entries. The total number of entries is not allowed to grow larger than <r>. If recovery was not necessary, mcl tries to prune the vector further down to at most s entries (if applicable), as specified by the **-S** flag. If this results in a vector that satisfies the recovery condition then recovery is attempted, exactly as described above. The latter will not occur of course if <r> <= <s>.::
+
+    -P <int> (1/cutoff)
+        (inverted) rigid pruning threshold
+
+    -S <int> (selection number)
+        select down to <int> entries if needed
+
+    -R <int> (recover number)
+        recover to maximally <int> entries if needed
+
+    -pct <pct> (recover percentage)
+        try recovery if mass is less than <pct>
+
+**Output**
+
+The output is then a file where each line is a cluster of tab-separated labels.
+
+
+		]]>
+    </help>
+    <citations>
+        <citation type="doi">10.5281/zenodo.3364789</citation>
+        <citation type="bibtex">
+            <![CDATA[
+			@article{dongen29graph,
+				title={Graph Clustering by Flow Simulation. 2000},
+				author={Dongen, SV},
+				journal={Domplein},
+				volume={29},
+				pages={3512}
+			}
+			]]>
+        </citation>
+        <citation type="doi">10.1093/nar/30.7.1575</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mcl-simple-pruning.out	Mon Jun 13 17:34:32 2022 +0000
@@ -0,0 +1,7 @@
+F	G
+H	J
+A
+B
+C
+D
+E
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mcl-simple.mci	Mon Jun 13 17:34:32 2022 +0000
@@ -0,0 +1,16 @@
+(mclheader
+mcltype matrix
+dimensions 9x9
+)
+(mclmatrix
+begin
+0 1:35 2:10 3:15 4:40 5:20 6:10 8:13 $
+1 0:35 2:35 4:30 5:5 7:5 $
+2 0:10 1:35 3:55 6:15 $
+3 0:15 2:55 4:30 7:5 8:25 $
+4 0:40 1:30 3:30 7:10 $
+5 0:20 1:5 6:55 7:45 8:27 $
+6 0:10 2:15 5:55 7:75 8:30 $
+7 1:5 3:5 4:10 5:45 6:75 8:38 $
+8 0:13 3:25 5:27 6:30 7:38 $
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mcl-simple.mci-no-tab.out	Mon Jun 13 17:34:32 2022 +0000
@@ -0,0 +1,10 @@
+# cline: mcl test.mci "-V" "all" "-o" "test_out.mci"
+(mclheader
+mcltype matrix
+dimensions 9x2
+)
+(mclmatrix
+begin
+0 0 1 2 3 4 $
+1 5 6 7 8 $
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mcl-simple.mci.tab	Mon Jun 13 17:34:32 2022 +0000
@@ -0,0 +1,9 @@
+0	A
+1	B
+2	C
+3	D
+4	E
+5	F
+6	G
+7	H
+8	J
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mcl-simple.out	Mon Jun 13 17:34:32 2022 +0000
@@ -0,0 +1,3 @@
+F	G	H	J
+A	B	E
+C	D
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mcl-simple.sif	Mon Jun 13 17:34:32 2022 +0000
@@ -0,0 +1,9 @@
+A	x	A:0	B:35	C:10	D:15	E:40
+B	x	A:35	B:0	C:35	D:0	E:30
+C	x	A:10	B:35	C:0	D:55	E:0
+D	x	A:15	B:0	C:55	D:0	E:30
+E	x	A:40	B:30	C:0	D:30	E:0
+F	x	A:20	G:55	H:45	B:5
+G	x	H:75	C:15	J:30	A:10
+H	x	J:38	D:5	E:10	B:5
+J	x	A:13	D:25	F:27
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mcl-simple.tabular	Mon Jun 13 17:34:32 2022 +0000
@@ -0,0 +1,40 @@
+A	A	0
+A	B	35
+A	C	10
+A	D	15
+A	E	40
+B	A	35
+B	B	0
+B	C	35
+B	D	0
+B	E	30
+C	A	10
+C	B	35
+C	C	0
+C	D	55
+C	E	0
+D	A	15
+D	B	0
+D	C	55
+D	D	0
+D	E	30
+E	A	40
+E	B	30
+E	C	0
+E	D	30
+E	E	0
+F	A	20
+F	G	55
+F	H	45
+F	B	5
+G	H	75
+G	C	15
+G	J	30
+G	A	10
+H	J	38
+H	D	5
+H	E	10
+H	B	5
+J	A	13
+J	D	25
+J	F	27
\ No newline at end of file