Mercurial > repos > eschen42 > w4mkmeans

diff w4mkmeans.xml @ 2:c415b7dc6f37 draft default tip
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 3e916537da6bb37e6f3927d7a11e98e0ab6ef5ec
author: eschen42
date: Mon, 05 Mar 2018 12:40:17 -0500
parents: 02cafb660b72
--- a/w4mkmeans.xml	Wed Aug 09 18:06:55 2017 -0400
+++ b/w4mkmeans.xml	Mon Mar 05 12:40:17 2018 -0500
@@ -1,9 +1,11 @@
-<tool id="w4mkmeans" name="w4mKmeans" version="0.98.3">
-  <description>Calculate K-means for W4M dataMatrix features or samples</description>
+<tool id="w4mkmeans" name="Kmeans for W4m" version="0.98.4">
+  <description>Calculate K-means for W4m dataMatrix features or samples</description>
 
   <requirements>
-    <requirement type="package" version="3.3.2">r-base</requirement>
+    <requirement type="package" version="3.4.1">r-base</requirement>
     <requirement type="package" version="1.1_4">r-batch</requirement>
+    <requirement type="package" version="1.8.0">libssh2</requirement>
+    <requirement type="package" version="1.13.2">krb5</requirement>
   </requirements>
 
   <stdio>
@@ -27,19 +29,18 @@
       slots "\${GALAXY_SLOTS:-1}"
       variableMetadata_out '$variableMetadata_out'
       variable_metadata_path '$variableMetadata_in'
-    ; echo exit code $?
   ]]></command>
 
   <inputs>
-    <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" />
-    <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" />
-    <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" />
-    <param name="categoricalPrefix" label="prefix for cluster names " type="text" value="k" help="[categorical_prefix] Some tools require non-numeric values to discern categorical data; e.g., enter 'k' here to prepend 'k' to cluster numbers in the output; default 'k'." />
+    <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="Feature (variable) x sample; decimal point: '.'; missing: NA; mode: numerical; separator: tab" />
+    <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="Sample x metadata columns; separator: tab" />
+    <param name="variableMetadata_in" label="Variable (feature) metadata file" type="data" format="tabular" help="Feature (variable) x metadata columns; separator: tab" />
+    <param name="categoricalPrefix" label="Prefix for cluster names " type="text" value="c" help="String prepended to cluster numbers in output; default 'c'; leave blank for no prefix." />
     <param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." />
     <param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." />
-    <param name="iter_max" label="Max number of iterations" type="text" value = "10" help="[iter_max] The maximum number of iterations allowed; default 10." />
-    <param name="nstart" label="Number of random sets" type="text" value = "1" help="[nstart] How many random sets should be chosen; default 1." />
-    <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see stats::kmeans reference for further info.">
+    <param name="iter_max" label="Maximum number of iterations" type="text" value = "20" help="[iter_max] The maximum number of iterations allowed; default 20." />
+    <param name="nstart" label="Number of random sets of clusters" type="text" value = "20" help="[nstart] How many random sets of clusters should be chosen initially; default 20." />
+    <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; &lt;br /&gt;alternatives 'Lloyd', 'MacQueen'; 'Forgy' (synonym for 'Lloyd'); see references.">
       <option value="Forgy">Forgy</option>
       <option value="Hartigan-Wong" selected="True">Hartigan-Wong</option>
       <option value="Lloyd">Lloyd</option>
@@ -48,9 +49,9 @@
   </inputs>
 
   <outputs>
-    <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data>
-    <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data>
-    <data name="scores_out" label="${tool.name}_${dataMatrix_in.name}.kmeans" format="tabular" ></data>
+    <data name="sampleMetadata_out" label="${sampleMetadata_in.name}.kmeans-smpl" format="tabular" ></data>
+    <data name="variableMetadata_out" label="${variableMetadata_in.name}.kmeans-vrbl" format="tabular" ></data>
+    <data name="scores_out" label="${dataMatrix_in.name}.kmeans-score" format="tabular" ></data>
   </outputs>
 
   <tests>
@@ -60,17 +61,17 @@
       <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
       <param name="ksamples" value="3,4"/>
       <param name="kfeatures" value="5,6,7"/>
-      <param name="iter_max" value="10"/>
-      <param name="nstart" value="1"/>
+      <param name="iter_max" value="20"/>
+      <param name="nstart" value="20"/>
       <param name="algorithm" value="Hartigan-Wong"/>
       <output name="scores_out">
         <assert_contents>
           <has_text     text="proportion" />
           <has_text     text="0.87482" />
-          <has_text     text="0.89248" />
-          <has_text     text="0.95355" />
-          <has_text     text="0.95673" />
-          <has_text     text="0.95963" />
+          <has_text     text="0.91765" />
+          <has_text     text="0.95362" />
+          <has_text     text="0.95719" />
+          <has_text     text="0.97966" />
         </assert_contents>
       </output>
     </test>
@@ -79,57 +80,38 @@
   <help>
     <![CDATA[
 
+===========================
+K-means for W4m data matrix
+===========================
+
 **Author** - Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu)
 
----------------------------------------------------------------------------
-
-
-**Source** - The source code for the w4mkmeans tool is available (from the Hegeman lab github repository) at https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper
-
-**R code used** - The R code invoked by this wrapper is the R 'stats::kmeans' package
-
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
-
+**Source Code** - The source code for the w4mkmeans tool is available (from the Hegeman lab github repository) at https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper
 
-**Tool updates**
-
-See the **NEWS** section at the bottom of this page
+**R code used** - The R code invoked by this wrapper is the R kmeans package at https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html
 
----------------------------------------------------
-
-===========================
-K-means for W4M data matrix
-===========================
+**Tool updates** - See the **NEWS** section at the bottom of this page
 
 -----------
 Description
 -----------
 
-Calculate K-means for sample-clusters (or feature-clusters, or both) using W4M dataMatrix (i.e., XCMS-preprocessed data files) as input.
+This tool calculate K-means clusters for samples, features, or both using W4m dataMatrix (i.e., XCMS-preprocessed data files) as input and writes the results to new columns in sampleMetadata, variableMetadata, or both.
 
-*Please note that XCMS refers to features as 'variables'.  This documentation does not use either term consistently.*
+- If several, comma-separated K's are supplied, then one column is added for each K.
+- For feature-clustering, each feature is assigned to a cluster such that the feature's response for all samples is closer to the mean of all features for that cluster than to the mean for any other cluster.
+- For sample-clustering, each sample is assigned to a cluster such that the sample's response for all features is closer to the mean of all samples for that cluster than to the mean for any other cluster.
+- Clustering is mutually exclusive, **not** hierarchical.
 
+    - Hierarchical clustering is available through the W4m Heat Map tool, https://github.com/workflow4metabolomics/heatmap
 
 -----------------
 Workflow Position
 -----------------
 
-  - Tool category: Statistical Analysis
-  - Upstream tool category: Preprocessing
-  - Downstream tool categories: Statistical Analysis
-
-
-----------
-Motivation
-----------
-
-This tool clusters samples, features (variables), or both from the W4M dataMatrix and writes the results to new columns in sampleMetadata, variableMetadata, or both, respectively.
-
-  - If several, comma-separated K's are supplied, then one column is added for each K.
-  - This clustering is **not** hierarchical; each member of a cluster is not a member of any other cluster.
-  - For feature-clustering, each feature is assigned to a cluster such that the feature's response for all samples is closer to the mean of all features for that cluster than to the mean for any other cluster.
-  - For sample-clustering, each sample is assigned to a cluster such that the sample's response for all features is closer to the mean of all samples for that cluster than to the mean for any other cluster.
-
+- Tool category: Statistical Analysis
+- Upstream tool category: Preprocessing
+- Downstream tool categories: Statistical Analysis
 
 -----------
 Input files
@@ -152,92 +134,117 @@
 
 **Data matrix** - input-file dataset
 
-  - XCMS variable x sample 'dataMatrix' (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and feature metadata, respectively (see below)
+- W4m variable (i.e. feature) x sample 'dataMatrix' (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and feature metadata, respectively (see below)
 
 **Sample metadata** - input-file dataset
 
-  - XCMS sample x metadata 'sampleMetadata' (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values
+- W4m sample x metadata 'sampleMetadata' (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values
 
 **Feature metadata** - input-file dataset
 
-  - XCMS variable x metadata 'variableMetadata' (tabular separated values) file of the numeric and/or character feature metadata, with . as decimal and NA for missing values
+- W4m variable (i.e. feature) x metadata 'variableMetadata' (tabular separated values) file of the numeric and/or character feature metadata, with . as decimal and NA for missing values
 
-**kfeatures** - K or K's for features (default = 0)
+**Prefix for cluster names** - character(s) to add as prefix to category number (default = 'c')
+
+- Some tools treat only non-numeric data as categorical; this prefix ensures that clusters data will be treated as categorical; an empty string is permitted here if desired (and succeeding tools requiring categorical data accept integers).
 
-  - integer or comma-separated integers ; zero (the default) or less will result in no calculation.
+**K-values for samples** - K or K-range for samples (default = 0)
 
-**ksamples** - K or K-range for samples (default = 0)
+- Integer or comma-separated positive integers ; zero (or less) will result in no calculation.
 
-  - integer or comma-separated integers ; zero (the default) or less will result in no calculation.
+**K-values for features** - K or K's for features (default = 0)
 
-**iter_max** - maximum_iterations (default = 10)
+- Integer or comma-separated positive integers ; zero (or less) will result in no calculation.
 
-  - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html).
+**Maximumn number of iterations** - (default = 20)
 
-**nstart** - how many random sets should be chosen (default = 1)
+- Maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html for further info).
 
-  - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html).
+**Number of random sets** - how many random sets should be chosen to start (default = 20)
+
+- Number of random sets of clusters to be chosen to start calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html for further info).
 
-**categorical_prefix** - character(s) to add as prefix to category number (default = 'k')
+**Algorithm** - Algorithm for clustering" (default = Hartigan-Wong)
 
-  - some tools treat only non-numeric data as categorical; this prefix ('k' by default) ensures that clusters data will be treated as categorical; an empty string is permitted here if desired (and succeeding tools accept integers as categorical data).
+- K-means clustering algorithm: 'Hartigan-Wong', 'Lloyd', or 'MacQueen'; 'Forgy' is a synonym for 'Lloyd' (see references for further info).
 
 ------------
 Output files
 ------------
 
-**XCMS sampleMetadata** - (tabular separated values) file identical to the Sample metadata file given as an input argument, excepting one column added for each K
+**Sample Metadata** - (tabular separated values) file identical to the Sample metadata file given as an input argument, excepting one column added for each K
 
-  - **k#** - cluster number for clustering samples with K = #
+- **k#** - cluster number for clustering samples with K = #
 
-**XCMS variableMetadata** - (tabular separated values) file identical to the Feature metadata file given as an input argument, excepting one column added for each K
+**Variable Metadata** - (tabular separated values) file identical to the Feature metadata file given as an input argument, excepting one column added for each K
 
-  - **k#** - cluster number for clustering features with K = #
+- **k#** - cluster number for clustering features with K = #
 
 **scores** - (tabular separated values) file with one line for each K.
 
-  - **clusterOn** - what was clustered - either 'sample' or 'feature'
-  - **k** - the chosen K for clustering
-  - **totalSS** - total (*between-treatements* plus total of *within-treatements*) sum of squares
-  - **betweenSS** - *between-treatements* sum of squares
-  - **proportion** - betweenSS / totalSS
+- **clusterOn** - what was clustered - either 'sample' or 'feature'
+- **k** - the chosen K for clustering
+- **totalSS** - total (*between-treatements* plus total of *within-treatements*) sum of squares
+- **betweenSS** - *between-treatements* sum of squares
+- **proportion** - betweenSS / totalSS
 
 ---------------
 Working example
 ---------------
 
+.. class:: infomark
+
 **Input files**
 
-+-------------------+-------------------------------------------------------------------------------------------------------------------+
-| Input File        | Download from URL                                                                                                 |
-+===================+===================================================================================================================+
-| Data matrix       | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_dataMatrix.tsv       |
-+-------------------+-------------------------------------------------------------------------------------------------------------------+
-| Sample metadata   | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_sampleMetadata.tsv   |
-+-------------------+-------------------------------------------------------------------------------------------------------------------+
-| Feature metadata  | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_variableMetadata.tsv |
-+-------------------+-------------------------------------------------------------------------------------------------------------------+
++-------------------------------------------------------------------------------------------------------------------+
+| URL                                                                                                               |
++===================================================================================================================+
+| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_dataMatrix.tsv       |
++-------------------------------------------------------------------------------------------------------------------+
+| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_sampleMetadata.tsv   |
++-------------------------------------------------------------------------------------------------------------------+
+| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_variableMetadata.tsv |
++-------------------------------------------------------------------------------------------------------------------+
+
+.. class:: infomark
 
 **Other input parameters**
 
 +-----------------+---------------+
 | Input Parameter | Value         |
 +=================+===============+
+| prefix          | c             |
++-----------------+---------------+
 | ksamples        | 3,4           |
 +-----------------+---------------+
 | kfeatures       | 5,6,7         |
 +-----------------+---------------+
-| iter_max        | 10            |
+| iter_max        | 20            |
 +-----------------+---------------+
-| nstart          | 1             |
+| nstart          | 20            |
 +-----------------+---------------+
 | algorithm       | Hartigan-Wong |
 +-----------------+---------------+
 
+.. class:: infomark
+
+**Expected output files**
+
++-------------------------------------------------------------------------------------------------------------------+
+| URL                                                                                                               |
++===================================================================================================================+
+| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-score.tsv    |
++-------------------------------------------------------------------------------------------------------------------+
+| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-vrbl.tsv     |
++-------------------------------------------------------------------------------------------------------------------+
+| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-smpl.tsv     |
++-------------------------------------------------------------------------------------------------------------------+
+
 ----
 NEWS
 ----
 
+- February 2018, Version 0.98.4 - Renamed output datasets to append '``.kmeans-smpl``', '``.kmeans-vrbl``', or '``.kmeans-score``'; refactored multi-threading.
 - August 2017, Version 0.98.3 - Add (optional) prefix to category numbers for downstream tools that treat only non-numeric data as categorical.
 - August 2017, Version 0.98.1 - First release
 
@@ -277,28 +284,12 @@
   year = 1965
 }
     ]]></citation>
-    <!-- W4M 3.0 - Guitton et al. 2017-->
+    <!-- W4m 3.0 - Guitton et al. 2017-->
     <citation type="doi">10.1016/j.biocel.2017.07.002</citation>
-    <!-- W4M 2.5 - Giacomini et al. 2014 -->
+    <!-- W4m 2.5 - Giacomini et al. 2014 -->
     <citation type="doi">10.1093/bioinformatics/btu813</citation>
     <!-- Hartigan and Wong algorithm -->
-    <citation type="bibtex"><![CDATA[
-@article{Hartigan79,
-  added-at = {2007-02-27T16:22:09.000+0100},
-  author = {Hartigan, J. and Wong, M.},
-  biburl = {https://www.bibsonomy.org/bibtex/23d8bfc440c5725783876929c022f67ce/pierpaolo.pk81},
-  description = {WSD},
-  interhash = {10d6d33920d9af578a4d0a556dc1477d},
-  intrahash = {3d8bfc440c5725783876929c022f67ce},
-  journal = {Applied Statistics},
-  keywords = {imported},
-  pages = {100-108},
-  timestamp = {2007-02-27T16:22:11.000+0100},
-  title = {Algorithm AS136: A k-means clustering algorithm},
-  volume = 28,
-  year = 1979
-}
-    ]]></citation>
+    <citation type="doi">10.2307/2346830</citation>
     <!-- Lloyd algorithm -->
     <citation type="doi">10.1109/TIT.1982.1056489</citation>
     <!-- MacQueen algorithm -->
author	eschen42
date	Mon, 05 Mar 2018 12:40:17 -0500
parents	02cafb660b72
children