Mercurial > repos > eschen42 > w4mkmeans
view w4mkmeans.xml @ 1:02cafb660b72 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit f600ce8a783df16e49272341dce0fc6bbc299b0a
author | eschen42 |
---|---|
date | Wed, 09 Aug 2017 18:06:55 -0400 |
parents | 6ccbe18131a6 |
children | c415b7dc6f37 |
line wrap: on
line source
<tool id="w4mkmeans" name="w4mKmeans" version="0.98.3"> <description>Calculate K-means for W4M dataMatrix features or samples</description> <requirements> <requirement type="package" version="3.3.2">r-base</requirement> <requirement type="package" version="1.1_4">r-batch</requirement> </requirements> <stdio> <exit_code range="1:" level="fatal" /> </stdio> <command detect_errors="aggressive"><![CDATA[ Rscript $__tool_directory__/w4mkmeans_wrapper.R tool_directory $__tool_directory__ algorithm '$algorithm' categorical_prefix '$categoricalPrefix' data_matrix_path '$dataMatrix_in' iter_max '$iter_max' kfeatures '$kfeatures' ksamples '$ksamples' nstart '$nstart' sampleMetadata_out '$sampleMetadata_out' sample_metadata_path '$sampleMetadata_in' scores_out '$scores_out' slots "\${GALAXY_SLOTS:-1}" variableMetadata_out '$variableMetadata_out' variable_metadata_path '$variableMetadata_in' ; echo exit code $? ]]></command> <inputs> <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" /> <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" /> <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" /> <param name="categoricalPrefix" label="prefix for cluster names " type="text" value="k" help="[categorical_prefix] Some tools require non-numeric values to discern categorical data; e.g., enter 'k' here to prepend 'k' to cluster numbers in the output; default 'k'." /> <param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." /> <param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." /> <param name="iter_max" label="Max number of iterations" type="text" value = "10" help="[iter_max] The maximum number of iterations allowed; default 10." /> <param name="nstart" label="Number of random sets" type="text" value = "1" help="[nstart] How many random sets should be chosen; default 1." /> <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see stats::kmeans reference for further info."> <option value="Forgy">Forgy</option> <option value="Hartigan-Wong" selected="True">Hartigan-Wong</option> <option value="Lloyd">Lloyd</option> <option value="MacQueen">MacQueen</option> </param> </inputs> <outputs> <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data> <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data> <data name="scores_out" label="${tool.name}_${dataMatrix_in.name}.kmeans" format="tabular" ></data> </outputs> <tests> <test> <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> <param name="ksamples" value="3,4"/> <param name="kfeatures" value="5,6,7"/> <param name="iter_max" value="10"/> <param name="nstart" value="1"/> <param name="algorithm" value="Hartigan-Wong"/> <output name="scores_out"> <assert_contents> <has_text text="proportion" /> <has_text text="0.87482" /> <has_text text="0.89248" /> <has_text text="0.95355" /> <has_text text="0.95673" /> <has_text text="0.95963" /> </assert_contents> </output> </test> </tests> <help> <![CDATA[ **Author** - Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu) --------------------------------------------------------------------------- **Source** - The source code for the w4mkmeans tool is available (from the Hegeman lab github repository) at https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper **R code used** - The R code invoked by this wrapper is the R 'stats::kmeans' package ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- **Tool updates** See the **NEWS** section at the bottom of this page --------------------------------------------------- =========================== K-means for W4M data matrix =========================== ----------- Description ----------- Calculate K-means for sample-clusters (or feature-clusters, or both) using W4M dataMatrix (i.e., XCMS-preprocessed data files) as input. *Please note that XCMS refers to features as 'variables'. This documentation does not use either term consistently.* ----------------- Workflow Position ----------------- - Tool category: Statistical Analysis - Upstream tool category: Preprocessing - Downstream tool categories: Statistical Analysis ---------- Motivation ---------- This tool clusters samples, features (variables), or both from the W4M dataMatrix and writes the results to new columns in sampleMetadata, variableMetadata, or both, respectively. - If several, comma-separated K's are supplied, then one column is added for each K. - This clustering is **not** hierarchical; each member of a cluster is not a member of any other cluster. - For feature-clustering, each feature is assigned to a cluster such that the feature's response for all samples is closer to the mean of all features for that cluster than to the mean for any other cluster. - For sample-clustering, each sample is assigned to a cluster such that the sample's response for all features is closer to the mean of all samples for that cluster than to the mean for any other cluster. ----------- Input files ----------- +--------------------------------------------+------------+ | File | Format | +============================================+============+ | Data matrix | tabular | +--------------------------------------------+------------+ | Sample metadata | tabular | +--------------------------------------------+------------+ | Variable (i.e., feature) metadata | tabular | +--------------------------------------------+------------+ ---------- Parameters ---------- **Data matrix** - input-file dataset - XCMS variable x sample 'dataMatrix' (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and feature metadata, respectively (see below) **Sample metadata** - input-file dataset - XCMS sample x metadata 'sampleMetadata' (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values **Feature metadata** - input-file dataset - XCMS variable x metadata 'variableMetadata' (tabular separated values) file of the numeric and/or character feature metadata, with . as decimal and NA for missing values **kfeatures** - K or K's for features (default = 0) - integer or comma-separated integers ; zero (the default) or less will result in no calculation. **ksamples** - K or K-range for samples (default = 0) - integer or comma-separated integers ; zero (the default) or less will result in no calculation. **iter_max** - maximum_iterations (default = 10) - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html). **nstart** - how many random sets should be chosen (default = 1) - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html). **categorical_prefix** - character(s) to add as prefix to category number (default = 'k') - some tools treat only non-numeric data as categorical; this prefix ('k' by default) ensures that clusters data will be treated as categorical; an empty string is permitted here if desired (and succeeding tools accept integers as categorical data). ------------ Output files ------------ **XCMS sampleMetadata** - (tabular separated values) file identical to the Sample metadata file given as an input argument, excepting one column added for each K - **k#** - cluster number for clustering samples with K = # **XCMS variableMetadata** - (tabular separated values) file identical to the Feature metadata file given as an input argument, excepting one column added for each K - **k#** - cluster number for clustering features with K = # **scores** - (tabular separated values) file with one line for each K. - **clusterOn** - what was clustered - either 'sample' or 'feature' - **k** - the chosen K for clustering - **totalSS** - total (*between-treatements* plus total of *within-treatements*) sum of squares - **betweenSS** - *between-treatements* sum of squares - **proportion** - betweenSS / totalSS --------------- Working example --------------- **Input files** +-------------------+-------------------------------------------------------------------------------------------------------------------+ | Input File | Download from URL | +===================+===================================================================================================================+ | Data matrix | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_dataMatrix.tsv | +-------------------+-------------------------------------------------------------------------------------------------------------------+ | Sample metadata | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_sampleMetadata.tsv | +-------------------+-------------------------------------------------------------------------------------------------------------------+ | Feature metadata | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_variableMetadata.tsv | +-------------------+-------------------------------------------------------------------------------------------------------------------+ **Other input parameters** +-----------------+---------------+ | Input Parameter | Value | +=================+===============+ | ksamples | 3,4 | +-----------------+---------------+ | kfeatures | 5,6,7 | +-----------------+---------------+ | iter_max | 10 | +-----------------+---------------+ | nstart | 1 | +-----------------+---------------+ | algorithm | Hartigan-Wong | +-----------------+---------------+ ---- NEWS ---- - August 2017, Version 0.98.3 - Add (optional) prefix to category numbers for downstream tools that treat only non-numeric data as categorical. - August 2017, Version 0.98.1 - First release --------- Citations --------- ]]> </help> <citations> <citation type="bibtex"><![CDATA[ @incollection{RCoreTeam2017, title = {stats::kmeans - K-Means Clustering}, booktitle = {R: A Language and Environment for Statistical Computing}, author = {{R Core Team}}, publisher = {R Foundation for Statistical Computing}, address = {Vienna, Austria}, year = {2017}, url = {https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html}, } ]]></citation> <!-- Forgy algorithm --> <citation type="bibtex"><![CDATA[ @article{forgy65, added-at = {2006-03-23T12:22:43.000+0100}, author = {Forgy, E.}, biburl = {https://www.bibsonomy.org/bibtex/21e31409932ce91df646c4731350e1207/hotho}, interhash = {c86383cba8cfe00d5e6ef200016aca3f}, intrahash = {1e31409932ce91df646c4731350e1207}, journal = {Biometrics}, keywords = {clustering kmeans}, number = 3, pages = {768-769}, timestamp = {2006-03-23T12:22:43.000+0100}, title = {Cluster Analysis of Multivariate Data: Efficiency versus Interpretability of Classification}, volume = 21, year = 1965 } ]]></citation> <!-- W4M 3.0 - Guitton et al. 2017--> <citation type="doi">10.1016/j.biocel.2017.07.002</citation> <!-- W4M 2.5 - Giacomini et al. 2014 --> <citation type="doi">10.1093/bioinformatics/btu813</citation> <!-- Hartigan and Wong algorithm --> <citation type="bibtex"><![CDATA[ @article{Hartigan79, added-at = {2007-02-27T16:22:09.000+0100}, author = {Hartigan, J. and Wong, M.}, biburl = {https://www.bibsonomy.org/bibtex/23d8bfc440c5725783876929c022f67ce/pierpaolo.pk81}, description = {WSD}, interhash = {10d6d33920d9af578a4d0a556dc1477d}, intrahash = {3d8bfc440c5725783876929c022f67ce}, journal = {Applied Statistics}, keywords = {imported}, pages = {100-108}, timestamp = {2007-02-27T16:22:11.000+0100}, title = {Algorithm AS136: A k-means clustering algorithm}, volume = 28, year = 1979 } ]]></citation> <!-- Lloyd algorithm --> <citation type="doi">10.1109/TIT.1982.1056489</citation> <!-- MacQueen algorithm --> <citation type="bibtex"><![CDATA[ @inproceedings{MacQueen1967, added-at = {2011-01-11T13:35:01.000+0100}, author = {MacQueen, J. B.}, biburl = {https://www.bibsonomy.org/bibtex/25dcdb8cd9fba78e0e791af619d61d66d/enitsirhc}, booktitle = {Proc. of the fifth Berkeley Symposium on Mathematical Statistics and Probability}, editor = {Cam, L. M. Le and Neyman, J.}, interhash = {8d7d4dfe7d3a06b8c9c3c2bb7aa91e28}, intrahash = {5dcdb8cd9fba78e0e791af619d61d66d}, keywords = {kmeans clustering}, pages = {281-297}, publisher = {University of California Press}, timestamp = {2011-01-11T13:35:01.000+0100}, title = {Some Methods for Classification and Analysis of MultiVariate Observations}, volume = 1, year = 1967 } ]]></citation> </citations> <!-- vim:et:sw=2:ts=2: --> </tool>