Mercurial > repos > eschen42 > w4mkmeans
changeset 1:02cafb660b72 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit f600ce8a783df16e49272341dce0fc6bbc299b0a
author | eschen42 |
---|---|
date | Wed, 09 Aug 2017 18:06:55 -0400 |
parents | 6ccbe18131a6 |
children | c415b7dc6f37 |
files | w4mkmeans.xml w4mkmeans_routines.R w4mkmeans_wrapper.R |
diffstat | 3 files changed, 73 insertions(+), 49 deletions(-) [+] |
line wrap: on
line diff
--- a/w4mkmeans.xml Tue Aug 08 15:30:38 2017 -0400 +++ b/w4mkmeans.xml Wed Aug 09 18:06:55 2017 -0400 @@ -1,5 +1,5 @@ -<tool id="w4mkmeans" name="Kmeans_for_W4M" version="0.98.1"> - <description>Calculate K-means for dataMatrix features or samples</description> +<tool id="w4mkmeans" name="w4mKmeans" version="0.98.3"> + <description>Calculate K-means for W4M dataMatrix features or samples</description> <requirements> <requirement type="package" version="3.3.2">r-base</requirement> @@ -14,18 +14,19 @@ <command detect_errors="aggressive"><![CDATA[ Rscript $__tool_directory__/w4mkmeans_wrapper.R tool_directory $__tool_directory__ + algorithm '$algorithm' + categorical_prefix '$categoricalPrefix' data_matrix_path '$dataMatrix_in' - variable_metadata_path '$variableMetadata_in' - sample_metadata_path '$sampleMetadata_in' - ksamples '$ksamples' - kfeatures '$kfeatures' iter_max '$iter_max' + kfeatures '$kfeatures' + ksamples '$ksamples' nstart '$nstart' - algorithm '$algorithm' + sampleMetadata_out '$sampleMetadata_out' + sample_metadata_path '$sampleMetadata_in' scores_out '$scores_out' - sampleMetadata_out '$sampleMetadata_out' + slots "\${GALAXY_SLOTS:-1}" variableMetadata_out '$variableMetadata_out' - slots "\${GALAXY_SLOTS:-1}" + variable_metadata_path '$variableMetadata_in' ; echo exit code $? ]]></command> @@ -33,11 +34,12 @@ <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" /> <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" /> <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" /> + <param name="categoricalPrefix" label="prefix for cluster names " type="text" value="k" help="[categorical_prefix] Some tools require non-numeric values to discern categorical data; e.g., enter 'k' here to prepend 'k' to cluster numbers in the output; default 'k'." /> <param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." /> <param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." /> <param name="iter_max" label="Max number of iterations" type="text" value = "10" help="[iter_max] The maximum number of iterations allowed; default 10." /> <param name="nstart" label="Number of random sets" type="text" value = "1" help="[nstart] How many random sets should be chosen; default 1." /> - <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see references for further info."> + <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see stats::kmeans reference for further info."> <option value="Forgy">Forgy</option> <option value="Hartigan-Wong" selected="True">Hartigan-Wong</option> <option value="Lloyd">Lloyd</option> @@ -52,7 +54,7 @@ </outputs> <tests> - <test> + <test> <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> @@ -176,6 +178,10 @@ - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html). +**categorical_prefix** - character(s) to add as prefix to category number (default = 'k') + + - some tools treat only non-numeric data as categorical; this prefix ('k' by default) ensures that clusters data will be treated as categorical; an empty string is permitted here if desired (and succeeding tools accept integers as categorical data). + ------------ Output files ------------ @@ -232,7 +238,8 @@ NEWS ---- -August 2017, Version 0.98.1 - First release +- August 2017, Version 0.98.3 - Add (optional) prefix to category numbers for downstream tools that treat only non-numeric data as categorical. +- August 2017, Version 0.98.1 - First release --------- Citations
--- a/w4mkmeans_routines.R Tue Aug 08 15:30:38 2017 -0400 +++ b/w4mkmeans_routines.R Wed Aug 09 18:06:55 2017 -0400 @@ -10,23 +10,24 @@ "w4mkmeans: bad input.", "# contract:", " required - caller will provide an environment comprising:", - " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", - " variableMetadata - the corresponding W4M data.frame having feature metadata", - " sampleMetdata - the corresponding W4M data.frame having sample metadata", - " dataMatrix - the corresponding W4M matrix", - " slots - the number of parallel slots for calculating kmeans", + " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", + " variableMetadata - the corresponding W4M data.frame having feature metadata", + " sampleMetdata - the corresponding W4M data.frame having sample metadata", + " dataMatrix - the corresponding W4M matrix", + " slots - the number of parallel slots for calculating kmeans", " optional - environment may comprise:", - " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", - " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", - " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", - " nstart - how many random sets of centers should be chosen (default = 1)", - " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", + " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", + " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", + " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", + " nstart - how many random sets of centers should be chosen (default = 1)", + " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", + " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", " ", " this routine will return a list comprising:", - " variableMetadata - the input variableMetadata data.frame with updates, if any", - " sampleMetadata - the input sampleMetadata data.frame with updates, if any", - " scores - an array of strings, each representing a line of a tsv having the following header:", - " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion" + " variableMetadata - the input variableMetadata data.frame with updates, if any", + " sampleMetadata - the input sampleMetadata data.frame with updates, if any", + " scores - an array of strings, each representing a line of a tsv having the following header:", + " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion" ) ) } @@ -37,11 +38,12 @@ lapply(w4kmeans_usage(),print) } # supply default arguments - if ( ! exists("iter.max" , env) ) env$iter.max <- 10 - if ( ! exists("nstart" , env) ) env$nstart <- 1 - if ( ! exists("algorithm", env) ) env$algorithm <- 'Hartigan-Wong' - if ( ! exists("ksamples" , env) ) env$ksamples <- c() - if ( ! exists("kfeatures", env) ) env$kfeatures <- c() + if ( ! exists("iter.max" , env) ) env$iter.max <- 10 + if ( ! exists("nstart" , env) ) env$nstart <- 1 + if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong' + if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k' + if ( ! exists("ksamples" , env) ) env$ksamples <- c() + if ( ! exists("kfeatures" , env) ) env$kfeatures <- c() # check mandatory arguments expected <- c( "log_print" @@ -61,9 +63,19 @@ scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) sampleMetadata <- env$sampleMetadata featureMetadata <- env$variableMetadata - ksamples <- as.numeric(env$ksamples) - kfeatures <- as.numeric(env$kfeatures) slots <- env$slots + positive_ints <- function(a, what) { + i <- as.integer(a) # may introduce NAs by coercion + i <- i[!is.na(i)] # eliminate NAs + i <- i[i > 0] # eliminate non-positive integers + i <- unique(sort(i)) # eliminate redundancy and disorder + if (length(a)!=length(i)) { + failure_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.") + } + return (i) # return results, if any + } + ksamples <- positive_ints(env$ksamples , "ksamples") + kfeatures <- positive_ints(env$kfeatures, "kfeatures") myLapply <- parLapply # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation) @@ -113,7 +125,7 @@ for ( i in 1:ksamples_length ) { result <- smpl_result_list[[i]] if (result$success) { - sampleMetadata[sprintf("k%d",ksamples[i])] <- result$value$clusters + sampleMetadata[sprintf("k%d",ksamples[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters) scores <- c(scores, result$value$scores) } } @@ -132,7 +144,7 @@ for ( i in 1:kfeatures_length ) { result <- feat_result_list[[i]] if (result$success) { - featureMetadata[sprintf("k%d",kfeatures[i])] <- result$value$clusters + featureMetadata[sprintf("k%d",kfeatures[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters) scores <- c(scores, result$value$scores) } }
--- a/w4mkmeans_wrapper.R Tue Aug 08 15:30:38 2017 -0400 +++ b/w4mkmeans_wrapper.R Wed Aug 09 18:06:55 2017 -0400 @@ -8,25 +8,26 @@ # - [parallel::clusterApply](https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/clusterApply.html) # invocation: -# Rscript $__tool_directory__/w4mkmeans_wrapper.R \ -# tool_directory $__tool_directory__ -# data_matrix_path '$dataMatrix_in' \ -# variable_metadata_path '$variableMetadata_in' \ -# sample_metadata_path '$sampleMetadata_in' \ -# kfeatures '$kfeatures' \ -# ksamples '$ksamples' \ -# iter_max '$iter_max' \ -# nstart '$nstart' \ -# algorithm '$algorithm' \ -# scores '$scores' \ -# sampleMetadata_out '$sampleMetadata_out' \ -# variableMetadata_out '$variableMetadata_out' \ -# slots "\${GALAXY_SLOTS:-1}" \ +# Rscript w4mkmeans_wrapper.R \ +# algorithm "$algorithm" \ +# categorical_prefix "$categorical_prefix" \ +# data_matrix_path "$dataMatrix_in" \ +# iter_max "$iter_max" \ +# kfeatures "$kfeatures" \ +# ksamples "$ksamples" \ +# nstart "$nstart" \ +# sampleMetadata_out "$sampleMetadata_out" \ +# sample_metadata_path "$sampleMetadata_in" \ +# scores_out "$scores_out" \ +# slots "${GALAXY_SLOTS:-1}" \ +# variableMetadata_out "$variableMetadata_out" \ +# variable_metadata_path "$variableMetadata_in" # # <inputs> # <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" /> # <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" /> # <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" /> +# <param name="categoricalPrefix" label="prefix for cluster names " type="text" value="k" help="Some tools require non-numeric values to discern categorical; e.g., enter 'k' here to prepend 'k' to cluster numbers in the output; default 'k'." /> # <param name="kfeatures" label="K value(s) for features" type="text" value="0" help="Single or min,max value(s) for K for features (variables), or 0 for none." /> # <param name="ksamples" label="K value(s) for samples" type="text" value="0" help="Single or min,max value(s) for K for samples, or 0 for none." /> # <param name="iter_max" label="Max number of iterations" type="text" value="10" help="The maximum number of iterations allowed; default 10." /> @@ -294,6 +295,10 @@ args_env$slots <- as.numeric( argVc['slots' ]) # string args args_env$algorithm <- as.character( argVc['algorithm']) +args_env$categorical_prefix <- as.character( argVc['categorical_prefix']) + + +# make local 'log_print' function available through 'env' args_env$log_print <- log_print log_print("PARAMETERS (parsed):")