# HG changeset patch # User eschen42 # Date 1502316415 14400 # Node ID 02cafb660b725130aa86e0eba2c1fcbd3fad42a4 # Parent 6ccbe18131a6017857050f5fcb811d0aaacea6f4 planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit f600ce8a783df16e49272341dce0fc6bbc299b0a diff -r 6ccbe18131a6 -r 02cafb660b72 w4mkmeans.xml --- a/w4mkmeans.xml Tue Aug 08 15:30:38 2017 -0400 +++ b/w4mkmeans.xml Wed Aug 09 18:06:55 2017 -0400 @@ -1,5 +1,5 @@ - - Calculate K-means for dataMatrix features or samples + + Calculate K-means for W4M dataMatrix features or samples r-base @@ -14,18 +14,19 @@ @@ -33,11 +34,12 @@ + - + @@ -52,7 +54,7 @@ - + @@ -176,6 +178,10 @@ - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html). +**categorical_prefix** - character(s) to add as prefix to category number (default = 'k') + + - some tools treat only non-numeric data as categorical; this prefix ('k' by default) ensures that clusters data will be treated as categorical; an empty string is permitted here if desired (and succeeding tools accept integers as categorical data). + ------------ Output files ------------ @@ -232,7 +238,8 @@ NEWS ---- -August 2017, Version 0.98.1 - First release +- August 2017, Version 0.98.3 - Add (optional) prefix to category numbers for downstream tools that treat only non-numeric data as categorical. +- August 2017, Version 0.98.1 - First release --------- Citations diff -r 6ccbe18131a6 -r 02cafb660b72 w4mkmeans_routines.R --- a/w4mkmeans_routines.R Tue Aug 08 15:30:38 2017 -0400 +++ b/w4mkmeans_routines.R Wed Aug 09 18:06:55 2017 -0400 @@ -10,23 +10,24 @@ "w4mkmeans: bad input.", "# contract:", " required - caller will provide an environment comprising:", - " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", - " variableMetadata - the corresponding W4M data.frame having feature metadata", - " sampleMetdata - the corresponding W4M data.frame having sample metadata", - " dataMatrix - the corresponding W4M matrix", - " slots - the number of parallel slots for calculating kmeans", + " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", + " variableMetadata - the corresponding W4M data.frame having feature metadata", + " sampleMetdata - the corresponding W4M data.frame having sample metadata", + " dataMatrix - the corresponding W4M matrix", + " slots - the number of parallel slots for calculating kmeans", " optional - environment may comprise:", - " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", - " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", - " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", - " nstart - how many random sets of centers should be chosen (default = 1)", - " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", + " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", + " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", + " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", + " nstart - how many random sets of centers should be chosen (default = 1)", + " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", + " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", " ", " this routine will return a list comprising:", - " variableMetadata - the input variableMetadata data.frame with updates, if any", - " sampleMetadata - the input sampleMetadata data.frame with updates, if any", - " scores - an array of strings, each representing a line of a tsv having the following header:", - " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion" + " variableMetadata - the input variableMetadata data.frame with updates, if any", + " sampleMetadata - the input sampleMetadata data.frame with updates, if any", + " scores - an array of strings, each representing a line of a tsv having the following header:", + " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion" ) ) } @@ -37,11 +38,12 @@ lapply(w4kmeans_usage(),print) } # supply default arguments - if ( ! exists("iter.max" , env) ) env$iter.max <- 10 - if ( ! exists("nstart" , env) ) env$nstart <- 1 - if ( ! exists("algorithm", env) ) env$algorithm <- 'Hartigan-Wong' - if ( ! exists("ksamples" , env) ) env$ksamples <- c() - if ( ! exists("kfeatures", env) ) env$kfeatures <- c() + if ( ! exists("iter.max" , env) ) env$iter.max <- 10 + if ( ! exists("nstart" , env) ) env$nstart <- 1 + if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong' + if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k' + if ( ! exists("ksamples" , env) ) env$ksamples <- c() + if ( ! exists("kfeatures" , env) ) env$kfeatures <- c() # check mandatory arguments expected <- c( "log_print" @@ -61,9 +63,19 @@ scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) sampleMetadata <- env$sampleMetadata featureMetadata <- env$variableMetadata - ksamples <- as.numeric(env$ksamples) - kfeatures <- as.numeric(env$kfeatures) slots <- env$slots + positive_ints <- function(a, what) { + i <- as.integer(a) # may introduce NAs by coercion + i <- i[!is.na(i)] # eliminate NAs + i <- i[i > 0] # eliminate non-positive integers + i <- unique(sort(i)) # eliminate redundancy and disorder + if (length(a)!=length(i)) { + failure_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.") + } + return (i) # return results, if any + } + ksamples <- positive_ints(env$ksamples , "ksamples") + kfeatures <- positive_ints(env$kfeatures, "kfeatures") myLapply <- parLapply # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation) @@ -113,7 +125,7 @@ for ( i in 1:ksamples_length ) { result <- smpl_result_list[[i]] if (result$success) { - sampleMetadata[sprintf("k%d",ksamples[i])] <- result$value$clusters + sampleMetadata[sprintf("k%d",ksamples[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters) scores <- c(scores, result$value$scores) } } @@ -132,7 +144,7 @@ for ( i in 1:kfeatures_length ) { result <- feat_result_list[[i]] if (result$success) { - featureMetadata[sprintf("k%d",kfeatures[i])] <- result$value$clusters + featureMetadata[sprintf("k%d",kfeatures[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters) scores <- c(scores, result$value$scores) } } diff -r 6ccbe18131a6 -r 02cafb660b72 w4mkmeans_wrapper.R --- a/w4mkmeans_wrapper.R Tue Aug 08 15:30:38 2017 -0400 +++ b/w4mkmeans_wrapper.R Wed Aug 09 18:06:55 2017 -0400 @@ -8,25 +8,26 @@ # - [parallel::clusterApply](https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/clusterApply.html) # invocation: -# Rscript $__tool_directory__/w4mkmeans_wrapper.R \ -# tool_directory $__tool_directory__ -# data_matrix_path '$dataMatrix_in' \ -# variable_metadata_path '$variableMetadata_in' \ -# sample_metadata_path '$sampleMetadata_in' \ -# kfeatures '$kfeatures' \ -# ksamples '$ksamples' \ -# iter_max '$iter_max' \ -# nstart '$nstart' \ -# algorithm '$algorithm' \ -# scores '$scores' \ -# sampleMetadata_out '$sampleMetadata_out' \ -# variableMetadata_out '$variableMetadata_out' \ -# slots "\${GALAXY_SLOTS:-1}" \ +# Rscript w4mkmeans_wrapper.R \ +# algorithm "$algorithm" \ +# categorical_prefix "$categorical_prefix" \ +# data_matrix_path "$dataMatrix_in" \ +# iter_max "$iter_max" \ +# kfeatures "$kfeatures" \ +# ksamples "$ksamples" \ +# nstart "$nstart" \ +# sampleMetadata_out "$sampleMetadata_out" \ +# sample_metadata_path "$sampleMetadata_in" \ +# scores_out "$scores_out" \ +# slots "${GALAXY_SLOTS:-1}" \ +# variableMetadata_out "$variableMetadata_out" \ +# variable_metadata_path "$variableMetadata_in" # # # # # +# # # # @@ -294,6 +295,10 @@ args_env$slots <- as.numeric( argVc['slots' ]) # string args args_env$algorithm <- as.character( argVc['algorithm']) +args_env$categorical_prefix <- as.character( argVc['categorical_prefix']) + + +# make local 'log_print' function available through 'env' args_env$log_print <- log_print log_print("PARAMETERS (parsed):")