Mercurial > repos > eschen42 > w4mkmeans
diff w4mkmeans_routines.R @ 1:02cafb660b72 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit f600ce8a783df16e49272341dce0fc6bbc299b0a
author | eschen42 |
---|---|
date | Wed, 09 Aug 2017 18:06:55 -0400 |
parents | 6ccbe18131a6 |
children | c415b7dc6f37 |
line wrap: on
line diff
--- a/w4mkmeans_routines.R Tue Aug 08 15:30:38 2017 -0400 +++ b/w4mkmeans_routines.R Wed Aug 09 18:06:55 2017 -0400 @@ -10,23 +10,24 @@ "w4mkmeans: bad input.", "# contract:", " required - caller will provide an environment comprising:", - " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", - " variableMetadata - the corresponding W4M data.frame having feature metadata", - " sampleMetdata - the corresponding W4M data.frame having sample metadata", - " dataMatrix - the corresponding W4M matrix", - " slots - the number of parallel slots for calculating kmeans", + " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", + " variableMetadata - the corresponding W4M data.frame having feature metadata", + " sampleMetdata - the corresponding W4M data.frame having sample metadata", + " dataMatrix - the corresponding W4M matrix", + " slots - the number of parallel slots for calculating kmeans", " optional - environment may comprise:", - " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", - " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", - " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", - " nstart - how many random sets of centers should be chosen (default = 1)", - " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", + " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", + " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", + " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", + " nstart - how many random sets of centers should be chosen (default = 1)", + " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", + " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", " ", " this routine will return a list comprising:", - " variableMetadata - the input variableMetadata data.frame with updates, if any", - " sampleMetadata - the input sampleMetadata data.frame with updates, if any", - " scores - an array of strings, each representing a line of a tsv having the following header:", - " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion" + " variableMetadata - the input variableMetadata data.frame with updates, if any", + " sampleMetadata - the input sampleMetadata data.frame with updates, if any", + " scores - an array of strings, each representing a line of a tsv having the following header:", + " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion" ) ) } @@ -37,11 +38,12 @@ lapply(w4kmeans_usage(),print) } # supply default arguments - if ( ! exists("iter.max" , env) ) env$iter.max <- 10 - if ( ! exists("nstart" , env) ) env$nstart <- 1 - if ( ! exists("algorithm", env) ) env$algorithm <- 'Hartigan-Wong' - if ( ! exists("ksamples" , env) ) env$ksamples <- c() - if ( ! exists("kfeatures", env) ) env$kfeatures <- c() + if ( ! exists("iter.max" , env) ) env$iter.max <- 10 + if ( ! exists("nstart" , env) ) env$nstart <- 1 + if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong' + if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k' + if ( ! exists("ksamples" , env) ) env$ksamples <- c() + if ( ! exists("kfeatures" , env) ) env$kfeatures <- c() # check mandatory arguments expected <- c( "log_print" @@ -61,9 +63,19 @@ scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) sampleMetadata <- env$sampleMetadata featureMetadata <- env$variableMetadata - ksamples <- as.numeric(env$ksamples) - kfeatures <- as.numeric(env$kfeatures) slots <- env$slots + positive_ints <- function(a, what) { + i <- as.integer(a) # may introduce NAs by coercion + i <- i[!is.na(i)] # eliminate NAs + i <- i[i > 0] # eliminate non-positive integers + i <- unique(sort(i)) # eliminate redundancy and disorder + if (length(a)!=length(i)) { + failure_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.") + } + return (i) # return results, if any + } + ksamples <- positive_ints(env$ksamples , "ksamples") + kfeatures <- positive_ints(env$kfeatures, "kfeatures") myLapply <- parLapply # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation) @@ -113,7 +125,7 @@ for ( i in 1:ksamples_length ) { result <- smpl_result_list[[i]] if (result$success) { - sampleMetadata[sprintf("k%d",ksamples[i])] <- result$value$clusters + sampleMetadata[sprintf("k%d",ksamples[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters) scores <- c(scores, result$value$scores) } } @@ -132,7 +144,7 @@ for ( i in 1:kfeatures_length ) { result <- feat_result_list[[i]] if (result$success) { - featureMetadata[sprintf("k%d",kfeatures[i])] <- result$value$clusters + featureMetadata[sprintf("k%d",kfeatures[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters) scores <- c(scores, result$value$scores) } }