Mercurial > repos > eschen42 > w4mkmeans
view w4mkmeans_routines.R @ 1:02cafb660b72 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit f600ce8a783df16e49272341dce0fc6bbc299b0a
author | eschen42 |
---|---|
date | Wed, 09 Aug 2017 18:06:55 -0400 |
parents | 6ccbe18131a6 |
children | c415b7dc6f37 |
line wrap: on
line source
##------------------------------------------------------------------------------------------------------ ## these are the batch-independent and file-structure-independent routines to support the w4mkmeans tool ##------------------------------------------------------------------------------------------------------ library(parallel) w4kmeans_usage <- function() { return ( c( "w4mkmeans: bad input.", "# contract:", " required - caller will provide an environment comprising:", " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", " variableMetadata - the corresponding W4M data.frame having feature metadata", " sampleMetdata - the corresponding W4M data.frame having sample metadata", " dataMatrix - the corresponding W4M matrix", " slots - the number of parallel slots for calculating kmeans", " optional - environment may comprise:", " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", " nstart - how many random sets of centers should be chosen (default = 1)", " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", " ", " this routine will return a list comprising:", " variableMetadata - the input variableMetadata data.frame with updates, if any", " sampleMetadata - the input sampleMetadata data.frame with updates, if any", " scores - an array of strings, each representing a line of a tsv having the following header:", " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion" ) ) } w4mkmeans <- function(env) { # abort if 'env' is null or is not an environment if ( is.null(env) || ! is.environment(env) ) { lapply(w4kmeans_usage(),print) } # supply default arguments if ( ! exists("iter.max" , env) ) env$iter.max <- 10 if ( ! exists("nstart" , env) ) env$nstart <- 1 if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong' if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k' if ( ! exists("ksamples" , env) ) env$ksamples <- c() if ( ! exists("kfeatures" , env) ) env$kfeatures <- c() # check mandatory arguments expected <- c( "log_print" , "variableMetadata" , "sampleMetadata" , "dataMatrix" , "slots" ) missing_from_env <- setdiff(expected, (ls(env))) if ( length(missing_from_env) > 0 ) { print(paste(c('expected environment members not found: ', as.character(missing_from_env)), collapse = ", ")) lapply(w4kmeans_usage(),print) stop("w4mkmeans: contract has been broken") } # extract parameters from 'env' failure_action <- env$log_print scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) sampleMetadata <- env$sampleMetadata featureMetadata <- env$variableMetadata slots <- env$slots positive_ints <- function(a, what) { i <- as.integer(a) # may introduce NAs by coercion i <- i[!is.na(i)] # eliminate NAs i <- i[i > 0] # eliminate non-positive integers i <- unique(sort(i)) # eliminate redundancy and disorder if (length(a)!=length(i)) { failure_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.") } return (i) # return results, if any } ksamples <- positive_ints(env$ksamples , "ksamples") kfeatures <- positive_ints(env$kfeatures, "kfeatures") myLapply <- parLapply # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation) # myLapply <- function(cl, ...) lapply(...) cl <- NULL if ( identical(myLapply, parLapply) ) { failure_action(sprintf("w4mkmeans: using parallel evaluation with %d slots", slots)) failure_action(names(cl)) cl <- makePSOCKcluster(names = slots) # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." clusterExport( cl = cl , varlist = c( "tryCatchFunc" , "calc_kmeans_one_dimension_one_k" , "prepare.data.matrix" ) ) final <- function(cl) { # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." if ( !is.null(cl) ) { failure_action("w4mkmeans: stopping cluster used for parallel evaluation") stopCluster(cl) } } } else { failure_action("w4mkmeans: using sequential evaluation (1 slot)") final <- function(cl) { } } tryCatch( expr = { # These myLapply calls produce lists of lists of results: # - The outer list has no keys and its members are accessed by index # - The inner list has keys "clusters" and "scores" # for each $i in ksamples, append column 'k$i' to data frame sampleMetadata ksamples_length <- length(ksamples) if ( ksamples_length > 0 ) { smpl_result_list <- myLapply( cl = cl , ksamples , calc_kmeans_one_dimension_one_k , env = env , dimension = "samples" ) for ( i in 1:ksamples_length ) { result <- smpl_result_list[[i]] if (result$success) { sampleMetadata[sprintf("k%d",ksamples[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters) scores <- c(scores, result$value$scores) } } } # for each $i in kfeatures, append column 'k$i' to data frame featureMetadata kfeatures_length <- length(kfeatures) if ( kfeatures_length > 0 ) { feat_result_list <- myLapply( cl = cl , kfeatures , calc_kmeans_one_dimension_one_k , env = env , dimension = "features" ) for ( i in 1:kfeatures_length ) { result <- feat_result_list[[i]] if (result$success) { featureMetadata[sprintf("k%d",kfeatures[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters) scores <- c(scores, result$value$scores) } } } return ( list( variableMetadata = featureMetadata , sampleMetadata = sampleMetadata , scores = scores ) ) } , finally = final(cl) ) } # calculate k-means for features or samples # - recall that the dataMatrix has features in rows and samples in columns # return value: # list(clusters = km$cluster, scores = scores) # arguments: # env: # environment having dataMatrix # dimension: # - "samples": produce clusters column to add to the sampleMetadata table # - this is the default case # - "variables": produce clusters column to add to the variableMetadata table # k: # integer, the number of clusters to make calc_kmeans_one_dimension_one_k <- function(k, env, dimension = "samples") { # abort if environment is not as expected if ( is.null(env) || ! is.environment(env) ) { stop("calc_kmeans_one_dimension_one_k - argument 'env' is not an environment") } if ( ! exists("log_print", env) || ! is.function(env$log_print) ) { stop("calc_kmeans_one_dimension_one_k - argument 'env' - environment does not include log_print or it is not a function") } # abort if k is not as expected if ( ! is.numeric(k) ) { stop(sprintf("calc_kmeans_one_dimension_one_k - expected numeric argument 'k' but type is %s", typeof(k))) } k <- as.integer(k) # abort if dimension is not as expected if ( ! is.character(dimension) || ! Reduce( f =`|`, x = sapply(X = c("features","samples"), FUN = `==`, dimension), init = FALSE) ) { stop("calc_kmeans_one_dimension_one_k - argument 'dimension' is neither 'features' nor 'samples'") } dm <- env$dataMatrix iter.max <- env$iter.max nstart <- env$nstart algorithm <- env$algorithm dim_features <- dimension == "features" # tryCatchFunc produces a list # On success of expr(), tryCatchFunc produces # list(success TRUE, value = expr(), msg = "") # On failure of expr(), tryCatchFunc produces # list(success = FALSE, value = NA, msg = "the error message") result_list <- tryCatchFunc( expr = function() { # kmeans clusters the rows; features are the columns of args_env$dataMatrix; samples, the rows # - to calculate sample-clusters, no transposition is needed because samples are rows # - to calculate feature-clusters, transposition is needed so that features will be the rows if ( ! dim_features ) dm <- t(dm) dm <- prepare.data.matrix( x.matrix = dm, data.transformation = function(x) { x } ) # need to set.seed to get reproducible results from kmeans set.seed(4567) # do the k-means clustering km <- kmeans( x = dm, centers = k, iter.max, nstart = nstart, algorithm = algorithm ) scores <- sprintf("%s\t%d\t%0.5e\t%0.5e\t%0.5f" , dimension , k , km$totss , km$betweenss , km$betweenss/km$totss ) list(clusters = km$cluster, scores = scores) }) return ( result_list ) }