comparison w4mkmeans_routines.R @ 2:c415b7dc6f37 draft default tip

planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 3e916537da6bb37e6f3927d7a11e98e0ab6ef5ec
author eschen42
date Mon, 05 Mar 2018 12:40:17 -0500
parents 02cafb660b72
children
comparison
equal deleted inserted replaced
1:02cafb660b72 2:c415b7dc6f37
2 ## these are the batch-independent and file-structure-independent routines to support the w4mkmeans tool 2 ## these are the batch-independent and file-structure-independent routines to support the w4mkmeans tool
3 ##------------------------------------------------------------------------------------------------------ 3 ##------------------------------------------------------------------------------------------------------
4 4
5 library(parallel) 5 library(parallel)
6 6
7 w4kmeans_usage <- function() { 7 w4mkmeans_usage <- function() {
8 return ( 8 return (
9 c( 9 c(
10 "w4mkmeans: bad input.", 10 "w4mkmeans: bad input.",
11 "# contract:", 11 " contract:",
12 " required - caller will provide an environment comprising:", 12 " required - caller will provide an environment comprising:",
13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", 13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...",
14 " variableMetadata - the corresponding W4M data.frame having feature metadata", 14 " variableMetadata - the corresponding W4M data.frame having feature metadata",
15 " sampleMetdata - the corresponding W4M data.frame having sample metadata", 15 " sampleMetdata - the corresponding W4M data.frame having sample metadata",
16 " dataMatrix - the corresponding W4M matrix", 16 " dataMatrix - the corresponding W4M matrix",
17 " slots - the number of parallel slots for calculating kmeans", 17 " slots - the number of parallel slots for calculating kmeans",
18 " optional - environment may comprise:", 18 " optional - environment may comprise:",
19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", 19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)",
20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", 20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)",
21 " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", 21 " iter_max - the maximum number of iterations when calculating a cluster (default = 20)",
22 " nstart - how many random sets of centers should be chosen (default = 1)", 22 " nstart - how many random sets of centers should be chosen (default = 20)",
23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", 23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
24 " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", 24 " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
25 " ", 25 " ",
26 " this routine will return a list comprising:", 26 " this routine will return a list comprising:",
27 " variableMetadata - the input variableMetadata data.frame with updates, if any", 27 " variableMetadata - the input variableMetadata data.frame with updates, if any",
33 } 33 }
34 34
35 w4mkmeans <- function(env) { 35 w4mkmeans <- function(env) {
36 # abort if 'env' is null or is not an environment 36 # abort if 'env' is null or is not an environment
37 if ( is.null(env) || ! is.environment(env) ) { 37 if ( is.null(env) || ! is.environment(env) ) {
38 lapply(w4kmeans_usage(),print) 38 lapply(w4mkmeans_usage(),print)
39 } 39 }
40 # extract parameters from 'env'
41 log_action <- env$log_print
40 # supply default arguments 42 # supply default arguments
41 if ( ! exists("iter.max" , env) ) env$iter.max <- 10 43 if ( ! exists("iter_max" , env) ) env$iter_max <- 20
42 if ( ! exists("nstart" , env) ) env$nstart <- 1 44 if ( ! exists("nstart" , env) ) env$nstart <- 20
43 if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong' 45 if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong'
44 if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k' 46 if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'c'
45 if ( ! exists("ksamples" , env) ) env$ksamples <- c() 47 if ( ! exists("ksamples" , env) ) env$ksamples <- c()
46 if ( ! exists("kfeatures" , env) ) env$kfeatures <- c() 48 if ( ! exists("kfeatures" , env) ) env$kfeatures <- c()
47 # check mandatory arguments 49 # check mandatory arguments
48 expected <- c( 50 expected <- c(
49 "log_print" 51 "log_print"
53 , "slots" 55 , "slots"
54 ) 56 )
55 missing_from_env <- setdiff(expected, (ls(env))) 57 missing_from_env <- setdiff(expected, (ls(env)))
56 if ( length(missing_from_env) > 0 ) { 58 if ( length(missing_from_env) > 0 ) {
57 print(paste(c('expected environment members not found: ', as.character(missing_from_env)), collapse = ", ")) 59 print(paste(c('expected environment members not found: ', as.character(missing_from_env)), collapse = ", "))
58 lapply(w4kmeans_usage(),print) 60 lapply(w4mkmeans_usage(),log_action)
59 stop("w4mkmeans: contract has been broken") 61 stop("w4mkmeans: contract has been broken")
60 } 62 }
61 # extract parameters from 'env' 63 # extract parameters from 'env'
62 failure_action <- env$log_print 64 log_action <- env$log_print
63 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) 65 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" )
64 sampleMetadata <- env$sampleMetadata 66 sampleMetadata <- env$sampleMetadata
65 featureMetadata <- env$variableMetadata 67 featureMetadata <- env$variableMetadata
66 slots <- env$slots 68 slots <- env$slots
67 positive_ints <- function(a, what) { 69 positive_ints <- function(a, what) {
68 i <- as.integer(a) # may introduce NAs by coercion 70 i <- as.integer(a) # may introduce NAs by coercion
69 i <- i[!is.na(i)] # eliminate NAs 71 i <- i[!is.na(i)] # eliminate NAs
70 i <- i[i > 0] # eliminate non-positive integers 72 i <- i[i > 0] # eliminate non-positive integers
71 i <- unique(sort(i)) # eliminate redundancy and disorder 73 i <- unique(sort(i)) # eliminate redundancy and disorder
72 if (length(a)!=length(i)) { 74 if (length(a)!=length(i)) {
73 failure_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.") 75 log_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.")
74 } 76 }
75 return (i) # return results, if any 77 return (i) # return results, if any
76 } 78 }
77 ksamples <- positive_ints(env$ksamples , "ksamples") 79 ksamples <- positive_ints(env$ksamples , "ksamples")
78 kfeatures <- positive_ints(env$kfeatures, "kfeatures") 80 kfeatures <- positive_ints(env$kfeatures, "kfeatures")
79 81
82 log_action("w4mkmeans: preparing data matrix")
83 # prepare data matrix (normalize, eliminate zero-variance rows, etc.; no transformation)
84 dm_en <- new.env()
85 dm_en$log <- c()
86 preparation_result <- tryCatchFunc(function(){
87 dm <- prepare.data.matrix(
88 x.matrix = env$dataMatrix
89 , data.transformation = function(x) { x }
90 , en = dm_en
91 )
92 my_log <- dm_en$log
93 for ( i in 1:length(my_log) ) {
94 log_action(my_log[i])
95 }
96 dm
97 })
98 if ( !preparation_result$success ) {
99 postmortem <- paste("prepare.data.matrix failed:", preparation_result$msg)
100 log_action(postmortem)
101 stop(postmortem)
102 }
103
104 env$preparedDataMatrix <- preparation_result$value
105
106 log_action("w4mkmeans: determining evaluation mode")
107
80 myLapply <- parLapply 108 myLapply <- parLapply
81 # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation)
82 # myLapply <- function(cl, ...) lapply(...)
83 cl <- NULL 109 cl <- NULL
110 tryCatch(
111 expr = {
112 outfile <- ""
113 # outfile: Where to direct the stdout and stderr connection output from the workers.
114 # - "" indicates no redirection (which may only be useful for workers on the local machine).
115 # - Defaults to ‘/dev/null’ (‘nul:’ on Windows).
116 # - The other possibility is a file path on the worker's host.
117 # - Files will be opened in append mode, as all workers log to the same file.
118 cl <- makePSOCKcluster( names = slots, outfile = outfile )
119 }
120 , error = function(e) {
121 log_action(sprintf("w4kmeans: falling back to serial evaluation because makePSOCKcluster(names = %d) threw an exception", slots))
122 # mimic parLapply, but without parallelization (as a last resort)
123 myLapply <<- function(cl, ...) lapply(...)
124 }
125 )
84 if ( identical(myLapply, parLapply) ) { 126 if ( identical(myLapply, parLapply) ) {
85 failure_action(sprintf("w4mkmeans: using parallel evaluation with %d slots", slots)) 127 log_action(sprintf("w4mkmeans: using parallel evaluation with %d slots", slots))
86 failure_action(names(cl))
87 cl <- makePSOCKcluster(names = slots)
88 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." 128 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster."
129 # note varlist: names of references must be passed to the cluster so that they can be resolved
89 clusterExport( 130 clusterExport(
90 cl = cl 131 cl = cl
91 , varlist = c( 132 , varlist = c(
92 "tryCatchFunc" 133 "tryCatchFunc" # required by calc_kmeans_one_dimension_one_k
93 , "calc_kmeans_one_dimension_one_k" 134 , "format_error" # required by tryCatchFunc when errors are caught
94 , "prepare.data.matrix" 135 , "iso_date" # required by log_print
136 , "log_print" # required by calc_kmeans_one_dimension_one_k
95 ) 137 )
96 ) 138 )
97 final <- function(cl) { 139 final <- function(cl) {
98 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." 140 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster."
99 if ( !is.null(cl) ) { 141 if ( !is.null(cl) ) {
100 failure_action("w4mkmeans: stopping cluster used for parallel evaluation") 142 log_action("w4mkmeans: stopping cluster used for parallel evaluation")
101 stopCluster(cl) 143 stopCluster(cl)
102 } 144 }
103 } 145 }
104 } else { 146 } else {
105 failure_action("w4mkmeans: using sequential evaluation (1 slot)") 147 log_action("w4mkmeans: using sequential evaluation (one slot)")
106 final <- function(cl) { } 148 final <- function(cl) { }
107 } 149 }
108 150
109 tryCatch( 151 tryCatch(
110 expr = { 152 expr = {
113 # - The inner list has keys "clusters" and "scores" 155 # - The inner list has keys "clusters" and "scores"
114 156
115 # for each $i in ksamples, append column 'k$i' to data frame sampleMetadata 157 # for each $i in ksamples, append column 'k$i' to data frame sampleMetadata
116 ksamples_length <- length(ksamples) 158 ksamples_length <- length(ksamples)
117 if ( ksamples_length > 0 ) { 159 if ( ksamples_length > 0 ) {
118 smpl_result_list <- myLapply( 160 smpl_result_list <- myLapply(
119 cl = cl 161 cl = cl
120 , ksamples 162 , ksamples
121 , calc_kmeans_one_dimension_one_k 163 , calc_kmeans_one_dimension_one_k
122 , env = env 164 , env = env
123 , dimension = "samples" 165 , dimension = "samples"
132 } 174 }
133 175
134 # for each $i in kfeatures, append column 'k$i' to data frame featureMetadata 176 # for each $i in kfeatures, append column 'k$i' to data frame featureMetadata
135 kfeatures_length <- length(kfeatures) 177 kfeatures_length <- length(kfeatures)
136 if ( kfeatures_length > 0 ) { 178 if ( kfeatures_length > 0 ) {
137 feat_result_list <- myLapply( 179 feat_result_list <- myLapply(
138 cl = cl 180 cl = cl
139 , kfeatures 181 , kfeatures
140 , calc_kmeans_one_dimension_one_k 182 , calc_kmeans_one_dimension_one_k
141 , env = env 183 , env = env
142 , dimension = "features" 184 , dimension = "features"
148 scores <- c(scores, result$value$scores) 190 scores <- c(scores, result$value$scores)
149 } 191 }
150 } 192 }
151 } 193 }
152 194
153 return ( 195 return (
154 list( 196 list(
155 variableMetadata = featureMetadata 197 variableMetadata = featureMetadata
156 , sampleMetadata = sampleMetadata 198 , sampleMetadata = sampleMetadata
157 , scores = scores 199 , scores = scores
158 ) 200 )
159 ) 201 )
160 } 202 }
161 , finally = final(cl) 203 , finally = {
204 final(cl)
205 }
162 ) 206 )
163 } 207 }
164 208
165 # calculate k-means for features or samples 209 # calculate k-means for features or samples
166 # - recall that the dataMatrix has features in rows and samples in columns 210 # - recall that the dataMatrix has features in rows and samples in columns
167 # return value: 211 # return value:
168 # list(clusters = km$cluster, scores = scores) 212 # list(clusters = km$cluster, scores = scores)
169 # arguments: 213 # arguments:
170 # env: 214 # env:
171 # environment having dataMatrix 215 # environment having dataMatrix
172 # dimension: 216 # dimension:
173 # - "samples": produce clusters column to add to the sampleMetadata table 217 # - "samples": produce clusters column to add to the sampleMetadata table
177 # integer, the number of clusters to make 221 # integer, the number of clusters to make
178 calc_kmeans_one_dimension_one_k <- function(k, env, dimension = "samples") { 222 calc_kmeans_one_dimension_one_k <- function(k, env, dimension = "samples") {
179 # abort if environment is not as expected 223 # abort if environment is not as expected
180 if ( is.null(env) || ! is.environment(env) ) { 224 if ( is.null(env) || ! is.environment(env) ) {
181 stop("calc_kmeans_one_dimension_one_k - argument 'env' is not an environment") 225 stop("calc_kmeans_one_dimension_one_k - argument 'env' is not an environment")
182 } 226 }
183 if ( ! exists("log_print", env) || ! is.function(env$log_print) ) { 227 if ( ! exists("log_print", env) || ! is.function(env$log_print) ) {
184 stop("calc_kmeans_one_dimension_one_k - argument 'env' - environment does not include log_print or it is not a function") 228 stop("calc_kmeans_one_dimension_one_k - argument 'env' - environment does not include log_print or it is not a function")
185 } 229 }
230 log_action <- env$log_print
186 # abort if k is not as expected 231 # abort if k is not as expected
187 if ( ! is.numeric(k) ) { 232 if ( ! is.numeric(k) ) {
188 stop(sprintf("calc_kmeans_one_dimension_one_k - expected numeric argument 'k' but type is %s", typeof(k))) 233 stop(sprintf("calc_kmeans_one_dimension_one_k - expected numeric argument 'k' but type is %s", typeof(k)))
189 } 234 }
190 k <- as.integer(k) 235 k <- as.integer(k)
191 # abort if dimension is not as expected 236 # abort if dimension is not as expected
192 if ( ! is.character(dimension) 237 if ( ! is.character(dimension)
193 || ! Reduce( f =`|`, x = sapply(X = c("features","samples"), FUN = `==`, dimension), init = FALSE) ) { 238 || ! Reduce( f =`|`, x = sapply(X = c("features","samples"), FUN = `==`, dimension), init = FALSE) ) {
194 stop("calc_kmeans_one_dimension_one_k - argument 'dimension' is neither 'features' nor 'samples'") 239 stop("calc_kmeans_one_dimension_one_k - argument 'dimension' is neither 'features' nor 'samples'")
195 } 240 }
196 dm <- env$dataMatrix 241 dm <- env$preparedDataMatrix
197 iter.max <- env$iter.max 242 iter_max <- env$iter_max
198 nstart <- env$nstart 243 nstart <- env$nstart
199 algorithm <- env$algorithm 244 algorithm <- env$algorithm
200 dim_features <- dimension == "features" 245 dim_features <- dimension == "features"
246
201 # tryCatchFunc produces a list 247 # tryCatchFunc produces a list
202 # On success of expr(), tryCatchFunc produces 248 # On success of func(), tryCatchFunc produces
203 # list(success TRUE, value = expr(), msg = "") 249 # list(success = TRUE, value = func(), msg = "")
204 # On failure of expr(), tryCatchFunc produces 250 # On failure of func(), tryCatchFunc produces
205 # list(success = FALSE, value = NA, msg = "the error message") 251 # list(success = FALSE, value = NA, msg = "the error message")
206 result_list <- tryCatchFunc( expr = function() { 252 result_list <- tryCatchFunc( func = function() {
207 # kmeans clusters the rows; features are the columns of args_env$dataMatrix; samples, the rows 253 # kmeans clusters the rows; features are the columns of args_env$dataMatrix; samples, the rows
208 # - to calculate sample-clusters, no transposition is needed because samples are rows 254 # - to calculate sample-clusters, no transposition is needed because samples are rows
209 # - to calculate feature-clusters, transposition is needed so that features will be the rows 255 # - to calculate feature-clusters, transposition is needed so that features will be the rows
210 if ( ! dim_features ) dm <- t(dm) 256 if ( ! dim_features ) {
211 dm <- prepare.data.matrix( x.matrix = dm, data.transformation = function(x) { x } ) 257 dm <- t(dm)
258 }
259
212 # need to set.seed to get reproducible results from kmeans 260 # need to set.seed to get reproducible results from kmeans
213 set.seed(4567) 261 set.seed(4567)
262
214 # do the k-means clustering 263 # do the k-means clustering
215 km <- kmeans( x = dm, centers = k, iter.max, nstart = nstart, algorithm = algorithm ) 264 withCallingHandlers(
265 {
266 km <<- kmeans( x = dm, centers = k, iter.max = iter_max, nstart = nstart, algorithm = algorithm )
267 }
268 , warning = function(w) {
269 lw <- list(w)
270 smplwrn <- as.character(w[[1]])
271 log_print(
272 sprintf( "Warning for %s: center = %d, nstart = %d, iter_max = %d: %s"
273 , if (dim_features) "features" else "samples"
274 , k
275 , nstart
276 , iter_max
277 , smplwrn
278 )
279 )
280 }
281 )
282
283 # collect the scores
216 scores <- 284 scores <-
217 sprintf("%s\t%d\t%0.5e\t%0.5e\t%0.5f" 285 sprintf("%s\t%d\t%0.5e\t%0.5e\t%0.5f"
218 , dimension 286 , dimension
219 , k 287 , k
220 , km$totss 288 , km$totss
221 , km$betweenss 289 , km$betweenss
222 , km$betweenss/km$totss 290 , km$betweenss/km$totss
223 ) 291 )
292
293 # return list of results
224 list(clusters = km$cluster, scores = scores) 294 list(clusters = km$cluster, scores = scores)
225 }) 295 })
296
297 # return either
298 # list(success = TRUE, value = func(), msg = "")
299 # or
300 # list(success = FALSE, value = NA, msg = "the error message")
226 return ( result_list ) 301 return ( result_list )
227 } 302 }
228 303
304 # vim: sw=2 ts=2 et :