Mercurial > repos > eschen42 > w4mkmeans
comparison w4mkmeans_routines.R @ 2:c415b7dc6f37 draft default tip
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 3e916537da6bb37e6f3927d7a11e98e0ab6ef5ec
author | eschen42 |
---|---|
date | Mon, 05 Mar 2018 12:40:17 -0500 |
parents | 02cafb660b72 |
children |
comparison
equal
deleted
inserted
replaced
1:02cafb660b72 | 2:c415b7dc6f37 |
---|---|
2 ## these are the batch-independent and file-structure-independent routines to support the w4mkmeans tool | 2 ## these are the batch-independent and file-structure-independent routines to support the w4mkmeans tool |
3 ##------------------------------------------------------------------------------------------------------ | 3 ##------------------------------------------------------------------------------------------------------ |
4 | 4 |
5 library(parallel) | 5 library(parallel) |
6 | 6 |
7 w4kmeans_usage <- function() { | 7 w4mkmeans_usage <- function() { |
8 return ( | 8 return ( |
9 c( | 9 c( |
10 "w4mkmeans: bad input.", | 10 "w4mkmeans: bad input.", |
11 "# contract:", | 11 " contract:", |
12 " required - caller will provide an environment comprising:", | 12 " required - caller will provide an environment comprising:", |
13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", | 13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", |
14 " variableMetadata - the corresponding W4M data.frame having feature metadata", | 14 " variableMetadata - the corresponding W4M data.frame having feature metadata", |
15 " sampleMetdata - the corresponding W4M data.frame having sample metadata", | 15 " sampleMetdata - the corresponding W4M data.frame having sample metadata", |
16 " dataMatrix - the corresponding W4M matrix", | 16 " dataMatrix - the corresponding W4M matrix", |
17 " slots - the number of parallel slots for calculating kmeans", | 17 " slots - the number of parallel slots for calculating kmeans", |
18 " optional - environment may comprise:", | 18 " optional - environment may comprise:", |
19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", | 19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", |
20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", | 20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", |
21 " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", | 21 " iter_max - the maximum number of iterations when calculating a cluster (default = 20)", |
22 " nstart - how many random sets of centers should be chosen (default = 1)", | 22 " nstart - how many random sets of centers should be chosen (default = 20)", |
23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", | 23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", |
24 " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", | 24 " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", |
25 " ", | 25 " ", |
26 " this routine will return a list comprising:", | 26 " this routine will return a list comprising:", |
27 " variableMetadata - the input variableMetadata data.frame with updates, if any", | 27 " variableMetadata - the input variableMetadata data.frame with updates, if any", |
33 } | 33 } |
34 | 34 |
35 w4mkmeans <- function(env) { | 35 w4mkmeans <- function(env) { |
36 # abort if 'env' is null or is not an environment | 36 # abort if 'env' is null or is not an environment |
37 if ( is.null(env) || ! is.environment(env) ) { | 37 if ( is.null(env) || ! is.environment(env) ) { |
38 lapply(w4kmeans_usage(),print) | 38 lapply(w4mkmeans_usage(),print) |
39 } | 39 } |
40 # extract parameters from 'env' | |
41 log_action <- env$log_print | |
40 # supply default arguments | 42 # supply default arguments |
41 if ( ! exists("iter.max" , env) ) env$iter.max <- 10 | 43 if ( ! exists("iter_max" , env) ) env$iter_max <- 20 |
42 if ( ! exists("nstart" , env) ) env$nstart <- 1 | 44 if ( ! exists("nstart" , env) ) env$nstart <- 20 |
43 if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong' | 45 if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong' |
44 if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k' | 46 if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'c' |
45 if ( ! exists("ksamples" , env) ) env$ksamples <- c() | 47 if ( ! exists("ksamples" , env) ) env$ksamples <- c() |
46 if ( ! exists("kfeatures" , env) ) env$kfeatures <- c() | 48 if ( ! exists("kfeatures" , env) ) env$kfeatures <- c() |
47 # check mandatory arguments | 49 # check mandatory arguments |
48 expected <- c( | 50 expected <- c( |
49 "log_print" | 51 "log_print" |
53 , "slots" | 55 , "slots" |
54 ) | 56 ) |
55 missing_from_env <- setdiff(expected, (ls(env))) | 57 missing_from_env <- setdiff(expected, (ls(env))) |
56 if ( length(missing_from_env) > 0 ) { | 58 if ( length(missing_from_env) > 0 ) { |
57 print(paste(c('expected environment members not found: ', as.character(missing_from_env)), collapse = ", ")) | 59 print(paste(c('expected environment members not found: ', as.character(missing_from_env)), collapse = ", ")) |
58 lapply(w4kmeans_usage(),print) | 60 lapply(w4mkmeans_usage(),log_action) |
59 stop("w4mkmeans: contract has been broken") | 61 stop("w4mkmeans: contract has been broken") |
60 } | 62 } |
61 # extract parameters from 'env' | 63 # extract parameters from 'env' |
62 failure_action <- env$log_print | 64 log_action <- env$log_print |
63 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) | 65 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) |
64 sampleMetadata <- env$sampleMetadata | 66 sampleMetadata <- env$sampleMetadata |
65 featureMetadata <- env$variableMetadata | 67 featureMetadata <- env$variableMetadata |
66 slots <- env$slots | 68 slots <- env$slots |
67 positive_ints <- function(a, what) { | 69 positive_ints <- function(a, what) { |
68 i <- as.integer(a) # may introduce NAs by coercion | 70 i <- as.integer(a) # may introduce NAs by coercion |
69 i <- i[!is.na(i)] # eliminate NAs | 71 i <- i[!is.na(i)] # eliminate NAs |
70 i <- i[i > 0] # eliminate non-positive integers | 72 i <- i[i > 0] # eliminate non-positive integers |
71 i <- unique(sort(i)) # eliminate redundancy and disorder | 73 i <- unique(sort(i)) # eliminate redundancy and disorder |
72 if (length(a)!=length(i)) { | 74 if (length(a)!=length(i)) { |
73 failure_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.") | 75 log_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.") |
74 } | 76 } |
75 return (i) # return results, if any | 77 return (i) # return results, if any |
76 } | 78 } |
77 ksamples <- positive_ints(env$ksamples , "ksamples") | 79 ksamples <- positive_ints(env$ksamples , "ksamples") |
78 kfeatures <- positive_ints(env$kfeatures, "kfeatures") | 80 kfeatures <- positive_ints(env$kfeatures, "kfeatures") |
79 | 81 |
82 log_action("w4mkmeans: preparing data matrix") | |
83 # prepare data matrix (normalize, eliminate zero-variance rows, etc.; no transformation) | |
84 dm_en <- new.env() | |
85 dm_en$log <- c() | |
86 preparation_result <- tryCatchFunc(function(){ | |
87 dm <- prepare.data.matrix( | |
88 x.matrix = env$dataMatrix | |
89 , data.transformation = function(x) { x } | |
90 , en = dm_en | |
91 ) | |
92 my_log <- dm_en$log | |
93 for ( i in 1:length(my_log) ) { | |
94 log_action(my_log[i]) | |
95 } | |
96 dm | |
97 }) | |
98 if ( !preparation_result$success ) { | |
99 postmortem <- paste("prepare.data.matrix failed:", preparation_result$msg) | |
100 log_action(postmortem) | |
101 stop(postmortem) | |
102 } | |
103 | |
104 env$preparedDataMatrix <- preparation_result$value | |
105 | |
106 log_action("w4mkmeans: determining evaluation mode") | |
107 | |
80 myLapply <- parLapply | 108 myLapply <- parLapply |
81 # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation) | |
82 # myLapply <- function(cl, ...) lapply(...) | |
83 cl <- NULL | 109 cl <- NULL |
110 tryCatch( | |
111 expr = { | |
112 outfile <- "" | |
113 # outfile: Where to direct the stdout and stderr connection output from the workers. | |
114 # - "" indicates no redirection (which may only be useful for workers on the local machine). | |
115 # - Defaults to ‘/dev/null’ (‘nul:’ on Windows). | |
116 # - The other possibility is a file path on the worker's host. | |
117 # - Files will be opened in append mode, as all workers log to the same file. | |
118 cl <- makePSOCKcluster( names = slots, outfile = outfile ) | |
119 } | |
120 , error = function(e) { | |
121 log_action(sprintf("w4kmeans: falling back to serial evaluation because makePSOCKcluster(names = %d) threw an exception", slots)) | |
122 # mimic parLapply, but without parallelization (as a last resort) | |
123 myLapply <<- function(cl, ...) lapply(...) | |
124 } | |
125 ) | |
84 if ( identical(myLapply, parLapply) ) { | 126 if ( identical(myLapply, parLapply) ) { |
85 failure_action(sprintf("w4mkmeans: using parallel evaluation with %d slots", slots)) | 127 log_action(sprintf("w4mkmeans: using parallel evaluation with %d slots", slots)) |
86 failure_action(names(cl)) | |
87 cl <- makePSOCKcluster(names = slots) | |
88 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." | 128 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." |
129 # note varlist: names of references must be passed to the cluster so that they can be resolved | |
89 clusterExport( | 130 clusterExport( |
90 cl = cl | 131 cl = cl |
91 , varlist = c( | 132 , varlist = c( |
92 "tryCatchFunc" | 133 "tryCatchFunc" # required by calc_kmeans_one_dimension_one_k |
93 , "calc_kmeans_one_dimension_one_k" | 134 , "format_error" # required by tryCatchFunc when errors are caught |
94 , "prepare.data.matrix" | 135 , "iso_date" # required by log_print |
136 , "log_print" # required by calc_kmeans_one_dimension_one_k | |
95 ) | 137 ) |
96 ) | 138 ) |
97 final <- function(cl) { | 139 final <- function(cl) { |
98 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." | 140 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." |
99 if ( !is.null(cl) ) { | 141 if ( !is.null(cl) ) { |
100 failure_action("w4mkmeans: stopping cluster used for parallel evaluation") | 142 log_action("w4mkmeans: stopping cluster used for parallel evaluation") |
101 stopCluster(cl) | 143 stopCluster(cl) |
102 } | 144 } |
103 } | 145 } |
104 } else { | 146 } else { |
105 failure_action("w4mkmeans: using sequential evaluation (1 slot)") | 147 log_action("w4mkmeans: using sequential evaluation (one slot)") |
106 final <- function(cl) { } | 148 final <- function(cl) { } |
107 } | 149 } |
108 | 150 |
109 tryCatch( | 151 tryCatch( |
110 expr = { | 152 expr = { |
113 # - The inner list has keys "clusters" and "scores" | 155 # - The inner list has keys "clusters" and "scores" |
114 | 156 |
115 # for each $i in ksamples, append column 'k$i' to data frame sampleMetadata | 157 # for each $i in ksamples, append column 'k$i' to data frame sampleMetadata |
116 ksamples_length <- length(ksamples) | 158 ksamples_length <- length(ksamples) |
117 if ( ksamples_length > 0 ) { | 159 if ( ksamples_length > 0 ) { |
118 smpl_result_list <- myLapply( | 160 smpl_result_list <- myLapply( |
119 cl = cl | 161 cl = cl |
120 , ksamples | 162 , ksamples |
121 , calc_kmeans_one_dimension_one_k | 163 , calc_kmeans_one_dimension_one_k |
122 , env = env | 164 , env = env |
123 , dimension = "samples" | 165 , dimension = "samples" |
132 } | 174 } |
133 | 175 |
134 # for each $i in kfeatures, append column 'k$i' to data frame featureMetadata | 176 # for each $i in kfeatures, append column 'k$i' to data frame featureMetadata |
135 kfeatures_length <- length(kfeatures) | 177 kfeatures_length <- length(kfeatures) |
136 if ( kfeatures_length > 0 ) { | 178 if ( kfeatures_length > 0 ) { |
137 feat_result_list <- myLapply( | 179 feat_result_list <- myLapply( |
138 cl = cl | 180 cl = cl |
139 , kfeatures | 181 , kfeatures |
140 , calc_kmeans_one_dimension_one_k | 182 , calc_kmeans_one_dimension_one_k |
141 , env = env | 183 , env = env |
142 , dimension = "features" | 184 , dimension = "features" |
148 scores <- c(scores, result$value$scores) | 190 scores <- c(scores, result$value$scores) |
149 } | 191 } |
150 } | 192 } |
151 } | 193 } |
152 | 194 |
153 return ( | 195 return ( |
154 list( | 196 list( |
155 variableMetadata = featureMetadata | 197 variableMetadata = featureMetadata |
156 , sampleMetadata = sampleMetadata | 198 , sampleMetadata = sampleMetadata |
157 , scores = scores | 199 , scores = scores |
158 ) | 200 ) |
159 ) | 201 ) |
160 } | 202 } |
161 , finally = final(cl) | 203 , finally = { |
204 final(cl) | |
205 } | |
162 ) | 206 ) |
163 } | 207 } |
164 | 208 |
165 # calculate k-means for features or samples | 209 # calculate k-means for features or samples |
166 # - recall that the dataMatrix has features in rows and samples in columns | 210 # - recall that the dataMatrix has features in rows and samples in columns |
167 # return value: | 211 # return value: |
168 # list(clusters = km$cluster, scores = scores) | 212 # list(clusters = km$cluster, scores = scores) |
169 # arguments: | 213 # arguments: |
170 # env: | 214 # env: |
171 # environment having dataMatrix | 215 # environment having dataMatrix |
172 # dimension: | 216 # dimension: |
173 # - "samples": produce clusters column to add to the sampleMetadata table | 217 # - "samples": produce clusters column to add to the sampleMetadata table |
177 # integer, the number of clusters to make | 221 # integer, the number of clusters to make |
178 calc_kmeans_one_dimension_one_k <- function(k, env, dimension = "samples") { | 222 calc_kmeans_one_dimension_one_k <- function(k, env, dimension = "samples") { |
179 # abort if environment is not as expected | 223 # abort if environment is not as expected |
180 if ( is.null(env) || ! is.environment(env) ) { | 224 if ( is.null(env) || ! is.environment(env) ) { |
181 stop("calc_kmeans_one_dimension_one_k - argument 'env' is not an environment") | 225 stop("calc_kmeans_one_dimension_one_k - argument 'env' is not an environment") |
182 } | 226 } |
183 if ( ! exists("log_print", env) || ! is.function(env$log_print) ) { | 227 if ( ! exists("log_print", env) || ! is.function(env$log_print) ) { |
184 stop("calc_kmeans_one_dimension_one_k - argument 'env' - environment does not include log_print or it is not a function") | 228 stop("calc_kmeans_one_dimension_one_k - argument 'env' - environment does not include log_print or it is not a function") |
185 } | 229 } |
230 log_action <- env$log_print | |
186 # abort if k is not as expected | 231 # abort if k is not as expected |
187 if ( ! is.numeric(k) ) { | 232 if ( ! is.numeric(k) ) { |
188 stop(sprintf("calc_kmeans_one_dimension_one_k - expected numeric argument 'k' but type is %s", typeof(k))) | 233 stop(sprintf("calc_kmeans_one_dimension_one_k - expected numeric argument 'k' but type is %s", typeof(k))) |
189 } | 234 } |
190 k <- as.integer(k) | 235 k <- as.integer(k) |
191 # abort if dimension is not as expected | 236 # abort if dimension is not as expected |
192 if ( ! is.character(dimension) | 237 if ( ! is.character(dimension) |
193 || ! Reduce( f =`|`, x = sapply(X = c("features","samples"), FUN = `==`, dimension), init = FALSE) ) { | 238 || ! Reduce( f =`|`, x = sapply(X = c("features","samples"), FUN = `==`, dimension), init = FALSE) ) { |
194 stop("calc_kmeans_one_dimension_one_k - argument 'dimension' is neither 'features' nor 'samples'") | 239 stop("calc_kmeans_one_dimension_one_k - argument 'dimension' is neither 'features' nor 'samples'") |
195 } | 240 } |
196 dm <- env$dataMatrix | 241 dm <- env$preparedDataMatrix |
197 iter.max <- env$iter.max | 242 iter_max <- env$iter_max |
198 nstart <- env$nstart | 243 nstart <- env$nstart |
199 algorithm <- env$algorithm | 244 algorithm <- env$algorithm |
200 dim_features <- dimension == "features" | 245 dim_features <- dimension == "features" |
246 | |
201 # tryCatchFunc produces a list | 247 # tryCatchFunc produces a list |
202 # On success of expr(), tryCatchFunc produces | 248 # On success of func(), tryCatchFunc produces |
203 # list(success TRUE, value = expr(), msg = "") | 249 # list(success = TRUE, value = func(), msg = "") |
204 # On failure of expr(), tryCatchFunc produces | 250 # On failure of func(), tryCatchFunc produces |
205 # list(success = FALSE, value = NA, msg = "the error message") | 251 # list(success = FALSE, value = NA, msg = "the error message") |
206 result_list <- tryCatchFunc( expr = function() { | 252 result_list <- tryCatchFunc( func = function() { |
207 # kmeans clusters the rows; features are the columns of args_env$dataMatrix; samples, the rows | 253 # kmeans clusters the rows; features are the columns of args_env$dataMatrix; samples, the rows |
208 # - to calculate sample-clusters, no transposition is needed because samples are rows | 254 # - to calculate sample-clusters, no transposition is needed because samples are rows |
209 # - to calculate feature-clusters, transposition is needed so that features will be the rows | 255 # - to calculate feature-clusters, transposition is needed so that features will be the rows |
210 if ( ! dim_features ) dm <- t(dm) | 256 if ( ! dim_features ) { |
211 dm <- prepare.data.matrix( x.matrix = dm, data.transformation = function(x) { x } ) | 257 dm <- t(dm) |
258 } | |
259 | |
212 # need to set.seed to get reproducible results from kmeans | 260 # need to set.seed to get reproducible results from kmeans |
213 set.seed(4567) | 261 set.seed(4567) |
262 | |
214 # do the k-means clustering | 263 # do the k-means clustering |
215 km <- kmeans( x = dm, centers = k, iter.max, nstart = nstart, algorithm = algorithm ) | 264 withCallingHandlers( |
265 { | |
266 km <<- kmeans( x = dm, centers = k, iter.max = iter_max, nstart = nstart, algorithm = algorithm ) | |
267 } | |
268 , warning = function(w) { | |
269 lw <- list(w) | |
270 smplwrn <- as.character(w[[1]]) | |
271 log_print( | |
272 sprintf( "Warning for %s: center = %d, nstart = %d, iter_max = %d: %s" | |
273 , if (dim_features) "features" else "samples" | |
274 , k | |
275 , nstart | |
276 , iter_max | |
277 , smplwrn | |
278 ) | |
279 ) | |
280 } | |
281 ) | |
282 | |
283 # collect the scores | |
216 scores <- | 284 scores <- |
217 sprintf("%s\t%d\t%0.5e\t%0.5e\t%0.5f" | 285 sprintf("%s\t%d\t%0.5e\t%0.5e\t%0.5f" |
218 , dimension | 286 , dimension |
219 , k | 287 , k |
220 , km$totss | 288 , km$totss |
221 , km$betweenss | 289 , km$betweenss |
222 , km$betweenss/km$totss | 290 , km$betweenss/km$totss |
223 ) | 291 ) |
292 | |
293 # return list of results | |
224 list(clusters = km$cluster, scores = scores) | 294 list(clusters = km$cluster, scores = scores) |
225 }) | 295 }) |
296 | |
297 # return either | |
298 # list(success = TRUE, value = func(), msg = "") | |
299 # or | |
300 # list(success = FALSE, value = NA, msg = "the error message") | |
226 return ( result_list ) | 301 return ( result_list ) |
227 } | 302 } |
228 | 303 |
304 # vim: sw=2 ts=2 et : |