Mercurial > repos > eschen42 > w4mkmeans
comparison w4mkmeans_routines.R @ 0:6ccbe18131a6 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author | eschen42 |
---|---|
date | Tue, 08 Aug 2017 15:30:38 -0400 |
parents | |
children | 02cafb660b72 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6ccbe18131a6 |
---|---|
1 ##------------------------------------------------------------------------------------------------------ | |
2 ## these are the batch-independent and file-structure-independent routines to support the w4mkmeans tool | |
3 ##------------------------------------------------------------------------------------------------------ | |
4 | |
5 library(parallel) | |
6 | |
7 w4kmeans_usage <- function() { | |
8 return ( | |
9 c( | |
10 "w4mkmeans: bad input.", | |
11 "# contract:", | |
12 " required - caller will provide an environment comprising:", | |
13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", | |
14 " variableMetadata - the corresponding W4M data.frame having feature metadata", | |
15 " sampleMetdata - the corresponding W4M data.frame having sample metadata", | |
16 " dataMatrix - the corresponding W4M matrix", | |
17 " slots - the number of parallel slots for calculating kmeans", | |
18 " optional - environment may comprise:", | |
19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", | |
20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", | |
21 " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", | |
22 " nstart - how many random sets of centers should be chosen (default = 1)", | |
23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", | |
24 " ", | |
25 " this routine will return a list comprising:", | |
26 " variableMetadata - the input variableMetadata data.frame with updates, if any", | |
27 " sampleMetadata - the input sampleMetadata data.frame with updates, if any", | |
28 " scores - an array of strings, each representing a line of a tsv having the following header:", | |
29 " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion" | |
30 ) | |
31 ) | |
32 } | |
33 | |
34 w4mkmeans <- function(env) { | |
35 # abort if 'env' is null or is not an environment | |
36 if ( is.null(env) || ! is.environment(env) ) { | |
37 lapply(w4kmeans_usage(),print) | |
38 } | |
39 # supply default arguments | |
40 if ( ! exists("iter.max" , env) ) env$iter.max <- 10 | |
41 if ( ! exists("nstart" , env) ) env$nstart <- 1 | |
42 if ( ! exists("algorithm", env) ) env$algorithm <- 'Hartigan-Wong' | |
43 if ( ! exists("ksamples" , env) ) env$ksamples <- c() | |
44 if ( ! exists("kfeatures", env) ) env$kfeatures <- c() | |
45 # check mandatory arguments | |
46 expected <- c( | |
47 "log_print" | |
48 , "variableMetadata" | |
49 , "sampleMetadata" | |
50 , "dataMatrix" | |
51 , "slots" | |
52 ) | |
53 missing_from_env <- setdiff(expected, (ls(env))) | |
54 if ( length(missing_from_env) > 0 ) { | |
55 print(paste(c('expected environment members not found: ', as.character(missing_from_env)), collapse = ", ")) | |
56 lapply(w4kmeans_usage(),print) | |
57 stop("w4mkmeans: contract has been broken") | |
58 } | |
59 # extract parameters from 'env' | |
60 failure_action <- env$log_print | |
61 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) | |
62 sampleMetadata <- env$sampleMetadata | |
63 featureMetadata <- env$variableMetadata | |
64 ksamples <- as.numeric(env$ksamples) | |
65 kfeatures <- as.numeric(env$kfeatures) | |
66 slots <- env$slots | |
67 | |
68 myLapply <- parLapply | |
69 # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation) | |
70 # myLapply <- function(cl, ...) lapply(...) | |
71 cl <- NULL | |
72 if ( identical(myLapply, parLapply) ) { | |
73 failure_action(sprintf("w4mkmeans: using parallel evaluation with %d slots", slots)) | |
74 failure_action(names(cl)) | |
75 cl <- makePSOCKcluster(names = slots) | |
76 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." | |
77 clusterExport( | |
78 cl = cl | |
79 , varlist = c( | |
80 "tryCatchFunc" | |
81 , "calc_kmeans_one_dimension_one_k" | |
82 , "prepare.data.matrix" | |
83 ) | |
84 ) | |
85 final <- function(cl) { | |
86 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." | |
87 if ( !is.null(cl) ) { | |
88 failure_action("w4mkmeans: stopping cluster used for parallel evaluation") | |
89 stopCluster(cl) | |
90 } | |
91 } | |
92 } else { | |
93 failure_action("w4mkmeans: using sequential evaluation (1 slot)") | |
94 final <- function(cl) { } | |
95 } | |
96 | |
97 tryCatch( | |
98 expr = { | |
99 # These myLapply calls produce lists of lists of results: | |
100 # - The outer list has no keys and its members are accessed by index | |
101 # - The inner list has keys "clusters" and "scores" | |
102 | |
103 # for each $i in ksamples, append column 'k$i' to data frame sampleMetadata | |
104 ksamples_length <- length(ksamples) | |
105 if ( ksamples_length > 0 ) { | |
106 smpl_result_list <- myLapply( | |
107 cl = cl | |
108 , ksamples | |
109 , calc_kmeans_one_dimension_one_k | |
110 , env = env | |
111 , dimension = "samples" | |
112 ) | |
113 for ( i in 1:ksamples_length ) { | |
114 result <- smpl_result_list[[i]] | |
115 if (result$success) { | |
116 sampleMetadata[sprintf("k%d",ksamples[i])] <- result$value$clusters | |
117 scores <- c(scores, result$value$scores) | |
118 } | |
119 } | |
120 } | |
121 | |
122 # for each $i in kfeatures, append column 'k$i' to data frame featureMetadata | |
123 kfeatures_length <- length(kfeatures) | |
124 if ( kfeatures_length > 0 ) { | |
125 feat_result_list <- myLapply( | |
126 cl = cl | |
127 , kfeatures | |
128 , calc_kmeans_one_dimension_one_k | |
129 , env = env | |
130 , dimension = "features" | |
131 ) | |
132 for ( i in 1:kfeatures_length ) { | |
133 result <- feat_result_list[[i]] | |
134 if (result$success) { | |
135 featureMetadata[sprintf("k%d",kfeatures[i])] <- result$value$clusters | |
136 scores <- c(scores, result$value$scores) | |
137 } | |
138 } | |
139 } | |
140 | |
141 return ( | |
142 list( | |
143 variableMetadata = featureMetadata | |
144 , sampleMetadata = sampleMetadata | |
145 , scores = scores | |
146 ) | |
147 ) | |
148 } | |
149 , finally = final(cl) | |
150 ) | |
151 } | |
152 | |
153 # calculate k-means for features or samples | |
154 # - recall that the dataMatrix has features in rows and samples in columns | |
155 # return value: | |
156 # list(clusters = km$cluster, scores = scores) | |
157 # arguments: | |
158 # env: | |
159 # environment having dataMatrix | |
160 # dimension: | |
161 # - "samples": produce clusters column to add to the sampleMetadata table | |
162 # - this is the default case | |
163 # - "variables": produce clusters column to add to the variableMetadata table | |
164 # k: | |
165 # integer, the number of clusters to make | |
166 calc_kmeans_one_dimension_one_k <- function(k, env, dimension = "samples") { | |
167 # abort if environment is not as expected | |
168 if ( is.null(env) || ! is.environment(env) ) { | |
169 stop("calc_kmeans_one_dimension_one_k - argument 'env' is not an environment") | |
170 } | |
171 if ( ! exists("log_print", env) || ! is.function(env$log_print) ) { | |
172 stop("calc_kmeans_one_dimension_one_k - argument 'env' - environment does not include log_print or it is not a function") | |
173 } | |
174 # abort if k is not as expected | |
175 if ( ! is.numeric(k) ) { | |
176 stop(sprintf("calc_kmeans_one_dimension_one_k - expected numeric argument 'k' but type is %s", typeof(k))) | |
177 } | |
178 k <- as.integer(k) | |
179 # abort if dimension is not as expected | |
180 if ( ! is.character(dimension) | |
181 || ! Reduce( f =`|`, x = sapply(X = c("features","samples"), FUN = `==`, dimension), init = FALSE) ) { | |
182 stop("calc_kmeans_one_dimension_one_k - argument 'dimension' is neither 'features' nor 'samples'") | |
183 } | |
184 dm <- env$dataMatrix | |
185 iter.max <- env$iter.max | |
186 nstart <- env$nstart | |
187 algorithm <- env$algorithm | |
188 dim_features <- dimension == "features" | |
189 # tryCatchFunc produces a list | |
190 # On success of expr(), tryCatchFunc produces | |
191 # list(success TRUE, value = expr(), msg = "") | |
192 # On failure of expr(), tryCatchFunc produces | |
193 # list(success = FALSE, value = NA, msg = "the error message") | |
194 result_list <- tryCatchFunc( expr = function() { | |
195 # kmeans clusters the rows; features are the columns of args_env$dataMatrix; samples, the rows | |
196 # - to calculate sample-clusters, no transposition is needed because samples are rows | |
197 # - to calculate feature-clusters, transposition is needed so that features will be the rows | |
198 if ( ! dim_features ) dm <- t(dm) | |
199 dm <- prepare.data.matrix( x.matrix = dm, data.transformation = function(x) { x } ) | |
200 # need to set.seed to get reproducible results from kmeans | |
201 set.seed(4567) | |
202 # do the k-means clustering | |
203 km <- kmeans( x = dm, centers = k, iter.max, nstart = nstart, algorithm = algorithm ) | |
204 scores <- | |
205 sprintf("%s\t%d\t%0.5e\t%0.5e\t%0.5f" | |
206 , dimension | |
207 , k | |
208 , km$totss | |
209 , km$betweenss | |
210 , km$betweenss/km$totss | |
211 ) | |
212 list(clusters = km$cluster, scores = scores) | |
213 }) | |
214 return ( result_list ) | |
215 } | |
216 |