Mercurial > repos > eschen42 > w4mkmeans
comparison w4mkmeans_wrapper.R @ 0:6ccbe18131a6 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author | eschen42 |
---|---|
date | Tue, 08 Aug 2017 15:30:38 -0400 |
parents | |
children | 02cafb660b72 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6ccbe18131a6 |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 # references: | |
4 # what this does: | |
5 # - [stats::kmeans](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html) | |
6 # - [stats::p.adjust](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html) | |
7 # how this does what it does: | |
8 # - [parallel::clusterApply](https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/clusterApply.html) | |
9 | |
10 # invocation: | |
11 # Rscript $__tool_directory__/w4mkmeans_wrapper.R \ | |
12 # tool_directory $__tool_directory__ | |
13 # data_matrix_path '$dataMatrix_in' \ | |
14 # variable_metadata_path '$variableMetadata_in' \ | |
15 # sample_metadata_path '$sampleMetadata_in' \ | |
16 # kfeatures '$kfeatures' \ | |
17 # ksamples '$ksamples' \ | |
18 # iter_max '$iter_max' \ | |
19 # nstart '$nstart' \ | |
20 # algorithm '$algorithm' \ | |
21 # scores '$scores' \ | |
22 # sampleMetadata_out '$sampleMetadata_out' \ | |
23 # variableMetadata_out '$variableMetadata_out' \ | |
24 # slots "\${GALAXY_SLOTS:-1}" \ | |
25 # | |
26 # <inputs> | |
27 # <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" /> | |
28 # <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" /> | |
29 # <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" /> | |
30 # <param name="kfeatures" label="K value(s) for features" type="text" value="0" help="Single or min,max value(s) for K for features (variables), or 0 for none." /> | |
31 # <param name="ksamples" label="K value(s) for samples" type="text" value="0" help="Single or min,max value(s) for K for samples, or 0 for none." /> | |
32 # <param name="iter_max" label="Max number of iterations" type="text" value="10" help="The maximum number of iterations allowed; default 10." /> | |
33 # <param name="nstart" label="Number of random sets" type="text" value="1" help="How many random sets should be chosen; default 1." /> | |
34 # <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see stats::kmeans reference for further info and references."> | |
35 # <option value="Hartigan-Wong" selected="TRUE">Hartigan-Wong</option> | |
36 # <option value="Lloyd">Lloyd</option> | |
37 # <option value="MacQueen">MacQueen</option> | |
38 # <option value="Forgy">Forgy</option> | |
39 # </param> | |
40 # </inputs> | |
41 # <outputs> | |
42 # <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data> | |
43 # <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data> | |
44 # </outputs> | |
45 | |
46 ##------------------------ | |
47 ## libraries for this file | |
48 ##------------------------ | |
49 | |
50 library(batch) ## for 'parseCommandArgs' | |
51 | |
52 ##------------------- | |
53 ## Pre-initialization | |
54 ##------------------- | |
55 | |
56 argVc <- unlist(parseCommandArgs(evaluate=FALSE)) | |
57 if ( Reduce( `|`, grepl("tool_directory",names(argVc)) ) ) { | |
58 tool_directory <- as.character(argVc["tool_directory"]) | |
59 } else { | |
60 tool_directory <- "." | |
61 } | |
62 r_path <- function(f) paste( tool_directory, f, sep = "/" ) | |
63 | |
64 ##---------------------------------------------------------- | |
65 ## Computation - source general and module-specific routines | |
66 ##---------------------------------------------------------- | |
67 | |
68 log_print <- function(x, ...) { | |
69 cat( | |
70 format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z") | |
71 , " " | |
72 , c(x, ...) | |
73 , "\n" | |
74 , sep="" | |
75 , file=stderr() | |
76 ) | |
77 } | |
78 | |
79 # log_print(sprintf("tool_directory is %s", tool_directory)) | |
80 | |
81 w4m_general_purpose_routines_path <- r_path("w4m_general_purpose_routines.R") | |
82 # log_print(sprintf("w4m_general_purpose_routines_path is %s", w4m_general_purpose_routines_path)) | |
83 if ( ! file.exists(w4m_general_purpose_routines_path) ) { | |
84 log_print("cannot find file w4m_general_purpose_routines.R") | |
85 q(save = "no", status = 1, runLast = TRUE) | |
86 } | |
87 # log_print("sourcing ",w4m_general_purpose_routines_path) | |
88 source(w4m_general_purpose_routines_path) | |
89 if ( ! exists("prepare.data.matrix") ) { | |
90 log_print("'prepare.data.matrix' was not read from file w4m_general_purpose_routines.R") | |
91 q(save = "no", status = 1, runLast = TRUE) | |
92 } | |
93 | |
94 w4mkmeans_routines_path <- r_path("w4mkmeans_routines.R") | |
95 # log_print(sprintf("w4mkmeans_routines_path is %s", w4mkmeans_routines_path)) | |
96 if ( ! file.exists(w4mkmeans_routines_path) ) { | |
97 log_print("cannot find file w4mkmeans_routines.R") | |
98 q(save = "no", status = 1, runLast = TRUE) | |
99 } | |
100 # log_print("sourcing ",w4mkmeans_routines_path) | |
101 source(w4mkmeans_routines_path) | |
102 if ( ! exists("w4mkmeans") ) { | |
103 log_print("'w4mkmeans' was not read from file w4mkmeans_routines.R") | |
104 q(save = "no", status = 1, runLast = TRUE) | |
105 } | |
106 | |
107 ##----------------------------------------- | |
108 ## Computation - W4m data-suppport routines | |
109 ##----------------------------------------- | |
110 | |
111 # read_data_frame - read a w4m data frame from a tsv, with error handling | |
112 # e.g., data_matrix_input_env <- read_data_frame(dataMatrix_in, "data matrix input") | |
113 read_data_frame <- function(file_path, kind_string, failure_action = log_print) { | |
114 my.env <- new.env() | |
115 my.env$success <- FALSE | |
116 my.env$msg <- sprintf("no message reading %s", kind_string) | |
117 tryCatch( | |
118 expr = { | |
119 my.env$data <- utils::read.delim( fill = FALSE, file = file_path ) | |
120 my.env$success <- TRUE | |
121 } | |
122 , error = function(e) { | |
123 my.env$msg <<- sprintf("%s read failed", kind_string) | |
124 } | |
125 ) | |
126 if (!my.env$success) { | |
127 failure_action(my.env$msg) | |
128 } | |
129 return (my.env) | |
130 } | |
131 | |
132 # write_result - write a w4m data frame to a tsv | |
133 write_result <- function(result, file_path, kind_string, failure_action = log_print) { | |
134 my.env <- new.env() | |
135 my.env$success <- FALSE | |
136 my.env$msg <- sprintf("no message writing %s", kind_string) | |
137 tryCatch( | |
138 expr = { | |
139 write.table( | |
140 x = result | |
141 , sep = "\t" | |
142 , file = file_path | |
143 , quote = FALSE | |
144 , row.names = FALSE | |
145 ) | |
146 my.env$success <- TRUE | |
147 } | |
148 , error = function(e) { | |
149 my.env$msg <<- sprintf("%s write failed", kind_string) | |
150 } | |
151 ) | |
152 if (!my.env$success) { | |
153 failure_action(my.env$msg) | |
154 return (my.env) | |
155 } | |
156 return (my.env) | |
157 } | |
158 | |
159 # read the three input files | |
160 read_input_data <- function(env, failure_action = log_print) { | |
161 kind_string <- "none" | |
162 tryCatch( | |
163 expr = { | |
164 # read in the sample metadata | |
165 kind_string <- "sample metadata input" | |
166 smpl_metadata_input_env <- | |
167 read_data_frame( | |
168 file_path = env$sample_metadata_path | |
169 , kind_string = kind_string | |
170 , failure_action = failure_action | |
171 ) | |
172 if (!smpl_metadata_input_env$success) { | |
173 failure_action(smpl_metadata_input_env$msg) | |
174 return ( FALSE ) | |
175 } | |
176 env$sampleMetadata <- smpl_metadata_input_env$data | |
177 | |
178 # read in the variable metadata | |
179 kind_string <- "variable metadata input" | |
180 vrbl_metadata_input_env <- | |
181 read_data_frame( | |
182 file_path = env$variable_metadata_path | |
183 , kind_string = kind_string | |
184 , failure_action = failure_action | |
185 ) | |
186 if (!vrbl_metadata_input_env$success) { | |
187 failure_action(vrbl_metadata_input_env$msg) | |
188 return ( FALSE ) | |
189 } | |
190 env$variableMetadata <- vrbl_metadata_input_env$data | |
191 | |
192 # read in the data matrix | |
193 kind_string <- "data matrix input" | |
194 data_matrix_input_env <- | |
195 read_data_frame( | |
196 file_path = env$data_matrix_path | |
197 , kind_string = kind_string | |
198 , failure_action = failure_action | |
199 ) | |
200 if (!data_matrix_input_env$success) { | |
201 failure_action(data_matrix_input_env$msg) | |
202 return ( FALSE ) | |
203 } | |
204 # data frame for dataMatrix has rownames in first column | |
205 data_matrix_df <- data_matrix_input_env$data | |
206 rownames(data_matrix_df) <- data_matrix_df[,1] | |
207 data_matrix <- data_matrix_df[,2:ncol(data_matrix_df)] | |
208 env$dataMatrix <- as.matrix(data_matrix) | |
209 | |
210 } | |
211 , error = function(e) { | |
212 failure_action( sprintf("read_input_data failed for '%s' - %s", kind_string, format_error(e)) ) | |
213 return ( FALSE ) | |
214 } | |
215 ) | |
216 return ( TRUE ) | |
217 } | |
218 | |
219 | |
220 read_input_failure_action <- function(x, ...) { | |
221 log_print("Failure reading input for '", modNamC, "' Galaxy module call") | |
222 log_print(x, ...) | |
223 } | |
224 | |
225 ##-------------------------- | |
226 ## Computation - Entry Point | |
227 ##-------------------------- | |
228 | |
229 ##---------- | |
230 ## Constants | |
231 ##---------- | |
232 | |
233 modNamC <- "w4mkmeans" ## module name | |
234 | |
235 ## options | |
236 ##-------- | |
237 | |
238 # Set the handler for R error-handling | |
239 options( show.error.messages = F | |
240 , error = function () { | |
241 log_print( "Fatal error in '", modNamC, "': ", geterrmessage() ) | |
242 q( "no", 1, F ) | |
243 } | |
244 , warn = -1 | |
245 ) | |
246 | |
247 # strings as factors? - not by default! | |
248 # save old value | |
249 strAsFacL <- options()$stringsAsFactors | |
250 options(stringsAsFactors = FALSE) | |
251 | |
252 | |
253 ## log file | |
254 ##--------- | |
255 | |
256 log_print("Start of the '", modNamC, "' Galaxy module call") | |
257 | |
258 ## arguments | |
259 ##---------- | |
260 | |
261 args_env <- new.env() | |
262 | |
263 # files | |
264 | |
265 log_print("PARAMETERS (raw):") | |
266 invisible( | |
267 lapply( | |
268 X = 1:length(argVc) | |
269 , FUN = function(i) { | |
270 log_print(sprintf(" - %s: %s", names(argVc)[i], argVc[i])) | |
271 } | |
272 ) | |
273 ) | |
274 | |
275 # write.table(as.matrix(argVc), col.names=F, quote=F, sep='\t') | |
276 | |
277 ## output files | |
278 sampleMetadata_out <- as.character(argVc["sampleMetadata_out"]) | |
279 variableMetadata_out <- as.character(argVc["variableMetadata_out"]) | |
280 scores_out <- as.character(argVc["scores_out"]) | |
281 ## input files | |
282 args_env$data_matrix_path <- as.character(argVc["data_matrix_path"]) | |
283 args_env$variable_metadata_path <- as.character(argVc["variable_metadata_path"]) | |
284 args_env$sample_metadata_path <- as.character(argVc["sample_metadata_path"]) | |
285 | |
286 # other parameters | |
287 | |
288 # multi-string args - split csv: "1,2,3" -> c("1","2","3") | |
289 args_env$kfeatures <- strsplit(x = as.character(argVc['kfeatures']), split = ",", fixed = TRUE)[[1]] | |
290 args_env$ksamples <- strsplit(x = as.character(argVc['ksamples' ]), split = ",", fixed = TRUE)[[1]] | |
291 # numeric args | |
292 args_env$iter_max <- as.numeric( argVc['iter_max' ]) | |
293 args_env$nstart <- as.numeric( argVc['nstart' ]) | |
294 args_env$slots <- as.numeric( argVc['slots' ]) | |
295 # string args | |
296 args_env$algorithm <- as.character( argVc['algorithm']) | |
297 args_env$log_print <- log_print | |
298 | |
299 log_print("PARAMETERS (parsed):") | |
300 for (member in ls(args_env)) { | |
301 value <- get(member, args_env) | |
302 value <- ifelse(length(value) == 1, value, sprintf("c(%s)", paste(value, collapse=", "))) | |
303 | |
304 log_print(sprintf(" - %s: %s", member, ifelse( !is.function(value) , value, "function" ))) | |
305 } | |
306 log_print("") | |
307 | |
308 ##--------------------------------------------------------- | |
309 ## Computation - attempt to read input data | |
310 ##--------------------------------------------------------- | |
311 if ( ! read_input_data(args_env, failure_action = read_input_failure_action) ) { | |
312 result <- -1 | |
313 } else { | |
314 log_print("Input data was read successfully.") | |
315 result <- w4mkmeans(env = args_env) | |
316 log_print("returned from call to w4mkmeans.") | |
317 } | |
318 | |
319 if ( length(result) == 0 ) { | |
320 log_print("no results were produced") | |
321 # exit with status code non-zero to indicate error | |
322 q(save = "no", status = 1, runLast = FALSE) | |
323 } else if ( ! setequal(names(result),c("variableMetadata","sampleMetadata","scores")) ) { | |
324 log_print(sprintf("unexpected result keys %s", names(result))) | |
325 # exit with status code non-zero to indicate error | |
326 q(save = "no", status = 1, runLast = FALSE) | |
327 } else if ( ! write_result(result = result$variableMetadata, file_path = variableMetadata_out, kind_string = "clustered variableMetadata")$success ) { | |
328 log_print("failed to write output file for clustered variableMetadata") | |
329 # exit with status code non-zero to indicate error | |
330 q(save = "no", status = 1, runLast = FALSE) | |
331 } else if ( ! write_result(result = result$sampleMetadata, file_path = sampleMetadata_out, kind_string = "clustered sampleMetadata")$success ) { | |
332 log_print("failed to write output file for clustered sampleMetadata") | |
333 # exit with status code non-zero to indicate error | |
334 q(save = "no", status = 1, runLast = FALSE) | |
335 } else { | |
336 tryCatch( | |
337 expr = { | |
338 fileConn<-file(scores_out) | |
339 writeLines(result$scores, fileConn) | |
340 close(fileConn) | |
341 } | |
342 , error = function(e) { | |
343 log_print(sprintf("failed to write output file for cluster scores - %s", format_error(e))) | |
344 # exit with status code non-zero to indicate error | |
345 q(save = "no", status = 1, runLast = FALSE) | |
346 } | |
347 ) | |
348 } | |
349 | |
350 ##-------- | |
351 ## Closing | |
352 ##-------- | |
353 | |
354 | |
355 if (!file.exists(sampleMetadata_out)) { | |
356 log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, sampleMetadata_out)) | |
357 } | |
358 | |
359 if (!file.exists(variableMetadata_out)) { | |
360 log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, variableMetadata_out)) | |
361 } | |
362 | |
363 if (!file.exists(scores_out)) { | |
364 log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, scores_out)) | |
365 } | |
366 | |
367 log_print("Normal termination of '", modNamC, "' Galaxy module call") | |
368 | |
369 # exit with status code zero | |
370 q(save = "no", status = 0, runLast = FALSE) |