Mercurial > repos > eschen42 > w4mkmeans
view w4mkmeans_wrapper.R @ 2:c415b7dc6f37 draft default tip
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 3e916537da6bb37e6f3927d7a11e98e0ab6ef5ec
author | eschen42 |
---|---|
date | Mon, 05 Mar 2018 12:40:17 -0500 |
parents | 02cafb660b72 |
children |
line wrap: on
line source
#!/usr/bin/env Rscript # references: # what this does: # - [stats::kmeans](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html) # - [stats::p.adjust](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html) # how this does what it does: # - [parallel::clusterApply](https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/clusterApply.html) # invocation: # Rscript w4mkmeans_wrapper.R \ # algorithm "$algorithm" \ # categorical_prefix "$categorical_prefix" \ # data_matrix_path "$dataMatrix_in" \ # iter_max "$iter_max" \ # kfeatures "$kfeatures" \ # ksamples "$ksamples" \ # nstart "$nstart" \ # sampleMetadata_out "$sampleMetadata_out" \ # sample_metadata_path "$sampleMetadata_in" \ # scores_out "$scores_out" \ # slots "${GALAXY_SLOTS:-1}" \ # variableMetadata_out "$variableMetadata_out" \ # variable_metadata_path "$variableMetadata_in" # # <inputs> # <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" /> # <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" /> # <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" /> # <param name="categoricalPrefix" label="prefix for cluster names " type="text" value="k" help="Some tools require non-numeric values to discern categorical; e.g., enter 'k' here to prepend 'k' to cluster numbers in the output; default 'k'." /> # <param name="kfeatures" label="K value(s) for features" type="text" value="0" help="Single or min,max value(s) for K for features (variables), or 0 for none." /> # <param name="ksamples" label="K value(s) for samples" type="text" value="0" help="Single or min,max value(s) for K for samples, or 0 for none." /> # <param name="iter_max" label="Max number of iterations" type="text" value="20" help="The maximum number of iterations allowed; default 20." /> # <param name="nstart" label="Number of random sets" type="text" value="20" help="How many random sets should be chosen; default 20." /> # <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see stats::kmeans reference for further info and references."> # <option value="Hartigan-Wong" selected="TRUE">Hartigan-Wong</option> # <option value="Lloyd">Lloyd</option> # <option value="MacQueen">MacQueen</option> # <option value="Forgy">Forgy</option> # </param> # </inputs> # <outputs> # <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data> # <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data> # </outputs> ##------------------------ ## libraries for this file ##------------------------ library(batch) ## for 'parseCommandArgs' ##------------------- ## Pre-initialization ##------------------- argVc <- unlist(parseCommandArgs(evaluate=FALSE)) if ( Reduce( `|`, grepl("tool_directory",names(argVc)) ) ) { tool_directory <- as.character(argVc["tool_directory"]) } else { tool_directory <- "." } r_path <- function(f) paste( tool_directory, f, sep = "/" ) ##---------------------------------------------------------- ## Computation - source general and module-specific routines ##---------------------------------------------------------- log_print <- function(x, ...) { cat( format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z") , " " , c(x, ...) , "\n" , sep="" , file=stderr() ) } log_cat <- function(x, ...) { cat( c(x, ...) , "\n" , sep="" , file=stderr() ) } # log_print(sprintf("tool_directory is %s", tool_directory)) w4m_general_purpose_routines_path <- r_path("w4m_general_purpose_routines.R") # log_print(sprintf("w4m_general_purpose_routines_path is %s", w4m_general_purpose_routines_path)) if ( ! file.exists(w4m_general_purpose_routines_path) ) { log_print("cannot find file w4m_general_purpose_routines.R") q(save = "no", status = 1, runLast = TRUE) } log_print("sourcing ",w4m_general_purpose_routines_path) source(w4m_general_purpose_routines_path) if ( ! exists("prepare.data.matrix") ) { log_print("'prepare.data.matrix' was not read from file w4m_general_purpose_routines.R") q(save = "no", status = 1, runLast = TRUE) } w4mkmeans_routines_path <- r_path("w4mkmeans_routines.R") # log_print(sprintf("w4mkmeans_routines_path is %s", w4mkmeans_routines_path)) if ( ! file.exists(w4mkmeans_routines_path) ) { log_print("cannot find file w4mkmeans_routines.R") q(save = "no", status = 1, runLast = TRUE) } # log_print("sourcing ",w4mkmeans_routines_path) source(w4mkmeans_routines_path) if ( ! exists("w4mkmeans") ) { log_print("'w4mkmeans' was not read from file w4mkmeans_routines.R") q(save = "no", status = 1, runLast = TRUE) } ##----------------------------------------- ## Computation - W4m data-suppport routines ##----------------------------------------- # read_data_frame - read a w4m data frame from a tsv, with error handling # e.g., data_matrix_input_env <- read_data_frame(dataMatrix_in, "data matrix input") read_data_frame <- function(file_path, kind_string, failure_action = log_print) { my.env <- new.env() my.env$success <- FALSE my.env$msg <- sprintf("no message reading %s", kind_string) tryCatch( expr = { my.env$data <- utils::read.delim( fill = FALSE, file = file_path ) my.env$success <- TRUE } , error = function(e) { my.env$msg <<- sprintf("%s read failed", kind_string) } ) if (!my.env$success) { failure_action(my.env$msg) } return (my.env) } # write_result - write a w4m data frame to a tsv write_result <- function(result, file_path, kind_string, failure_action = log_print) { my.env <- new.env() my.env$success <- FALSE my.env$msg <- sprintf("no message writing %s", kind_string) tryCatch( expr = { write.table( x = result , sep = "\t" , file = file_path , quote = FALSE , row.names = FALSE ) my.env$success <- TRUE } , error = function(e) { my.env$msg <<- sprintf("%s write failed", kind_string) } ) if (!my.env$success) { failure_action(my.env$msg) return (my.env) } return (my.env) } # read the three input files read_input_data <- function(env, failure_action = log_print) { kind_string <- "none" tryCatch( expr = { # read in the sample metadata kind_string <- "sample metadata input" smpl_metadata_input_env <- read_data_frame( file_path = env$sample_metadata_path , kind_string = kind_string , failure_action = failure_action ) if (!smpl_metadata_input_env$success) { failure_action(smpl_metadata_input_env$msg) return ( FALSE ) } env$sampleMetadata <- smpl_metadata_input_env$data # read in the variable metadata kind_string <- "variable metadata input" vrbl_metadata_input_env <- read_data_frame( file_path = env$variable_metadata_path , kind_string = kind_string , failure_action = failure_action ) if (!vrbl_metadata_input_env$success) { failure_action(vrbl_metadata_input_env$msg) return ( FALSE ) } env$variableMetadata <- vrbl_metadata_input_env$data # read in the data matrix kind_string <- "data matrix input" data_matrix_input_env <- read_data_frame( file_path = env$data_matrix_path , kind_string = kind_string , failure_action = failure_action ) if (!data_matrix_input_env$success) { failure_action(data_matrix_input_env$msg) return ( FALSE ) } # data frame for dataMatrix has rownames in first column data_matrix_df <- data_matrix_input_env$data rownames(data_matrix_df) <- data_matrix_df[,1] data_matrix <- data_matrix_df[,2:ncol(data_matrix_df)] env$dataMatrix <- as.matrix(data_matrix) } , error = function(e) { failure_action( sprintf("read_input_data failed for '%s' - %s", kind_string, format_error(e)) ) return ( FALSE ) } ) return ( TRUE ) } read_input_failure_action <- function(x, ...) { log_print("Failure reading input for '", modNamC, "' Galaxy module call") log_print(x, ...) } ##-------------------------- ## Computation - Entry Point ##-------------------------- ##---------- ## Constants ##---------- modNamC <- "w4mkmeans" ## module name ## options ##-------- # Set the handler for R error-handling options( show.error.messages = F , error = function () { log_print( "Fatal error in '", modNamC, "': ", geterrmessage() ) q( "no", 1, F ) } , warn = -1 ) # strings as factors? - not by default! # save old value strAsFacL <- options()$stringsAsFactors options(stringsAsFactors = FALSE) ## log file ##--------- log_print("Start of the '", modNamC, "' Galaxy module call") ## arguments ##---------- args_env <- new.env() # files log_print("PARAMETERS (raw):") invisible( lapply( X = 1:length(argVc) , FUN = function(i) { log_print(sprintf(" - %s: %s", names(argVc)[i], argVc[i])) } ) ) # write.table(as.matrix(argVc), col.names=F, quote=F, sep='\t') ## output files sampleMetadata_out <- as.character(argVc["sampleMetadata_out"]) variableMetadata_out <- as.character(argVc["variableMetadata_out"]) scores_out <- as.character(argVc["scores_out"]) ## input files args_env$data_matrix_path <- as.character(argVc["data_matrix_path"]) args_env$variable_metadata_path <- as.character(argVc["variable_metadata_path"]) args_env$sample_metadata_path <- as.character(argVc["sample_metadata_path"]) # other parameters # multi-string args - split csv: "1,2,3" -> c("1","2","3") args_env$kfeatures <- strsplit(x = as.character(argVc['kfeatures']), split = ",", fixed = TRUE)[[1]] args_env$ksamples <- strsplit(x = as.character(argVc['ksamples' ]), split = ",", fixed = TRUE)[[1]] # numeric args args_env$iter_max <- as.numeric( argVc['iter_max' ]) args_env$nstart <- as.numeric( argVc['nstart' ]) args_env$slots <- as.numeric( argVc['slots' ]) # string args args_env$algorithm <- as.character( argVc['algorithm']) args_env$categorical_prefix <- as.character( argVc['categorical_prefix']) # make local 'log_print' function available through 'env' args_env$log_print <- log_print log_print("PARAMETERS (parsed):") for (member in ls(args_env)) { value <- get(member, args_env) value <- ifelse(length(value) == 1, value, sprintf("c(%s)", paste(value, collapse=", "))) log_print(sprintf(" - %s: %s", member, ifelse( !is.function(value) , value, "function" ))) } log_print("") ##--------------------------------------------------------- ## Computation - attempt to read input data and process ##--------------------------------------------------------- if ( ! read_input_data(args_env, failure_action = read_input_failure_action) ) { result <- -1 } else { log_print("Input data was read.") # attempt to process the data result <- w4mkmeans(env = args_env) log_print("Returned from call to w4mkmeans.") } if ( length(result) == 0 ) { log_print("no results were produced") # exit with status code non-zero to indicate error q(save = "no", status = 1, runLast = FALSE) } else if ( ! setequal(names(result),c("variableMetadata","sampleMetadata","scores")) ) { log_print(sprintf("unexpected result keys %s", names(result))) # exit with status code non-zero to indicate error q(save = "no", status = 1, runLast = FALSE) } else if ( ! write_result(result = result$variableMetadata, file_path = variableMetadata_out, kind_string = "clustered variableMetadata")$success ) { log_print("failed to write output file for clustered variableMetadata") # exit with status code non-zero to indicate error q(save = "no", status = 1, runLast = FALSE) } else if ( ! write_result(result = result$sampleMetadata, file_path = sampleMetadata_out, kind_string = "clustered sampleMetadata")$success ) { log_print("failed to write output file for clustered sampleMetadata") # exit with status code non-zero to indicate error q(save = "no", status = 1, runLast = FALSE) } else { tryCatch( expr = { fileConn<-file(scores_out) writeLines(result$scores, fileConn) close(fileConn) } , error = function(e) { log_print(sprintf("failed to write output file for cluster scores - %s", format_error(e))) # exit with status code non-zero to indicate error q(save = "no", status = 1, runLast = FALSE) } ) } ##-------- ## Closing ##-------- if (!file.exists(sampleMetadata_out)) { log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, sampleMetadata_out)) } if (!file.exists(variableMetadata_out)) { log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, variableMetadata_out)) } if (!file.exists(scores_out)) { log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, scores_out)) } log_print("Normal termination of '", modNamC, "' Galaxy module call") # exit with status code zero q(save = "no", status = 0, runLast = FALSE)