comparison w4mkmeans_routines.R @ 1:02cafb660b72 draft

planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit f600ce8a783df16e49272341dce0fc6bbc299b0a
author eschen42
date Wed, 09 Aug 2017 18:06:55 -0400
parents 6ccbe18131a6
children c415b7dc6f37
comparison
equal deleted inserted replaced
0:6ccbe18131a6 1:02cafb660b72
8 return ( 8 return (
9 c( 9 c(
10 "w4mkmeans: bad input.", 10 "w4mkmeans: bad input.",
11 "# contract:", 11 "# contract:",
12 " required - caller will provide an environment comprising:", 12 " required - caller will provide an environment comprising:",
13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", 13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...",
14 " variableMetadata - the corresponding W4M data.frame having feature metadata", 14 " variableMetadata - the corresponding W4M data.frame having feature metadata",
15 " sampleMetdata - the corresponding W4M data.frame having sample metadata", 15 " sampleMetdata - the corresponding W4M data.frame having sample metadata",
16 " dataMatrix - the corresponding W4M matrix", 16 " dataMatrix - the corresponding W4M matrix",
17 " slots - the number of parallel slots for calculating kmeans", 17 " slots - the number of parallel slots for calculating kmeans",
18 " optional - environment may comprise:", 18 " optional - environment may comprise:",
19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", 19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)",
20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", 20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)",
21 " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", 21 " iter.max - the maximum number of iterations when calculating a cluster (default = 10)",
22 " nstart - how many random sets of centers should be chosen (default = 1)", 22 " nstart - how many random sets of centers should be chosen (default = 1)",
23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", 23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
24 " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
24 " ", 25 " ",
25 " this routine will return a list comprising:", 26 " this routine will return a list comprising:",
26 " variableMetadata - the input variableMetadata data.frame with updates, if any", 27 " variableMetadata - the input variableMetadata data.frame with updates, if any",
27 " sampleMetadata - the input sampleMetadata data.frame with updates, if any", 28 " sampleMetadata - the input sampleMetadata data.frame with updates, if any",
28 " scores - an array of strings, each representing a line of a tsv having the following header:", 29 " scores - an array of strings, each representing a line of a tsv having the following header:",
29 " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion" 30 " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion"
30 ) 31 )
31 ) 32 )
32 } 33 }
33 34
34 w4mkmeans <- function(env) { 35 w4mkmeans <- function(env) {
35 # abort if 'env' is null or is not an environment 36 # abort if 'env' is null or is not an environment
36 if ( is.null(env) || ! is.environment(env) ) { 37 if ( is.null(env) || ! is.environment(env) ) {
37 lapply(w4kmeans_usage(),print) 38 lapply(w4kmeans_usage(),print)
38 } 39 }
39 # supply default arguments 40 # supply default arguments
40 if ( ! exists("iter.max" , env) ) env$iter.max <- 10 41 if ( ! exists("iter.max" , env) ) env$iter.max <- 10
41 if ( ! exists("nstart" , env) ) env$nstart <- 1 42 if ( ! exists("nstart" , env) ) env$nstart <- 1
42 if ( ! exists("algorithm", env) ) env$algorithm <- 'Hartigan-Wong' 43 if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong'
43 if ( ! exists("ksamples" , env) ) env$ksamples <- c() 44 if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k'
44 if ( ! exists("kfeatures", env) ) env$kfeatures <- c() 45 if ( ! exists("ksamples" , env) ) env$ksamples <- c()
46 if ( ! exists("kfeatures" , env) ) env$kfeatures <- c()
45 # check mandatory arguments 47 # check mandatory arguments
46 expected <- c( 48 expected <- c(
47 "log_print" 49 "log_print"
48 , "variableMetadata" 50 , "variableMetadata"
49 , "sampleMetadata" 51 , "sampleMetadata"
59 # extract parameters from 'env' 61 # extract parameters from 'env'
60 failure_action <- env$log_print 62 failure_action <- env$log_print
61 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) 63 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" )
62 sampleMetadata <- env$sampleMetadata 64 sampleMetadata <- env$sampleMetadata
63 featureMetadata <- env$variableMetadata 65 featureMetadata <- env$variableMetadata
64 ksamples <- as.numeric(env$ksamples)
65 kfeatures <- as.numeric(env$kfeatures)
66 slots <- env$slots 66 slots <- env$slots
67 positive_ints <- function(a, what) {
68 i <- as.integer(a) # may introduce NAs by coercion
69 i <- i[!is.na(i)] # eliminate NAs
70 i <- i[i > 0] # eliminate non-positive integers
71 i <- unique(sort(i)) # eliminate redundancy and disorder
72 if (length(a)!=length(i)) {
73 failure_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.")
74 }
75 return (i) # return results, if any
76 }
77 ksamples <- positive_ints(env$ksamples , "ksamples")
78 kfeatures <- positive_ints(env$kfeatures, "kfeatures")
67 79
68 myLapply <- parLapply 80 myLapply <- parLapply
69 # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation) 81 # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation)
70 # myLapply <- function(cl, ...) lapply(...) 82 # myLapply <- function(cl, ...) lapply(...)
71 cl <- NULL 83 cl <- NULL
111 , dimension = "samples" 123 , dimension = "samples"
112 ) 124 )
113 for ( i in 1:ksamples_length ) { 125 for ( i in 1:ksamples_length ) {
114 result <- smpl_result_list[[i]] 126 result <- smpl_result_list[[i]]
115 if (result$success) { 127 if (result$success) {
116 sampleMetadata[sprintf("k%d",ksamples[i])] <- result$value$clusters 128 sampleMetadata[sprintf("k%d",ksamples[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters)
117 scores <- c(scores, result$value$scores) 129 scores <- c(scores, result$value$scores)
118 } 130 }
119 } 131 }
120 } 132 }
121 133
130 , dimension = "features" 142 , dimension = "features"
131 ) 143 )
132 for ( i in 1:kfeatures_length ) { 144 for ( i in 1:kfeatures_length ) {
133 result <- feat_result_list[[i]] 145 result <- feat_result_list[[i]]
134 if (result$success) { 146 if (result$success) {
135 featureMetadata[sprintf("k%d",kfeatures[i])] <- result$value$clusters 147 featureMetadata[sprintf("k%d",kfeatures[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters)
136 scores <- c(scores, result$value$scores) 148 scores <- c(scores, result$value$scores)
137 } 149 }
138 } 150 }
139 } 151 }
140 152