Mercurial > repos > eschen42 > w4mkmeans
comparison w4mkmeans_routines.R @ 1:02cafb660b72 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit f600ce8a783df16e49272341dce0fc6bbc299b0a
author | eschen42 |
---|---|
date | Wed, 09 Aug 2017 18:06:55 -0400 |
parents | 6ccbe18131a6 |
children | c415b7dc6f37 |
comparison
equal
deleted
inserted
replaced
0:6ccbe18131a6 | 1:02cafb660b72 |
---|---|
8 return ( | 8 return ( |
9 c( | 9 c( |
10 "w4mkmeans: bad input.", | 10 "w4mkmeans: bad input.", |
11 "# contract:", | 11 "# contract:", |
12 " required - caller will provide an environment comprising:", | 12 " required - caller will provide an environment comprising:", |
13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", | 13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", |
14 " variableMetadata - the corresponding W4M data.frame having feature metadata", | 14 " variableMetadata - the corresponding W4M data.frame having feature metadata", |
15 " sampleMetdata - the corresponding W4M data.frame having sample metadata", | 15 " sampleMetdata - the corresponding W4M data.frame having sample metadata", |
16 " dataMatrix - the corresponding W4M matrix", | 16 " dataMatrix - the corresponding W4M matrix", |
17 " slots - the number of parallel slots for calculating kmeans", | 17 " slots - the number of parallel slots for calculating kmeans", |
18 " optional - environment may comprise:", | 18 " optional - environment may comprise:", |
19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", | 19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", |
20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", | 20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", |
21 " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", | 21 " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", |
22 " nstart - how many random sets of centers should be chosen (default = 1)", | 22 " nstart - how many random sets of centers should be chosen (default = 1)", |
23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", | 23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", |
24 " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", | |
24 " ", | 25 " ", |
25 " this routine will return a list comprising:", | 26 " this routine will return a list comprising:", |
26 " variableMetadata - the input variableMetadata data.frame with updates, if any", | 27 " variableMetadata - the input variableMetadata data.frame with updates, if any", |
27 " sampleMetadata - the input sampleMetadata data.frame with updates, if any", | 28 " sampleMetadata - the input sampleMetadata data.frame with updates, if any", |
28 " scores - an array of strings, each representing a line of a tsv having the following header:", | 29 " scores - an array of strings, each representing a line of a tsv having the following header:", |
29 " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion" | 30 " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion" |
30 ) | 31 ) |
31 ) | 32 ) |
32 } | 33 } |
33 | 34 |
34 w4mkmeans <- function(env) { | 35 w4mkmeans <- function(env) { |
35 # abort if 'env' is null or is not an environment | 36 # abort if 'env' is null or is not an environment |
36 if ( is.null(env) || ! is.environment(env) ) { | 37 if ( is.null(env) || ! is.environment(env) ) { |
37 lapply(w4kmeans_usage(),print) | 38 lapply(w4kmeans_usage(),print) |
38 } | 39 } |
39 # supply default arguments | 40 # supply default arguments |
40 if ( ! exists("iter.max" , env) ) env$iter.max <- 10 | 41 if ( ! exists("iter.max" , env) ) env$iter.max <- 10 |
41 if ( ! exists("nstart" , env) ) env$nstart <- 1 | 42 if ( ! exists("nstart" , env) ) env$nstart <- 1 |
42 if ( ! exists("algorithm", env) ) env$algorithm <- 'Hartigan-Wong' | 43 if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong' |
43 if ( ! exists("ksamples" , env) ) env$ksamples <- c() | 44 if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k' |
44 if ( ! exists("kfeatures", env) ) env$kfeatures <- c() | 45 if ( ! exists("ksamples" , env) ) env$ksamples <- c() |
46 if ( ! exists("kfeatures" , env) ) env$kfeatures <- c() | |
45 # check mandatory arguments | 47 # check mandatory arguments |
46 expected <- c( | 48 expected <- c( |
47 "log_print" | 49 "log_print" |
48 , "variableMetadata" | 50 , "variableMetadata" |
49 , "sampleMetadata" | 51 , "sampleMetadata" |
59 # extract parameters from 'env' | 61 # extract parameters from 'env' |
60 failure_action <- env$log_print | 62 failure_action <- env$log_print |
61 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) | 63 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) |
62 sampleMetadata <- env$sampleMetadata | 64 sampleMetadata <- env$sampleMetadata |
63 featureMetadata <- env$variableMetadata | 65 featureMetadata <- env$variableMetadata |
64 ksamples <- as.numeric(env$ksamples) | |
65 kfeatures <- as.numeric(env$kfeatures) | |
66 slots <- env$slots | 66 slots <- env$slots |
67 positive_ints <- function(a, what) { | |
68 i <- as.integer(a) # may introduce NAs by coercion | |
69 i <- i[!is.na(i)] # eliminate NAs | |
70 i <- i[i > 0] # eliminate non-positive integers | |
71 i <- unique(sort(i)) # eliminate redundancy and disorder | |
72 if (length(a)!=length(i)) { | |
73 failure_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.") | |
74 } | |
75 return (i) # return results, if any | |
76 } | |
77 ksamples <- positive_ints(env$ksamples , "ksamples") | |
78 kfeatures <- positive_ints(env$kfeatures, "kfeatures") | |
67 | 79 |
68 myLapply <- parLapply | 80 myLapply <- parLapply |
69 # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation) | 81 # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation) |
70 # myLapply <- function(cl, ...) lapply(...) | 82 # myLapply <- function(cl, ...) lapply(...) |
71 cl <- NULL | 83 cl <- NULL |
111 , dimension = "samples" | 123 , dimension = "samples" |
112 ) | 124 ) |
113 for ( i in 1:ksamples_length ) { | 125 for ( i in 1:ksamples_length ) { |
114 result <- smpl_result_list[[i]] | 126 result <- smpl_result_list[[i]] |
115 if (result$success) { | 127 if (result$success) { |
116 sampleMetadata[sprintf("k%d",ksamples[i])] <- result$value$clusters | 128 sampleMetadata[sprintf("k%d",ksamples[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters) |
117 scores <- c(scores, result$value$scores) | 129 scores <- c(scores, result$value$scores) |
118 } | 130 } |
119 } | 131 } |
120 } | 132 } |
121 | 133 |
130 , dimension = "features" | 142 , dimension = "features" |
131 ) | 143 ) |
132 for ( i in 1:kfeatures_length ) { | 144 for ( i in 1:kfeatures_length ) { |
133 result <- feat_result_list[[i]] | 145 result <- feat_result_list[[i]] |
134 if (result$success) { | 146 if (result$success) { |
135 featureMetadata[sprintf("k%d",kfeatures[i])] <- result$value$clusters | 147 featureMetadata[sprintf("k%d",kfeatures[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters) |
136 scores <- c(scores, result$value$scores) | 148 scores <- c(scores, result$value$scores) |
137 } | 149 } |
138 } | 150 } |
139 } | 151 } |
140 | 152 |