Mercurial > repos > eschen42 > w4mkmeans

diff w4mkmeans_routines.R @ 1:02cafb660b72 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit f600ce8a783df16e49272341dce0fc6bbc299b0a
author: eschen42
date: Wed, 09 Aug 2017 18:06:55 -0400
parents: 6ccbe18131a6
children: c415b7dc6f37
--- a/w4mkmeans_routines.R	Tue Aug 08 15:30:38 2017 -0400
+++ b/w4mkmeans_routines.R	Wed Aug 09 18:06:55 2017 -0400
@@ -10,23 +10,24 @@
      "w4mkmeans: bad input.",
      "# contract:",
      "    required - caller will provide an environment comprising:",
-     "      log_print        - a logging function with the signature function(x, ...) expecting strings as x and ...",
-     "      variableMetadata - the corresponding W4M data.frame having feature metadata",
-     "      sampleMetdata    - the corresponding W4M data.frame having sample metadata",
-     "      dataMatrix       - the corresponding W4M matrix",
-     "      slots            - the number of parallel slots for calculating kmeans",
+     "      log_print          - a logging function with the signature function(x, ...) expecting strings as x and ...",
+     "      variableMetadata   - the corresponding W4M data.frame having feature metadata",
+     "      sampleMetdata      - the corresponding W4M data.frame having sample metadata",
+     "      dataMatrix         - the corresponding W4M matrix",
+     "      slots              - the number of parallel slots for calculating kmeans",
      "    optional - environment may comprise:",
-     "      kfeatures        - an array of integers, the k's to apply for clustering by feature (default, empty array)",
-     "      ksamples         - an array of integers, the k's to apply for clustering by sample (default, empty array)",
-     "      iter.max         - the maximum number of iterations when calculating a cluster (default = 10)",
-     "      nstart           - how many random sets of centers should be chosen (default = 1)",
-     "      algorithm        - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
+     "      kfeatures          - an array of integers, the k's to apply for clustering by feature (default, empty array)",
+     "      ksamples           - an array of integers, the k's to apply for clustering by sample (default, empty array)",
+     "      iter.max           - the maximum number of iterations when calculating a cluster (default = 10)",
+     "      nstart             - how many random sets of centers should be chosen (default = 1)",
+     "      algorithm          - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
+     "      categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
      "      ",
      "    this routine will return a list comprising:",
-     "      variableMetadata - the input variableMetadata data.frame with updates, if any",
-     "      sampleMetadata   - the input sampleMetadata data.frame with updates, if any",
-     "      scores           - an array of strings, each representing a line of a tsv having the following header:",
-     "                           clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion"
+     "      variableMetadata   - the input variableMetadata data.frame with updates, if any",
+     "      sampleMetadata     - the input sampleMetadata data.frame with updates, if any",
+     "      scores             - an array of strings, each representing a line of a tsv having the following header:",
+     "                             clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion"
     )
   )
 }
@@ -37,11 +38,12 @@
     lapply(w4kmeans_usage(),print)
   } 
   # supply default arguments
-  if ( ! exists("iter.max" , env) ) env$iter.max  <- 10
-  if ( ! exists("nstart"   , env) ) env$nstart    <- 1
-  if ( ! exists("algorithm", env) ) env$algorithm <- 'Hartigan-Wong'
-  if ( ! exists("ksamples" , env) ) env$ksamples  <- c()
-  if ( ! exists("kfeatures", env) ) env$kfeatures <- c()
+  if ( ! exists("iter.max"          , env) ) env$iter.max  <- 10
+  if ( ! exists("nstart"            , env) ) env$nstart    <- 1
+  if ( ! exists("algorithm"         , env) ) env$algorithm <- 'Hartigan-Wong'
+  if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k'
+  if ( ! exists("ksamples"          , env) ) env$ksamples  <- c()
+  if ( ! exists("kfeatures"         , env) ) env$kfeatures <- c()
   # check mandatory arguments
   expected <- c(
     "log_print"
@@ -61,9 +63,19 @@
   scores          <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" )
   sampleMetadata  <- env$sampleMetadata
   featureMetadata <- env$variableMetadata
-  ksamples        <- as.numeric(env$ksamples)
-  kfeatures       <- as.numeric(env$kfeatures)
   slots           <- env$slots
+  positive_ints <- function(a, what) {
+    i <- as.integer(a)    # may introduce NAs by coercion
+    i <- i[!is.na(i)]     # eliminate NAs
+    i <- i[i > 0]         # eliminate non-positive integers
+    i <- unique(sort(i))  # eliminate redundancy and disorder
+    if (length(a)!=length(i)) {
+      failure_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.")
+    }
+    return (i)            # return results, if any
+  }
+  ksamples        <- positive_ints(env$ksamples , "ksamples")
+  kfeatures       <- positive_ints(env$kfeatures, "kfeatures")
 
   myLapply <- parLapply
   # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation)
@@ -113,7 +125,7 @@
         for ( i in 1:ksamples_length ) {
           result <- smpl_result_list[[i]]
           if (result$success) {
-            sampleMetadata[sprintf("k%d",ksamples[i])] <- result$value$clusters
+            sampleMetadata[sprintf("k%d",ksamples[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters)
             scores <- c(scores, result$value$scores)
           }
         }
@@ -132,7 +144,7 @@
         for ( i in 1:kfeatures_length ) {
           result <- feat_result_list[[i]]
           if (result$success) {
-            featureMetadata[sprintf("k%d",kfeatures[i])] <- result$value$clusters
+            featureMetadata[sprintf("k%d",kfeatures[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters)
             scores <- c(scores, result$value$scores)
           }
         }
author	eschen42
date	Wed, 09 Aug 2017 18:06:55 -0400
parents	6ccbe18131a6
children	c415b7dc6f37