Mercurial > repos > eschen42 > w4mkmeans
comparison w4m_general_purpose_routines.R @ 0:6ccbe18131a6 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author | eschen42 |
---|---|
date | Tue, 08 Aug 2017 15:30:38 -0400 |
parents | |
children | c415b7dc6f37 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6ccbe18131a6 |
---|---|
1 # prepare.data.matrix - Prepare x.datamatrix for multivariate statistical analaysis (MVA) | |
2 # - Motivation: | |
3 # - Selection: | |
4 # - You may want to exclude several samples from your analysis: | |
5 # - If so, set the argument 'exclude.samples' to a vector of sample names | |
6 # - You may want to exclude several features or features from your analysis: | |
7 # - If so, set the argument 'exclude.features' to a vector of feature names | |
8 # - Renaming samples: | |
9 # - You may want to rename several samples from your analysis: | |
10 # - If so, set the argument 'sample.rename.function' to a function accepting a vector | |
11 # of sample names and producing a vector of strings of equivalent length | |
12 # - MVA is confounded by missing values. | |
13 # - By default, this function imputes missing values as zero. | |
14 # - For a different imputation, set the 'data.imputation' argument to a function | |
15 # accepting a single matrix argument and returning a matrix of the same | |
16 # dimensions as the argument. | |
17 # - Transformation | |
18 # - It may be desirable to transform the intensity data to reduce the range. | |
19 # - By default, this function performs an eigth-root transformation: | |
20 # - Any root-tranformation has the advantage of never being negative. | |
21 # - Calculation of the eight-root is four times faster in my hands than log10. | |
22 # - However, it has the disadvantage that calculation of fold-differences | |
23 # is not additive as with log-transformation. | |
24 # - Rather, you must divide the values and raise to the eighth power. | |
25 # - For a different transformation, set the 'data.transformation' argument | |
26 # to a function accepting a single matrix argument. | |
27 # - The function should be written to return a matrix of the same dimensions | |
28 # as the argument. | |
29 # arguments: | |
30 # - x.matrix - matrix of intensities (or data.frame of sample metadata) | |
31 # - one row per sample | |
32 # - one column per feature or metadata attribute | |
33 # - exclude.samples - vector of labels of matrix rows (samples) to omit from analysis | |
34 # - exclude.features - vector of labels of matrix columnss (features) to omit from analysis | |
35 # - sample.rename.function - function to be used to rename rows if necessary, or NULL | |
36 # - e.g., sample.rename.function = function(x) { | |
37 # sub("(.*)_.*","\\1", row.names(x)) | |
38 # } | |
39 # - data.imputation - function applied to matrix to impute missing values | |
40 # - e.g., data.imputation = function(m) { | |
41 # m[is.na(m)] <- min(m, na.rm = TRUE) / 100 | |
42 # return (m) | |
43 # } | |
44 # - data.transformation - function applied to matrix cells | |
45 # - e.g., data.transformation = function(x) { return( log10(x) ) } | |
46 # or, data.transformation = log10 | |
47 # result value: | |
48 # transformed, imputed x.datamatrix with renamed rows and with neither excluded values nor features | |
49 # | |
50 ################################ | |
51 ## | |
52 ## Notes regarding the effectiveness and performance of the data transformation method. | |
53 ## | |
54 ## The two transformations that I tried (log10 and 8th root) required different imputation methods. | |
55 ## | |
56 ## For the LCMS resin data set that I was working with, separation in MVA was nearly equivalent for: | |
57 ## data.imputation <- function(x.matrix) { | |
58 ## x.matrix[is.na(x.matrix)] <- 0 | |
59 ## return (x.matrix) | |
60 ## } | |
61 ## data.transformation <- function(x) { | |
62 ## sqrt( sqrt( sqrt(x) ) ) | |
63 ## } | |
64 ## and | |
65 ## data.imputation <- function(x.matrix) { | |
66 ## x.matrix[is.na(x.matrix)] <- min(x.matrix, na.rm = TRUE) / 100 | |
67 ## return (x.matrix) | |
68 ## } | |
69 ## data.transformation <- function(x) { | |
70 ## log10(x) | |
71 ## } | |
72 ## | |
73 ## Note further that triple application of the square root: | |
74 ## - may be four times faster than log10: | |
75 ## - may be three times faster than log2: | |
76 ## | |
77 ## system.time( junk <- sqrt( sqrt( sqrt(1:100000000) ) ) ) | |
78 ## user system elapsed | |
79 ## 0.832 0.236 1.069 | |
80 ## system.time( junk <- log10(1:100000000) ) | |
81 ## user system elapsed | |
82 ## 3.936 0.400 4.337 | |
83 ## system.time( junk <- log2(1:100000000) ) | |
84 ## user system elapsed | |
85 ## 2.784 0.320 3.101 | |
86 ## | |
87 ################################ | |
88 # | |
89 prepare.data.matrix <- function( | |
90 x.matrix | |
91 , exclude.samples = NULL | |
92 , exclude.features = NULL | |
93 , sample.rename.function = NULL | |
94 , data.imputation = | |
95 function(m) { | |
96 # replace NA values with zero | |
97 m[is.na(m)] <- 0 | |
98 # replace negative values with zero, if applicable (It should never be applicable!) | |
99 if (min(m < 0)) { | |
100 m <- matrix(lapply(X = m, FUN = function(z) {max(z,0)}), nrow = nrow(m) ) | |
101 } | |
102 # return matrix as the result | |
103 return (m) | |
104 } | |
105 , data.transformation = function(x) { | |
106 sqrt( sqrt( sqrt(x) ) ) | |
107 } | |
108 , en = new.env() | |
109 ) { | |
110 # MatVar - Compute variance of rows or columns of a matrix | |
111 # ref: http://stackoverflow.com/a/25100036 | |
112 # For row variance, dim == 1, for col variance, dim == 2 | |
113 MatVar <- function(x, dim = 1) { | |
114 if (dim == 1) { | |
115 dim.x.2 <- dim(x)[2] | |
116 if ( dim.x.2 == 0 ) | |
117 stop("MatVar: there are zero columns") | |
118 if ( dim.x.2 == 1 ) { | |
119 stop("MatVar: a single column is insufficient to calculate a variance") | |
120 # return ( rep.int(x = 0, times = nrow(x)) ) | |
121 } else { | |
122 return ( rowSums( (x - rowMeans(x))^2 ) / ( dim(x)[2] - 1 ) ) | |
123 } | |
124 } else if (dim == 2) { | |
125 dim.x.1 <- dim(x)[1] | |
126 if ( dim.x.1 == 0 ) { | |
127 stop("MatVar: there are zero rows") | |
128 } | |
129 if ( dim.x.1 == 1 ) { | |
130 stop("MatVar: a single row is insufficient to calculate a variance") | |
131 # return ( rep.int(x = 0, times = ncol(x)) ) | |
132 } else { | |
133 return ( rowSums( (t(x) - colMeans(x))^2 ) / ( dim(x)[1] - 1 ) ) | |
134 } | |
135 } else stop("Please enter valid dimension, for rows, dim = 1; for colums, dim = 2") | |
136 } | |
137 | |
138 nonzero.var <- function(x) { | |
139 if (nrow(x) == 0) { | |
140 print(str(x)) | |
141 stop("matrix has no rows") | |
142 } | |
143 if (ncol(x) == 0) { | |
144 print(str(x)) | |
145 stop("matrix has no columns") | |
146 } | |
147 if ( is.numeric(x) ) { | |
148 # exclude any rows with zero variance | |
149 row.vars <- MatVar(x, dim = 1) | |
150 nonzero.row.vars <- row.vars > 0 | |
151 nonzero.rows <- row.vars[nonzero.row.vars] | |
152 if ( length(rownames(x)) != length(rownames(nonzero.rows)) ) { | |
153 row.names <- attr(nonzero.rows,"names") | |
154 x <- x[ row.names, , drop = FALSE ] | |
155 } | |
156 | |
157 # exclude any columns with zero variance | |
158 column.vars <- MatVar(x, dim = 2) | |
159 nonzero.column.vars <- column.vars > 0 | |
160 nonzero.columns <- column.vars[nonzero.column.vars] | |
161 if ( length(colnames(x)) != length(colnames(nonzero.columns)) ) { | |
162 column.names <- attr(nonzero.columns,"names") | |
163 x <- x[ , column.names, drop = FALSE ] | |
164 } | |
165 } | |
166 return (x) | |
167 } | |
168 | |
169 if (is.null(x.matrix)) { | |
170 stop("FATAL ERROR - prepare.data.matrix was called with null x.matrix") | |
171 } | |
172 | |
173 en$xpre <- x <- x.matrix | |
174 | |
175 # exclude any samples as indicated | |
176 if ( !is.null(exclude.features) ) { | |
177 my.colnames <- colnames(x) | |
178 my.col.diff <- setdiff(my.colnames, exclude.features) | |
179 x <- x[ , my.col.diff , drop = FALSE ] | |
180 } | |
181 | |
182 # exclude any features as indicated | |
183 if ( !is.null(exclude.samples) ) { | |
184 my.rownames <- rownames(x) | |
185 my.row.diff <- setdiff(my.rownames, exclude.samples) | |
186 x <- x[ my.row.diff, , drop = FALSE ] | |
187 } | |
188 | |
189 # rename rows if desired | |
190 if ( !is.null(sample.rename.function) ) { | |
191 renamed <- sample.rename.function(x) | |
192 rownames(x) <- renamed | |
193 } | |
194 | |
195 # save redacted x.datamatrix to environment | |
196 en$redacted.data.matrix <- x | |
197 | |
198 # impute values missing from the x.datamatrix | |
199 if ( !is.null(data.imputation) ) { | |
200 x <- data.imputation(x) | |
201 } | |
202 | |
203 # perform transformation if desired | |
204 if ( !is.null(data.transformation) ) { | |
205 x <- data.transformation(x) | |
206 } else { | |
207 x <- x | |
208 } | |
209 | |
210 # purge rows and columns that have zero variance | |
211 if ( is.numeric(x) ) { | |
212 x <- nonzero.var(x) | |
213 } | |
214 | |
215 # save imputed, transformed x.datamatrix to environment | |
216 en$imputed.transformed.data.matrix <- x | |
217 | |
218 return(x) | |
219 } | |
220 | |
221 | |
222 ##----------------------------------------------- | |
223 ## helper functions for error detection/reporting | |
224 ##----------------------------------------------- | |
225 | |
226 # log-printing to stderr | |
227 log_print <- function(x, ...) { | |
228 cat( | |
229 format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z") | |
230 , " " | |
231 , c(x, ...) | |
232 , "\n" | |
233 , sep="" | |
234 , file=stderr() | |
235 ) | |
236 } | |
237 | |
238 # tryCatchFunc produces a list | |
239 # On success of expr(), tryCatchFunc produces | |
240 # list(success TRUE, value = expr(), msg = "") | |
241 # On failure of expr(), tryCatchFunc produces | |
242 # list(success = FALSE, value = NA, msg = "the error message") | |
243 tryCatchFunc <- function(expr) { | |
244 # format error for logging | |
245 format_error <- function(e) { | |
246 paste(c("Error { message:", e$message, ", call:", e$call, "}"), collapse = " ") | |
247 } | |
248 my_expr <- expr | |
249 retval <- NULL | |
250 tryCatch( | |
251 expr = { | |
252 retval <- ( list( success = TRUE, value = my_expr(), msg = "" ) ) | |
253 } | |
254 , error = function(e) { | |
255 retval <<- list( success = FALSE, value = NA, msg = format_error(e) ) | |
256 } | |
257 ) | |
258 return (retval) | |
259 } | |
260 | |
261 # tryCatchProc produces a list | |
262 # On success of expr(), tryCatchProc produces | |
263 # list(success TRUE, msg = "") | |
264 # On failure of expr(), tryCatchProc produces | |
265 # list(success = FALSE, msg = "the error message") | |
266 tryCatchProc <- function(expr) { | |
267 # format error for logging | |
268 format_error <- function(e) { | |
269 paste(c("Error { message:", e$message, ", call:", e$call, "}"), collapse = " ") | |
270 } | |
271 retval <- NULL | |
272 tryCatch( | |
273 expr = { | |
274 expr() | |
275 retval <- ( list( success = TRUE, msg = "" ) ) | |
276 } | |
277 , error = function(e) { | |
278 retval <<- list( success = FALSE, msg = format_error(e) ) | |
279 } | |
280 ) | |
281 return (retval) | |
282 } | |
283 |