Mercurial > repos > jasonxu > matrixeqtl
comparison MatrixEQTL/R/Matrix_eQTL_engine.R @ 3:ae74f8fb3aef draft
Uploaded
author | jasonxu |
---|---|
date | Fri, 12 Mar 2021 08:20:57 +0000 |
parents | cd4c8e4a4b5b |
children |
comparison
equal
deleted
inserted
replaced
2:5fafba5359eb | 3:ae74f8fb3aef |
---|---|
1 # Matrix eQTL by Andrey A. Shabalin | |
2 # http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ | |
3 | |
4 # http://cran.r-project.org/web/packages/policies.html | |
5 | |
6 library(methods) | |
7 | |
8 modelLINEAR = 117348L; | |
9 modelANOVA = 47074L; | |
10 modelLINEAR_CROSS = 1113461L; | |
11 | |
12 .seq = function(a,b){if(a<=b){a:b}else{integer(0)}}; | |
13 | |
14 SlicedData <- setRefClass( "SlicedData", | |
15 fields = list( | |
16 dataEnv = "environment", | |
17 nSlices1 = "numeric", | |
18 rowNameSlices = "list", | |
19 columnNames = "character", | |
20 fileDelimiter = "character", | |
21 fileSkipColumns = "numeric", | |
22 fileSkipRows = "numeric", | |
23 fileSliceSize = "numeric", | |
24 fileOmitCharacters = "character" | |
25 ), | |
26 methods = list( | |
27 initialize = function( mat = NULL ) { | |
28 dataEnv <<- new.env(hash = TRUE, size = 29L); | |
29 nSlices1 <<- 0L; | |
30 if(!is.null(mat)) { | |
31 CreateFromMatrix(mat); | |
32 } | |
33 fileSliceSize <<- 1000; | |
34 fileDelimiter <<- "\t"; | |
35 fileSkipColumns <<- 1L; | |
36 fileSkipRows <<- 1L; | |
37 fileOmitCharacters <<- "NA" | |
38 return(invisible(.self)); | |
39 }, | |
40 CreateFromMatrix = function( mat ) { | |
41 stopifnot( class(mat) == "matrix" ); | |
42 setSliceRaw( 1L ,mat ); | |
43 rns = rownames( mat, do.NULL = FALSE); | |
44 #if( is.null(rns) ) { | |
45 # rns = paste( "Row_",(1:nrow(mat)), sep="" ); | |
46 #} | |
47 rowNameSlices <<- list(rns); | |
48 cns = colnames( mat, do.NULL = FALSE ); | |
49 #if( is.null(cns) ){ | |
50 # cns = paste( "Col_",(1:ncol(mat)), sep="" ); | |
51 #} | |
52 columnNames <<- cns; | |
53 return(invisible(.self)); | |
54 }, | |
55 getSlice = function(sl) { | |
56 value = get(paste(sl), dataEnv); | |
57 if( is.raw(value) ) { | |
58 storage.mode(value) = "double"; | |
59 value[value == 255] = NA; | |
60 } | |
61 return( value ) | |
62 }, | |
63 getSliceRaw = function(sl) { | |
64 return( get(paste(sl), dataEnv) ) | |
65 }, | |
66 setSliceRaw = function(sl, value) { | |
67 assign( paste(sl), value, dataEnv ) | |
68 if( nSlices1 < sl ) { | |
69 nSlices1 <<- sl; | |
70 } | |
71 }, | |
72 setSlice = function(sl, value) { | |
73 if( length(value) > 0 ) { | |
74 if( all(as.integer(value) == value, na.rm = TRUE) ) { | |
75 if( (min(value, na.rm = TRUE) >= 0 ) && | |
76 (max(value, na.rm = TRUE) < 255) ) | |
77 { | |
78 nv = value; | |
79 suppressWarnings({storage.mode(nv) = "raw"}); | |
80 nv[ is.na(value)] = as.raw(255); | |
81 value = nv; | |
82 } else { | |
83 storage.mode(value) = "integer"; | |
84 } | |
85 } | |
86 } | |
87 setSliceRaw(sl, value); | |
88 }, | |
89 nSlices = function() { | |
90 return( nSlices1 ); | |
91 }, | |
92 LoadFile = function(filename, skipRows = NULL, skipColumns = NULL, sliceSize = NULL, omitCharacters = NULL, delimiter = NULL, rowNamesColumn = 1) { | |
93 if( !is.null(skipRows) ) { | |
94 fileSkipRows <<- skipRows; | |
95 } | |
96 if( !is.null(skipColumns) ) { | |
97 fileSkipColumns <<- skipColumns; | |
98 } | |
99 if( !is.null(omitCharacters) ) { | |
100 fileOmitCharacters <<- omitCharacters; | |
101 } | |
102 if( !is.null(sliceSize) ) { | |
103 fileSliceSize <<- sliceSize; | |
104 } | |
105 if( !is.null(delimiter) ) { | |
106 fileDelimiter <<- delimiter; | |
107 } | |
108 stopifnot( (fileSkipColumns == 0) || (rowNamesColumn <= fileSkipColumns) ) | |
109 stopifnot( (fileSkipColumns == 0) || (rowNamesColumn >= 1) ) | |
110 | |
111 fid = file(description = filename, open = "rt", blocking = FALSE, raw = FALSE) | |
112 # clean object if file is open | |
113 Clear(); | |
114 lines = readLines(con = fid, n = max(fileSkipRows,1L), ok = TRUE, warn = TRUE) | |
115 line1 = tail(lines,1); | |
116 splt = strsplit(line1, split = fileDelimiter, fixed = TRUE); | |
117 if( fileSkipRows > 0L ) { | |
118 columnNames <<- splt[[1]]; # [ -(1:fileSkipColumns) ]; | |
119 } else { | |
120 seek(fid, 0) | |
121 } | |
122 | |
123 rm( lines, line1, splt ); | |
124 | |
125 rowNameSlices <<- vector("list", 15); | |
126 | |
127 curSliceId = 0L; | |
128 repeat | |
129 { | |
130 # preallocate names and data | |
131 if(length(rowNameSlices) < curSliceId) { | |
132 rowNameSlices[[2L*curSliceId]] <<- NULL; | |
133 } | |
134 curSliceId = curSliceId + 1L; | |
135 | |
136 # read sliceSize rows | |
137 rowtag = vector("character",fileSliceSize); | |
138 rowvals = vector("list",fileSliceSize); | |
139 for(i in 1:fileSliceSize) { | |
140 temp = ""; | |
141 if( fileSkipColumns > 0L ) { | |
142 temp = scan(file = fid, what = character(), n = fileSkipColumns, quiet = TRUE,sep = fileDelimiter); | |
143 } | |
144 rowtag[i] = temp[rowNamesColumn];#paste(temp,collapse=" "); | |
145 rowvals[[i]] = scan(file = fid, what = double(), nlines = 1, quiet = TRUE, sep = fileDelimiter, na.strings = fileOmitCharacters); | |
146 if( length(rowvals[[i]]) == 0L ) { | |
147 if(i==1L) { | |
148 rowtag = matrix(0, 0, 0); | |
149 rowvals = character(0); | |
150 } else { | |
151 rowtag = rowtag[ 1:(i-1) ]; | |
152 rowvals = rowvals[ 1:(i-1) ]; | |
153 } | |
154 break; | |
155 } | |
156 } | |
157 if( length(rowtag) == 0L ) { | |
158 curSliceId = curSliceId - 1L; | |
159 break; | |
160 } | |
161 rowNameSlices[[curSliceId]] <<- rowtag; | |
162 data = c(rowvals, recursive = TRUE); | |
163 dim(data) = c(length(rowvals[[1]]), length(rowvals)); | |
164 data = t(data); | |
165 setSlice(curSliceId, data); | |
166 if( length(rowtag) < fileSliceSize ) { | |
167 break; | |
168 } | |
169 numtxt = formatC(curSliceId*fileSliceSize, big.mark=",", format = "f", digits = 0) | |
170 cat( "Rows read: ", numtxt, "\n"); | |
171 flush.console() | |
172 } | |
173 close(fid) | |
174 if( fileSkipRows == 0 ) { | |
175 columnNames <<- paste("Col_", (1:nCols()), sep=""); | |
176 } else { | |
177 columnNames <<- tail(columnNames, ncol(getSliceRaw(1))); | |
178 } | |
179 if( fileSkipColumns == 0 ) { | |
180 cnt = 0L; | |
181 for( sl in 1:nSlices() ) { | |
182 nr = length(getSliceRaw(sl)); | |
183 rowNameSlices[[sl]] <<- paste("Row_",cnt + (1:nr),sep=""); | |
184 cnt = cnt + nr; | |
185 } | |
186 } | |
187 rowNameSlices <<- rowNameSlices[1:curSliceId]; | |
188 cat("Rows read: ", nRows(), " done.\n"); | |
189 return(invisible(.self)); | |
190 }, | |
191 SaveFile = function(filename) { | |
192 if( nSlices() == 0 ) { | |
193 cat("No data to save"); | |
194 return(); | |
195 } | |
196 fid = file(filename,"wt"); | |
197 for( sl in 1:nSlices() ) { | |
198 z = getSlice(sl); | |
199 rownames(z) = rowNameSlices[[sl]]; | |
200 colnames(z) = columnNames; | |
201 write.table(z, file = fid, sep = "\t", | |
202 col.names = (if(sl == 1){NA}else{FALSE})); | |
203 } | |
204 close(fid); | |
205 }, | |
206 nRows = function() { | |
207 s = 0L; | |
208 for(sl in .seq(1,nSlices())) { | |
209 s = s + nrow(getSliceRaw(sl)); | |
210 } | |
211 return( s ) | |
212 }, | |
213 nCols = function() { | |
214 if( nSlices() == 0L ) { | |
215 return(0L); | |
216 } else { | |
217 return( ncol(getSliceRaw(1L)) ) | |
218 } | |
219 }, | |
220 Clear = function() { | |
221 for( sl in .seq(1,nSlices()) ) { | |
222 rm(list = paste(sl), envir = dataEnv) | |
223 } | |
224 nSlices1 <<- 0L; | |
225 rowNameSlices <<- list(); | |
226 columnNames <<- character(); | |
227 return(invisible(.self)); | |
228 }, | |
229 IsCombined = function() { | |
230 return( nSlices() <= 1L ); | |
231 }, | |
232 GetAllRowNames = function() { | |
233 return( c(rowNameSlices, recursive=TRUE) ); | |
234 }, | |
235 SetNanRowMean = function() { | |
236 if( (nCols() == 0L) ) { | |
237 return(invisible(.self)); | |
238 } | |
239 for( sl in .seq(1,nSlices()) ) { | |
240 slice = getSlice(sl); | |
241 if( any(is.na(slice)) ) { | |
242 rowmean = rowMeans(slice, na.rm = TRUE); | |
243 rowmean[is.na(rowmean)] = 0L; | |
244 for( j in which(!complete.cases(slice)) ) { | |
245 where1 = is.na(slice[j, ]); | |
246 slice[j, where1] = rowmean[j]; | |
247 } | |
248 setSlice(sl, slice); | |
249 } | |
250 } | |
251 return(invisible(.self)); | |
252 }, | |
253 RowStandardizeCentered = function() { | |
254 for(sl in .seq(1,nSlices()) ) { | |
255 slice = getSlice(sl); | |
256 div = sqrt( rowSums(slice^2) ); | |
257 div[ div == 0 ] = 1; | |
258 setSlice(sl, slice/div); | |
259 } | |
260 return(invisible(.self)); | |
261 }, | |
262 CombineInOneSlice = function() { | |
263 if( nSlices() <= 1L ) { | |
264 return(invisible(.self)); | |
265 } | |
266 nc = nCols(); | |
267 nr = nRows(); | |
268 datatypes = c("raw","integer","double"); | |
269 datafuns = c(as.raw, as.integer, as.double); | |
270 datatype = character(nSlices()); | |
271 for(sl in 1:nSlices()) { | |
272 datatype[sl] = typeof(getSliceRaw(sl)); | |
273 } | |
274 mch = max(match(datatype,datatypes,nomatch = length(datatypes))); | |
275 datafun = datafuns[[mch]]; | |
276 newData = matrix(datafun(0), nrow = nr, ncol = nc); | |
277 offset = 0; | |
278 for(sl in 1:nSlices()) { | |
279 if(mch==1) { | |
280 slice = getSliceRaw(sl); | |
281 } else { | |
282 slice = getSlice(sl); | |
283 } | |
284 newData[ offset + (1:nrow(slice)),] = datafun(slice); | |
285 setSlice(sl, numeric()); | |
286 offset = offset + nrow(slice); | |
287 } | |
288 | |
289 nSlices1 <<- 1L; | |
290 setSliceRaw(1L, newData); | |
291 rm(newData); | |
292 | |
293 newrowNameSlices = GetAllRowNames(); | |
294 rowNameSlices <<- list(newrowNameSlices) | |
295 return(invisible(.self)); | |
296 }, | |
297 ResliceCombined = function(sliceSize = -1) { | |
298 if( sliceSize > 0L ) { | |
299 fileSliceSize <<- sliceSize; | |
300 } | |
301 if( fileSliceSize <= 0 ) { | |
302 fileSliceSize <<- 1000; | |
303 } | |
304 if( IsCombined() ) { | |
305 nRows1 = nRows(); | |
306 if(nRows1 == 0L) { | |
307 return(invisible(.self)); | |
308 } | |
309 newNSlices = floor( (nRows1 + fileSliceSize - 1)/fileSliceSize ); | |
310 oldData = getSliceRaw(1L); | |
311 #oldNames = rowNameSlices[[1]]; | |
312 newNameslices = vector("list",newNSlices) | |
313 for( sl in 1:newNSlices ) { | |
314 range = (1+(sl-1)*fileSliceSize) : (min(nRows1,sl*fileSliceSize)); | |
315 newpart = oldData[range, ,drop = FALSE]; | |
316 if( is.raw(oldData) ) { | |
317 setSliceRaw( sl, newpart); | |
318 } else { | |
319 setSlice( sl, newpart); | |
320 } | |
321 newNameslices[[sl]] = rowNameSlices[[1]][range]; | |
322 } | |
323 rowNameSlices <<- newNameslices ; | |
324 } else { | |
325 stop("Reslice of a sliced matrix is not supported yet. Use CombineInOneSlice first."); | |
326 } | |
327 return(invisible(.self)); | |
328 }, | |
329 Clone = function() { | |
330 clone = SlicedData$new(); | |
331 for(sl in .seq(1,nSlices()) ) { | |
332 clone$setSliceRaw(sl,getSliceRaw(sl)); | |
333 } | |
334 clone$rowNameSlices = rowNameSlices; | |
335 clone$columnNames = columnNames; | |
336 clone$fileDelimiter = fileDelimiter; | |
337 clone$fileSkipColumns = fileSkipColumns; | |
338 clone$fileSkipRows = fileSkipRows; | |
339 clone$fileSliceSize = fileSliceSize; | |
340 clone$fileOmitCharacters = fileOmitCharacters; | |
341 return( clone ); | |
342 }, | |
343 RowMatrixMultiply = function(multiplier) { | |
344 for(sl in .seq(1,nSlices()) ) { | |
345 setSlice(sl, getSlice(sl) %*% multiplier); | |
346 } | |
347 return(invisible(.self)); | |
348 }, | |
349 ColumnSubsample = function(subset) { | |
350 for(sl in .seq(1,nSlices()) ) { | |
351 setSliceRaw(sl, getSliceRaw(sl)[ ,subset, drop = FALSE]); | |
352 } | |
353 columnNames <<- columnNames[subset]; | |
354 return(invisible(.self)); | |
355 }, | |
356 RowReorderSimple = function(ordr) { | |
357 # had to use an inefficient and dirty method | |
358 # due to horrible memory management in R | |
359 if( (typeof(ordr) == "logical") && all(ordr) ) { | |
360 return(invisible(.self)); | |
361 } | |
362 if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { | |
363 return(invisible(.self)); | |
364 } | |
365 CombineInOneSlice(); | |
366 gc(); | |
367 setSliceRaw( 1L, getSliceRaw(1L)[ordr, ] ); | |
368 rowNameSlices[[1]] <<- rowNameSlices[[1]][ordr]; | |
369 gc(); | |
370 ResliceCombined(); | |
371 gc(); | |
372 return(invisible(.self)); | |
373 }, | |
374 RowReorder = function(ordr) { | |
375 # transform logical into indices | |
376 if( typeof(ordr) == "logical" ) { | |
377 if( length(ordr) == nRows() ) { | |
378 ordr = which(ordr); | |
379 } else { | |
380 stop("Parameter \"ordr\" has wrong length") | |
381 } | |
382 } | |
383 ## first, check that anything has to be done at all | |
384 if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { | |
385 return(invisible(.self)); | |
386 } | |
387 ## check bounds | |
388 #if( (min(ordr) < 1) || (max(ordr) > nRows()) ) { | |
389 # stop("Parameter \"ordr\" is out of bounds"); | |
390 #} | |
391 ## slice the data into individual rows | |
392 all_rows = vector("list", nSlices()) | |
393 for( i in 1:nSlices() ) { | |
394 slice = getSliceRaw(i) | |
395 all_rows[[i]] = split(slice, 1:nrow(slice)) | |
396 setSliceRaw(i,numeric()) | |
397 } | |
398 gc(); | |
399 all_rows = unlist(all_rows, recursive=FALSE, use.names = FALSE); | |
400 ## Reorder the rows | |
401 all_rows = all_rows[ordr]; | |
402 ## get row names | |
403 all_names = GetAllRowNames(); | |
404 ## erase the set | |
405 rowNameSlices <<- list(); | |
406 ## sort names | |
407 all_names = all_names[ordr]; | |
408 ## | |
409 ## Make slices back | |
410 nrows = length(all_rows); | |
411 nSlices1 <<- as.integer((nrows+fileSliceSize-1)/fileSliceSize); | |
412 ##cat(nrows, " ", nSlices1); | |
413 rowNameSlices1 = vector("list", nSlices1); | |
414 for( i in 1:nSlices1 ) { | |
415 fr = 1 + fileSliceSize*(i-1); | |
416 to = min( fileSliceSize*i, nrows); | |
417 | |
418 subset = all_rows[fr:to]; | |
419 types = unlist(lapply(subset,typeof)); | |
420 israw = (types == "raw") | |
421 if(!all(israw == israw[1])) { | |
422 # some raw and some are not | |
423 subset = lapply(subset, function(x){if(is.raw(x)){x=as.integer(x);x[x==255] = NA;return(x)}else{return(x)}}); | |
424 } | |
425 subset = unlist(subset); | |
426 dim(subset) = c( length(all_rows[[fr]]) , to - fr + 1) | |
427 #subset = matrix(subset, ncol = (to-fr+1)); | |
428 if(is.raw(subset)) { | |
429 setSliceRaw(i, t(subset)); | |
430 } else { | |
431 setSlice(i, t(subset)); | |
432 } | |
433 rowNameSlices1[[i]] = all_names[fr:to]; | |
434 all_rows[fr:to] = 0; | |
435 all_names[fr:to] = 0; | |
436 } | |
437 rowNameSlices <<- rowNameSlices1; | |
438 gc(); | |
439 return(invisible(.self)); | |
440 }, | |
441 RowRemoveZeroEps = function(){ | |
442 for(sl in .seq(1,nSlices()) ) { | |
443 slice = getSlice(sl); | |
444 amean = rowMeans(abs(slice)); | |
445 remove = (amean < .Machine$double.eps*nCols()); | |
446 if(any(remove)) { | |
447 rowNameSlices[[sl]] <<- rowNameSlices[[sl]][!remove]; | |
448 setSlice(sl, slice[!remove, , drop = FALSE]); | |
449 } | |
450 } | |
451 return(invisible(.self)); | |
452 }, | |
453 FindRow = function(rowname) { | |
454 for(sl in .seq(1,nSlices()) ) { | |
455 mch = match(rowname,rowNameSlices[[sl]], nomatch = 0); | |
456 if( mch > 0 ) | |
457 { | |
458 row = getSlice(sl)[mch[1], , drop=FALSE]; | |
459 rownames(row) = rowname; | |
460 colnames(row) = columnNames; | |
461 return( list(slice = sl, item = mch, row = row) ); | |
462 } | |
463 } | |
464 return( NULL ); | |
465 }, | |
466 show = function() { | |
467 cat("SlicedData object. For more information type: ?SlicedData\n"); | |
468 cat("Number of columns:", nCols(), "\n"); | |
469 cat("Number of rows:", nRows(), "\n"); | |
470 cat("Data is stored in", nSlices(), "slices\n"); | |
471 if(nCols()>0) { | |
472 z = getSlice(1L); | |
473 if(nrow(z)>0) { | |
474 z = z[1:min(nrow(z),10L), 1:min(ncol(z),10L), drop = FALSE]; | |
475 rownames(z) = rowNameSlices[[1]][1:nrow(z)]; | |
476 colnames(z) = columnNames[1:ncol(z)]; | |
477 cat("Top left corner of the first slice (up to 10x10):\n"); | |
478 methods:::show(z) | |
479 } | |
480 } | |
481 } | |
482 )) | |
483 | |
484 setGeneric("nrow") | |
485 setMethod("nrow", "SlicedData", function(x) { | |
486 return( x$nRows() ); | |
487 }) | |
488 setGeneric("NROW") | |
489 setMethod("NROW", "SlicedData", function(x) { | |
490 return( x$nRows() ); | |
491 }) | |
492 setGeneric("ncol") | |
493 setMethod("ncol", "SlicedData", function(x) { | |
494 return( x$nCols() ); | |
495 }) | |
496 setGeneric("NCOL") | |
497 setMethod("NCOL", "SlicedData", function(x) { | |
498 return( x$nCols() ); | |
499 }) | |
500 setGeneric("dim") | |
501 setMethod("dim", "SlicedData", function(x) { | |
502 return( c(x$nRows(),x$nCols()) ); | |
503 }) | |
504 setGeneric("colnames") | |
505 setMethod("colnames", "SlicedData", function(x) { | |
506 return( x$columnNames ); | |
507 }) | |
508 setGeneric("rownames") | |
509 setMethod("rownames", "SlicedData", function(x) { | |
510 return( x$GetAllRowNames() ); | |
511 }) | |
512 setMethod("[[", "SlicedData", function(x,i) { | |
513 return( x$getSlice(i) ); | |
514 }) | |
515 setGeneric("length") | |
516 setMethod("length", "SlicedData", function(x) { | |
517 return( x$nSlices() ); | |
518 }) | |
519 setMethod("[[<-", "SlicedData", function(x,i,value) { | |
520 x$setSlice(i, value); | |
521 return(x); | |
522 }) | |
523 summary.SlicedData = function(object, ...) { | |
524 z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); | |
525 return(z); | |
526 } | |
527 | |
528 ##### setGeneric("summary") ##### | |
529 #setMethod("summary", "SlicedData", function(object, ...) { | |
530 # z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); | |
531 # return(z); | |
532 # }) | |
533 #setGeneric("show", standardGeneric("show")) | |
534 # setMethod("show", "SlicedData", function(object) { | |
535 # cat("SlicedData object. For more information type: ?SlicedData\n"); | |
536 # cat("Number of columns:", object$nCols(), "\n"); | |
537 # cat("Number of rows:", object$nRows(), "\n"); | |
538 # cat("Data is stored in", object$nSlices(), "slices\n"); | |
539 # if(object$nSlices()>0) { | |
540 # z = object$getSlice(1); | |
541 # if(nrow(z)>0) { | |
542 # z = z[1:min(nrow(z),10), 1:min(ncol(z),10), drop = FALSE]; | |
543 # rownames(z) = object$rowNameSlices[[1]][1:nrow(z)]; | |
544 # colnames(z) = object$columnNames[1:ncol(z)]; | |
545 # cat("Top left corner of the first slice (up to 10x10):\n"); | |
546 # show(z) | |
547 # } | |
548 # } | |
549 # }) | |
550 | |
551 setGeneric("as.matrix") | |
552 setMethod("as.matrix", "SlicedData", function(x) { | |
553 if(x$nSlices() == 0) { | |
554 return( matrix(0,0,0) ); | |
555 } | |
556 if(x$nSlices() > 1) { | |
557 copy = x$Clone(); | |
558 copy$CombineInOneSlice(); | |
559 } else { | |
560 copy = x; | |
561 } | |
562 mat = copy$getSlice(1L); | |
563 rownames(mat) = rownames(copy); | |
564 colnames(mat) = colnames(copy); | |
565 return( mat ); | |
566 }) | |
567 setGeneric("colnames<-") | |
568 setMethod("colnames<-", "SlicedData", function(x,value) { | |
569 stopifnot( class(value) == "character" ); | |
570 stopifnot( length(value) == x$nCols() ); | |
571 x$columnNames = value; | |
572 return(x); | |
573 }) | |
574 setGeneric("rownames<-") | |
575 setMethod("rownames<-", "SlicedData", function(x,value) { | |
576 stopifnot( class(value) == "character" ); | |
577 stopifnot( length(value) == x$nRows() ); | |
578 start = 1; | |
579 newNameSlices = vector("list", x$nSlices()); | |
580 for( i in .seq(1,x$nSlices()) ) { | |
581 nr = nrow(x$getSliceRaw(i)); | |
582 newNameSlices[[i]] = value[ start:(start+nr-1) ]; | |
583 start = start + nr; | |
584 } | |
585 x$rowNameSlices = newNameSlices; | |
586 return(x); | |
587 }) | |
588 setGeneric("rowSums") | |
589 setMethod("rowSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { | |
590 if(x$nSlices() == 0) { | |
591 return( numeric() ); | |
592 } | |
593 stopifnot( dims == 1 ); | |
594 thesum = vector("list", x$nSlices()); | |
595 for( i in 1:x$nSlices() ) { | |
596 thesum[[i]] = rowSums(x$getSlice(i), na.rm) | |
597 } | |
598 return(unlist(thesum, recursive = FALSE, use.names = FALSE)); | |
599 }) | |
600 setGeneric("rowMeans") | |
601 setMethod("rowMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { | |
602 if(x$nSlices() == 0) { | |
603 return( numeric() ); | |
604 } | |
605 stopifnot( dims == 1 ); | |
606 thesum = vector("list", x$nSlices()); | |
607 for( i in 1:x$nSlices() ) { | |
608 thesum[[i]] = rowMeans(x$getSlice(i), na.rm) | |
609 } | |
610 return(unlist(thesum, recursive = FALSE, use.names = FALSE)); | |
611 }) | |
612 setGeneric("colSums") | |
613 setMethod("colSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { | |
614 if(x$nCols() == 0) { | |
615 return( numeric() ); | |
616 } | |
617 stopifnot( dims == 1 ); | |
618 thesum = 0; | |
619 for( i in .seq(1,x$nSlices()) ) { | |
620 thesum = thesum + colSums(x$getSlice(i), na.rm) | |
621 } | |
622 return(thesum); | |
623 }) | |
624 setGeneric("colMeans") | |
625 setMethod("colMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { | |
626 if(x$nCols() == 0) { | |
627 return( numeric() ); | |
628 } | |
629 stopifnot( dims == 1 ); | |
630 thesum = 0; | |
631 thecounts = x$nRows(); | |
632 for( i in .seq(1,x$nSlices()) ) { | |
633 slice = x$getSlice(i); | |
634 thesum = thesum + colSums(slice, na.rm) | |
635 if( na.rm ) { | |
636 thecounts = thecounts - colSums(is.na(slice)) | |
637 } | |
638 } | |
639 return(thesum/thecounts); | |
640 }) | |
641 | |
642 .listBuilder <- setRefClass(".listBuilder", | |
643 fields = list( | |
644 dataEnv = "environment", | |
645 n = "numeric" | |
646 ), | |
647 methods = list( | |
648 initialize = function() { | |
649 dataEnv <<- new.env(hash = TRUE); | |
650 n <<- 0; | |
651 # cumlength <<- 0; | |
652 return(.self); | |
653 }, | |
654 add = function(x) { | |
655 if(length(x) > 0) { | |
656 n <<- n + 1; | |
657 # cumlength <<- cumlength + length(x); | |
658 assign(paste(n), x, dataEnv ); | |
659 } | |
660 return(.self); | |
661 }, | |
662 set = function(i,x) { | |
663 if(length(x) > 0) { | |
664 if(i>n) | |
665 n <<- i; | |
666 assign(paste(i), x, dataEnv ); | |
667 } | |
668 return(.self); | |
669 }, | |
670 get = function(i) { | |
671 return(base::get(paste(i),dataEnv)); | |
672 }, | |
673 list = function() { | |
674 if(n==0) return(list()); | |
675 result = vector('list',n); | |
676 for( i in 1:n) { | |
677 result[[i]] = .self$get(i); | |
678 } | |
679 return(result); | |
680 }, | |
681 unlist = function() { | |
682 return(base::unlist(.self$list(), recursive=FALSE, use.names = FALSE)); | |
683 }, | |
684 show = function() { | |
685 cat(".listBuilder object.\nIternal object in MatrixEQTL package.\n"); | |
686 cat("Number of elements:", object$n, "\n"); | |
687 } | |
688 )) | |
689 | |
690 .histogrammer <- setRefClass(".histogrammer", | |
691 fields = list( | |
692 pvbins1 = "numeric", | |
693 statbins1 = "numeric", | |
694 hist.count = "numeric" | |
695 ), | |
696 methods = list( | |
697 initialize = function (pvbins, statbins) { | |
698 if(length(pvbins)) { | |
699 ord = order(statbins); | |
700 pvbins1 <<- pvbins[ord]; | |
701 statbins1 <<- statbins[ord]; | |
702 hist.count <<- double(length(pvbins)-1); | |
703 } | |
704 return(.self); | |
705 }, | |
706 update = function(stats.for.hist) { | |
707 h = hist(stats.for.hist, breaks = statbins1, include.lowest = TRUE, right = TRUE, plot = FALSE)$counts; | |
708 hist.count <<- hist.count + h; | |
709 }, | |
710 getResults = function() { | |
711 if(!is.unsorted(pvbins1)) { | |
712 return(list(hist.bins = pvbins1 , hist.counts = hist.count )); | |
713 } else { | |
714 return(list(hist.bins = rev(pvbins1), hist.counts = rev(hist.count))); | |
715 } | |
716 } | |
717 )) | |
718 | |
719 | |
720 .minpvalue <- setRefClass(".minpvalue", | |
721 fields = list( | |
722 sdata = ".listBuilder", | |
723 gdata = ".listBuilder" | |
724 ), | |
725 methods = list( | |
726 initialize = function(snps, gene) { | |
727 sdata <<- .listBuilder$new(); | |
728 for( ss in 1:snps$nSlices() ) { | |
729 sdata$set( ss, double(nrow(snps$getSliceRaw(ss)))); | |
730 } | |
731 gdata <<- .listBuilder$new(); | |
732 for( gg in 1:gene$nSlices() ) { | |
733 gdata$set( gg, double(nrow(gene$getSliceRaw(gg)))); | |
734 } | |
735 return(.self); | |
736 }, | |
737 update = function(ss, gg, astat) { | |
738 gmax = gdata$get(gg) | |
739 z1 = max.col(astat,ties.method='first'); | |
740 z11 = astat[1:nrow(astat) + nrow(astat) * (z1 - 1)]; | |
741 gmax = pmax(gmax, z11); | |
742 gdata$set(gg, gmax); | |
743 | |
744 smax = sdata$get(ss) | |
745 z22 = apply(astat,2,max); | |
746 smax = pmax(smax, z22); | |
747 sdata$set(ss, smax); | |
748 return(.self); | |
749 }, | |
750 updatecis = function(ss, gg, select.cis, astat) { | |
751 if(length(astat)>0) | |
752 { | |
753 byrows = aggregate(x=astat, by=list(row=select.cis[,1]), FUN=max); | |
754 bycols = aggregate(x=astat, by=list(col=select.cis[,2]), FUN=max); | |
755 | |
756 gmax = gdata$get(gg); | |
757 gmax[byrows$row] = pmax(gmax[byrows$row], byrows$x) | |
758 gdata$set(gg, gmax); | |
759 | |
760 smax = sdata$get(ss) | |
761 smax[bycols$col] = pmax(smax[bycols$col], bycols$x) | |
762 sdata$set(ss, smax); | |
763 } | |
764 return(.self); | |
765 }, | |
766 getResults = function(snps, gene, pvfun) { | |
767 min.pv.snps = pvfun(sdata$unlist()); | |
768 names(min.pv.snps) = rownames(snps); | |
769 min.pv.gene = pvfun(gdata$unlist()); | |
770 names(min.pv.gene) = rownames(gene); | |
771 return(list(min.pv.snps = min.pv.snps, min.pv.gene = min.pv.gene)); | |
772 } | |
773 )) | |
774 | |
775 .OutputSaver_FRD <- setRefClass(".OutputSaver_FRD", | |
776 fields = list( | |
777 sdata = ".listBuilder", | |
778 gdata = ".listBuilder", | |
779 cdata = ".listBuilder", | |
780 bdata = ".listBuilder", | |
781 fid = "list", | |
782 testfun1 = "list", | |
783 pvfun1 = "list" | |
784 ), | |
785 methods = list( | |
786 initialize = function () { | |
787 sdata <<- .listBuilder$new(); | |
788 gdata <<- .listBuilder$new(); | |
789 cdata <<- .listBuilder$new(); | |
790 bdata <<- .listBuilder$new(); | |
791 fid <<- list(0); | |
792 testfun1 <<- list(0); | |
793 pvfun1 <<- list(0); | |
794 return(.self); | |
795 }, | |
796 start = function(filename, statistic_name, unused1, unused2, testfun, pvfun) { | |
797 testfun1 <<- list(testfun); | |
798 pvfun1 <<- list(pvfun); | |
799 if(length(filename) > 0) { | |
800 if(class(filename) == "character") { | |
801 fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); | |
802 } else { | |
803 fid <<- list(filename, FALSE) | |
804 } | |
805 writeLines( paste("SNP\tgene\t",statistic_name,"\tp-value\tFDR", sep = ""), fid[[1]]); | |
806 } else { | |
807 fid <<- list(); | |
808 } | |
809 }, | |
810 update = function(spos, gpos, sta, beta = NULL) { | |
811 if(length(sta)>0) { | |
812 sdata$add(spos); | |
813 gdata$add(gpos); | |
814 cdata$add(sta ); | |
815 if(!is.null(beta )) | |
816 bdata$add(beta ); | |
817 } | |
818 return(.self); | |
819 }, | |
820 getResults = function( gene, snps, FDR_total_count) { | |
821 pvalues = NULL; | |
822 if(cdata$n > 0) { | |
823 tests = testfun1[[1]](cdata$unlist()); | |
824 cdata <<- .listBuilder$new(); | |
825 | |
826 pvalues = pvfun1[[1]](tests); | |
827 ord = order(pvalues); | |
828 | |
829 tests = tests[ord]; | |
830 pvalues = pvalues[ord]; | |
831 | |
832 FDR = pvalues * FDR_total_count / (1:length(pvalues)); | |
833 FDR[length(FDR)] = min(FDR[length(FDR)], 1); | |
834 FDR = rev(cummin(rev(FDR))); | |
835 | |
836 snps_names = snps$GetAllRowNames()[sdata$unlist()[ord]]; | |
837 sdata <<- .listBuilder$new(); | |
838 gene_names = gene$GetAllRowNames()[gdata$unlist()[ord]]; | |
839 gdata <<- .listBuilder$new(); | |
840 | |
841 beta = NULL; | |
842 if(bdata$n > 0) | |
843 beta = bdata$unlist()[ord]; | |
844 | |
845 if(length(fid)>0) { | |
846 step = 1000; ########### 100000 | |
847 for( part in 1:ceiling(length(FDR)/step) ) { | |
848 fr = (part-1)*step + 1; | |
849 to = min(part*step, length(FDR)); | |
850 dump = data.frame(snps_names[fr:to], | |
851 gene_names[fr:to], | |
852 if(is.null(beta)) tests[fr:to] else list(beta[fr:to],tests[fr:to]), | |
853 pvalues[fr:to], | |
854 FDR[fr:to], | |
855 row.names = NULL, | |
856 check.rows = FALSE, | |
857 check.names = FALSE, | |
858 stringsAsFactors = FALSE); | |
859 write.table(dump, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); | |
860 } | |
861 } | |
862 } else { | |
863 cat("No significant associations were found.\n", file = if(length(fid)>0){fid[[1]]}else{""}); | |
864 } | |
865 if(length(fid)>0) { | |
866 if(fid[[2]]) { | |
867 close(fid[[1]]); | |
868 } | |
869 fid <<- list(); | |
870 } | |
871 | |
872 if(!is.null(pvalues)) { | |
873 eqtls = list( snps = snps_names, | |
874 gene = gene_names, | |
875 statistic = tests, | |
876 pvalue = pvalues, | |
877 FDR = FDR); | |
878 if(!is.null(beta)) | |
879 eqtls$beta = beta; | |
880 } else { | |
881 eqtls = list( snps = character(), | |
882 gene = character(), | |
883 beta = numeric(), | |
884 statistic = numeric(), | |
885 pvalue = numeric(), | |
886 FDR = numeric()); | |
887 } | |
888 return(list(eqtls = data.frame(eqtls))); | |
889 } | |
890 ) | |
891 ) | |
892 | |
893 | |
894 .OutputSaver_direct <- setRefClass(".OutputSaver_direct", | |
895 fields = list( | |
896 gene_names = "character", | |
897 snps_names = "character", | |
898 fid = "list", | |
899 testfun1 = "list", | |
900 pvfun1 = "list" | |
901 ), | |
902 methods = list( | |
903 initialize = function() { | |
904 gene_names <<- character(0); | |
905 snps_names <<- character(0); | |
906 fid <<- list(0); | |
907 testfun1 <<- list(0); | |
908 pvfun1 <<- list(0); | |
909 return(.self); | |
910 }, | |
911 start = function(filename, statistic_name, snps, gene, testfun, pvfun) { | |
912 # I hope the program stops if it fails to open the file | |
913 if(class(filename) == "character") { | |
914 fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); | |
915 } else { | |
916 fid <<- list(filename, FALSE) | |
917 } | |
918 writeLines(paste("SNP\tgene\t", statistic_name, "\tp-value", sep = ""), fid[[1]]); | |
919 gene_names <<- gene$GetAllRowNames(); | |
920 snps_names <<- snps$GetAllRowNames(); | |
921 testfun1 <<- list(testfun); | |
922 pvfun1 <<- list(pvfun); | |
923 }, | |
924 update = function(spos, gpos, sta, beta = NULL) { | |
925 if( length(sta) == 0 ) | |
926 return(); | |
927 sta = testfun1[[1]](sta); | |
928 lst = list(snps = snps_names[spos], gene = gene_names[gpos], beta = beta, statistic = sta, pvalue = pvfun1[[1]](sta)); | |
929 lst$beta = lst$beta; | |
930 | |
931 dump2 = data.frame(lst, row.names = NULL, check.rows = FALSE, check.names = FALSE, stringsAsFactors = FALSE); | |
932 write.table(dump2, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); | |
933 }, | |
934 getResults = function(...) { | |
935 if(length(fid)>0) { | |
936 if(fid[[2]]) { | |
937 close(fid[[1]]); | |
938 } | |
939 fid <<- list(); | |
940 } | |
941 gene_names <<- character(0); | |
942 snps_names <<- character(0); | |
943 return(list()); | |
944 } | |
945 ) | |
946 ) | |
947 | |
948 .my.pmin = function(x, val) { | |
949 # minimum "pmin" function that can handle empty array | |
950 if(length(x) == 0) { | |
951 return(x) | |
952 } else { | |
953 return(pmin.int(x,val)); | |
954 } | |
955 } | |
956 | |
957 .my.pmax = function(x, val) { | |
958 # minimum "pmin" function that can handle empty array | |
959 if(length(x) == 0) { | |
960 return(x) | |
961 } else { | |
962 return(pmax.int(x,val)); | |
963 } | |
964 } | |
965 | |
966 .pv.nz = function(x){return( .my.pmax(x,.Machine$double.xmin) )} | |
967 | |
968 .SetNanRowMean = function(x) { | |
969 if( any(is.na(x)) ) { | |
970 rowmean = rowMeans(x, na.rm = TRUE); | |
971 rowmean[ is.na(rowmean) ] = 0; | |
972 for( j in which(!complete.cases(x)) ) { | |
973 where1 = is.na( x[j, ] ); | |
974 x[j,where1] = rowmean[j]; | |
975 } | |
976 } | |
977 return(x); | |
978 } | |
979 | |
980 # .SNP_process_split_for_ANOVA = function(x) { | |
981 # # split into 2 dummy variables | |
982 # | |
983 # uniq = unique(c(x)); | |
984 # uniq = uniq[!is.na(uniq)]; | |
985 # | |
986 # if( length(uniq) > 3 ) { | |
987 # stop("More than three genotype categories is not handled by ANOVA"); | |
988 # } else if ( length(uniq) < 3 ) { | |
989 # uniq = c(uniq, min(uniq)-(1:(3-length(uniq)))); | |
990 # } | |
991 # | |
992 # freq = matrix(0, nrow(x), length(uniq)); | |
993 # for(i in 1:length(uniq)) { | |
994 # freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); | |
995 # } | |
996 # | |
997 # md = apply(freq, 1, which.max); | |
998 # freq[ cbind(1:nrow(x),md) ] = -1; | |
999 # | |
1000 # md = apply(freq, 1, which.max); # min(freq[cbind(1:nrow(slice),md)] - rowSums(select,na.rm = TRUE )) | |
1001 # new_slice1 = (x == uniq[md]); | |
1002 # new_slice1[is.na(new_slice1)] = 0; | |
1003 # freq[ cbind(1:nrow(x),md) ] = -1; | |
1004 # | |
1005 # md = apply(freq,1,which.max); | |
1006 # new_slice2 = (x == uniq[md]); | |
1007 # new_slice2[ is.na(new_slice2) ] = 0; | |
1008 # rez = vector("list", 2); | |
1009 # rez[[1]] = new_slice1; | |
1010 # rez[[2]] = new_slice2; | |
1011 # return( rez ); | |
1012 # } | |
1013 | |
1014 .SNP_process_split_for_ANOVA = function(x,n.groups) { | |
1015 # split into 2 dummy variables (or more) | |
1016 | |
1017 # # Get the number of ANOVA groups | |
1018 # n.groups = options('MatrixEQTL.ANOVA.categories')[[1]]; | |
1019 # if( is.null(n.groups)) | |
1020 # n.groups = 3; | |
1021 | |
1022 # Unique values in x (make sure it has length of n.groups); | |
1023 uniq = unique(as.vector(x)); | |
1024 uniq = uniq[!is.na(uniq)]; | |
1025 if( length(uniq) > n.groups ) { | |
1026 stop("More than declared number of genotype categories is detected by ANOVA"); | |
1027 } else if ( length(uniq) < n.groups ) { | |
1028 uniq = c(uniq, rep(min(uniq)-1, n.groups-length(uniq))); | |
1029 } | |
1030 | |
1031 # Table of frequencies for each variable (row) | |
1032 freq = matrix(0, nrow(x), n.groups); | |
1033 for(i in 1:n.groups) { | |
1034 freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); | |
1035 } | |
1036 # remove NA's from x for convenience | |
1037 x[is.na(x)] = min(uniq)-2; | |
1038 | |
1039 # Output list of matrices | |
1040 rez = vector('list',n.groups-1); | |
1041 | |
1042 # Skip the most frequent value | |
1043 md = apply(freq, 1, which.max); # most frequent value for each variable | |
1044 freq[ cbind(1:nrow(x),md) ] = -1; | |
1045 | |
1046 # The rest form dumm | |
1047 for(j in 1:(n.groups-1)){ | |
1048 md = apply(freq, 1, which.max); | |
1049 freq[ cbind(1:nrow(x),md) ] = -1; | |
1050 rez[[j]] = (x == uniq[md]); | |
1051 } | |
1052 return( rez ); | |
1053 } | |
1054 | |
1055 Matrix_eQTL_engine = function( | |
1056 snps, | |
1057 gene, | |
1058 cvrt = SlicedData$new(), | |
1059 output_file_name, | |
1060 pvOutputThreshold = 1e-5, | |
1061 useModel = modelLINEAR, | |
1062 errorCovariance = numeric(), | |
1063 verbose = TRUE, | |
1064 pvalue.hist = FALSE, | |
1065 min.pv.by.genesnp = FALSE, | |
1066 noFDRsaveMemory = FALSE) { | |
1067 rez = Matrix_eQTL_main( | |
1068 snps = snps, | |
1069 gene = gene, | |
1070 cvrt = cvrt, | |
1071 output_file_name = output_file_name, | |
1072 pvOutputThreshold = pvOutputThreshold, | |
1073 useModel = useModel, | |
1074 errorCovariance = errorCovariance, | |
1075 verbose = verbose, | |
1076 pvalue.hist = pvalue.hist, | |
1077 min.pv.by.genesnp = min.pv.by.genesnp, | |
1078 noFDRsaveMemory = noFDRsaveMemory); | |
1079 return( rez ); | |
1080 } | |
1081 | |
1082 Matrix_eQTL_main = function( | |
1083 snps, | |
1084 gene, | |
1085 cvrt = SlicedData$new(), | |
1086 output_file_name = "", | |
1087 pvOutputThreshold = 1e-5, | |
1088 useModel = modelLINEAR, | |
1089 errorCovariance = numeric(), | |
1090 verbose = TRUE, | |
1091 output_file_name.cis = "", | |
1092 pvOutputThreshold.cis = 0, | |
1093 snpspos = NULL, | |
1094 genepos = NULL, | |
1095 cisDist = 1e6, | |
1096 pvalue.hist = FALSE, | |
1097 min.pv.by.genesnp = FALSE, | |
1098 noFDRsaveMemory = FALSE) { | |
1099 ################################# Basic variable checks ################################# | |
1100 { | |
1101 # status("Performing basic checks of the input variables"); | |
1102 stopifnot( "SlicedData" %in% class(gene) ); | |
1103 stopifnot( "SlicedData" %in% class(snps) ); | |
1104 stopifnot( "SlicedData" %in% class(cvrt) ); | |
1105 | |
1106 # Check dimensions | |
1107 if( min(snps$nRows(),snps$nCols()) == 0 ) | |
1108 stop("Empty genotype dataset"); | |
1109 if( min(gene$nRows(),gene$nCols()) == 0 ) | |
1110 stop("Empty expression dataset"); | |
1111 if( snps$nCols() != gene$nCols() ) | |
1112 stop("Different number of samples in the genotype and gene expression files"); | |
1113 if( cvrt$nRows()>0 ) { | |
1114 if( snps$nCols() != cvrt$nCols() ) | |
1115 stop("Wrong number of samples in the matrix of covariates"); | |
1116 } | |
1117 | |
1118 stopifnot( class(pvOutputThreshold) == "numeric" ); | |
1119 stopifnot( length(pvOutputThreshold) == 1 ); | |
1120 stopifnot( pvOutputThreshold >= 0 ); | |
1121 stopifnot( pvOutputThreshold <= 1 ); | |
1122 | |
1123 stopifnot( class(noFDRsaveMemory) == "logical" ); | |
1124 stopifnot( length(noFDRsaveMemory) == 1 ); | |
1125 | |
1126 if( pvOutputThreshold > 0 ) { | |
1127 stopifnot( !((length(output_file_name) == 0) && noFDRsaveMemory) ) | |
1128 stopifnot( length(output_file_name) <= 1 ); | |
1129 if( length(output_file_name) == 1 ) { | |
1130 stopifnot( class(output_file_name) %in% c("character","connection") ); | |
1131 } | |
1132 } | |
1133 | |
1134 stopifnot( class(pvOutputThreshold.cis) == "numeric" ); | |
1135 stopifnot( length(pvOutputThreshold.cis) == 1 ); | |
1136 stopifnot( pvOutputThreshold.cis >= 0 ); | |
1137 stopifnot( pvOutputThreshold.cis <= 1 ); | |
1138 stopifnot( !((pvOutputThreshold > 0) & (pvOutputThreshold.cis > 0) & (pvOutputThreshold > pvOutputThreshold.cis)) ); | |
1139 stopifnot( (pvOutputThreshold > 0) | (pvOutputThreshold.cis > 0) ); | |
1140 | |
1141 stopifnot( class(useModel) == class(modelLINEAR) ); | |
1142 stopifnot( length(useModel) == 1 ); | |
1143 stopifnot( useModel %in% c(modelLINEAR, modelANOVA, modelLINEAR_CROSS) ); | |
1144 if( useModel %in% c(modelLINEAR, modelLINEAR_CROSS) ) { | |
1145 if( snps$nCols() <= cvrt$nRows() + 1 + 1) { | |
1146 stop('The number of covariates exceeds the number of samples.\nLinear regression can not be fit.') | |
1147 } | |
1148 } | |
1149 if( useModel == modelLINEAR_CROSS ) { | |
1150 if( cvrt$nRows() == 0 ) { | |
1151 stop( "Model \"modelLINEAR_CROSS\" requires at least one covariate" ); | |
1152 } | |
1153 } | |
1154 if( useModel == modelANOVA ) { | |
1155 n.anova.groups = getOption('MatrixEQTL.ANOVA.categories', 3); | |
1156 stopifnot( n.anova.groups == floor(n.anova.groups) ); | |
1157 stopifnot( n.anova.groups >= 3 ); | |
1158 # stopifnot( n.anova.groups < snps$nCols() - cvrt$nRows() - 2 ); | |
1159 if( snps$nCols() <= cvrt$nRows() + n.anova.groups) { | |
1160 stop('The number of covariates exceeds the number of samples.\nLinear regression (ANOVA) can not be fit.') | |
1161 } | |
1162 } | |
1163 | |
1164 stopifnot( class(verbose) == "logical" ); | |
1165 stopifnot( length(verbose) == 1 ); | |
1166 | |
1167 stopifnot( class(min.pv.by.genesnp) == "logical" ); | |
1168 stopifnot( length(min.pv.by.genesnp) == 1 ); | |
1169 | |
1170 if( pvOutputThreshold.cis > 0 ) { | |
1171 stopifnot( !((length(output_file_name.cis) == 0) && noFDRsaveMemory) ) | |
1172 stopifnot( length(output_file_name.cis) <= 1 ); | |
1173 if( length(output_file_name.cis) == 1 ) { | |
1174 stopifnot( class(output_file_name.cis) %in% c("character","connection") ); | |
1175 } | |
1176 | |
1177 # stopifnot( class(output_file_name.cis) == "character" ); | |
1178 # stopifnot( length(output_file_name.cis) == 1 ); | |
1179 stopifnot( class(snpspos) == "data.frame" ); | |
1180 stopifnot( ncol(snpspos) == 3 ); | |
1181 stopifnot( nrow(snpspos) > 0 ); | |
1182 stopifnot( class(snpspos[1,3]) %in% c("integer", "numeric") ) | |
1183 stopifnot( !any(is.na(snpspos[,3])) ) | |
1184 stopifnot( class(genepos) == "data.frame" ); | |
1185 stopifnot( ncol(genepos) == 4 ); | |
1186 stopifnot( nrow(genepos) > 0 ); | |
1187 stopifnot( class(genepos[1,3]) %in% c("integer", "numeric") ) | |
1188 stopifnot( class(genepos[1,4]) %in% c("integer", "numeric") ) | |
1189 stopifnot( !any(is.na(genepos[,3])) ) | |
1190 stopifnot( !any(is.na(genepos[,4])) ) | |
1191 stopifnot( nzchar(output_file_name.cis) ) | |
1192 } | |
1193 | |
1194 if( pvOutputThreshold > 0 ) { | |
1195 stopifnot( nzchar(output_file_name) ) | |
1196 } | |
1197 | |
1198 stopifnot( class(errorCovariance) %in% c("numeric", "matrix") ); | |
1199 errorCovariance = as.matrix(errorCovariance); | |
1200 if(length(errorCovariance)>0) { | |
1201 if( nrow(errorCovariance) != ncol(errorCovariance) ) { | |
1202 stop("The covariance matrix is not square"); | |
1203 } | |
1204 if( nrow(errorCovariance) != snps$nCols() ) { | |
1205 stop("The covariance matrix size does not match the number of samples"); | |
1206 } | |
1207 if( !all(errorCovariance == t(errorCovariance)) ) { | |
1208 stop("The covariance matrix is not symmetric"); | |
1209 } | |
1210 } | |
1211 } | |
1212 ################################# Initial setup ######################################### | |
1213 { | |
1214 gene.std = .listBuilder$new(); | |
1215 snps.std = .listBuilder$new(); | |
1216 | |
1217 dont.clone.gene = getOption('MatrixEQTL.dont.preserve.gene.object', FALSE) | |
1218 if(is.null(dont.clone.gene)) | |
1219 dont.clone.gene = FALSE; | |
1220 | |
1221 if( !dont.clone.gene ) | |
1222 gene = gene$Clone(); | |
1223 # snps = snps$Clone(); # snps is read only | |
1224 cvrt = cvrt$Clone(); | |
1225 | |
1226 params = list( | |
1227 output_file_name = output_file_name, | |
1228 pvOutputThreshold = pvOutputThreshold, | |
1229 useModel = useModel, | |
1230 errorCovariance = errorCovariance , | |
1231 verbose = verbose, | |
1232 output_file_name.cis = output_file_name.cis, | |
1233 pvOutputThreshold.cis = pvOutputThreshold.cis, | |
1234 cisDist = cisDist , | |
1235 pvalue.hist = pvalue.hist, | |
1236 min.pv.by.genesnp = min.pv.by.genesnp); | |
1237 | |
1238 if( verbose ) { | |
1239 lastTime = 0; | |
1240 status <- function(text) { | |
1241 # gc(); | |
1242 newTime = proc.time()[3]; | |
1243 if(lastTime != 0) { | |
1244 cat("Task finished in ", newTime-lastTime, " seconds\n"); | |
1245 } | |
1246 cat(text,"\n"); | |
1247 lastTime <<- newTime; | |
1248 unused = flush.console(); | |
1249 } | |
1250 } else { | |
1251 status = function(text){} | |
1252 } | |
1253 start.time = proc.time()[3]; | |
1254 } | |
1255 ################################# Error covariance matrix processing #################### | |
1256 { | |
1257 if( length(errorCovariance) > 0 ) { | |
1258 status("Processing the error covariance matrix"); | |
1259 eig = eigen(errorCovariance, symmetric = TRUE) | |
1260 d = eig$values; | |
1261 v = eig$vectors; | |
1262 # errorCovariance == v %*% diag(d) %*% t(v) | |
1263 # errorCovariance^0.5 == v*sqrt(d)*v" (no single quotes anymore) | |
1264 # errorCovariance^(-0.5) == v*diag(1./sqrt(diag(d)))*v" | |
1265 if( any(d<=0) ) { | |
1266 stop("The covariance matrix is not positive definite"); | |
1267 } | |
1268 correctionMatrix = v %*% diag(1./sqrt(d)) %*% t(v); | |
1269 rm( eig, v, d, errorCovariance ) | |
1270 } else { | |
1271 rm( errorCovariance ); | |
1272 correctionMatrix = numeric(); | |
1273 } | |
1274 } | |
1275 ################################# Matching gene and SNPs locations ###################### | |
1276 if( pvOutputThreshold.cis > 0 ) { | |
1277 status("Matching data files and location files") | |
1278 | |
1279 # names in the input data | |
1280 gene_names = gene$GetAllRowNames(); | |
1281 snps_names = snps$GetAllRowNames(); | |
1282 | |
1283 # gene range, set: left<right | |
1284 if(any(genepos[,3] > genepos[,4])) { | |
1285 temp3 = genepos[,3]; | |
1286 temp4 = genepos[,4]; | |
1287 genepos[,3] = pmin(temp3,temp4); | |
1288 genepos[,4] = pmax(temp3,temp4); | |
1289 rm(temp3, temp4); | |
1290 } | |
1291 | |
1292 # match with the location data | |
1293 genematch = match( gene_names, genepos[ ,1], nomatch = 0L); | |
1294 usedgene = matrix(FALSE, nrow(genepos), 1); # genes in "genepos" that are matching "gene_names" | |
1295 usedgene[ genematch ] = TRUE; | |
1296 if( !any(genematch) ) { | |
1297 stop("Gene names do not match those in the gene location file."); | |
1298 } | |
1299 cat( sum(genematch>0), "of", length(gene_names), " genes matched\n"); | |
1300 | |
1301 | |
1302 snpsmatch = match( snps_names, snpspos[ ,1], nomatch = 0L); | |
1303 usedsnps = matrix(FALSE, nrow(snpspos),1); | |
1304 usedsnps[ snpsmatch ] = TRUE; | |
1305 if( !any(snpsmatch) ) { | |
1306 stop("SNP names do not match those in the SNP location file."); | |
1307 } | |
1308 cat( sum(snpsmatch>0), "of", length(snps_names), " SNPs matched\n"); | |
1309 | |
1310 # list used chr names | |
1311 chrNames = unique(c( as.character(unique(snpspos[usedsnps,2])), | |
1312 as.character(unique(genepos[usedgene,2])) )) | |
1313 chrNames = chrNames[ sort.list( suppressWarnings(as.integer(chrNames)), | |
1314 method = "radix", na.last = TRUE ) ]; | |
1315 # match chr names | |
1316 genechr = match(genepos[,2],chrNames); | |
1317 snpschr = match(snpspos[,2],chrNames); | |
1318 | |
1319 # max length of a chromosome | |
1320 chrMax = max( snpspos[usedsnps, 3], genepos[usedgene, 4], na.rm = TRUE) + cisDist; | |
1321 | |
1322 # Single number location for all rows in "genepos" and "snpspos" | |
1323 genepos2 = as.matrix(genepos[ ,3:4, drop = FALSE] + (genechr-1)*chrMax); | |
1324 snpspos2 = as.matrix(snpspos[ ,3 , drop = FALSE] + (snpschr-1)*chrMax); | |
1325 | |
1326 # the final location arrays; | |
1327 snps_pos = matrix(0,length(snps_names),1); | |
1328 snps_pos[snpsmatch>0, ] = snpspos2[snpsmatch, , drop = FALSE]; | |
1329 snps_pos[rowSums(is.na(snps_pos))>0, ] = 0; | |
1330 snps_pos[snps_pos==0] = (length(chrNames)+1) * (chrMax+cisDist); | |
1331 rm(snps_names, snpsmatch, usedsnps, snpschr, snpspos2) | |
1332 | |
1333 gene_pos = matrix(0,length(gene_names),2); | |
1334 gene_pos[genematch>0, ] = genepos2[genematch, , drop = FALSE]; | |
1335 gene_pos[rowSums(is.na(gene_pos))>0, ] = 0; | |
1336 gene_pos[gene_pos==0] = (length(chrNames)+2) * (chrMax+cisDist); | |
1337 rm(gene_names, genematch, usedgene, genechr, genepos2) | |
1338 rm(chrNames, chrMax); | |
1339 | |
1340 if( is.unsorted(snps_pos) ) { | |
1341 status("Reordering SNPs\n"); | |
1342 ordr = sort.list(snps_pos); | |
1343 snps$RowReorder(ordr); | |
1344 snps_pos = snps_pos[ordr, , drop = FALSE]; | |
1345 rm(ordr); | |
1346 } | |
1347 if( is.unsorted(rowSums(gene_pos)) ) { | |
1348 status("Reordering genes\n"); | |
1349 ordr = sort.list(rowSums(gene_pos)); | |
1350 gene$RowReorder(ordr); | |
1351 gene_pos = gene_pos[ordr, , drop = FALSE]; | |
1352 rm(ordr); | |
1353 } | |
1354 | |
1355 # Slice it back. | |
1356 geneloc = vector("list", gene$nSlices()) | |
1357 gene_offset = 0; | |
1358 for(gc in 1:gene$nSlices()) { | |
1359 nr = length(gene$rowNameSlices[[gc]]); | |
1360 geneloc[[gc]] = gene_pos[gene_offset + (1:nr), , drop = FALSE]; | |
1361 gene_offset = gene_offset + nr; | |
1362 } | |
1363 rm(gc, gene_offset, gene_pos); | |
1364 | |
1365 snpsloc = vector("list", snps$nSlices()) | |
1366 snps_offset = 0; | |
1367 for(sc in 1:snps$nSlices()) { | |
1368 nr = length(snps$rowNameSlices[[sc]]); | |
1369 snpsloc[[sc]] = snps_pos[snps_offset + (1:nr), , drop = FALSE]; | |
1370 snps_offset = snps_offset + nr; | |
1371 } | |
1372 rm(nr, sc, snps_offset, snps_pos); | |
1373 } | |
1374 ################################# Covariates processing ################################# | |
1375 { | |
1376 status("Processing covariates"); | |
1377 if( useModel == modelLINEAR_CROSS ) { | |
1378 last.covariate = as.vector(tail( cvrt$getSlice(cvrt$nSlices()), n = 1)); | |
1379 } | |
1380 if( cvrt$nRows()>0 ) { | |
1381 cvrt$SetNanRowMean(); | |
1382 cvrt$CombineInOneSlice(); | |
1383 cvrt = rbind(matrix(1,1,snps$nCols()),cvrt$getSlice(1)); | |
1384 } else { | |
1385 cvrt = matrix(1,1,snps$nCols()); | |
1386 } | |
1387 # Correct for the error covariance structure | |
1388 if( length(correctionMatrix)>0 ) { | |
1389 cvrt = cvrt %*% correctionMatrix; | |
1390 } | |
1391 # Orthonormalize covariates | |
1392 # status("Orthonormalizing covariates"); | |
1393 q = qr(t(cvrt)); | |
1394 if( min(abs(diag(qr.R(q)))) < .Machine$double.eps * snps$nCols() ) { | |
1395 stop("Colinear or zero covariates detected"); | |
1396 } | |
1397 cvrt = t( qr.Q(q) ); | |
1398 rm( q ); | |
1399 } | |
1400 ################################# Gene expression processing ############################ | |
1401 { | |
1402 status("Processing gene expression data (imputation, residualization, etc.)"); | |
1403 # Impute gene expression | |
1404 gene$SetNanRowMean(); | |
1405 # Correct for the error covariance structure | |
1406 if( length(correctionMatrix)>0 ) { | |
1407 gene$RowMatrixMultiply(correctionMatrix); | |
1408 } | |
1409 # Orthogonolize expression w.r.t. covariates | |
1410 # status("Orthogonolizing expression w.r.t. covariates"); | |
1411 gene_offsets = double(gene$nSlices()+1); | |
1412 for( sl in 1:gene$nSlices() ) { | |
1413 slice = gene$getSlice(sl); | |
1414 gene_offsets[sl+1] = gene_offsets[sl] + nrow(slice); | |
1415 rowsq1 = rowSums(slice^2); | |
1416 slice = slice - tcrossprod(slice,cvrt) %*% cvrt; | |
1417 rowsq2 = rowSums(slice^2); | |
1418 # kill rows colinear with the covariates | |
1419 delete.rows = (rowsq2 <= rowsq1 * .Machine$double.eps ); | |
1420 slice[delete.rows] = 0; | |
1421 div = sqrt( rowSums(slice^2) ); | |
1422 div[ div == 0 ] = 1; | |
1423 gene.std$set(sl, div); | |
1424 gene$setSlice(sl, slice / div); | |
1425 } | |
1426 rm(rowsq1, rowsq2, delete.rows, div); | |
1427 rm( sl, slice ); | |
1428 #gene$RowRemoveZeroEps(); | |
1429 } | |
1430 ################################# snps_process, testfun, pvfun, threshfun, afun ######## | |
1431 { | |
1432 # snps_process - preprocess SNPs slice | |
1433 # | |
1434 # afun --- abs for signed stats, identity for non-negative | |
1435 # threshfun --- internal stat threshold for given p-value | |
1436 # testfun --- t or F statistic from the internal one | |
1437 # pvfun --- p-value from the t or F statistic | |
1438 | |
1439 nSamples = snps$nCols(); | |
1440 nGenes = gene$nRows(); | |
1441 nSnps = snps$nRows(); | |
1442 nCov = nrow(cvrt); | |
1443 # nVarTested = length(snps_list); # set in case(useModel) | |
1444 # dfNull = nSamples - nCov; | |
1445 # d.f. of the full model | |
1446 betafun = NULL; | |
1447 | |
1448 if( useModel == modelLINEAR ) { | |
1449 snps_process = function(x) { | |
1450 return( list(.SetNanRowMean(x)) ); | |
1451 }; | |
1452 nVarTested = 1; | |
1453 dfFull = nSamples - nCov - nVarTested; | |
1454 statistic.fun = function(mat_list) { | |
1455 return( mat_list[[1]] ); | |
1456 } | |
1457 afun = function(x) {return(abs(x))}; | |
1458 threshfun = function(pv) { | |
1459 thr = qt(pv/2, dfFull, lower.tail = FALSE); | |
1460 thr = thr^2; | |
1461 thr = sqrt( thr / (dfFull + thr) ); | |
1462 thr[pv >= 1] = 0; | |
1463 thr[pv <= 0] = 1; | |
1464 return( thr ); | |
1465 } | |
1466 testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } | |
1467 pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2)); } | |
1468 thresh.cis = threshfun(pvOutputThreshold.cis); | |
1469 thresh = threshfun(pvOutputThreshold); | |
1470 betafun = function(stat, ss, gg, select) { | |
1471 return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); | |
1472 } | |
1473 } else | |
1474 if( useModel == modelANOVA ) { | |
1475 snps_process = function(x).SNP_process_split_for_ANOVA(x,n.anova.groups); | |
1476 nVarTested = n.anova.groups - 1; | |
1477 dfFull = nSamples - nCov - nVarTested; | |
1478 # statistic.fun = function(mat_list) { | |
1479 # return( mat_list[[1]]^2 + mat_list[[2]]^2 ); | |
1480 # } | |
1481 statistic.fun = function(mat_list) { | |
1482 x = mat_list[[1]]^2; | |
1483 for( j in 2:length(mat_list) ) | |
1484 x = x + mat_list[[j]]^2; | |
1485 return( x ); | |
1486 } | |
1487 afun = identity; | |
1488 threshfun = function(pv) { | |
1489 thr = qf(pv, nVarTested, dfFull, lower.tail = FALSE); | |
1490 thr = thr / (dfFull/nVarTested + thr); | |
1491 thr[pv >= 1] = 0; | |
1492 thr[pv <= 0] = 1; | |
1493 return( thr ); | |
1494 } | |
1495 testfun = function(x) { return( x / (1 - .my.pmin(x,1)) * (dfFull/nVarTested) ); } | |
1496 pvfun = function(x) { return( .pv.nz(pf(x, nVarTested, dfFull, lower.tail = FALSE)) ); } | |
1497 thresh.cis = threshfun(pvOutputThreshold.cis); | |
1498 thresh = threshfun(pvOutputThreshold); | |
1499 } else | |
1500 if( useModel == modelLINEAR_CROSS ) { | |
1501 last.covariate = as.vector( last.covariate ); | |
1502 snps_process = .SNP_process_split_for_LINEAR_CROSS = function(x) { | |
1503 out = vector("list", 2); | |
1504 out[[1]] = .SetNanRowMean(x); | |
1505 out[[2]] = t( t(out[[1]]) * last.covariate ); | |
1506 return( out ); | |
1507 }; | |
1508 nVarTested = 1; | |
1509 dfFull = nSamples - nCov - nVarTested - 1; | |
1510 statistic.fun = function(mat_list) { | |
1511 return( mat_list[[2]] / sqrt(1 - mat_list[[1]]^2) ); | |
1512 } | |
1513 afun = function(x) {return(abs(x))}; | |
1514 threshfun = function(pv) { | |
1515 thr = qt(pv/2, dfFull, lower.tail = FALSE); | |
1516 thr = thr^2; | |
1517 thr = sqrt( thr / (dfFull + thr) ); | |
1518 thr[pv >= 1] = 0; | |
1519 thr[pv <= 0] = 1; | |
1520 return( thr ); | |
1521 } | |
1522 testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } | |
1523 pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2 )); } | |
1524 thresh.cis = threshfun(pvOutputThreshold.cis); | |
1525 thresh = threshfun(pvOutputThreshold); | |
1526 betafun = function(stat, ss, gg, select) { | |
1527 return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); | |
1528 } | |
1529 } | |
1530 params$dfFull = dfFull; | |
1531 } | |
1532 ################################# Saver class(es) creation ############################## | |
1533 { | |
1534 status("Creating output file(s)"); | |
1535 if(noFDRsaveMemory) { | |
1536 if( pvOutputThreshold > 0 ) { | |
1537 saver.tra = .OutputSaver_direct$new(); | |
1538 } | |
1539 if( pvOutputThreshold.cis > 0 ) { | |
1540 saver.cis = .OutputSaver_direct$new(); | |
1541 } | |
1542 } else { | |
1543 if( pvOutputThreshold > 0 ) { | |
1544 saver.tra = .OutputSaver_FRD$new(); | |
1545 } | |
1546 if( pvOutputThreshold.cis > 0 ) { | |
1547 saver.cis = .OutputSaver_FRD$new(); | |
1548 } | |
1549 } | |
1550 if( pvOutputThreshold > 0 ) | |
1551 if( pvOutputThreshold * gene$nRows() * snps$nRows() > 1000000 ) | |
1552 if(!noFDRsaveMemory) | |
1553 cat('Warning: pvOutputThreshold may be too large.\nExpected number of findings > ', | |
1554 pvOutputThreshold * gene$nRows() * snps$nRows(),'\n'); | |
1555 if( (useModel == modelLINEAR) || (useModel == modelLINEAR_CROSS) ) { | |
1556 statistic_name = "t-stat"; | |
1557 } else if( useModel == modelANOVA ) { | |
1558 statistic_name = "F-test"; | |
1559 } | |
1560 if(!is.null(betafun)) | |
1561 statistic_name = paste('beta\t',statistic_name, sep=''); | |
1562 if( pvOutputThreshold > 0 ) | |
1563 saver.tra$start(output_file_name, statistic_name, snps, gene, testfun, pvfun); | |
1564 if( pvOutputThreshold.cis > 0 ) | |
1565 saver.cis$start(output_file_name.cis, statistic_name, snps, gene, testfun, pvfun); | |
1566 rm( statistic_name ); | |
1567 } | |
1568 ################################# Some useful functions ################################# | |
1569 { | |
1570 orthonormalize.snps = function(cursnps, ss) { | |
1571 for(p in 1:length(cursnps)) { | |
1572 if(length(correctionMatrix)>0) { | |
1573 cursnps[[p]] = cursnps[[p]] %*% correctionMatrix; | |
1574 } | |
1575 cursnps[[p]] = cursnps[[p]] - tcrossprod(cursnps[[p]],cvrt) %*% cvrt; | |
1576 for(w in .seq(1,p-1L)) | |
1577 cursnps[[p]] = cursnps[[p]] - rowSums(cursnps[[p]]*cursnps[[w]]) * cursnps[[w]]; | |
1578 div = sqrt( rowSums(cursnps[[p]]^2) ); | |
1579 div[ div == 0 ] = 1; | |
1580 cursnps[[p]] = cursnps[[p]]/div; | |
1581 } | |
1582 snps.std$set(ss, div); | |
1583 return(cursnps); | |
1584 } | |
1585 # if( pvOutputThreshold.cis > 0 ) { | |
1586 # is.cis.pair = function(gg,ss) { | |
1587 # return(!( ( snpsloc[[ss]][1, 1] - tail( geneloc[[gg]][ , 2], n = 1L) > cisDist) | | |
1588 # ( geneloc[[gg]][1, 1] - tail( snpsloc[[ss]] , n = 1L) > cisDist) ) ); | |
1589 # } | |
1590 # } | |
1591 if( pvOutputThreshold.cis > 0 ) { | |
1592 # sn.l = sapply(snpsloc, function(x)x[1] ); | |
1593 # sn.r = sapply(snpsloc, function(x)tail(x,1) ); | |
1594 # ge.l = sapply(geneloc, function(x)x[1,1] ); | |
1595 # ge.r = sapply(geneloc, function(x)x[nrow(x) , 2] ); | |
1596 sn.l = sapply(snpsloc, '[', 1 ); | |
1597 sn.r = sapply(snpsloc, tail, 1 ); | |
1598 ge.l = sapply(geneloc, '[', 1, 1 ); | |
1599 ge.r = sapply( lapply(geneloc, tail.matrix, 1 ), '[', 2); | |
1600 gg.1 = findInterval( sn.l , ge.r + cisDist +1) + 1; | |
1601 # cat(gg.1,'\n') | |
1602 gg.2 = findInterval( sn.r , ge.l - cisDist ); | |
1603 # cat(gg.2,'\n') | |
1604 rm(sn.l, sn.r, ge.l, ge.r); | |
1605 } | |
1606 | |
1607 } | |
1608 ################################# Prepare counters and histogram bins ################### | |
1609 { | |
1610 pvbins = NULL; # bin edges for p-values | |
1611 statbins = 0; # bin edges for the test statistic (|t| or F) | |
1612 do.hist = FALSE; | |
1613 if( length(pvalue.hist) == 1 ) { | |
1614 if(pvalue.hist == "qqplot") { | |
1615 pvbins = c(0, 10^rev(seq(0, log10(.Machine$double.xmin)-1, -0.05))); | |
1616 } else | |
1617 if( is.numeric(pvalue.hist) ) { | |
1618 pvbins = seq(from = 0, to = 1, length.out = pvalue.hist+1); | |
1619 } else | |
1620 if( pvalue.hist == TRUE ) { | |
1621 pvbins = seq(from = 0, to = 1, length.out = 100+1); | |
1622 } | |
1623 } else | |
1624 if( is.numeric(pvalue.hist) && (length(pvalue.hist) > 1) ) { | |
1625 pvbins = pvalue.hist; | |
1626 } | |
1627 if( is.null(pvbins) && (pvalue.hist != FALSE) ) { | |
1628 stop("Wrong value of pvalue.hist. Must be FALSE, TRUE, \"qqplot\", or numerical"); | |
1629 } | |
1630 do.hist = !is.null(pvbins); | |
1631 if( do.hist ) { | |
1632 pvbins = sort(pvbins); | |
1633 statbins = threshfun(pvbins); | |
1634 if( pvOutputThreshold > 0) { | |
1635 hist.all = .histogrammer$new(pvbins, statbins); | |
1636 } | |
1637 if( pvOutputThreshold.cis > 0) { | |
1638 hist.cis = .histogrammer$new(pvbins, statbins); | |
1639 } | |
1640 } | |
1641 rm( pvbins, statbins); | |
1642 if(min.pv.by.genesnp) { | |
1643 if( pvOutputThreshold > 0) { | |
1644 minpv.tra = .minpvalue$new(snps,gene); | |
1645 } | |
1646 if( pvOutputThreshold.cis > 0) { | |
1647 minpv.cis = .minpvalue$new(snps,gene); | |
1648 } | |
1649 } | |
1650 } | |
1651 ################################# Main loop ############################################# | |
1652 { | |
1653 beta = NULL; | |
1654 n.tests.all = 0; | |
1655 n.tests.cis = 0; | |
1656 n.eqtls.tra = 0; | |
1657 n.eqtls.cis = 0; | |
1658 | |
1659 status("Performing eQTL analysis"); | |
1660 # ss = 1; gg = 1; | |
1661 # ss = snps$nSlices(); gg = gene$nSlices(); | |
1662 | |
1663 snps_offset = 0; | |
1664 for(ss in 1:snps$nSlices()) { | |
1665 # for(ss in 1:min(2,snps$nSlices())) { #for debug | |
1666 cursnps = NULL; | |
1667 nrcs = nrow(snps$getSliceRaw(ss)); | |
1668 | |
1669 # loop only through the useful stuff | |
1670 for(gg in if(pvOutputThreshold>0){1:gene$nSlices()}else{.seq(gg.1[ss],gg.2[ss])} ) { | |
1671 gene_offset = gene_offsets[gg]; | |
1672 curgene = gene$getSlice(gg); | |
1673 nrcg = nrow(curgene); | |
1674 if(nrcg == 0) next; | |
1675 | |
1676 rp = ""; | |
1677 | |
1678 statistic = NULL; | |
1679 select.cis.raw = NULL; | |
1680 ## do cis analysis | |
1681 # if( (pvOutputThreshold.cis > 0) && ( is.cis.pair(gg, ss) ) ) { | |
1682 if( (pvOutputThreshold.cis > 0) && (gg >= gg.1[ss]) && (gg <= gg.2[ss]) ) { | |
1683 | |
1684 if( is.null( statistic ) ) { | |
1685 if( is.null( cursnps ) ) { | |
1686 cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); | |
1687 } | |
1688 mat = vector("list", length(cursnps)); | |
1689 for(d in 1:length(cursnps)) { | |
1690 mat[[d]] = tcrossprod(curgene, cursnps[[d]]); | |
1691 } | |
1692 statistic = statistic.fun( mat ); | |
1693 astatistic = afun(statistic); | |
1694 # rm(mat); | |
1695 } | |
1696 | |
1697 # sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1 +1 , snpsloc[[ss]]); | |
1698 # sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist -1 , snpsloc[[ss]]); | |
1699 sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1, snpsloc[[ss]]); | |
1700 sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist, snpsloc[[ss]]); | |
1701 xx = unlist(lapply(which(sn.r>sn.l),FUN=function(x){(sn.l[x]:(sn.r[x]-1))*nrow(statistic)+x})) | |
1702 select.cis.raw = xx[ astatistic[xx] >= thresh.cis ]; | |
1703 select.cis = arrayInd(select.cis.raw, dim(statistic)) | |
1704 | |
1705 n.tests.cis = n.tests.cis + length(xx); | |
1706 n.eqtls.cis = n.eqtls.cis + length(select.cis.raw); | |
1707 | |
1708 if( do.hist ) | |
1709 hist.cis$update(astatistic[xx]); | |
1710 | |
1711 if( min.pv.by.genesnp ) { | |
1712 # minpv.cis$updatecis(ss, gg, arrayInd(xx, dim(statistic)), astatistic[xx]) | |
1713 temp = double(length(astatistic)); | |
1714 dim(temp) = dim(astatistic); | |
1715 temp[xx] = astatistic[xx]; | |
1716 minpv.cis$update(ss, gg, temp); | |
1717 } | |
1718 | |
1719 if(!is.null(betafun)) | |
1720 beta = betafun(mat[[length(mat)]][select.cis.raw], ss, gg, select.cis); | |
1721 | |
1722 saver.cis$update( snps_offset + select.cis[ , 2], | |
1723 gene_offset + select.cis[ , 1], | |
1724 statistic[select.cis.raw], | |
1725 beta); | |
1726 | |
1727 # statistic.select.cis = statistic[ select.cis ]; | |
1728 # test = testfun( statistic.select.cis ); | |
1729 # pv = pvfun(test); | |
1730 # Saver.cis$WriteBlock( cbind(snps_offset + select.cis[ , 2], gene_offset + select.cis[ , 1], test, pv) ); | |
1731 # counter.cis$Update(gg, ss, select.cis, pv, n.tests = length(xx), if(do.hist) afun(statistic[xx]) ) | |
1732 rp = paste(rp, ", ", formatC(n.eqtls.cis, big.mark=",", format = "f", digits = 0), " cis-eQTLs", sep = ""); | |
1733 } | |
1734 ## do trans/all analysis | |
1735 if(pvOutputThreshold>0) { | |
1736 if( is.null( statistic ) ) { | |
1737 if( is.null( cursnps ) ) { | |
1738 cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); | |
1739 } | |
1740 mat = vector("list", length(cursnps)); | |
1741 for(d in 1:length(cursnps)) { | |
1742 mat[[d]] = tcrossprod(curgene, cursnps[[d]]); | |
1743 } | |
1744 statistic = statistic.fun( mat ); | |
1745 astatistic = afun(statistic); | |
1746 # rm(mat); | |
1747 } | |
1748 | |
1749 if( do.hist ) | |
1750 hist.all$update(astatistic); | |
1751 | |
1752 if(!is.null(select.cis.raw)) | |
1753 astatistic[xx] = -1; | |
1754 # select.tra.raw = select.tra.raw[!(select.tra.raw %in% select.cis.raw)]; | |
1755 | |
1756 select.tra.raw = which( astatistic >= thresh); | |
1757 select.tra = arrayInd(select.tra.raw, dim(statistic)) | |
1758 | |
1759 n.eqtls.tra = n.eqtls.tra + length(select.tra.raw); | |
1760 n.tests.all = n.tests.all + length(statistic); | |
1761 | |
1762 if(!is.null(betafun)) | |
1763 beta = betafun(mat[[length(mat)]][select.tra.raw], ss, gg, select.tra); | |
1764 | |
1765 saver.tra$update( snps_offset + select.tra[ , 2], | |
1766 gene_offset + select.tra[ , 1], | |
1767 statistic[select.tra.raw], | |
1768 beta); | |
1769 | |
1770 if( min.pv.by.genesnp ) | |
1771 minpv.tra$update(ss, gg, astatistic) | |
1772 | |
1773 # statistic.select.tra = statistic[ select.tra ]; | |
1774 # test = testfun( statistic.select.tra ); | |
1775 # pv = pvfun( test ); | |
1776 # Saver$WriteBlock( cbind( snps_offset + select.tra[ , 2], gene_offset + select.tra[ , 1], test, pv) ); | |
1777 # counter$Update(gg, ss, select.tra, pv, n.tests = nrcs*nrcg, if(do.hist) afun(statistic) ) | |
1778 rp = paste(rp, ", ", formatC(n.eqtls.tra, big.mark=",", format = "f", digits = 0), if(pvOutputThreshold.cis > 0)" trans-"else" ","eQTLs", sep = "") | |
1779 } | |
1780 | |
1781 #gene_offset = gene_offset + nrcg; | |
1782 if( !is.null(statistic) ) { | |
1783 per = 100*(gg/gene$nSlices() + ss-1) / snps$nSlices(); | |
1784 cat( formatC(floor(per*100)/100, format = "f", width = 5, digits = 2), "% done" , rp, "\n", sep = ""); | |
1785 flush.console(); | |
1786 } | |
1787 } # gg in 1:gene$nSlices() | |
1788 snps_offset = snps_offset + nrow(snps$getSliceRaw(ss)); | |
1789 } # ss in 1:snps$nSlices() | |
1790 } | |
1791 ################################# Results collection #################################### | |
1792 { | |
1793 rez = list(time.in.sec = proc.time()[3] - start.time); | |
1794 rez$param = params; | |
1795 | |
1796 if(pvOutputThreshold.cis > 0) { | |
1797 rez.cis = list(ntests = n.tests.cis, neqtls = n.eqtls.cis); | |
1798 rez.cis = c(rez.cis, saver.cis$getResults( gene, snps, n.tests.cis) ); | |
1799 if(do.hist) | |
1800 rez.cis = c(rez.cis, hist.cis$getResults() ); | |
1801 if(min.pv.by.genesnp) | |
1802 rez.cis = c(rez.cis, minpv.cis$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); | |
1803 } | |
1804 | |
1805 if(pvOutputThreshold>0) { | |
1806 rez.all = list(ntests = n.tests.all, neqtls = n.eqtls.tra + n.eqtls.cis); | |
1807 if(pvOutputThreshold.cis > 0) { | |
1808 rez.tra = list(ntests = n.tests.all - n.tests.cis, neqtls = n.eqtls.tra); | |
1809 rez.tra = c(rez.tra, saver.tra$getResults( gene, snps, n.tests.all - n.tests.cis) ); | |
1810 } else { | |
1811 rez.all = c(rez.all, saver.tra$getResults( gene, snps, n.tests.all ) ); | |
1812 } | |
1813 if(do.hist) { | |
1814 rez.all = c(rez.all, hist.all$getResults() ); | |
1815 if(pvOutputThreshold.cis > 0) { | |
1816 rez.tra$hist.bins = rez.all$hist.bins; | |
1817 rez.tra$hist.counts = rez.all$hist.counts - rez.cis$hist.counts; | |
1818 } | |
1819 } | |
1820 if(min.pv.by.genesnp) { | |
1821 if(pvOutputThreshold.cis > 0) { | |
1822 rez.tra = c(rez.tra, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); | |
1823 } else { | |
1824 rez.all = c(rez.all, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); | |
1825 } | |
1826 } | |
1827 } | |
1828 | |
1829 if(exists('rez.all')>0) | |
1830 rez$all = rez.all; | |
1831 if(exists('rez.tra')>0) | |
1832 rez$trans = rez.tra; | |
1833 if(exists('rez.cis')>0) | |
1834 rez$cis = rez.cis; | |
1835 | |
1836 class(rez) = c(class(rez),"MatrixEQTL"); | |
1837 status(""); | |
1838 } | |
1839 # cat('s std ',snps.std$get(1),'\n'); | |
1840 # cat('g std ',gene.std$get(1),'\n'); | |
1841 ################################# Results collection #################################### | |
1842 return(rez); | |
1843 } | |
1844 | |
1845 .histme = function(m, name1, name2, ...) { | |
1846 cnts = m$hist.counts; | |
1847 bins = m$hist.bins; | |
1848 ntst = m$ntests; | |
1849 centers = 0.5 * (bins[-1L] + bins[-length(bins)]); | |
1850 density = 0.5 / (bins[-1L] - centers) * cnts / ntst; | |
1851 ntext = paste("P-value distribution for ", name1, formatC(ntst, big.mark=",", format = "f", digits = 0), name2, " gene-SNP pairs ",sep=""); | |
1852 r = structure(list(breaks = bins, counts = cnts, density = density, | |
1853 mids = centers, equidist = FALSE), class = "histogram"); | |
1854 plot(r, main = ntext, ylab = "Density", xlab = "P-values", ...) | |
1855 abline( h = 1, col = "blue"); | |
1856 return(invisible()); | |
1857 } | |
1858 | |
1859 .qqme = function(m, lcol, cex, pch, ...) { | |
1860 cnts = m$hist.counts; | |
1861 bins = m$hist.bins; | |
1862 ntst = m$ntests; | |
1863 | |
1864 cusu = cumsum(cnts) / ntst; | |
1865 ypos = bins[-1][is.finite(cusu)]; | |
1866 xpos = cusu[is.finite(cusu)]; | |
1867 lines(-log10(xpos), -log10(ypos), col = lcol, ...); | |
1868 # lines(xpos, ypos, col = lcol, ...); | |
1869 if(length(m$eqtls$pvalue)==0) | |
1870 return(); | |
1871 ypvs = -log10(m$eqtls$pvalue); | |
1872 xpvs = -log10(1:length(ypvs) / ntst); | |
1873 if(length(ypvs) > 1000) { | |
1874 # need to filter a bit, make the plotting faster | |
1875 levels = as.integer( xpvs/xpvs[1] * 1e3); | |
1876 keep = c(TRUE, diff(levels)!=0); | |
1877 levels = as.integer( ypvs/ypvs[1] * 1e3); | |
1878 keep = keep | c(TRUE, diff(levels)!=0); | |
1879 ypvs = ypvs[keep]; | |
1880 xpvs = xpvs[keep]; | |
1881 rm(keep) | |
1882 } | |
1883 points(xpvs, ypvs, col = lcol, pch = pch, cex = cex, ...); | |
1884 } | |
1885 | |
1886 plot.MatrixEQTL = function(x, cex = 0.5, pch = 19, xlim = NULL, ylim = NULL, ...) { | |
1887 if( x$param$pvalue.hist == FALSE ) { | |
1888 warning("Cannot plot p-value distribution: the information was not recorded.\nUse pvalue.hist!=FALSE."); | |
1889 return(invisible()); | |
1890 } | |
1891 if( x$param$pvalue.hist == "qqplot" ) { | |
1892 xmin = 1/max(x$cis$ntests, x$all$ntests); | |
1893 ymax = NULL; | |
1894 if(!is.null(ylim)) { | |
1895 ymax = ylim[2]; | |
1896 } else { | |
1897 ymax = -log10(min( | |
1898 x$cis$eqtls$pvalue[1], x$cis$hist.bins[ c(FALSE,x$cis$hist.counts>0)][1], | |
1899 x$all$eqtls$pvalue[1], x$all$hist.bins[ c(FALSE,x$all$hist.counts>0)][1], | |
1900 x$trans$eqtls$pvalue[1], x$trans$hist.bins[c(FALSE,x$trans$hist.counts>0)][1], | |
1901 na.rm = TRUE))+0.1; | |
1902 } | |
1903 if(ymax == 0) { | |
1904 ymax = -log10(.Machine$double.xmin) | |
1905 } | |
1906 if(!is.null(ymax)) | |
1907 ylim = c(0,ymax); | |
1908 | |
1909 if(is.null(xlim)) | |
1910 xlim = c(0, -log10(xmin/1.5)); | |
1911 | |
1912 plot(numeric(),numeric(), xlab = "-Log10(p-value), theoretical", | |
1913 ylab = "-Log10(p-value), observed", | |
1914 xlim = c(0, -log10(xmin/1.5)), | |
1915 ylim = ylim, | |
1916 xaxs="i", yaxs="i", ...); | |
1917 lines(c(0,1e3), c(0,1e3), col = "gray"); | |
1918 if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { | |
1919 .qqme( x$cis, "red", cex, pch, ...); | |
1920 .qqme( x$trans, "blue", cex, pch, ...); | |
1921 title(paste("QQ-plot for", | |
1922 formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), | |
1923 "local and", | |
1924 formatC(x$trans$ntests, big.mark=",", format = "f", digits = 0), | |
1925 "distant gene-SNP p-values")); | |
1926 lset = c(1,2,4); | |
1927 } else | |
1928 if(x$param$pvOutputThreshold.cis > 0) { | |
1929 .qqme(x$cis, "red", cex, pch, ...); | |
1930 title(paste("QQ-plot for", | |
1931 formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), | |
1932 "local gene-SNP p-values")); | |
1933 lset = c(1,4); | |
1934 } else { | |
1935 .qqme(x$all, "blue", cex, pch, ...); | |
1936 title(paste("QQ-plot for all", | |
1937 formatC(x$all$ntests, big.mark=",", format = "f", digits = 0), | |
1938 "gene-SNP p-values")); | |
1939 lset = c(3,4); | |
1940 } | |
1941 legend("topleft", | |
1942 c("Local p-values","Distant p-values","All p-values","diagonal")[lset], | |
1943 col = c("red","blue","blue","gray")[lset], | |
1944 text.col = c("red","blue","blue","gray")[lset], | |
1945 pch = 20, lwd = 1, pt.cex = c(1,1,1,0)[lset]) | |
1946 } else { | |
1947 if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { | |
1948 par(mfrow=c(2,1)); | |
1949 .histme(x$cis, "", " local", ...); | |
1950 tran = list(hist.counts = x$all$hist.counts - x$cis$hist.counts, | |
1951 hist.bins = x$all$hist.bins, | |
1952 ntests = x$all$ntests - x$cis$ntests); | |
1953 .histme(x$trans,""," distant", ...); | |
1954 par(mfrow=c(1,1)); | |
1955 } else | |
1956 if(x$param$pvOutputThreshold.cis > 0) { | |
1957 .histme(x$cis, "", " local", ...); | |
1958 } else { | |
1959 .histme(x$all, "all ", "" , ...); | |
1960 } | |
1961 } | |
1962 return(invisible()); | |
1963 } | |
1964 |