comparison MatrixEQTL/R/Matrix_eQTL_engine.R @ 0:cd4c8e4a4b5b draft

Uploaded
author jasonxu
date Fri, 12 Mar 2021 08:12:46 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:cd4c8e4a4b5b
1 # Matrix eQTL by Andrey A. Shabalin
2 # http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/
3
4 # http://cran.r-project.org/web/packages/policies.html
5
6 library(methods)
7
8 modelLINEAR = 117348L;
9 modelANOVA = 47074L;
10 modelLINEAR_CROSS = 1113461L;
11
12 .seq = function(a,b){if(a<=b){a:b}else{integer(0)}};
13
14 SlicedData <- setRefClass( "SlicedData",
15 fields = list(
16 dataEnv = "environment",
17 nSlices1 = "numeric",
18 rowNameSlices = "list",
19 columnNames = "character",
20 fileDelimiter = "character",
21 fileSkipColumns = "numeric",
22 fileSkipRows = "numeric",
23 fileSliceSize = "numeric",
24 fileOmitCharacters = "character"
25 ),
26 methods = list(
27 initialize = function( mat = NULL ) {
28 dataEnv <<- new.env(hash = TRUE, size = 29L);
29 nSlices1 <<- 0L;
30 if(!is.null(mat)) {
31 CreateFromMatrix(mat);
32 }
33 fileSliceSize <<- 1000;
34 fileDelimiter <<- "\t";
35 fileSkipColumns <<- 1L;
36 fileSkipRows <<- 1L;
37 fileOmitCharacters <<- "NA"
38 return(invisible(.self));
39 },
40 CreateFromMatrix = function( mat ) {
41 stopifnot( class(mat) == "matrix" );
42 setSliceRaw( 1L ,mat );
43 rns = rownames( mat, do.NULL = FALSE);
44 #if( is.null(rns) ) {
45 # rns = paste( "Row_",(1:nrow(mat)), sep="" );
46 #}
47 rowNameSlices <<- list(rns);
48 cns = colnames( mat, do.NULL = FALSE );
49 #if( is.null(cns) ){
50 # cns = paste( "Col_",(1:ncol(mat)), sep="" );
51 #}
52 columnNames <<- cns;
53 return(invisible(.self));
54 },
55 getSlice = function(sl) {
56 value = get(paste(sl), dataEnv);
57 if( is.raw(value) ) {
58 storage.mode(value) = "double";
59 value[value == 255] = NA;
60 }
61 return( value )
62 },
63 getSliceRaw = function(sl) {
64 return( get(paste(sl), dataEnv) )
65 },
66 setSliceRaw = function(sl, value) {
67 assign( paste(sl), value, dataEnv )
68 if( nSlices1 < sl ) {
69 nSlices1 <<- sl;
70 }
71 },
72 setSlice = function(sl, value) {
73 if( length(value) > 0 ) {
74 if( all(as.integer(value) == value, na.rm = TRUE) ) {
75 if( (min(value, na.rm = TRUE) >= 0 ) &&
76 (max(value, na.rm = TRUE) < 255) )
77 {
78 nv = value;
79 suppressWarnings({storage.mode(nv) = "raw"});
80 nv[ is.na(value)] = as.raw(255);
81 value = nv;
82 } else {
83 storage.mode(value) = "integer";
84 }
85 }
86 }
87 setSliceRaw(sl, value);
88 },
89 nSlices = function() {
90 return( nSlices1 );
91 },
92 LoadFile = function(filename, skipRows = NULL, skipColumns = NULL, sliceSize = NULL, omitCharacters = NULL, delimiter = NULL, rowNamesColumn = 1) {
93 if( !is.null(skipRows) ) {
94 fileSkipRows <<- skipRows;
95 }
96 if( !is.null(skipColumns) ) {
97 fileSkipColumns <<- skipColumns;
98 }
99 if( !is.null(omitCharacters) ) {
100 fileOmitCharacters <<- omitCharacters;
101 }
102 if( !is.null(sliceSize) ) {
103 fileSliceSize <<- sliceSize;
104 }
105 if( !is.null(delimiter) ) {
106 fileDelimiter <<- delimiter;
107 }
108 stopifnot( (fileSkipColumns == 0) || (rowNamesColumn <= fileSkipColumns) )
109 stopifnot( (fileSkipColumns == 0) || (rowNamesColumn >= 1) )
110
111 fid = file(description = filename, open = "rt", blocking = FALSE, raw = FALSE)
112 # clean object if file is open
113 Clear();
114 lines = readLines(con = fid, n = max(fileSkipRows,1L), ok = TRUE, warn = TRUE)
115 line1 = tail(lines,1);
116 splt = strsplit(line1, split = fileDelimiter, fixed = TRUE);
117 if( fileSkipRows > 0L ) {
118 columnNames <<- splt[[1]]; # [ -(1:fileSkipColumns) ];
119 } else {
120 seek(fid, 0)
121 }
122
123 rm( lines, line1, splt );
124
125 rowNameSlices <<- vector("list", 15);
126
127 curSliceId = 0L;
128 repeat
129 {
130 # preallocate names and data
131 if(length(rowNameSlices) < curSliceId) {
132 rowNameSlices[[2L*curSliceId]] <<- NULL;
133 }
134 curSliceId = curSliceId + 1L;
135
136 # read sliceSize rows
137 rowtag = vector("character",fileSliceSize);
138 rowvals = vector("list",fileSliceSize);
139 for(i in 1:fileSliceSize) {
140 temp = "";
141 if( fileSkipColumns > 0L ) {
142 temp = scan(file = fid, what = character(), n = fileSkipColumns, quiet = TRUE,sep = fileDelimiter);
143 }
144 rowtag[i] = temp[rowNamesColumn];#paste(temp,collapse=" ");
145 rowvals[[i]] = scan(file = fid, what = double(), nlines = 1, quiet = TRUE, sep = fileDelimiter, na.strings = fileOmitCharacters);
146 if( length(rowvals[[i]]) == 0L ) {
147 if(i==1L) {
148 rowtag = matrix(0, 0, 0);
149 rowvals = character(0);
150 } else {
151 rowtag = rowtag[ 1:(i-1) ];
152 rowvals = rowvals[ 1:(i-1) ];
153 }
154 break;
155 }
156 }
157 if( length(rowtag) == 0L ) {
158 curSliceId = curSliceId - 1L;
159 break;
160 }
161 rowNameSlices[[curSliceId]] <<- rowtag;
162 data = c(rowvals, recursive = TRUE);
163 dim(data) = c(length(rowvals[[1]]), length(rowvals));
164 data = t(data);
165 setSlice(curSliceId, data);
166 if( length(rowtag) < fileSliceSize ) {
167 break;
168 }
169 numtxt = formatC(curSliceId*fileSliceSize, big.mark=",", format = "f", digits = 0)
170 cat( "Rows read: ", numtxt, "\n");
171 flush.console()
172 }
173 close(fid)
174 if( fileSkipRows == 0 ) {
175 columnNames <<- paste("Col_", (1:nCols()), sep="");
176 } else {
177 columnNames <<- tail(columnNames, ncol(getSliceRaw(1)));
178 }
179 if( fileSkipColumns == 0 ) {
180 cnt = 0L;
181 for( sl in 1:nSlices() ) {
182 nr = length(getSliceRaw(sl));
183 rowNameSlices[[sl]] <<- paste("Row_",cnt + (1:nr),sep="");
184 cnt = cnt + nr;
185 }
186 }
187 rowNameSlices <<- rowNameSlices[1:curSliceId];
188 cat("Rows read: ", nRows(), " done.\n");
189 return(invisible(.self));
190 },
191 SaveFile = function(filename) {
192 if( nSlices() == 0 ) {
193 cat("No data to save");
194 return();
195 }
196 fid = file(filename,"wt");
197 for( sl in 1:nSlices() ) {
198 z = getSlice(sl);
199 rownames(z) = rowNameSlices[[sl]];
200 colnames(z) = columnNames;
201 write.table(z, file = fid, sep = "\t",
202 col.names = (if(sl == 1){NA}else{FALSE}));
203 }
204 close(fid);
205 },
206 nRows = function() {
207 s = 0L;
208 for(sl in .seq(1,nSlices())) {
209 s = s + nrow(getSliceRaw(sl));
210 }
211 return( s )
212 },
213 nCols = function() {
214 if( nSlices() == 0L ) {
215 return(0L);
216 } else {
217 return( ncol(getSliceRaw(1L)) )
218 }
219 },
220 Clear = function() {
221 for( sl in .seq(1,nSlices()) ) {
222 rm(list = paste(sl), envir = dataEnv)
223 }
224 nSlices1 <<- 0L;
225 rowNameSlices <<- list();
226 columnNames <<- character();
227 return(invisible(.self));
228 },
229 IsCombined = function() {
230 return( nSlices() <= 1L );
231 },
232 GetAllRowNames = function() {
233 return( c(rowNameSlices, recursive=TRUE) );
234 },
235 SetNanRowMean = function() {
236 if( (nCols() == 0L) ) {
237 return(invisible(.self));
238 }
239 for( sl in .seq(1,nSlices()) ) {
240 slice = getSlice(sl);
241 if( any(is.na(slice)) ) {
242 rowmean = rowMeans(slice, na.rm = TRUE);
243 rowmean[is.na(rowmean)] = 0L;
244 for( j in which(!complete.cases(slice)) ) {
245 where1 = is.na(slice[j, ]);
246 slice[j, where1] = rowmean[j];
247 }
248 setSlice(sl, slice);
249 }
250 }
251 return(invisible(.self));
252 },
253 RowStandardizeCentered = function() {
254 for(sl in .seq(1,nSlices()) ) {
255 slice = getSlice(sl);
256 div = sqrt( rowSums(slice^2) );
257 div[ div == 0 ] = 1;
258 setSlice(sl, slice/div);
259 }
260 return(invisible(.self));
261 },
262 CombineInOneSlice = function() {
263 if( nSlices() <= 1L ) {
264 return(invisible(.self));
265 }
266 nc = nCols();
267 nr = nRows();
268 datatypes = c("raw","integer","double");
269 datafuns = c(as.raw, as.integer, as.double);
270 datatype = character(nSlices());
271 for(sl in 1:nSlices()) {
272 datatype[sl] = typeof(getSliceRaw(sl));
273 }
274 mch = max(match(datatype,datatypes,nomatch = length(datatypes)));
275 datafun = datafuns[[mch]];
276 newData = matrix(datafun(0), nrow = nr, ncol = nc);
277 offset = 0;
278 for(sl in 1:nSlices()) {
279 if(mch==1) {
280 slice = getSliceRaw(sl);
281 } else {
282 slice = getSlice(sl);
283 }
284 newData[ offset + (1:nrow(slice)),] = datafun(slice);
285 setSlice(sl, numeric());
286 offset = offset + nrow(slice);
287 }
288
289 nSlices1 <<- 1L;
290 setSliceRaw(1L, newData);
291 rm(newData);
292
293 newrowNameSlices = GetAllRowNames();
294 rowNameSlices <<- list(newrowNameSlices)
295 return(invisible(.self));
296 },
297 ResliceCombined = function(sliceSize = -1) {
298 if( sliceSize > 0L ) {
299 fileSliceSize <<- sliceSize;
300 }
301 if( fileSliceSize <= 0 ) {
302 fileSliceSize <<- 1000;
303 }
304 if( IsCombined() ) {
305 nRows1 = nRows();
306 if(nRows1 == 0L) {
307 return(invisible(.self));
308 }
309 newNSlices = floor( (nRows1 + fileSliceSize - 1)/fileSliceSize );
310 oldData = getSliceRaw(1L);
311 #oldNames = rowNameSlices[[1]];
312 newNameslices = vector("list",newNSlices)
313 for( sl in 1:newNSlices ) {
314 range = (1+(sl-1)*fileSliceSize) : (min(nRows1,sl*fileSliceSize));
315 newpart = oldData[range, ,drop = FALSE];
316 if( is.raw(oldData) ) {
317 setSliceRaw( sl, newpart);
318 } else {
319 setSlice( sl, newpart);
320 }
321 newNameslices[[sl]] = rowNameSlices[[1]][range];
322 }
323 rowNameSlices <<- newNameslices ;
324 } else {
325 stop("Reslice of a sliced matrix is not supported yet. Use CombineInOneSlice first.");
326 }
327 return(invisible(.self));
328 },
329 Clone = function() {
330 clone = SlicedData$new();
331 for(sl in .seq(1,nSlices()) ) {
332 clone$setSliceRaw(sl,getSliceRaw(sl));
333 }
334 clone$rowNameSlices = rowNameSlices;
335 clone$columnNames = columnNames;
336 clone$fileDelimiter = fileDelimiter;
337 clone$fileSkipColumns = fileSkipColumns;
338 clone$fileSkipRows = fileSkipRows;
339 clone$fileSliceSize = fileSliceSize;
340 clone$fileOmitCharacters = fileOmitCharacters;
341 return( clone );
342 },
343 RowMatrixMultiply = function(multiplier) {
344 for(sl in .seq(1,nSlices()) ) {
345 setSlice(sl, getSlice(sl) %*% multiplier);
346 }
347 return(invisible(.self));
348 },
349 ColumnSubsample = function(subset) {
350 for(sl in .seq(1,nSlices()) ) {
351 setSliceRaw(sl, getSliceRaw(sl)[ ,subset, drop = FALSE]);
352 }
353 columnNames <<- columnNames[subset];
354 return(invisible(.self));
355 },
356 RowReorderSimple = function(ordr) {
357 # had to use an inefficient and dirty method
358 # due to horrible memory management in R
359 if( (typeof(ordr) == "logical") && all(ordr) ) {
360 return(invisible(.self));
361 }
362 if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) {
363 return(invisible(.self));
364 }
365 CombineInOneSlice();
366 gc();
367 setSliceRaw( 1L, getSliceRaw(1L)[ordr, ] );
368 rowNameSlices[[1]] <<- rowNameSlices[[1]][ordr];
369 gc();
370 ResliceCombined();
371 gc();
372 return(invisible(.self));
373 },
374 RowReorder = function(ordr) {
375 # transform logical into indices
376 if( typeof(ordr) == "logical" ) {
377 if( length(ordr) == nRows() ) {
378 ordr = which(ordr);
379 } else {
380 stop("Parameter \"ordr\" has wrong length")
381 }
382 }
383 ## first, check that anything has to be done at all
384 if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) {
385 return(invisible(.self));
386 }
387 ## check bounds
388 #if( (min(ordr) < 1) || (max(ordr) > nRows()) ) {
389 # stop("Parameter \"ordr\" is out of bounds");
390 #}
391 ## slice the data into individual rows
392 all_rows = vector("list", nSlices())
393 for( i in 1:nSlices() ) {
394 slice = getSliceRaw(i)
395 all_rows[[i]] = split(slice, 1:nrow(slice))
396 setSliceRaw(i,numeric())
397 }
398 gc();
399 all_rows = unlist(all_rows, recursive=FALSE, use.names = FALSE);
400 ## Reorder the rows
401 all_rows = all_rows[ordr];
402 ## get row names
403 all_names = GetAllRowNames();
404 ## erase the set
405 rowNameSlices <<- list();
406 ## sort names
407 all_names = all_names[ordr];
408 ##
409 ## Make slices back
410 nrows = length(all_rows);
411 nSlices1 <<- as.integer((nrows+fileSliceSize-1)/fileSliceSize);
412 ##cat(nrows, " ", nSlices1);
413 rowNameSlices1 = vector("list", nSlices1);
414 for( i in 1:nSlices1 ) {
415 fr = 1 + fileSliceSize*(i-1);
416 to = min( fileSliceSize*i, nrows);
417
418 subset = all_rows[fr:to];
419 types = unlist(lapply(subset,typeof));
420 israw = (types == "raw")
421 if(!all(israw == israw[1])) {
422 # some raw and some are not
423 subset = lapply(subset, function(x){if(is.raw(x)){x=as.integer(x);x[x==255] = NA;return(x)}else{return(x)}});
424 }
425 subset = unlist(subset);
426 dim(subset) = c( length(all_rows[[fr]]) , to - fr + 1)
427 #subset = matrix(subset, ncol = (to-fr+1));
428 if(is.raw(subset)) {
429 setSliceRaw(i, t(subset));
430 } else {
431 setSlice(i, t(subset));
432 }
433 rowNameSlices1[[i]] = all_names[fr:to];
434 all_rows[fr:to] = 0;
435 all_names[fr:to] = 0;
436 }
437 rowNameSlices <<- rowNameSlices1;
438 gc();
439 return(invisible(.self));
440 },
441 RowRemoveZeroEps = function(){
442 for(sl in .seq(1,nSlices()) ) {
443 slice = getSlice(sl);
444 amean = rowMeans(abs(slice));
445 remove = (amean < .Machine$double.eps*nCols());
446 if(any(remove)) {
447 rowNameSlices[[sl]] <<- rowNameSlices[[sl]][!remove];
448 setSlice(sl, slice[!remove, , drop = FALSE]);
449 }
450 }
451 return(invisible(.self));
452 },
453 FindRow = function(rowname) {
454 for(sl in .seq(1,nSlices()) ) {
455 mch = match(rowname,rowNameSlices[[sl]], nomatch = 0);
456 if( mch > 0 )
457 {
458 row = getSlice(sl)[mch[1], , drop=FALSE];
459 rownames(row) = rowname;
460 colnames(row) = columnNames;
461 return( list(slice = sl, item = mch, row = row) );
462 }
463 }
464 return( NULL );
465 },
466 show = function() {
467 cat("SlicedData object. For more information type: ?SlicedData\n");
468 cat("Number of columns:", nCols(), "\n");
469 cat("Number of rows:", nRows(), "\n");
470 cat("Data is stored in", nSlices(), "slices\n");
471 if(nCols()>0) {
472 z = getSlice(1L);
473 if(nrow(z)>0) {
474 z = z[1:min(nrow(z),10L), 1:min(ncol(z),10L), drop = FALSE];
475 rownames(z) = rowNameSlices[[1]][1:nrow(z)];
476 colnames(z) = columnNames[1:ncol(z)];
477 cat("Top left corner of the first slice (up to 10x10):\n");
478 methods:::show(z)
479 }
480 }
481 }
482 ))
483
484 setGeneric("nrow")
485 setMethod("nrow", "SlicedData", function(x) {
486 return( x$nRows() );
487 })
488 setGeneric("NROW")
489 setMethod("NROW", "SlicedData", function(x) {
490 return( x$nRows() );
491 })
492 setGeneric("ncol")
493 setMethod("ncol", "SlicedData", function(x) {
494 return( x$nCols() );
495 })
496 setGeneric("NCOL")
497 setMethod("NCOL", "SlicedData", function(x) {
498 return( x$nCols() );
499 })
500 setGeneric("dim")
501 setMethod("dim", "SlicedData", function(x) {
502 return( c(x$nRows(),x$nCols()) );
503 })
504 setGeneric("colnames")
505 setMethod("colnames", "SlicedData", function(x) {
506 return( x$columnNames );
507 })
508 setGeneric("rownames")
509 setMethod("rownames", "SlicedData", function(x) {
510 return( x$GetAllRowNames() );
511 })
512 setMethod("[[", "SlicedData", function(x,i) {
513 return( x$getSlice(i) );
514 })
515 setGeneric("length")
516 setMethod("length", "SlicedData", function(x) {
517 return( x$nSlices() );
518 })
519 setMethod("[[<-", "SlicedData", function(x,i,value) {
520 x$setSlice(i, value);
521 return(x);
522 })
523 summary.SlicedData = function(object, ...) {
524 z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices());
525 return(z);
526 }
527
528 ##### setGeneric("summary") #####
529 #setMethod("summary", "SlicedData", function(object, ...) {
530 # z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices());
531 # return(z);
532 # })
533 #setGeneric("show", standardGeneric("show"))
534 # setMethod("show", "SlicedData", function(object) {
535 # cat("SlicedData object. For more information type: ?SlicedData\n");
536 # cat("Number of columns:", object$nCols(), "\n");
537 # cat("Number of rows:", object$nRows(), "\n");
538 # cat("Data is stored in", object$nSlices(), "slices\n");
539 # if(object$nSlices()>0) {
540 # z = object$getSlice(1);
541 # if(nrow(z)>0) {
542 # z = z[1:min(nrow(z),10), 1:min(ncol(z),10), drop = FALSE];
543 # rownames(z) = object$rowNameSlices[[1]][1:nrow(z)];
544 # colnames(z) = object$columnNames[1:ncol(z)];
545 # cat("Top left corner of the first slice (up to 10x10):\n");
546 # show(z)
547 # }
548 # }
549 # })
550
551 setGeneric("as.matrix")
552 setMethod("as.matrix", "SlicedData", function(x) {
553 if(x$nSlices() == 0) {
554 return( matrix(0,0,0) );
555 }
556 if(x$nSlices() > 1) {
557 copy = x$Clone();
558 copy$CombineInOneSlice();
559 } else {
560 copy = x;
561 }
562 mat = copy$getSlice(1L);
563 rownames(mat) = rownames(copy);
564 colnames(mat) = colnames(copy);
565 return( mat );
566 })
567 setGeneric("colnames<-")
568 setMethod("colnames<-", "SlicedData", function(x,value) {
569 stopifnot( class(value) == "character" );
570 stopifnot( length(value) == x$nCols() );
571 x$columnNames = value;
572 return(x);
573 })
574 setGeneric("rownames<-")
575 setMethod("rownames<-", "SlicedData", function(x,value) {
576 stopifnot( class(value) == "character" );
577 stopifnot( length(value) == x$nRows() );
578 start = 1;
579 newNameSlices = vector("list", x$nSlices());
580 for( i in .seq(1,x$nSlices()) ) {
581 nr = nrow(x$getSliceRaw(i));
582 newNameSlices[[i]] = value[ start:(start+nr-1) ];
583 start = start + nr;
584 }
585 x$rowNameSlices = newNameSlices;
586 return(x);
587 })
588 setGeneric("rowSums")
589 setMethod("rowSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) {
590 if(x$nSlices() == 0) {
591 return( numeric() );
592 }
593 stopifnot( dims == 1 );
594 thesum = vector("list", x$nSlices());
595 for( i in 1:x$nSlices() ) {
596 thesum[[i]] = rowSums(x$getSlice(i), na.rm)
597 }
598 return(unlist(thesum, recursive = FALSE, use.names = FALSE));
599 })
600 setGeneric("rowMeans")
601 setMethod("rowMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) {
602 if(x$nSlices() == 0) {
603 return( numeric() );
604 }
605 stopifnot( dims == 1 );
606 thesum = vector("list", x$nSlices());
607 for( i in 1:x$nSlices() ) {
608 thesum[[i]] = rowMeans(x$getSlice(i), na.rm)
609 }
610 return(unlist(thesum, recursive = FALSE, use.names = FALSE));
611 })
612 setGeneric("colSums")
613 setMethod("colSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) {
614 if(x$nCols() == 0) {
615 return( numeric() );
616 }
617 stopifnot( dims == 1 );
618 thesum = 0;
619 for( i in .seq(1,x$nSlices()) ) {
620 thesum = thesum + colSums(x$getSlice(i), na.rm)
621 }
622 return(thesum);
623 })
624 setGeneric("colMeans")
625 setMethod("colMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) {
626 if(x$nCols() == 0) {
627 return( numeric() );
628 }
629 stopifnot( dims == 1 );
630 thesum = 0;
631 thecounts = x$nRows();
632 for( i in .seq(1,x$nSlices()) ) {
633 slice = x$getSlice(i);
634 thesum = thesum + colSums(slice, na.rm)
635 if( na.rm ) {
636 thecounts = thecounts - colSums(is.na(slice))
637 }
638 }
639 return(thesum/thecounts);
640 })
641
642 .listBuilder <- setRefClass(".listBuilder",
643 fields = list(
644 dataEnv = "environment",
645 n = "numeric"
646 ),
647 methods = list(
648 initialize = function() {
649 dataEnv <<- new.env(hash = TRUE);
650 n <<- 0;
651 # cumlength <<- 0;
652 return(.self);
653 },
654 add = function(x) {
655 if(length(x) > 0) {
656 n <<- n + 1;
657 # cumlength <<- cumlength + length(x);
658 assign(paste(n), x, dataEnv );
659 }
660 return(.self);
661 },
662 set = function(i,x) {
663 if(length(x) > 0) {
664 if(i>n)
665 n <<- i;
666 assign(paste(i), x, dataEnv );
667 }
668 return(.self);
669 },
670 get = function(i) {
671 return(base::get(paste(i),dataEnv));
672 },
673 list = function() {
674 if(n==0) return(list());
675 result = vector('list',n);
676 for( i in 1:n) {
677 result[[i]] = .self$get(i);
678 }
679 return(result);
680 },
681 unlist = function() {
682 return(base::unlist(.self$list(), recursive=FALSE, use.names = FALSE));
683 },
684 show = function() {
685 cat(".listBuilder object.\nIternal object in MatrixEQTL package.\n");
686 cat("Number of elements:", object$n, "\n");
687 }
688 ))
689
690 .histogrammer <- setRefClass(".histogrammer",
691 fields = list(
692 pvbins1 = "numeric",
693 statbins1 = "numeric",
694 hist.count = "numeric"
695 ),
696 methods = list(
697 initialize = function (pvbins, statbins) {
698 if(length(pvbins)) {
699 ord = order(statbins);
700 pvbins1 <<- pvbins[ord];
701 statbins1 <<- statbins[ord];
702 hist.count <<- double(length(pvbins)-1);
703 }
704 return(.self);
705 },
706 update = function(stats.for.hist) {
707 h = hist(stats.for.hist, breaks = statbins1, include.lowest = TRUE, right = TRUE, plot = FALSE)$counts;
708 hist.count <<- hist.count + h;
709 },
710 getResults = function() {
711 if(!is.unsorted(pvbins1)) {
712 return(list(hist.bins = pvbins1 , hist.counts = hist.count ));
713 } else {
714 return(list(hist.bins = rev(pvbins1), hist.counts = rev(hist.count)));
715 }
716 }
717 ))
718
719
720 .minpvalue <- setRefClass(".minpvalue",
721 fields = list(
722 sdata = ".listBuilder",
723 gdata = ".listBuilder"
724 ),
725 methods = list(
726 initialize = function(snps, gene) {
727 sdata <<- .listBuilder$new();
728 for( ss in 1:snps$nSlices() ) {
729 sdata$set( ss, double(nrow(snps$getSliceRaw(ss))));
730 }
731 gdata <<- .listBuilder$new();
732 for( gg in 1:gene$nSlices() ) {
733 gdata$set( gg, double(nrow(gene$getSliceRaw(gg))));
734 }
735 return(.self);
736 },
737 update = function(ss, gg, astat) {
738 gmax = gdata$get(gg)
739 z1 = max.col(astat,ties.method='first');
740 z11 = astat[1:nrow(astat) + nrow(astat) * (z1 - 1)];
741 gmax = pmax(gmax, z11);
742 gdata$set(gg, gmax);
743
744 smax = sdata$get(ss)
745 z22 = apply(astat,2,max);
746 smax = pmax(smax, z22);
747 sdata$set(ss, smax);
748 return(.self);
749 },
750 updatecis = function(ss, gg, select.cis, astat) {
751 if(length(astat)>0)
752 {
753 byrows = aggregate(x=astat, by=list(row=select.cis[,1]), FUN=max);
754 bycols = aggregate(x=astat, by=list(col=select.cis[,2]), FUN=max);
755
756 gmax = gdata$get(gg);
757 gmax[byrows$row] = pmax(gmax[byrows$row], byrows$x)
758 gdata$set(gg, gmax);
759
760 smax = sdata$get(ss)
761 smax[bycols$col] = pmax(smax[bycols$col], bycols$x)
762 sdata$set(ss, smax);
763 }
764 return(.self);
765 },
766 getResults = function(snps, gene, pvfun) {
767 min.pv.snps = pvfun(sdata$unlist());
768 names(min.pv.snps) = rownames(snps);
769 min.pv.gene = pvfun(gdata$unlist());
770 names(min.pv.gene) = rownames(gene);
771 return(list(min.pv.snps = min.pv.snps, min.pv.gene = min.pv.gene));
772 }
773 ))
774
775 .OutputSaver_FRD <- setRefClass(".OutputSaver_FRD",
776 fields = list(
777 sdata = ".listBuilder",
778 gdata = ".listBuilder",
779 cdata = ".listBuilder",
780 bdata = ".listBuilder",
781 fid = "list",
782 testfun1 = "list",
783 pvfun1 = "list"
784 ),
785 methods = list(
786 initialize = function () {
787 sdata <<- .listBuilder$new();
788 gdata <<- .listBuilder$new();
789 cdata <<- .listBuilder$new();
790 bdata <<- .listBuilder$new();
791 fid <<- list(0);
792 testfun1 <<- list(0);
793 pvfun1 <<- list(0);
794 return(.self);
795 },
796 start = function(filename, statistic_name, unused1, unused2, testfun, pvfun) {
797 testfun1 <<- list(testfun);
798 pvfun1 <<- list(pvfun);
799 if(length(filename) > 0) {
800 if(class(filename) == "character") {
801 fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE);
802 } else {
803 fid <<- list(filename, FALSE)
804 }
805 writeLines( paste("SNP\tgene\t",statistic_name,"\tp-value\tFDR", sep = ""), fid[[1]]);
806 } else {
807 fid <<- list();
808 }
809 },
810 update = function(spos, gpos, sta, beta = NULL) {
811 if(length(sta)>0) {
812 sdata$add(spos);
813 gdata$add(gpos);
814 cdata$add(sta );
815 if(!is.null(beta ))
816 bdata$add(beta );
817 }
818 return(.self);
819 },
820 getResults = function( gene, snps, FDR_total_count) {
821 pvalues = NULL;
822 if(cdata$n > 0) {
823 tests = testfun1[[1]](cdata$unlist());
824 cdata <<- .listBuilder$new();
825
826 pvalues = pvfun1[[1]](tests);
827 ord = order(pvalues);
828
829 tests = tests[ord];
830 pvalues = pvalues[ord];
831
832 FDR = pvalues * FDR_total_count / (1:length(pvalues));
833 FDR[length(FDR)] = min(FDR[length(FDR)], 1);
834 FDR = rev(cummin(rev(FDR)));
835
836 snps_names = snps$GetAllRowNames()[sdata$unlist()[ord]];
837 sdata <<- .listBuilder$new();
838 gene_names = gene$GetAllRowNames()[gdata$unlist()[ord]];
839 gdata <<- .listBuilder$new();
840
841 beta = NULL;
842 if(bdata$n > 0)
843 beta = bdata$unlist()[ord];
844
845 if(length(fid)>0) {
846 step = 1000; ########### 100000
847 for( part in 1:ceiling(length(FDR)/step) ) {
848 fr = (part-1)*step + 1;
849 to = min(part*step, length(FDR));
850 dump = data.frame(snps_names[fr:to],
851 gene_names[fr:to],
852 if(is.null(beta)) tests[fr:to] else list(beta[fr:to],tests[fr:to]),
853 pvalues[fr:to],
854 FDR[fr:to],
855 row.names = NULL,
856 check.rows = FALSE,
857 check.names = FALSE,
858 stringsAsFactors = FALSE);
859 write.table(dump, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE);
860 }
861 }
862 } else {
863 cat("No significant associations were found.\n", file = if(length(fid)>0){fid[[1]]}else{""});
864 }
865 if(length(fid)>0) {
866 if(fid[[2]]) {
867 close(fid[[1]]);
868 }
869 fid <<- list();
870 }
871
872 if(!is.null(pvalues)) {
873 eqtls = list( snps = snps_names,
874 gene = gene_names,
875 statistic = tests,
876 pvalue = pvalues,
877 FDR = FDR);
878 if(!is.null(beta))
879 eqtls$beta = beta;
880 } else {
881 eqtls = list( snps = character(),
882 gene = character(),
883 beta = numeric(),
884 statistic = numeric(),
885 pvalue = numeric(),
886 FDR = numeric());
887 }
888 return(list(eqtls = data.frame(eqtls)));
889 }
890 )
891 )
892
893
894 .OutputSaver_direct <- setRefClass(".OutputSaver_direct",
895 fields = list(
896 gene_names = "character",
897 snps_names = "character",
898 fid = "list",
899 testfun1 = "list",
900 pvfun1 = "list"
901 ),
902 methods = list(
903 initialize = function() {
904 gene_names <<- character(0);
905 snps_names <<- character(0);
906 fid <<- list(0);
907 testfun1 <<- list(0);
908 pvfun1 <<- list(0);
909 return(.self);
910 },
911 start = function(filename, statistic_name, snps, gene, testfun, pvfun) {
912 # I hope the program stops if it fails to open the file
913 if(class(filename) == "character") {
914 fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE);
915 } else {
916 fid <<- list(filename, FALSE)
917 }
918 writeLines(paste("SNP\tgene\t", statistic_name, "\tp-value", sep = ""), fid[[1]]);
919 gene_names <<- gene$GetAllRowNames();
920 snps_names <<- snps$GetAllRowNames();
921 testfun1 <<- list(testfun);
922 pvfun1 <<- list(pvfun);
923 },
924 update = function(spos, gpos, sta, beta = NULL) {
925 if( length(sta) == 0 )
926 return();
927 sta = testfun1[[1]](sta);
928 lst = list(snps = snps_names[spos], gene = gene_names[gpos], beta = beta, statistic = sta, pvalue = pvfun1[[1]](sta));
929 lst$beta = lst$beta;
930
931 dump2 = data.frame(lst, row.names = NULL, check.rows = FALSE, check.names = FALSE, stringsAsFactors = FALSE);
932 write.table(dump2, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE);
933 },
934 getResults = function(...) {
935 if(length(fid)>0) {
936 if(fid[[2]]) {
937 close(fid[[1]]);
938 }
939 fid <<- list();
940 }
941 gene_names <<- character(0);
942 snps_names <<- character(0);
943 return(list());
944 }
945 )
946 )
947
948 .my.pmin = function(x, val) {
949 # minimum "pmin" function that can handle empty array
950 if(length(x) == 0) {
951 return(x)
952 } else {
953 return(pmin.int(x,val));
954 }
955 }
956
957 .my.pmax = function(x, val) {
958 # minimum "pmin" function that can handle empty array
959 if(length(x) == 0) {
960 return(x)
961 } else {
962 return(pmax.int(x,val));
963 }
964 }
965
966 .pv.nz = function(x){return( .my.pmax(x,.Machine$double.xmin) )}
967
968 .SetNanRowMean = function(x) {
969 if( any(is.na(x)) ) {
970 rowmean = rowMeans(x, na.rm = TRUE);
971 rowmean[ is.na(rowmean) ] = 0;
972 for( j in which(!complete.cases(x)) ) {
973 where1 = is.na( x[j, ] );
974 x[j,where1] = rowmean[j];
975 }
976 }
977 return(x);
978 }
979
980 # .SNP_process_split_for_ANOVA = function(x) {
981 # # split into 2 dummy variables
982 #
983 # uniq = unique(c(x));
984 # uniq = uniq[!is.na(uniq)];
985 #
986 # if( length(uniq) > 3 ) {
987 # stop("More than three genotype categories is not handled by ANOVA");
988 # } else if ( length(uniq) < 3 ) {
989 # uniq = c(uniq, min(uniq)-(1:(3-length(uniq))));
990 # }
991 #
992 # freq = matrix(0, nrow(x), length(uniq));
993 # for(i in 1:length(uniq)) {
994 # freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE);
995 # }
996 #
997 # md = apply(freq, 1, which.max);
998 # freq[ cbind(1:nrow(x),md) ] = -1;
999 #
1000 # md = apply(freq, 1, which.max); # min(freq[cbind(1:nrow(slice),md)] - rowSums(select,na.rm = TRUE ))
1001 # new_slice1 = (x == uniq[md]);
1002 # new_slice1[is.na(new_slice1)] = 0;
1003 # freq[ cbind(1:nrow(x),md) ] = -1;
1004 #
1005 # md = apply(freq,1,which.max);
1006 # new_slice2 = (x == uniq[md]);
1007 # new_slice2[ is.na(new_slice2) ] = 0;
1008 # rez = vector("list", 2);
1009 # rez[[1]] = new_slice1;
1010 # rez[[2]] = new_slice2;
1011 # return( rez );
1012 # }
1013
1014 .SNP_process_split_for_ANOVA = function(x,n.groups) {
1015 # split into 2 dummy variables (or more)
1016
1017 # # Get the number of ANOVA groups
1018 # n.groups = options('MatrixEQTL.ANOVA.categories')[[1]];
1019 # if( is.null(n.groups))
1020 # n.groups = 3;
1021
1022 # Unique values in x (make sure it has length of n.groups);
1023 uniq = unique(as.vector(x));
1024 uniq = uniq[!is.na(uniq)];
1025 if( length(uniq) > n.groups ) {
1026 stop("More than declared number of genotype categories is detected by ANOVA");
1027 } else if ( length(uniq) < n.groups ) {
1028 uniq = c(uniq, rep(min(uniq)-1, n.groups-length(uniq)));
1029 }
1030
1031 # Table of frequencies for each variable (row)
1032 freq = matrix(0, nrow(x), n.groups);
1033 for(i in 1:n.groups) {
1034 freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE);
1035 }
1036 # remove NA's from x for convenience
1037 x[is.na(x)] = min(uniq)-2;
1038
1039 # Output list of matrices
1040 rez = vector('list',n.groups-1);
1041
1042 # Skip the most frequent value
1043 md = apply(freq, 1, which.max); # most frequent value for each variable
1044 freq[ cbind(1:nrow(x),md) ] = -1;
1045
1046 # The rest form dumm
1047 for(j in 1:(n.groups-1)){
1048 md = apply(freq, 1, which.max);
1049 freq[ cbind(1:nrow(x),md) ] = -1;
1050 rez[[j]] = (x == uniq[md]);
1051 }
1052 return( rez );
1053 }
1054
1055 Matrix_eQTL_engine = function(
1056 snps,
1057 gene,
1058 cvrt = SlicedData$new(),
1059 output_file_name,
1060 pvOutputThreshold = 1e-5,
1061 useModel = modelLINEAR,
1062 errorCovariance = numeric(),
1063 verbose = TRUE,
1064 pvalue.hist = FALSE,
1065 min.pv.by.genesnp = FALSE,
1066 noFDRsaveMemory = FALSE) {
1067 rez = Matrix_eQTL_main(
1068 snps = snps,
1069 gene = gene,
1070 cvrt = cvrt,
1071 output_file_name = output_file_name,
1072 pvOutputThreshold = pvOutputThreshold,
1073 useModel = useModel,
1074 errorCovariance = errorCovariance,
1075 verbose = verbose,
1076 pvalue.hist = pvalue.hist,
1077 min.pv.by.genesnp = min.pv.by.genesnp,
1078 noFDRsaveMemory = noFDRsaveMemory);
1079 return( rez );
1080 }
1081
1082 Matrix_eQTL_main = function(
1083 snps,
1084 gene,
1085 cvrt = SlicedData$new(),
1086 output_file_name = "",
1087 pvOutputThreshold = 1e-5,
1088 useModel = modelLINEAR,
1089 errorCovariance = numeric(),
1090 verbose = TRUE,
1091 output_file_name.cis = "",
1092 pvOutputThreshold.cis = 0,
1093 snpspos = NULL,
1094 genepos = NULL,
1095 cisDist = 1e6,
1096 pvalue.hist = FALSE,
1097 min.pv.by.genesnp = FALSE,
1098 noFDRsaveMemory = FALSE) {
1099 ################################# Basic variable checks #################################
1100 {
1101 # status("Performing basic checks of the input variables");
1102 stopifnot( "SlicedData" %in% class(gene) );
1103 stopifnot( "SlicedData" %in% class(snps) );
1104 stopifnot( "SlicedData" %in% class(cvrt) );
1105
1106 # Check dimensions
1107 if( min(snps$nRows(),snps$nCols()) == 0 )
1108 stop("Empty genotype dataset");
1109 if( min(gene$nRows(),gene$nCols()) == 0 )
1110 stop("Empty expression dataset");
1111 if( snps$nCols() != gene$nCols() )
1112 stop("Different number of samples in the genotype and gene expression files");
1113 if( cvrt$nRows()>0 ) {
1114 if( snps$nCols() != cvrt$nCols() )
1115 stop("Wrong number of samples in the matrix of covariates");
1116 }
1117
1118 stopifnot( class(pvOutputThreshold) == "numeric" );
1119 stopifnot( length(pvOutputThreshold) == 1 );
1120 stopifnot( pvOutputThreshold >= 0 );
1121 stopifnot( pvOutputThreshold <= 1 );
1122
1123 stopifnot( class(noFDRsaveMemory) == "logical" );
1124 stopifnot( length(noFDRsaveMemory) == 1 );
1125
1126 if( pvOutputThreshold > 0 ) {
1127 stopifnot( !((length(output_file_name) == 0) && noFDRsaveMemory) )
1128 stopifnot( length(output_file_name) <= 1 );
1129 if( length(output_file_name) == 1 ) {
1130 stopifnot( class(output_file_name) %in% c("character","connection") );
1131 }
1132 }
1133
1134 stopifnot( class(pvOutputThreshold.cis) == "numeric" );
1135 stopifnot( length(pvOutputThreshold.cis) == 1 );
1136 stopifnot( pvOutputThreshold.cis >= 0 );
1137 stopifnot( pvOutputThreshold.cis <= 1 );
1138 stopifnot( !((pvOutputThreshold > 0) & (pvOutputThreshold.cis > 0) & (pvOutputThreshold > pvOutputThreshold.cis)) );
1139 stopifnot( (pvOutputThreshold > 0) | (pvOutputThreshold.cis > 0) );
1140
1141 stopifnot( class(useModel) == class(modelLINEAR) );
1142 stopifnot( length(useModel) == 1 );
1143 stopifnot( useModel %in% c(modelLINEAR, modelANOVA, modelLINEAR_CROSS) );
1144 if( useModel %in% c(modelLINEAR, modelLINEAR_CROSS) ) {
1145 if( snps$nCols() <= cvrt$nRows() + 1 + 1) {
1146 stop('The number of covariates exceeds the number of samples.\nLinear regression can not be fit.')
1147 }
1148 }
1149 if( useModel == modelLINEAR_CROSS ) {
1150 if( cvrt$nRows() == 0 ) {
1151 stop( "Model \"modelLINEAR_CROSS\" requires at least one covariate" );
1152 }
1153 }
1154 if( useModel == modelANOVA ) {
1155 n.anova.groups = getOption('MatrixEQTL.ANOVA.categories', 3);
1156 stopifnot( n.anova.groups == floor(n.anova.groups) );
1157 stopifnot( n.anova.groups >= 3 );
1158 # stopifnot( n.anova.groups < snps$nCols() - cvrt$nRows() - 2 );
1159 if( snps$nCols() <= cvrt$nRows() + n.anova.groups) {
1160 stop('The number of covariates exceeds the number of samples.\nLinear regression (ANOVA) can not be fit.')
1161 }
1162 }
1163
1164 stopifnot( class(verbose) == "logical" );
1165 stopifnot( length(verbose) == 1 );
1166
1167 stopifnot( class(min.pv.by.genesnp) == "logical" );
1168 stopifnot( length(min.pv.by.genesnp) == 1 );
1169
1170 if( pvOutputThreshold.cis > 0 ) {
1171 stopifnot( !((length(output_file_name.cis) == 0) && noFDRsaveMemory) )
1172 stopifnot( length(output_file_name.cis) <= 1 );
1173 if( length(output_file_name.cis) == 1 ) {
1174 stopifnot( class(output_file_name.cis) %in% c("character","connection") );
1175 }
1176
1177 # stopifnot( class(output_file_name.cis) == "character" );
1178 # stopifnot( length(output_file_name.cis) == 1 );
1179 stopifnot( class(snpspos) == "data.frame" );
1180 stopifnot( ncol(snpspos) == 3 );
1181 stopifnot( nrow(snpspos) > 0 );
1182 stopifnot( class(snpspos[1,3]) %in% c("integer", "numeric") )
1183 stopifnot( !any(is.na(snpspos[,3])) )
1184 stopifnot( class(genepos) == "data.frame" );
1185 stopifnot( ncol(genepos) == 4 );
1186 stopifnot( nrow(genepos) > 0 );
1187 stopifnot( class(genepos[1,3]) %in% c("integer", "numeric") )
1188 stopifnot( class(genepos[1,4]) %in% c("integer", "numeric") )
1189 stopifnot( !any(is.na(genepos[,3])) )
1190 stopifnot( !any(is.na(genepos[,4])) )
1191 stopifnot( nzchar(output_file_name.cis) )
1192 }
1193
1194 if( pvOutputThreshold > 0 ) {
1195 stopifnot( nzchar(output_file_name) )
1196 }
1197
1198 stopifnot( class(errorCovariance) %in% c("numeric", "matrix") );
1199 errorCovariance = as.matrix(errorCovariance);
1200 if(length(errorCovariance)>0) {
1201 if( nrow(errorCovariance) != ncol(errorCovariance) ) {
1202 stop("The covariance matrix is not square");
1203 }
1204 if( nrow(errorCovariance) != snps$nCols() ) {
1205 stop("The covariance matrix size does not match the number of samples");
1206 }
1207 if( !all(errorCovariance == t(errorCovariance)) ) {
1208 stop("The covariance matrix is not symmetric");
1209 }
1210 }
1211 }
1212 ################################# Initial setup #########################################
1213 {
1214 gene.std = .listBuilder$new();
1215 snps.std = .listBuilder$new();
1216
1217 dont.clone.gene = getOption('MatrixEQTL.dont.preserve.gene.object', FALSE)
1218 if(is.null(dont.clone.gene))
1219 dont.clone.gene = FALSE;
1220
1221 if( !dont.clone.gene )
1222 gene = gene$Clone();
1223 # snps = snps$Clone(); # snps is read only
1224 cvrt = cvrt$Clone();
1225
1226 params = list(
1227 output_file_name = output_file_name,
1228 pvOutputThreshold = pvOutputThreshold,
1229 useModel = useModel,
1230 errorCovariance = errorCovariance ,
1231 verbose = verbose,
1232 output_file_name.cis = output_file_name.cis,
1233 pvOutputThreshold.cis = pvOutputThreshold.cis,
1234 cisDist = cisDist ,
1235 pvalue.hist = pvalue.hist,
1236 min.pv.by.genesnp = min.pv.by.genesnp);
1237
1238 if( verbose ) {
1239 lastTime = 0;
1240 status <- function(text) {
1241 # gc();
1242 newTime = proc.time()[3];
1243 if(lastTime != 0) {
1244 cat("Task finished in ", newTime-lastTime, " seconds\n");
1245 }
1246 cat(text,"\n");
1247 lastTime <<- newTime;
1248 unused = flush.console();
1249 }
1250 } else {
1251 status = function(text){}
1252 }
1253 start.time = proc.time()[3];
1254 }
1255 ################################# Error covariance matrix processing ####################
1256 {
1257 if( length(errorCovariance) > 0 ) {
1258 status("Processing the error covariance matrix");
1259 eig = eigen(errorCovariance, symmetric = TRUE)
1260 d = eig$values;
1261 v = eig$vectors;
1262 # errorCovariance == v %*% diag(d) %*% t(v)
1263 # errorCovariance^0.5 == v*sqrt(d)*v" (no single quotes anymore)
1264 # errorCovariance^(-0.5) == v*diag(1./sqrt(diag(d)))*v"
1265 if( any(d<=0) ) {
1266 stop("The covariance matrix is not positive definite");
1267 }
1268 correctionMatrix = v %*% diag(1./sqrt(d)) %*% t(v);
1269 rm( eig, v, d, errorCovariance )
1270 } else {
1271 rm( errorCovariance );
1272 correctionMatrix = numeric();
1273 }
1274 }
1275 ################################# Matching gene and SNPs locations ######################
1276 if( pvOutputThreshold.cis > 0 ) {
1277 status("Matching data files and location files")
1278
1279 # names in the input data
1280 gene_names = gene$GetAllRowNames();
1281 snps_names = snps$GetAllRowNames();
1282
1283 # gene range, set: left<right
1284 if(any(genepos[,3] > genepos[,4])) {
1285 temp3 = genepos[,3];
1286 temp4 = genepos[,4];
1287 genepos[,3] = pmin(temp3,temp4);
1288 genepos[,4] = pmax(temp3,temp4);
1289 rm(temp3, temp4);
1290 }
1291
1292 # match with the location data
1293 genematch = match( gene_names, genepos[ ,1], nomatch = 0L);
1294 usedgene = matrix(FALSE, nrow(genepos), 1); # genes in "genepos" that are matching "gene_names"
1295 usedgene[ genematch ] = TRUE;
1296 if( !any(genematch) ) {
1297 stop("Gene names do not match those in the gene location file.");
1298 }
1299 cat( sum(genematch>0), "of", length(gene_names), " genes matched\n");
1300
1301
1302 snpsmatch = match( snps_names, snpspos[ ,1], nomatch = 0L);
1303 usedsnps = matrix(FALSE, nrow(snpspos),1);
1304 usedsnps[ snpsmatch ] = TRUE;
1305 if( !any(snpsmatch) ) {
1306 stop("SNP names do not match those in the SNP location file.");
1307 }
1308 cat( sum(snpsmatch>0), "of", length(snps_names), " SNPs matched\n");
1309
1310 # list used chr names
1311 chrNames = unique(c( as.character(unique(snpspos[usedsnps,2])),
1312 as.character(unique(genepos[usedgene,2])) ))
1313 chrNames = chrNames[ sort.list( suppressWarnings(as.integer(chrNames)),
1314 method = "radix", na.last = TRUE ) ];
1315 # match chr names
1316 genechr = match(genepos[,2],chrNames);
1317 snpschr = match(snpspos[,2],chrNames);
1318
1319 # max length of a chromosome
1320 chrMax = max( snpspos[usedsnps, 3], genepos[usedgene, 4], na.rm = TRUE) + cisDist;
1321
1322 # Single number location for all rows in "genepos" and "snpspos"
1323 genepos2 = as.matrix(genepos[ ,3:4, drop = FALSE] + (genechr-1)*chrMax);
1324 snpspos2 = as.matrix(snpspos[ ,3 , drop = FALSE] + (snpschr-1)*chrMax);
1325
1326 # the final location arrays;
1327 snps_pos = matrix(0,length(snps_names),1);
1328 snps_pos[snpsmatch>0, ] = snpspos2[snpsmatch, , drop = FALSE];
1329 snps_pos[rowSums(is.na(snps_pos))>0, ] = 0;
1330 snps_pos[snps_pos==0] = (length(chrNames)+1) * (chrMax+cisDist);
1331 rm(snps_names, snpsmatch, usedsnps, snpschr, snpspos2)
1332
1333 gene_pos = matrix(0,length(gene_names),2);
1334 gene_pos[genematch>0, ] = genepos2[genematch, , drop = FALSE];
1335 gene_pos[rowSums(is.na(gene_pos))>0, ] = 0;
1336 gene_pos[gene_pos==0] = (length(chrNames)+2) * (chrMax+cisDist);
1337 rm(gene_names, genematch, usedgene, genechr, genepos2)
1338 rm(chrNames, chrMax);
1339
1340 if( is.unsorted(snps_pos) ) {
1341 status("Reordering SNPs\n");
1342 ordr = sort.list(snps_pos);
1343 snps$RowReorder(ordr);
1344 snps_pos = snps_pos[ordr, , drop = FALSE];
1345 rm(ordr);
1346 }
1347 if( is.unsorted(rowSums(gene_pos)) ) {
1348 status("Reordering genes\n");
1349 ordr = sort.list(rowSums(gene_pos));
1350 gene$RowReorder(ordr);
1351 gene_pos = gene_pos[ordr, , drop = FALSE];
1352 rm(ordr);
1353 }
1354
1355 # Slice it back.
1356 geneloc = vector("list", gene$nSlices())
1357 gene_offset = 0;
1358 for(gc in 1:gene$nSlices()) {
1359 nr = length(gene$rowNameSlices[[gc]]);
1360 geneloc[[gc]] = gene_pos[gene_offset + (1:nr), , drop = FALSE];
1361 gene_offset = gene_offset + nr;
1362 }
1363 rm(gc, gene_offset, gene_pos);
1364
1365 snpsloc = vector("list", snps$nSlices())
1366 snps_offset = 0;
1367 for(sc in 1:snps$nSlices()) {
1368 nr = length(snps$rowNameSlices[[sc]]);
1369 snpsloc[[sc]] = snps_pos[snps_offset + (1:nr), , drop = FALSE];
1370 snps_offset = snps_offset + nr;
1371 }
1372 rm(nr, sc, snps_offset, snps_pos);
1373 }
1374 ################################# Covariates processing #################################
1375 {
1376 status("Processing covariates");
1377 if( useModel == modelLINEAR_CROSS ) {
1378 last.covariate = as.vector(tail( cvrt$getSlice(cvrt$nSlices()), n = 1));
1379 }
1380 if( cvrt$nRows()>0 ) {
1381 cvrt$SetNanRowMean();
1382 cvrt$CombineInOneSlice();
1383 cvrt = rbind(matrix(1,1,snps$nCols()),cvrt$getSlice(1));
1384 } else {
1385 cvrt = matrix(1,1,snps$nCols());
1386 }
1387 # Correct for the error covariance structure
1388 if( length(correctionMatrix)>0 ) {
1389 cvrt = cvrt %*% correctionMatrix;
1390 }
1391 # Orthonormalize covariates
1392 # status("Orthonormalizing covariates");
1393 q = qr(t(cvrt));
1394 if( min(abs(diag(qr.R(q)))) < .Machine$double.eps * snps$nCols() ) {
1395 stop("Colinear or zero covariates detected");
1396 }
1397 cvrt = t( qr.Q(q) );
1398 rm( q );
1399 }
1400 ################################# Gene expression processing ############################
1401 {
1402 status("Processing gene expression data (imputation, residualization, etc.)");
1403 # Impute gene expression
1404 gene$SetNanRowMean();
1405 # Correct for the error covariance structure
1406 if( length(correctionMatrix)>0 ) {
1407 gene$RowMatrixMultiply(correctionMatrix);
1408 }
1409 # Orthogonolize expression w.r.t. covariates
1410 # status("Orthogonolizing expression w.r.t. covariates");
1411 gene_offsets = double(gene$nSlices()+1);
1412 for( sl in 1:gene$nSlices() ) {
1413 slice = gene$getSlice(sl);
1414 gene_offsets[sl+1] = gene_offsets[sl] + nrow(slice);
1415 rowsq1 = rowSums(slice^2);
1416 slice = slice - tcrossprod(slice,cvrt) %*% cvrt;
1417 rowsq2 = rowSums(slice^2);
1418 # kill rows colinear with the covariates
1419 delete.rows = (rowsq2 <= rowsq1 * .Machine$double.eps );
1420 slice[delete.rows] = 0;
1421 div = sqrt( rowSums(slice^2) );
1422 div[ div == 0 ] = 1;
1423 gene.std$set(sl, div);
1424 gene$setSlice(sl, slice / div);
1425 }
1426 rm(rowsq1, rowsq2, delete.rows, div);
1427 rm( sl, slice );
1428 #gene$RowRemoveZeroEps();
1429 }
1430 ################################# snps_process, testfun, pvfun, threshfun, afun ########
1431 {
1432 # snps_process - preprocess SNPs slice
1433 #
1434 # afun --- abs for signed stats, identity for non-negative
1435 # threshfun --- internal stat threshold for given p-value
1436 # testfun --- t or F statistic from the internal one
1437 # pvfun --- p-value from the t or F statistic
1438
1439 nSamples = snps$nCols();
1440 nGenes = gene$nRows();
1441 nSnps = snps$nRows();
1442 nCov = nrow(cvrt);
1443 # nVarTested = length(snps_list); # set in case(useModel)
1444 # dfNull = nSamples - nCov;
1445 # d.f. of the full model
1446 betafun = NULL;
1447
1448 if( useModel == modelLINEAR ) {
1449 snps_process = function(x) {
1450 return( list(.SetNanRowMean(x)) );
1451 };
1452 nVarTested = 1;
1453 dfFull = nSamples - nCov - nVarTested;
1454 statistic.fun = function(mat_list) {
1455 return( mat_list[[1]] );
1456 }
1457 afun = function(x) {return(abs(x))};
1458 threshfun = function(pv) {
1459 thr = qt(pv/2, dfFull, lower.tail = FALSE);
1460 thr = thr^2;
1461 thr = sqrt( thr / (dfFull + thr) );
1462 thr[pv >= 1] = 0;
1463 thr[pv <= 0] = 1;
1464 return( thr );
1465 }
1466 testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); }
1467 pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2)); }
1468 thresh.cis = threshfun(pvOutputThreshold.cis);
1469 thresh = threshfun(pvOutputThreshold);
1470 betafun = function(stat, ss, gg, select) {
1471 return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]);
1472 }
1473 } else
1474 if( useModel == modelANOVA ) {
1475 snps_process = function(x).SNP_process_split_for_ANOVA(x,n.anova.groups);
1476 nVarTested = n.anova.groups - 1;
1477 dfFull = nSamples - nCov - nVarTested;
1478 # statistic.fun = function(mat_list) {
1479 # return( mat_list[[1]]^2 + mat_list[[2]]^2 );
1480 # }
1481 statistic.fun = function(mat_list) {
1482 x = mat_list[[1]]^2;
1483 for( j in 2:length(mat_list) )
1484 x = x + mat_list[[j]]^2;
1485 return( x );
1486 }
1487 afun = identity;
1488 threshfun = function(pv) {
1489 thr = qf(pv, nVarTested, dfFull, lower.tail = FALSE);
1490 thr = thr / (dfFull/nVarTested + thr);
1491 thr[pv >= 1] = 0;
1492 thr[pv <= 0] = 1;
1493 return( thr );
1494 }
1495 testfun = function(x) { return( x / (1 - .my.pmin(x,1)) * (dfFull/nVarTested) ); }
1496 pvfun = function(x) { return( .pv.nz(pf(x, nVarTested, dfFull, lower.tail = FALSE)) ); }
1497 thresh.cis = threshfun(pvOutputThreshold.cis);
1498 thresh = threshfun(pvOutputThreshold);
1499 } else
1500 if( useModel == modelLINEAR_CROSS ) {
1501 last.covariate = as.vector( last.covariate );
1502 snps_process = .SNP_process_split_for_LINEAR_CROSS = function(x) {
1503 out = vector("list", 2);
1504 out[[1]] = .SetNanRowMean(x);
1505 out[[2]] = t( t(out[[1]]) * last.covariate );
1506 return( out );
1507 };
1508 nVarTested = 1;
1509 dfFull = nSamples - nCov - nVarTested - 1;
1510 statistic.fun = function(mat_list) {
1511 return( mat_list[[2]] / sqrt(1 - mat_list[[1]]^2) );
1512 }
1513 afun = function(x) {return(abs(x))};
1514 threshfun = function(pv) {
1515 thr = qt(pv/2, dfFull, lower.tail = FALSE);
1516 thr = thr^2;
1517 thr = sqrt( thr / (dfFull + thr) );
1518 thr[pv >= 1] = 0;
1519 thr[pv <= 0] = 1;
1520 return( thr );
1521 }
1522 testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); }
1523 pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2 )); }
1524 thresh.cis = threshfun(pvOutputThreshold.cis);
1525 thresh = threshfun(pvOutputThreshold);
1526 betafun = function(stat, ss, gg, select) {
1527 return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]);
1528 }
1529 }
1530 params$dfFull = dfFull;
1531 }
1532 ################################# Saver class(es) creation ##############################
1533 {
1534 status("Creating output file(s)");
1535 if(noFDRsaveMemory) {
1536 if( pvOutputThreshold > 0 ) {
1537 saver.tra = .OutputSaver_direct$new();
1538 }
1539 if( pvOutputThreshold.cis > 0 ) {
1540 saver.cis = .OutputSaver_direct$new();
1541 }
1542 } else {
1543 if( pvOutputThreshold > 0 ) {
1544 saver.tra = .OutputSaver_FRD$new();
1545 }
1546 if( pvOutputThreshold.cis > 0 ) {
1547 saver.cis = .OutputSaver_FRD$new();
1548 }
1549 }
1550 if( pvOutputThreshold > 0 )
1551 if( pvOutputThreshold * gene$nRows() * snps$nRows() > 1000000 )
1552 if(!noFDRsaveMemory)
1553 cat('Warning: pvOutputThreshold may be too large.\nExpected number of findings > ',
1554 pvOutputThreshold * gene$nRows() * snps$nRows(),'\n');
1555 if( (useModel == modelLINEAR) || (useModel == modelLINEAR_CROSS) ) {
1556 statistic_name = "t-stat";
1557 } else if( useModel == modelANOVA ) {
1558 statistic_name = "F-test";
1559 }
1560 if(!is.null(betafun))
1561 statistic_name = paste('beta\t',statistic_name, sep='');
1562 if( pvOutputThreshold > 0 )
1563 saver.tra$start(output_file_name, statistic_name, snps, gene, testfun, pvfun);
1564 if( pvOutputThreshold.cis > 0 )
1565 saver.cis$start(output_file_name.cis, statistic_name, snps, gene, testfun, pvfun);
1566 rm( statistic_name );
1567 }
1568 ################################# Some useful functions #################################
1569 {
1570 orthonormalize.snps = function(cursnps, ss) {
1571 for(p in 1:length(cursnps)) {
1572 if(length(correctionMatrix)>0) {
1573 cursnps[[p]] = cursnps[[p]] %*% correctionMatrix;
1574 }
1575 cursnps[[p]] = cursnps[[p]] - tcrossprod(cursnps[[p]],cvrt) %*% cvrt;
1576 for(w in .seq(1,p-1L))
1577 cursnps[[p]] = cursnps[[p]] - rowSums(cursnps[[p]]*cursnps[[w]]) * cursnps[[w]];
1578 div = sqrt( rowSums(cursnps[[p]]^2) );
1579 div[ div == 0 ] = 1;
1580 cursnps[[p]] = cursnps[[p]]/div;
1581 }
1582 snps.std$set(ss, div);
1583 return(cursnps);
1584 }
1585 # if( pvOutputThreshold.cis > 0 ) {
1586 # is.cis.pair = function(gg,ss) {
1587 # return(!( ( snpsloc[[ss]][1, 1] - tail( geneloc[[gg]][ , 2], n = 1L) > cisDist) |
1588 # ( geneloc[[gg]][1, 1] - tail( snpsloc[[ss]] , n = 1L) > cisDist) ) );
1589 # }
1590 # }
1591 if( pvOutputThreshold.cis > 0 ) {
1592 # sn.l = sapply(snpsloc, function(x)x[1] );
1593 # sn.r = sapply(snpsloc, function(x)tail(x,1) );
1594 # ge.l = sapply(geneloc, function(x)x[1,1] );
1595 # ge.r = sapply(geneloc, function(x)x[nrow(x) , 2] );
1596 sn.l = sapply(snpsloc, '[', 1 );
1597 sn.r = sapply(snpsloc, tail, 1 );
1598 ge.l = sapply(geneloc, '[', 1, 1 );
1599 ge.r = sapply( lapply(geneloc, tail.matrix, 1 ), '[', 2);
1600 gg.1 = findInterval( sn.l , ge.r + cisDist +1) + 1;
1601 # cat(gg.1,'\n')
1602 gg.2 = findInterval( sn.r , ge.l - cisDist );
1603 # cat(gg.2,'\n')
1604 rm(sn.l, sn.r, ge.l, ge.r);
1605 }
1606
1607 }
1608 ################################# Prepare counters and histogram bins ###################
1609 {
1610 pvbins = NULL; # bin edges for p-values
1611 statbins = 0; # bin edges for the test statistic (|t| or F)
1612 do.hist = FALSE;
1613 if( length(pvalue.hist) == 1 ) {
1614 if(pvalue.hist == "qqplot") {
1615 pvbins = c(0, 10^rev(seq(0, log10(.Machine$double.xmin)-1, -0.05)));
1616 } else
1617 if( is.numeric(pvalue.hist) ) {
1618 pvbins = seq(from = 0, to = 1, length.out = pvalue.hist+1);
1619 } else
1620 if( pvalue.hist == TRUE ) {
1621 pvbins = seq(from = 0, to = 1, length.out = 100+1);
1622 }
1623 } else
1624 if( is.numeric(pvalue.hist) && (length(pvalue.hist) > 1) ) {
1625 pvbins = pvalue.hist;
1626 }
1627 if( is.null(pvbins) && (pvalue.hist != FALSE) ) {
1628 stop("Wrong value of pvalue.hist. Must be FALSE, TRUE, \"qqplot\", or numerical");
1629 }
1630 do.hist = !is.null(pvbins);
1631 if( do.hist ) {
1632 pvbins = sort(pvbins);
1633 statbins = threshfun(pvbins);
1634 if( pvOutputThreshold > 0) {
1635 hist.all = .histogrammer$new(pvbins, statbins);
1636 }
1637 if( pvOutputThreshold.cis > 0) {
1638 hist.cis = .histogrammer$new(pvbins, statbins);
1639 }
1640 }
1641 rm( pvbins, statbins);
1642 if(min.pv.by.genesnp) {
1643 if( pvOutputThreshold > 0) {
1644 minpv.tra = .minpvalue$new(snps,gene);
1645 }
1646 if( pvOutputThreshold.cis > 0) {
1647 minpv.cis = .minpvalue$new(snps,gene);
1648 }
1649 }
1650 }
1651 ################################# Main loop #############################################
1652 {
1653 beta = NULL;
1654 n.tests.all = 0;
1655 n.tests.cis = 0;
1656 n.eqtls.tra = 0;
1657 n.eqtls.cis = 0;
1658
1659 status("Performing eQTL analysis");
1660 # ss = 1; gg = 1;
1661 # ss = snps$nSlices(); gg = gene$nSlices();
1662
1663 snps_offset = 0;
1664 for(ss in 1:snps$nSlices()) {
1665 # for(ss in 1:min(2,snps$nSlices())) { #for debug
1666 cursnps = NULL;
1667 nrcs = nrow(snps$getSliceRaw(ss));
1668
1669 # loop only through the useful stuff
1670 for(gg in if(pvOutputThreshold>0){1:gene$nSlices()}else{.seq(gg.1[ss],gg.2[ss])} ) {
1671 gene_offset = gene_offsets[gg];
1672 curgene = gene$getSlice(gg);
1673 nrcg = nrow(curgene);
1674 if(nrcg == 0) next;
1675
1676 rp = "";
1677
1678 statistic = NULL;
1679 select.cis.raw = NULL;
1680 ## do cis analysis
1681 # if( (pvOutputThreshold.cis > 0) && ( is.cis.pair(gg, ss) ) ) {
1682 if( (pvOutputThreshold.cis > 0) && (gg >= gg.1[ss]) && (gg <= gg.2[ss]) ) {
1683
1684 if( is.null( statistic ) ) {
1685 if( is.null( cursnps ) ) {
1686 cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss );
1687 }
1688 mat = vector("list", length(cursnps));
1689 for(d in 1:length(cursnps)) {
1690 mat[[d]] = tcrossprod(curgene, cursnps[[d]]);
1691 }
1692 statistic = statistic.fun( mat );
1693 astatistic = afun(statistic);
1694 # rm(mat);
1695 }
1696
1697 # sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1 +1 , snpsloc[[ss]]);
1698 # sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist -1 , snpsloc[[ss]]);
1699 sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1, snpsloc[[ss]]);
1700 sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist, snpsloc[[ss]]);
1701 xx = unlist(lapply(which(sn.r>sn.l),FUN=function(x){(sn.l[x]:(sn.r[x]-1))*nrow(statistic)+x}))
1702 select.cis.raw = xx[ astatistic[xx] >= thresh.cis ];
1703 select.cis = arrayInd(select.cis.raw, dim(statistic))
1704
1705 n.tests.cis = n.tests.cis + length(xx);
1706 n.eqtls.cis = n.eqtls.cis + length(select.cis.raw);
1707
1708 if( do.hist )
1709 hist.cis$update(astatistic[xx]);
1710
1711 if( min.pv.by.genesnp ) {
1712 # minpv.cis$updatecis(ss, gg, arrayInd(xx, dim(statistic)), astatistic[xx])
1713 temp = double(length(astatistic));
1714 dim(temp) = dim(astatistic);
1715 temp[xx] = astatistic[xx];
1716 minpv.cis$update(ss, gg, temp);
1717 }
1718
1719 if(!is.null(betafun))
1720 beta = betafun(mat[[length(mat)]][select.cis.raw], ss, gg, select.cis);
1721
1722 saver.cis$update( snps_offset + select.cis[ , 2],
1723 gene_offset + select.cis[ , 1],
1724 statistic[select.cis.raw],
1725 beta);
1726
1727 # statistic.select.cis = statistic[ select.cis ];
1728 # test = testfun( statistic.select.cis );
1729 # pv = pvfun(test);
1730 # Saver.cis$WriteBlock( cbind(snps_offset + select.cis[ , 2], gene_offset + select.cis[ , 1], test, pv) );
1731 # counter.cis$Update(gg, ss, select.cis, pv, n.tests = length(xx), if(do.hist) afun(statistic[xx]) )
1732 rp = paste(rp, ", ", formatC(n.eqtls.cis, big.mark=",", format = "f", digits = 0), " cis-eQTLs", sep = "");
1733 }
1734 ## do trans/all analysis
1735 if(pvOutputThreshold>0) {
1736 if( is.null( statistic ) ) {
1737 if( is.null( cursnps ) ) {
1738 cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss );
1739 }
1740 mat = vector("list", length(cursnps));
1741 for(d in 1:length(cursnps)) {
1742 mat[[d]] = tcrossprod(curgene, cursnps[[d]]);
1743 }
1744 statistic = statistic.fun( mat );
1745 astatistic = afun(statistic);
1746 # rm(mat);
1747 }
1748
1749 if( do.hist )
1750 hist.all$update(astatistic);
1751
1752 if(!is.null(select.cis.raw))
1753 astatistic[xx] = -1;
1754 # select.tra.raw = select.tra.raw[!(select.tra.raw %in% select.cis.raw)];
1755
1756 select.tra.raw = which( astatistic >= thresh);
1757 select.tra = arrayInd(select.tra.raw, dim(statistic))
1758
1759 n.eqtls.tra = n.eqtls.tra + length(select.tra.raw);
1760 n.tests.all = n.tests.all + length(statistic);
1761
1762 if(!is.null(betafun))
1763 beta = betafun(mat[[length(mat)]][select.tra.raw], ss, gg, select.tra);
1764
1765 saver.tra$update( snps_offset + select.tra[ , 2],
1766 gene_offset + select.tra[ , 1],
1767 statistic[select.tra.raw],
1768 beta);
1769
1770 if( min.pv.by.genesnp )
1771 minpv.tra$update(ss, gg, astatistic)
1772
1773 # statistic.select.tra = statistic[ select.tra ];
1774 # test = testfun( statistic.select.tra );
1775 # pv = pvfun( test );
1776 # Saver$WriteBlock( cbind( snps_offset + select.tra[ , 2], gene_offset + select.tra[ , 1], test, pv) );
1777 # counter$Update(gg, ss, select.tra, pv, n.tests = nrcs*nrcg, if(do.hist) afun(statistic) )
1778 rp = paste(rp, ", ", formatC(n.eqtls.tra, big.mark=",", format = "f", digits = 0), if(pvOutputThreshold.cis > 0)" trans-"else" ","eQTLs", sep = "")
1779 }
1780
1781 #gene_offset = gene_offset + nrcg;
1782 if( !is.null(statistic) ) {
1783 per = 100*(gg/gene$nSlices() + ss-1) / snps$nSlices();
1784 cat( formatC(floor(per*100)/100, format = "f", width = 5, digits = 2), "% done" , rp, "\n", sep = "");
1785 flush.console();
1786 }
1787 } # gg in 1:gene$nSlices()
1788 snps_offset = snps_offset + nrow(snps$getSliceRaw(ss));
1789 } # ss in 1:snps$nSlices()
1790 }
1791 ################################# Results collection ####################################
1792 {
1793 rez = list(time.in.sec = proc.time()[3] - start.time);
1794 rez$param = params;
1795
1796 if(pvOutputThreshold.cis > 0) {
1797 rez.cis = list(ntests = n.tests.cis, neqtls = n.eqtls.cis);
1798 rez.cis = c(rez.cis, saver.cis$getResults( gene, snps, n.tests.cis) );
1799 if(do.hist)
1800 rez.cis = c(rez.cis, hist.cis$getResults() );
1801 if(min.pv.by.genesnp)
1802 rez.cis = c(rez.cis, minpv.cis$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) );
1803 }
1804
1805 if(pvOutputThreshold>0) {
1806 rez.all = list(ntests = n.tests.all, neqtls = n.eqtls.tra + n.eqtls.cis);
1807 if(pvOutputThreshold.cis > 0) {
1808 rez.tra = list(ntests = n.tests.all - n.tests.cis, neqtls = n.eqtls.tra);
1809 rez.tra = c(rez.tra, saver.tra$getResults( gene, snps, n.tests.all - n.tests.cis) );
1810 } else {
1811 rez.all = c(rez.all, saver.tra$getResults( gene, snps, n.tests.all ) );
1812 }
1813 if(do.hist) {
1814 rez.all = c(rez.all, hist.all$getResults() );
1815 if(pvOutputThreshold.cis > 0) {
1816 rez.tra$hist.bins = rez.all$hist.bins;
1817 rez.tra$hist.counts = rez.all$hist.counts - rez.cis$hist.counts;
1818 }
1819 }
1820 if(min.pv.by.genesnp) {
1821 if(pvOutputThreshold.cis > 0) {
1822 rez.tra = c(rez.tra, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) );
1823 } else {
1824 rez.all = c(rez.all, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) );
1825 }
1826 }
1827 }
1828
1829 if(exists('rez.all')>0)
1830 rez$all = rez.all;
1831 if(exists('rez.tra')>0)
1832 rez$trans = rez.tra;
1833 if(exists('rez.cis')>0)
1834 rez$cis = rez.cis;
1835
1836 class(rez) = c(class(rez),"MatrixEQTL");
1837 status("");
1838 }
1839 # cat('s std ',snps.std$get(1),'\n');
1840 # cat('g std ',gene.std$get(1),'\n');
1841 ################################# Results collection ####################################
1842 return(rez);
1843 }
1844
1845 .histme = function(m, name1, name2, ...) {
1846 cnts = m$hist.counts;
1847 bins = m$hist.bins;
1848 ntst = m$ntests;
1849 centers = 0.5 * (bins[-1L] + bins[-length(bins)]);
1850 density = 0.5 / (bins[-1L] - centers) * cnts / ntst;
1851 ntext = paste("P-value distribution for ", name1, formatC(ntst, big.mark=",", format = "f", digits = 0), name2, " gene-SNP pairs ",sep="");
1852 r = structure(list(breaks = bins, counts = cnts, density = density,
1853 mids = centers, equidist = FALSE), class = "histogram");
1854 plot(r, main = ntext, ylab = "Density", xlab = "P-values", ...)
1855 abline( h = 1, col = "blue");
1856 return(invisible());
1857 }
1858
1859 .qqme = function(m, lcol, cex, pch, ...) {
1860 cnts = m$hist.counts;
1861 bins = m$hist.bins;
1862 ntst = m$ntests;
1863
1864 cusu = cumsum(cnts) / ntst;
1865 ypos = bins[-1][is.finite(cusu)];
1866 xpos = cusu[is.finite(cusu)];
1867 lines(-log10(xpos), -log10(ypos), col = lcol, ...);
1868 # lines(xpos, ypos, col = lcol, ...);
1869 if(length(m$eqtls$pvalue)==0)
1870 return();
1871 ypvs = -log10(m$eqtls$pvalue);
1872 xpvs = -log10(1:length(ypvs) / ntst);
1873 if(length(ypvs) > 1000) {
1874 # need to filter a bit, make the plotting faster
1875 levels = as.integer( xpvs/xpvs[1] * 1e3);
1876 keep = c(TRUE, diff(levels)!=0);
1877 levels = as.integer( ypvs/ypvs[1] * 1e3);
1878 keep = keep | c(TRUE, diff(levels)!=0);
1879 ypvs = ypvs[keep];
1880 xpvs = xpvs[keep];
1881 rm(keep)
1882 }
1883 points(xpvs, ypvs, col = lcol, pch = pch, cex = cex, ...);
1884 }
1885
1886 plot.MatrixEQTL = function(x, cex = 0.5, pch = 19, xlim = NULL, ylim = NULL, ...) {
1887 if( x$param$pvalue.hist == FALSE ) {
1888 warning("Cannot plot p-value distribution: the information was not recorded.\nUse pvalue.hist!=FALSE.");
1889 return(invisible());
1890 }
1891 if( x$param$pvalue.hist == "qqplot" ) {
1892 xmin = 1/max(x$cis$ntests, x$all$ntests);
1893 ymax = NULL;
1894 if(!is.null(ylim)) {
1895 ymax = ylim[2];
1896 } else {
1897 ymax = -log10(min(
1898 x$cis$eqtls$pvalue[1], x$cis$hist.bins[ c(FALSE,x$cis$hist.counts>0)][1],
1899 x$all$eqtls$pvalue[1], x$all$hist.bins[ c(FALSE,x$all$hist.counts>0)][1],
1900 x$trans$eqtls$pvalue[1], x$trans$hist.bins[c(FALSE,x$trans$hist.counts>0)][1],
1901 na.rm = TRUE))+0.1;
1902 }
1903 if(ymax == 0) {
1904 ymax = -log10(.Machine$double.xmin)
1905 }
1906 if(!is.null(ymax))
1907 ylim = c(0,ymax);
1908
1909 if(is.null(xlim))
1910 xlim = c(0, -log10(xmin/1.5));
1911
1912 plot(numeric(),numeric(), xlab = "-Log10(p-value), theoretical",
1913 ylab = "-Log10(p-value), observed",
1914 xlim = c(0, -log10(xmin/1.5)),
1915 ylim = ylim,
1916 xaxs="i", yaxs="i", ...);
1917 lines(c(0,1e3), c(0,1e3), col = "gray");
1918 if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) {
1919 .qqme( x$cis, "red", cex, pch, ...);
1920 .qqme( x$trans, "blue", cex, pch, ...);
1921 title(paste("QQ-plot for",
1922 formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0),
1923 "local and",
1924 formatC(x$trans$ntests, big.mark=",", format = "f", digits = 0),
1925 "distant gene-SNP p-values"));
1926 lset = c(1,2,4);
1927 } else
1928 if(x$param$pvOutputThreshold.cis > 0) {
1929 .qqme(x$cis, "red", cex, pch, ...);
1930 title(paste("QQ-plot for",
1931 formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0),
1932 "local gene-SNP p-values"));
1933 lset = c(1,4);
1934 } else {
1935 .qqme(x$all, "blue", cex, pch, ...);
1936 title(paste("QQ-plot for all",
1937 formatC(x$all$ntests, big.mark=",", format = "f", digits = 0),
1938 "gene-SNP p-values"));
1939 lset = c(3,4);
1940 }
1941 legend("topleft",
1942 c("Local p-values","Distant p-values","All p-values","diagonal")[lset],
1943 col = c("red","blue","blue","gray")[lset],
1944 text.col = c("red","blue","blue","gray")[lset],
1945 pch = 20, lwd = 1, pt.cex = c(1,1,1,0)[lset])
1946 } else {
1947 if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) {
1948 par(mfrow=c(2,1));
1949 .histme(x$cis, "", " local", ...);
1950 tran = list(hist.counts = x$all$hist.counts - x$cis$hist.counts,
1951 hist.bins = x$all$hist.bins,
1952 ntests = x$all$ntests - x$cis$ntests);
1953 .histme(x$trans,""," distant", ...);
1954 par(mfrow=c(1,1));
1955 } else
1956 if(x$param$pvOutputThreshold.cis > 0) {
1957 .histme(x$cis, "", " local", ...);
1958 } else {
1959 .histme(x$all, "all ", "" , ...);
1960 }
1961 }
1962 return(invisible());
1963 }
1964