Mercurial > repos > jasonxu > matrixeqtl
diff MatrixEQTL/R/Matrix_eQTL_engine.R @ 3:ae74f8fb3aef draft
Uploaded
author | jasonxu |
---|---|
date | Fri, 12 Mar 2021 08:20:57 +0000 |
parents | cd4c8e4a4b5b |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/R/Matrix_eQTL_engine.R Fri Mar 12 08:20:57 2021 +0000 @@ -0,0 +1,1964 @@ +# Matrix eQTL by Andrey A. Shabalin +# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ + +# http://cran.r-project.org/web/packages/policies.html + +library(methods) + +modelLINEAR = 117348L; +modelANOVA = 47074L; +modelLINEAR_CROSS = 1113461L; + +.seq = function(a,b){if(a<=b){a:b}else{integer(0)}}; + +SlicedData <- setRefClass( "SlicedData", + fields = list( + dataEnv = "environment", + nSlices1 = "numeric", + rowNameSlices = "list", + columnNames = "character", + fileDelimiter = "character", + fileSkipColumns = "numeric", + fileSkipRows = "numeric", + fileSliceSize = "numeric", + fileOmitCharacters = "character" + ), + methods = list( + initialize = function( mat = NULL ) { + dataEnv <<- new.env(hash = TRUE, size = 29L); + nSlices1 <<- 0L; + if(!is.null(mat)) { + CreateFromMatrix(mat); + } + fileSliceSize <<- 1000; + fileDelimiter <<- "\t"; + fileSkipColumns <<- 1L; + fileSkipRows <<- 1L; + fileOmitCharacters <<- "NA" + return(invisible(.self)); + }, + CreateFromMatrix = function( mat ) { + stopifnot( class(mat) == "matrix" ); + setSliceRaw( 1L ,mat ); + rns = rownames( mat, do.NULL = FALSE); + #if( is.null(rns) ) { + # rns = paste( "Row_",(1:nrow(mat)), sep="" ); + #} + rowNameSlices <<- list(rns); + cns = colnames( mat, do.NULL = FALSE ); + #if( is.null(cns) ){ + # cns = paste( "Col_",(1:ncol(mat)), sep="" ); + #} + columnNames <<- cns; + return(invisible(.self)); + }, + getSlice = function(sl) { + value = get(paste(sl), dataEnv); + if( is.raw(value) ) { + storage.mode(value) = "double"; + value[value == 255] = NA; + } + return( value ) + }, + getSliceRaw = function(sl) { + return( get(paste(sl), dataEnv) ) + }, + setSliceRaw = function(sl, value) { + assign( paste(sl), value, dataEnv ) + if( nSlices1 < sl ) { + nSlices1 <<- sl; + } + }, + setSlice = function(sl, value) { + if( length(value) > 0 ) { + if( all(as.integer(value) == value, na.rm = TRUE) ) { + if( (min(value, na.rm = TRUE) >= 0 ) && + (max(value, na.rm = TRUE) < 255) ) + { + nv = value; + suppressWarnings({storage.mode(nv) = "raw"}); + nv[ is.na(value)] = as.raw(255); + value = nv; + } else { + storage.mode(value) = "integer"; + } + } + } + setSliceRaw(sl, value); + }, + nSlices = function() { + return( nSlices1 ); + }, + LoadFile = function(filename, skipRows = NULL, skipColumns = NULL, sliceSize = NULL, omitCharacters = NULL, delimiter = NULL, rowNamesColumn = 1) { + if( !is.null(skipRows) ) { + fileSkipRows <<- skipRows; + } + if( !is.null(skipColumns) ) { + fileSkipColumns <<- skipColumns; + } + if( !is.null(omitCharacters) ) { + fileOmitCharacters <<- omitCharacters; + } + if( !is.null(sliceSize) ) { + fileSliceSize <<- sliceSize; + } + if( !is.null(delimiter) ) { + fileDelimiter <<- delimiter; + } + stopifnot( (fileSkipColumns == 0) || (rowNamesColumn <= fileSkipColumns) ) + stopifnot( (fileSkipColumns == 0) || (rowNamesColumn >= 1) ) + + fid = file(description = filename, open = "rt", blocking = FALSE, raw = FALSE) + # clean object if file is open + Clear(); + lines = readLines(con = fid, n = max(fileSkipRows,1L), ok = TRUE, warn = TRUE) + line1 = tail(lines,1); + splt = strsplit(line1, split = fileDelimiter, fixed = TRUE); + if( fileSkipRows > 0L ) { + columnNames <<- splt[[1]]; # [ -(1:fileSkipColumns) ]; + } else { + seek(fid, 0) + } + + rm( lines, line1, splt ); + + rowNameSlices <<- vector("list", 15); + + curSliceId = 0L; + repeat + { + # preallocate names and data + if(length(rowNameSlices) < curSliceId) { + rowNameSlices[[2L*curSliceId]] <<- NULL; + } + curSliceId = curSliceId + 1L; + + # read sliceSize rows + rowtag = vector("character",fileSliceSize); + rowvals = vector("list",fileSliceSize); + for(i in 1:fileSliceSize) { + temp = ""; + if( fileSkipColumns > 0L ) { + temp = scan(file = fid, what = character(), n = fileSkipColumns, quiet = TRUE,sep = fileDelimiter); + } + rowtag[i] = temp[rowNamesColumn];#paste(temp,collapse=" "); + rowvals[[i]] = scan(file = fid, what = double(), nlines = 1, quiet = TRUE, sep = fileDelimiter, na.strings = fileOmitCharacters); + if( length(rowvals[[i]]) == 0L ) { + if(i==1L) { + rowtag = matrix(0, 0, 0); + rowvals = character(0); + } else { + rowtag = rowtag[ 1:(i-1) ]; + rowvals = rowvals[ 1:(i-1) ]; + } + break; + } + } + if( length(rowtag) == 0L ) { + curSliceId = curSliceId - 1L; + break; + } + rowNameSlices[[curSliceId]] <<- rowtag; + data = c(rowvals, recursive = TRUE); + dim(data) = c(length(rowvals[[1]]), length(rowvals)); + data = t(data); + setSlice(curSliceId, data); + if( length(rowtag) < fileSliceSize ) { + break; + } + numtxt = formatC(curSliceId*fileSliceSize, big.mark=",", format = "f", digits = 0) + cat( "Rows read: ", numtxt, "\n"); + flush.console() + } + close(fid) + if( fileSkipRows == 0 ) { + columnNames <<- paste("Col_", (1:nCols()), sep=""); + } else { + columnNames <<- tail(columnNames, ncol(getSliceRaw(1))); + } + if( fileSkipColumns == 0 ) { + cnt = 0L; + for( sl in 1:nSlices() ) { + nr = length(getSliceRaw(sl)); + rowNameSlices[[sl]] <<- paste("Row_",cnt + (1:nr),sep=""); + cnt = cnt + nr; + } + } + rowNameSlices <<- rowNameSlices[1:curSliceId]; + cat("Rows read: ", nRows(), " done.\n"); + return(invisible(.self)); + }, + SaveFile = function(filename) { + if( nSlices() == 0 ) { + cat("No data to save"); + return(); + } + fid = file(filename,"wt"); + for( sl in 1:nSlices() ) { + z = getSlice(sl); + rownames(z) = rowNameSlices[[sl]]; + colnames(z) = columnNames; + write.table(z, file = fid, sep = "\t", + col.names = (if(sl == 1){NA}else{FALSE})); + } + close(fid); + }, + nRows = function() { + s = 0L; + for(sl in .seq(1,nSlices())) { + s = s + nrow(getSliceRaw(sl)); + } + return( s ) + }, + nCols = function() { + if( nSlices() == 0L ) { + return(0L); + } else { + return( ncol(getSliceRaw(1L)) ) + } + }, + Clear = function() { + for( sl in .seq(1,nSlices()) ) { + rm(list = paste(sl), envir = dataEnv) + } + nSlices1 <<- 0L; + rowNameSlices <<- list(); + columnNames <<- character(); + return(invisible(.self)); + }, + IsCombined = function() { + return( nSlices() <= 1L ); + }, + GetAllRowNames = function() { + return( c(rowNameSlices, recursive=TRUE) ); + }, + SetNanRowMean = function() { + if( (nCols() == 0L) ) { + return(invisible(.self)); + } + for( sl in .seq(1,nSlices()) ) { + slice = getSlice(sl); + if( any(is.na(slice)) ) { + rowmean = rowMeans(slice, na.rm = TRUE); + rowmean[is.na(rowmean)] = 0L; + for( j in which(!complete.cases(slice)) ) { + where1 = is.na(slice[j, ]); + slice[j, where1] = rowmean[j]; + } + setSlice(sl, slice); + } + } + return(invisible(.self)); + }, + RowStandardizeCentered = function() { + for(sl in .seq(1,nSlices()) ) { + slice = getSlice(sl); + div = sqrt( rowSums(slice^2) ); + div[ div == 0 ] = 1; + setSlice(sl, slice/div); + } + return(invisible(.self)); + }, + CombineInOneSlice = function() { + if( nSlices() <= 1L ) { + return(invisible(.self)); + } + nc = nCols(); + nr = nRows(); + datatypes = c("raw","integer","double"); + datafuns = c(as.raw, as.integer, as.double); + datatype = character(nSlices()); + for(sl in 1:nSlices()) { + datatype[sl] = typeof(getSliceRaw(sl)); + } + mch = max(match(datatype,datatypes,nomatch = length(datatypes))); + datafun = datafuns[[mch]]; + newData = matrix(datafun(0), nrow = nr, ncol = nc); + offset = 0; + for(sl in 1:nSlices()) { + if(mch==1) { + slice = getSliceRaw(sl); + } else { + slice = getSlice(sl); + } + newData[ offset + (1:nrow(slice)),] = datafun(slice); + setSlice(sl, numeric()); + offset = offset + nrow(slice); + } + + nSlices1 <<- 1L; + setSliceRaw(1L, newData); + rm(newData); + + newrowNameSlices = GetAllRowNames(); + rowNameSlices <<- list(newrowNameSlices) + return(invisible(.self)); + }, + ResliceCombined = function(sliceSize = -1) { + if( sliceSize > 0L ) { + fileSliceSize <<- sliceSize; + } + if( fileSliceSize <= 0 ) { + fileSliceSize <<- 1000; + } + if( IsCombined() ) { + nRows1 = nRows(); + if(nRows1 == 0L) { + return(invisible(.self)); + } + newNSlices = floor( (nRows1 + fileSliceSize - 1)/fileSliceSize ); + oldData = getSliceRaw(1L); + #oldNames = rowNameSlices[[1]]; + newNameslices = vector("list",newNSlices) + for( sl in 1:newNSlices ) { + range = (1+(sl-1)*fileSliceSize) : (min(nRows1,sl*fileSliceSize)); + newpart = oldData[range, ,drop = FALSE]; + if( is.raw(oldData) ) { + setSliceRaw( sl, newpart); + } else { + setSlice( sl, newpart); + } + newNameslices[[sl]] = rowNameSlices[[1]][range]; + } + rowNameSlices <<- newNameslices ; + } else { + stop("Reslice of a sliced matrix is not supported yet. Use CombineInOneSlice first."); + } + return(invisible(.self)); + }, + Clone = function() { + clone = SlicedData$new(); + for(sl in .seq(1,nSlices()) ) { + clone$setSliceRaw(sl,getSliceRaw(sl)); + } + clone$rowNameSlices = rowNameSlices; + clone$columnNames = columnNames; + clone$fileDelimiter = fileDelimiter; + clone$fileSkipColumns = fileSkipColumns; + clone$fileSkipRows = fileSkipRows; + clone$fileSliceSize = fileSliceSize; + clone$fileOmitCharacters = fileOmitCharacters; + return( clone ); + }, + RowMatrixMultiply = function(multiplier) { + for(sl in .seq(1,nSlices()) ) { + setSlice(sl, getSlice(sl) %*% multiplier); + } + return(invisible(.self)); + }, + ColumnSubsample = function(subset) { + for(sl in .seq(1,nSlices()) ) { + setSliceRaw(sl, getSliceRaw(sl)[ ,subset, drop = FALSE]); + } + columnNames <<- columnNames[subset]; + return(invisible(.self)); + }, + RowReorderSimple = function(ordr) { + # had to use an inefficient and dirty method + # due to horrible memory management in R + if( (typeof(ordr) == "logical") && all(ordr) ) { + return(invisible(.self)); + } + if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { + return(invisible(.self)); + } + CombineInOneSlice(); + gc(); + setSliceRaw( 1L, getSliceRaw(1L)[ordr, ] ); + rowNameSlices[[1]] <<- rowNameSlices[[1]][ordr]; + gc(); + ResliceCombined(); + gc(); + return(invisible(.self)); + }, + RowReorder = function(ordr) { + # transform logical into indices + if( typeof(ordr) == "logical" ) { + if( length(ordr) == nRows() ) { + ordr = which(ordr); + } else { + stop("Parameter \"ordr\" has wrong length") + } + } + ## first, check that anything has to be done at all + if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { + return(invisible(.self)); + } + ## check bounds + #if( (min(ordr) < 1) || (max(ordr) > nRows()) ) { + # stop("Parameter \"ordr\" is out of bounds"); + #} + ## slice the data into individual rows + all_rows = vector("list", nSlices()) + for( i in 1:nSlices() ) { + slice = getSliceRaw(i) + all_rows[[i]] = split(slice, 1:nrow(slice)) + setSliceRaw(i,numeric()) + } + gc(); + all_rows = unlist(all_rows, recursive=FALSE, use.names = FALSE); + ## Reorder the rows + all_rows = all_rows[ordr]; + ## get row names + all_names = GetAllRowNames(); + ## erase the set + rowNameSlices <<- list(); + ## sort names + all_names = all_names[ordr]; + ## + ## Make slices back + nrows = length(all_rows); + nSlices1 <<- as.integer((nrows+fileSliceSize-1)/fileSliceSize); + ##cat(nrows, " ", nSlices1); + rowNameSlices1 = vector("list", nSlices1); + for( i in 1:nSlices1 ) { + fr = 1 + fileSliceSize*(i-1); + to = min( fileSliceSize*i, nrows); + + subset = all_rows[fr:to]; + types = unlist(lapply(subset,typeof)); + israw = (types == "raw") + if(!all(israw == israw[1])) { + # some raw and some are not + subset = lapply(subset, function(x){if(is.raw(x)){x=as.integer(x);x[x==255] = NA;return(x)}else{return(x)}}); + } + subset = unlist(subset); + dim(subset) = c( length(all_rows[[fr]]) , to - fr + 1) + #subset = matrix(subset, ncol = (to-fr+1)); + if(is.raw(subset)) { + setSliceRaw(i, t(subset)); + } else { + setSlice(i, t(subset)); + } + rowNameSlices1[[i]] = all_names[fr:to]; + all_rows[fr:to] = 0; + all_names[fr:to] = 0; + } + rowNameSlices <<- rowNameSlices1; + gc(); + return(invisible(.self)); + }, + RowRemoveZeroEps = function(){ + for(sl in .seq(1,nSlices()) ) { + slice = getSlice(sl); + amean = rowMeans(abs(slice)); + remove = (amean < .Machine$double.eps*nCols()); + if(any(remove)) { + rowNameSlices[[sl]] <<- rowNameSlices[[sl]][!remove]; + setSlice(sl, slice[!remove, , drop = FALSE]); + } + } + return(invisible(.self)); + }, + FindRow = function(rowname) { + for(sl in .seq(1,nSlices()) ) { + mch = match(rowname,rowNameSlices[[sl]], nomatch = 0); + if( mch > 0 ) + { + row = getSlice(sl)[mch[1], , drop=FALSE]; + rownames(row) = rowname; + colnames(row) = columnNames; + return( list(slice = sl, item = mch, row = row) ); + } + } + return( NULL ); + }, + show = function() { + cat("SlicedData object. For more information type: ?SlicedData\n"); + cat("Number of columns:", nCols(), "\n"); + cat("Number of rows:", nRows(), "\n"); + cat("Data is stored in", nSlices(), "slices\n"); + if(nCols()>0) { + z = getSlice(1L); + if(nrow(z)>0) { + z = z[1:min(nrow(z),10L), 1:min(ncol(z),10L), drop = FALSE]; + rownames(z) = rowNameSlices[[1]][1:nrow(z)]; + colnames(z) = columnNames[1:ncol(z)]; + cat("Top left corner of the first slice (up to 10x10):\n"); + methods:::show(z) + } + } + } + )) + +setGeneric("nrow") +setMethod("nrow", "SlicedData", function(x) { + return( x$nRows() ); + }) +setGeneric("NROW") +setMethod("NROW", "SlicedData", function(x) { + return( x$nRows() ); + }) +setGeneric("ncol") +setMethod("ncol", "SlicedData", function(x) { + return( x$nCols() ); + }) +setGeneric("NCOL") +setMethod("NCOL", "SlicedData", function(x) { + return( x$nCols() ); + }) +setGeneric("dim") +setMethod("dim", "SlicedData", function(x) { + return( c(x$nRows(),x$nCols()) ); + }) +setGeneric("colnames") +setMethod("colnames", "SlicedData", function(x) { + return( x$columnNames ); + }) +setGeneric("rownames") +setMethod("rownames", "SlicedData", function(x) { + return( x$GetAllRowNames() ); + }) +setMethod("[[", "SlicedData", function(x,i) { + return( x$getSlice(i) ); + }) +setGeneric("length") +setMethod("length", "SlicedData", function(x) { + return( x$nSlices() ); + }) +setMethod("[[<-", "SlicedData", function(x,i,value) { + x$setSlice(i, value); + return(x); +}) +summary.SlicedData = function(object, ...) { + z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); + return(z); +} + +##### setGeneric("summary") ##### +#setMethod("summary", "SlicedData", function(object, ...) { +# z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); +# return(z); +# }) +#setGeneric("show", standardGeneric("show")) +# setMethod("show", "SlicedData", function(object) { +# cat("SlicedData object. For more information type: ?SlicedData\n"); +# cat("Number of columns:", object$nCols(), "\n"); +# cat("Number of rows:", object$nRows(), "\n"); +# cat("Data is stored in", object$nSlices(), "slices\n"); +# if(object$nSlices()>0) { +# z = object$getSlice(1); +# if(nrow(z)>0) { +# z = z[1:min(nrow(z),10), 1:min(ncol(z),10), drop = FALSE]; +# rownames(z) = object$rowNameSlices[[1]][1:nrow(z)]; +# colnames(z) = object$columnNames[1:ncol(z)]; +# cat("Top left corner of the first slice (up to 10x10):\n"); +# show(z) +# } +# } +# }) + +setGeneric("as.matrix") +setMethod("as.matrix", "SlicedData", function(x) { + if(x$nSlices() == 0) { + return( matrix(0,0,0) ); + } + if(x$nSlices() > 1) { + copy = x$Clone(); + copy$CombineInOneSlice(); + } else { + copy = x; + } + mat = copy$getSlice(1L); + rownames(mat) = rownames(copy); + colnames(mat) = colnames(copy); + return( mat ); + }) +setGeneric("colnames<-") +setMethod("colnames<-", "SlicedData", function(x,value) { + stopifnot( class(value) == "character" ); + stopifnot( length(value) == x$nCols() ); + x$columnNames = value; + return(x); + }) +setGeneric("rownames<-") +setMethod("rownames<-", "SlicedData", function(x,value) { + stopifnot( class(value) == "character" ); + stopifnot( length(value) == x$nRows() ); + start = 1; + newNameSlices = vector("list", x$nSlices()); + for( i in .seq(1,x$nSlices()) ) { + nr = nrow(x$getSliceRaw(i)); + newNameSlices[[i]] = value[ start:(start+nr-1) ]; + start = start + nr; + } + x$rowNameSlices = newNameSlices; + return(x); + }) +setGeneric("rowSums") +setMethod("rowSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { + if(x$nSlices() == 0) { + return( numeric() ); + } + stopifnot( dims == 1 ); + thesum = vector("list", x$nSlices()); + for( i in 1:x$nSlices() ) { + thesum[[i]] = rowSums(x$getSlice(i), na.rm) + } + return(unlist(thesum, recursive = FALSE, use.names = FALSE)); + }) +setGeneric("rowMeans") +setMethod("rowMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { + if(x$nSlices() == 0) { + return( numeric() ); + } + stopifnot( dims == 1 ); + thesum = vector("list", x$nSlices()); + for( i in 1:x$nSlices() ) { + thesum[[i]] = rowMeans(x$getSlice(i), na.rm) + } + return(unlist(thesum, recursive = FALSE, use.names = FALSE)); + }) +setGeneric("colSums") +setMethod("colSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { + if(x$nCols() == 0) { + return( numeric() ); + } + stopifnot( dims == 1 ); + thesum = 0; + for( i in .seq(1,x$nSlices()) ) { + thesum = thesum + colSums(x$getSlice(i), na.rm) + } + return(thesum); + }) +setGeneric("colMeans") +setMethod("colMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { + if(x$nCols() == 0) { + return( numeric() ); + } + stopifnot( dims == 1 ); + thesum = 0; + thecounts = x$nRows(); + for( i in .seq(1,x$nSlices()) ) { + slice = x$getSlice(i); + thesum = thesum + colSums(slice, na.rm) + if( na.rm ) { + thecounts = thecounts - colSums(is.na(slice)) + } + } + return(thesum/thecounts); + }) + +.listBuilder <- setRefClass(".listBuilder", + fields = list( + dataEnv = "environment", + n = "numeric" + ), + methods = list( + initialize = function() { + dataEnv <<- new.env(hash = TRUE); + n <<- 0; +# cumlength <<- 0; + return(.self); + }, + add = function(x) { + if(length(x) > 0) { + n <<- n + 1; +# cumlength <<- cumlength + length(x); + assign(paste(n), x, dataEnv ); + } + return(.self); + }, + set = function(i,x) { + if(length(x) > 0) { + if(i>n) + n <<- i; + assign(paste(i), x, dataEnv ); + } + return(.self); + }, + get = function(i) { + return(base::get(paste(i),dataEnv)); + }, + list = function() { + if(n==0) return(list()); + result = vector('list',n); + for( i in 1:n) { + result[[i]] = .self$get(i); + } + return(result); + }, + unlist = function() { + return(base::unlist(.self$list(), recursive=FALSE, use.names = FALSE)); + }, + show = function() { + cat(".listBuilder object.\nIternal object in MatrixEQTL package.\n"); + cat("Number of elements:", object$n, "\n"); + } + )) + +.histogrammer <- setRefClass(".histogrammer", + fields = list( + pvbins1 = "numeric", + statbins1 = "numeric", + hist.count = "numeric" + ), + methods = list( + initialize = function (pvbins, statbins) { + if(length(pvbins)) { + ord = order(statbins); + pvbins1 <<- pvbins[ord]; + statbins1 <<- statbins[ord]; + hist.count <<- double(length(pvbins)-1); + } + return(.self); + }, + update = function(stats.for.hist) { + h = hist(stats.for.hist, breaks = statbins1, include.lowest = TRUE, right = TRUE, plot = FALSE)$counts; + hist.count <<- hist.count + h; + }, + getResults = function() { + if(!is.unsorted(pvbins1)) { + return(list(hist.bins = pvbins1 , hist.counts = hist.count )); + } else { + return(list(hist.bins = rev(pvbins1), hist.counts = rev(hist.count))); + } + } + )) + + +.minpvalue <- setRefClass(".minpvalue", + fields = list( + sdata = ".listBuilder", + gdata = ".listBuilder" + ), + methods = list( + initialize = function(snps, gene) { + sdata <<- .listBuilder$new(); + for( ss in 1:snps$nSlices() ) { + sdata$set( ss, double(nrow(snps$getSliceRaw(ss)))); + } + gdata <<- .listBuilder$new(); + for( gg in 1:gene$nSlices() ) { + gdata$set( gg, double(nrow(gene$getSliceRaw(gg)))); + } + return(.self); + }, + update = function(ss, gg, astat) { + gmax = gdata$get(gg) + z1 = max.col(astat,ties.method='first'); + z11 = astat[1:nrow(astat) + nrow(astat) * (z1 - 1)]; + gmax = pmax(gmax, z11); + gdata$set(gg, gmax); + + smax = sdata$get(ss) + z22 = apply(astat,2,max); + smax = pmax(smax, z22); + sdata$set(ss, smax); + return(.self); + }, + updatecis = function(ss, gg, select.cis, astat) { + if(length(astat)>0) + { + byrows = aggregate(x=astat, by=list(row=select.cis[,1]), FUN=max); + bycols = aggregate(x=astat, by=list(col=select.cis[,2]), FUN=max); + + gmax = gdata$get(gg); + gmax[byrows$row] = pmax(gmax[byrows$row], byrows$x) + gdata$set(gg, gmax); + + smax = sdata$get(ss) + smax[bycols$col] = pmax(smax[bycols$col], bycols$x) + sdata$set(ss, smax); + } + return(.self); + }, + getResults = function(snps, gene, pvfun) { + min.pv.snps = pvfun(sdata$unlist()); + names(min.pv.snps) = rownames(snps); + min.pv.gene = pvfun(gdata$unlist()); + names(min.pv.gene) = rownames(gene); + return(list(min.pv.snps = min.pv.snps, min.pv.gene = min.pv.gene)); + } + )) + +.OutputSaver_FRD <- setRefClass(".OutputSaver_FRD", + fields = list( + sdata = ".listBuilder", + gdata = ".listBuilder", + cdata = ".listBuilder", + bdata = ".listBuilder", + fid = "list", + testfun1 = "list", + pvfun1 = "list" + ), + methods = list( + initialize = function () { + sdata <<- .listBuilder$new(); + gdata <<- .listBuilder$new(); + cdata <<- .listBuilder$new(); + bdata <<- .listBuilder$new(); + fid <<- list(0); + testfun1 <<- list(0); + pvfun1 <<- list(0); + return(.self); + }, + start = function(filename, statistic_name, unused1, unused2, testfun, pvfun) { + testfun1 <<- list(testfun); + pvfun1 <<- list(pvfun); + if(length(filename) > 0) { + if(class(filename) == "character") { + fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); + } else { + fid <<- list(filename, FALSE) + } + writeLines( paste("SNP\tgene\t",statistic_name,"\tp-value\tFDR", sep = ""), fid[[1]]); + } else { + fid <<- list(); + } + }, + update = function(spos, gpos, sta, beta = NULL) { + if(length(sta)>0) { + sdata$add(spos); + gdata$add(gpos); + cdata$add(sta ); + if(!is.null(beta )) + bdata$add(beta ); + } + return(.self); + }, + getResults = function( gene, snps, FDR_total_count) { + pvalues = NULL; + if(cdata$n > 0) { + tests = testfun1[[1]](cdata$unlist()); + cdata <<- .listBuilder$new(); + + pvalues = pvfun1[[1]](tests); + ord = order(pvalues); + + tests = tests[ord]; + pvalues = pvalues[ord]; + + FDR = pvalues * FDR_total_count / (1:length(pvalues)); + FDR[length(FDR)] = min(FDR[length(FDR)], 1); + FDR = rev(cummin(rev(FDR))); + + snps_names = snps$GetAllRowNames()[sdata$unlist()[ord]]; + sdata <<- .listBuilder$new(); + gene_names = gene$GetAllRowNames()[gdata$unlist()[ord]]; + gdata <<- .listBuilder$new(); + + beta = NULL; + if(bdata$n > 0) + beta = bdata$unlist()[ord]; + + if(length(fid)>0) { + step = 1000; ########### 100000 + for( part in 1:ceiling(length(FDR)/step) ) { + fr = (part-1)*step + 1; + to = min(part*step, length(FDR)); + dump = data.frame(snps_names[fr:to], + gene_names[fr:to], + if(is.null(beta)) tests[fr:to] else list(beta[fr:to],tests[fr:to]), + pvalues[fr:to], + FDR[fr:to], + row.names = NULL, + check.rows = FALSE, + check.names = FALSE, + stringsAsFactors = FALSE); + write.table(dump, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); + } + } + } else { + cat("No significant associations were found.\n", file = if(length(fid)>0){fid[[1]]}else{""}); + } + if(length(fid)>0) { + if(fid[[2]]) { + close(fid[[1]]); + } + fid <<- list(); + } + + if(!is.null(pvalues)) { + eqtls = list( snps = snps_names, + gene = gene_names, + statistic = tests, + pvalue = pvalues, + FDR = FDR); + if(!is.null(beta)) + eqtls$beta = beta; + } else { + eqtls = list( snps = character(), + gene = character(), + beta = numeric(), + statistic = numeric(), + pvalue = numeric(), + FDR = numeric()); + } + return(list(eqtls = data.frame(eqtls))); + } + ) +) + + +.OutputSaver_direct <- setRefClass(".OutputSaver_direct", + fields = list( + gene_names = "character", + snps_names = "character", + fid = "list", + testfun1 = "list", + pvfun1 = "list" + ), + methods = list( + initialize = function() { + gene_names <<- character(0); + snps_names <<- character(0); + fid <<- list(0); + testfun1 <<- list(0); + pvfun1 <<- list(0); + return(.self); + }, + start = function(filename, statistic_name, snps, gene, testfun, pvfun) { + # I hope the program stops if it fails to open the file + if(class(filename) == "character") { + fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); + } else { + fid <<- list(filename, FALSE) + } + writeLines(paste("SNP\tgene\t", statistic_name, "\tp-value", sep = ""), fid[[1]]); + gene_names <<- gene$GetAllRowNames(); + snps_names <<- snps$GetAllRowNames(); + testfun1 <<- list(testfun); + pvfun1 <<- list(pvfun); + }, + update = function(spos, gpos, sta, beta = NULL) { + if( length(sta) == 0 ) + return(); + sta = testfun1[[1]](sta); + lst = list(snps = snps_names[spos], gene = gene_names[gpos], beta = beta, statistic = sta, pvalue = pvfun1[[1]](sta)); + lst$beta = lst$beta; + + dump2 = data.frame(lst, row.names = NULL, check.rows = FALSE, check.names = FALSE, stringsAsFactors = FALSE); + write.table(dump2, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); + }, + getResults = function(...) { + if(length(fid)>0) { + if(fid[[2]]) { + close(fid[[1]]); + } + fid <<- list(); + } + gene_names <<- character(0); + snps_names <<- character(0); + return(list()); + } + ) +) + +.my.pmin = function(x, val) { + # minimum "pmin" function that can handle empty array + if(length(x) == 0) { + return(x) + } else { + return(pmin.int(x,val)); + } +} + +.my.pmax = function(x, val) { + # minimum "pmin" function that can handle empty array + if(length(x) == 0) { + return(x) + } else { + return(pmax.int(x,val)); + } +} + +.pv.nz = function(x){return( .my.pmax(x,.Machine$double.xmin) )} + +.SetNanRowMean = function(x) { + if( any(is.na(x)) ) { + rowmean = rowMeans(x, na.rm = TRUE); + rowmean[ is.na(rowmean) ] = 0; + for( j in which(!complete.cases(x)) ) { + where1 = is.na( x[j, ] ); + x[j,where1] = rowmean[j]; + } + } + return(x); +} + +# .SNP_process_split_for_ANOVA = function(x) { +# # split into 2 dummy variables +# +# uniq = unique(c(x)); +# uniq = uniq[!is.na(uniq)]; +# +# if( length(uniq) > 3 ) { +# stop("More than three genotype categories is not handled by ANOVA"); +# } else if ( length(uniq) < 3 ) { +# uniq = c(uniq, min(uniq)-(1:(3-length(uniq)))); +# } +# +# freq = matrix(0, nrow(x), length(uniq)); +# for(i in 1:length(uniq)) { +# freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); +# } +# +# md = apply(freq, 1, which.max); +# freq[ cbind(1:nrow(x),md) ] = -1; +# +# md = apply(freq, 1, which.max); # min(freq[cbind(1:nrow(slice),md)] - rowSums(select,na.rm = TRUE )) +# new_slice1 = (x == uniq[md]); +# new_slice1[is.na(new_slice1)] = 0; +# freq[ cbind(1:nrow(x),md) ] = -1; +# +# md = apply(freq,1,which.max); +# new_slice2 = (x == uniq[md]); +# new_slice2[ is.na(new_slice2) ] = 0; +# rez = vector("list", 2); +# rez[[1]] = new_slice1; +# rez[[2]] = new_slice2; +# return( rez ); +# } + +.SNP_process_split_for_ANOVA = function(x,n.groups) { + # split into 2 dummy variables (or more) + +# # Get the number of ANOVA groups +# n.groups = options('MatrixEQTL.ANOVA.categories')[[1]]; +# if( is.null(n.groups)) +# n.groups = 3; + + # Unique values in x (make sure it has length of n.groups); + uniq = unique(as.vector(x)); + uniq = uniq[!is.na(uniq)]; + if( length(uniq) > n.groups ) { + stop("More than declared number of genotype categories is detected by ANOVA"); + } else if ( length(uniq) < n.groups ) { + uniq = c(uniq, rep(min(uniq)-1, n.groups-length(uniq))); + } + + # Table of frequencies for each variable (row) + freq = matrix(0, nrow(x), n.groups); + for(i in 1:n.groups) { + freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); + } + # remove NA's from x for convenience + x[is.na(x)] = min(uniq)-2; + + # Output list of matrices + rez = vector('list',n.groups-1); + + # Skip the most frequent value + md = apply(freq, 1, which.max); # most frequent value for each variable + freq[ cbind(1:nrow(x),md) ] = -1; + + # The rest form dumm + for(j in 1:(n.groups-1)){ + md = apply(freq, 1, which.max); + freq[ cbind(1:nrow(x),md) ] = -1; + rez[[j]] = (x == uniq[md]); + } + return( rez ); +} + +Matrix_eQTL_engine = function( + snps, + gene, + cvrt = SlicedData$new(), + output_file_name, + pvOutputThreshold = 1e-5, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + pvalue.hist = FALSE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE) { + rez = Matrix_eQTL_main( + snps = snps, + gene = gene, + cvrt = cvrt, + output_file_name = output_file_name, + pvOutputThreshold = pvOutputThreshold, + useModel = useModel, + errorCovariance = errorCovariance, + verbose = verbose, + pvalue.hist = pvalue.hist, + min.pv.by.genesnp = min.pv.by.genesnp, + noFDRsaveMemory = noFDRsaveMemory); + return( rez ); +} + +Matrix_eQTL_main = function( + snps, + gene, + cvrt = SlicedData$new(), + output_file_name = "", + pvOutputThreshold = 1e-5, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + output_file_name.cis = "", + pvOutputThreshold.cis = 0, + snpspos = NULL, + genepos = NULL, + cisDist = 1e6, + pvalue.hist = FALSE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE) { + ################################# Basic variable checks ################################# + { + # status("Performing basic checks of the input variables"); + stopifnot( "SlicedData" %in% class(gene) ); + stopifnot( "SlicedData" %in% class(snps) ); + stopifnot( "SlicedData" %in% class(cvrt) ); + + # Check dimensions + if( min(snps$nRows(),snps$nCols()) == 0 ) + stop("Empty genotype dataset"); + if( min(gene$nRows(),gene$nCols()) == 0 ) + stop("Empty expression dataset"); + if( snps$nCols() != gene$nCols() ) + stop("Different number of samples in the genotype and gene expression files"); + if( cvrt$nRows()>0 ) { + if( snps$nCols() != cvrt$nCols() ) + stop("Wrong number of samples in the matrix of covariates"); + } + + stopifnot( class(pvOutputThreshold) == "numeric" ); + stopifnot( length(pvOutputThreshold) == 1 ); + stopifnot( pvOutputThreshold >= 0 ); + stopifnot( pvOutputThreshold <= 1 ); + + stopifnot( class(noFDRsaveMemory) == "logical" ); + stopifnot( length(noFDRsaveMemory) == 1 ); + + if( pvOutputThreshold > 0 ) { + stopifnot( !((length(output_file_name) == 0) && noFDRsaveMemory) ) + stopifnot( length(output_file_name) <= 1 ); + if( length(output_file_name) == 1 ) { + stopifnot( class(output_file_name) %in% c("character","connection") ); + } + } + + stopifnot( class(pvOutputThreshold.cis) == "numeric" ); + stopifnot( length(pvOutputThreshold.cis) == 1 ); + stopifnot( pvOutputThreshold.cis >= 0 ); + stopifnot( pvOutputThreshold.cis <= 1 ); + stopifnot( !((pvOutputThreshold > 0) & (pvOutputThreshold.cis > 0) & (pvOutputThreshold > pvOutputThreshold.cis)) ); + stopifnot( (pvOutputThreshold > 0) | (pvOutputThreshold.cis > 0) ); + + stopifnot( class(useModel) == class(modelLINEAR) ); + stopifnot( length(useModel) == 1 ); + stopifnot( useModel %in% c(modelLINEAR, modelANOVA, modelLINEAR_CROSS) ); + if( useModel %in% c(modelLINEAR, modelLINEAR_CROSS) ) { + if( snps$nCols() <= cvrt$nRows() + 1 + 1) { + stop('The number of covariates exceeds the number of samples.\nLinear regression can not be fit.') + } + } + if( useModel == modelLINEAR_CROSS ) { + if( cvrt$nRows() == 0 ) { + stop( "Model \"modelLINEAR_CROSS\" requires at least one covariate" ); + } + } + if( useModel == modelANOVA ) { + n.anova.groups = getOption('MatrixEQTL.ANOVA.categories', 3); + stopifnot( n.anova.groups == floor(n.anova.groups) ); + stopifnot( n.anova.groups >= 3 ); +# stopifnot( n.anova.groups < snps$nCols() - cvrt$nRows() - 2 ); + if( snps$nCols() <= cvrt$nRows() + n.anova.groups) { + stop('The number of covariates exceeds the number of samples.\nLinear regression (ANOVA) can not be fit.') + } + } + + stopifnot( class(verbose) == "logical" ); + stopifnot( length(verbose) == 1 ); + + stopifnot( class(min.pv.by.genesnp) == "logical" ); + stopifnot( length(min.pv.by.genesnp) == 1 ); + + if( pvOutputThreshold.cis > 0 ) { + stopifnot( !((length(output_file_name.cis) == 0) && noFDRsaveMemory) ) + stopifnot( length(output_file_name.cis) <= 1 ); + if( length(output_file_name.cis) == 1 ) { + stopifnot( class(output_file_name.cis) %in% c("character","connection") ); + } + +# stopifnot( class(output_file_name.cis) == "character" ); +# stopifnot( length(output_file_name.cis) == 1 ); + stopifnot( class(snpspos) == "data.frame" ); + stopifnot( ncol(snpspos) == 3 ); + stopifnot( nrow(snpspos) > 0 ); + stopifnot( class(snpspos[1,3]) %in% c("integer", "numeric") ) + stopifnot( !any(is.na(snpspos[,3])) ) + stopifnot( class(genepos) == "data.frame" ); + stopifnot( ncol(genepos) == 4 ); + stopifnot( nrow(genepos) > 0 ); + stopifnot( class(genepos[1,3]) %in% c("integer", "numeric") ) + stopifnot( class(genepos[1,4]) %in% c("integer", "numeric") ) + stopifnot( !any(is.na(genepos[,3])) ) + stopifnot( !any(is.na(genepos[,4])) ) + stopifnot( nzchar(output_file_name.cis) ) + } + + if( pvOutputThreshold > 0 ) { + stopifnot( nzchar(output_file_name) ) + } + + stopifnot( class(errorCovariance) %in% c("numeric", "matrix") ); + errorCovariance = as.matrix(errorCovariance); + if(length(errorCovariance)>0) { + if( nrow(errorCovariance) != ncol(errorCovariance) ) { + stop("The covariance matrix is not square"); + } + if( nrow(errorCovariance) != snps$nCols() ) { + stop("The covariance matrix size does not match the number of samples"); + } + if( !all(errorCovariance == t(errorCovariance)) ) { + stop("The covariance matrix is not symmetric"); + } + } + } + ################################# Initial setup ######################################### + { + gene.std = .listBuilder$new(); + snps.std = .listBuilder$new(); + + dont.clone.gene = getOption('MatrixEQTL.dont.preserve.gene.object', FALSE) + if(is.null(dont.clone.gene)) + dont.clone.gene = FALSE; + + if( !dont.clone.gene ) + gene = gene$Clone(); + # snps = snps$Clone(); # snps is read only + cvrt = cvrt$Clone(); + + params = list( + output_file_name = output_file_name, + pvOutputThreshold = pvOutputThreshold, + useModel = useModel, + errorCovariance = errorCovariance , + verbose = verbose, + output_file_name.cis = output_file_name.cis, + pvOutputThreshold.cis = pvOutputThreshold.cis, + cisDist = cisDist , + pvalue.hist = pvalue.hist, + min.pv.by.genesnp = min.pv.by.genesnp); + + if( verbose ) { + lastTime = 0; + status <- function(text) { + # gc(); + newTime = proc.time()[3]; + if(lastTime != 0) { + cat("Task finished in ", newTime-lastTime, " seconds\n"); + } + cat(text,"\n"); + lastTime <<- newTime; + unused = flush.console(); + } + } else { + status = function(text){} + } + start.time = proc.time()[3]; + } + ################################# Error covariance matrix processing #################### + { + if( length(errorCovariance) > 0 ) { + status("Processing the error covariance matrix"); + eig = eigen(errorCovariance, symmetric = TRUE) + d = eig$values; + v = eig$vectors; + # errorCovariance == v %*% diag(d) %*% t(v) + # errorCovariance^0.5 == v*sqrt(d)*v" (no single quotes anymore) + # errorCovariance^(-0.5) == v*diag(1./sqrt(diag(d)))*v" + if( any(d<=0) ) { + stop("The covariance matrix is not positive definite"); + } + correctionMatrix = v %*% diag(1./sqrt(d)) %*% t(v); + rm( eig, v, d, errorCovariance ) + } else { + rm( errorCovariance ); + correctionMatrix = numeric(); + } + } + ################################# Matching gene and SNPs locations ###################### + if( pvOutputThreshold.cis > 0 ) { + status("Matching data files and location files") + + # names in the input data + gene_names = gene$GetAllRowNames(); + snps_names = snps$GetAllRowNames(); + + # gene range, set: left<right + if(any(genepos[,3] > genepos[,4])) { + temp3 = genepos[,3]; + temp4 = genepos[,4]; + genepos[,3] = pmin(temp3,temp4); + genepos[,4] = pmax(temp3,temp4); + rm(temp3, temp4); + } + + # match with the location data + genematch = match( gene_names, genepos[ ,1], nomatch = 0L); + usedgene = matrix(FALSE, nrow(genepos), 1); # genes in "genepos" that are matching "gene_names" + usedgene[ genematch ] = TRUE; + if( !any(genematch) ) { + stop("Gene names do not match those in the gene location file."); + } + cat( sum(genematch>0), "of", length(gene_names), " genes matched\n"); + + + snpsmatch = match( snps_names, snpspos[ ,1], nomatch = 0L); + usedsnps = matrix(FALSE, nrow(snpspos),1); + usedsnps[ snpsmatch ] = TRUE; + if( !any(snpsmatch) ) { + stop("SNP names do not match those in the SNP location file."); + } + cat( sum(snpsmatch>0), "of", length(snps_names), " SNPs matched\n"); + + # list used chr names + chrNames = unique(c( as.character(unique(snpspos[usedsnps,2])), + as.character(unique(genepos[usedgene,2])) )) + chrNames = chrNames[ sort.list( suppressWarnings(as.integer(chrNames)), + method = "radix", na.last = TRUE ) ]; + # match chr names + genechr = match(genepos[,2],chrNames); + snpschr = match(snpspos[,2],chrNames); + + # max length of a chromosome + chrMax = max( snpspos[usedsnps, 3], genepos[usedgene, 4], na.rm = TRUE) + cisDist; + + # Single number location for all rows in "genepos" and "snpspos" + genepos2 = as.matrix(genepos[ ,3:4, drop = FALSE] + (genechr-1)*chrMax); + snpspos2 = as.matrix(snpspos[ ,3 , drop = FALSE] + (snpschr-1)*chrMax); + + # the final location arrays; + snps_pos = matrix(0,length(snps_names),1); + snps_pos[snpsmatch>0, ] = snpspos2[snpsmatch, , drop = FALSE]; + snps_pos[rowSums(is.na(snps_pos))>0, ] = 0; + snps_pos[snps_pos==0] = (length(chrNames)+1) * (chrMax+cisDist); + rm(snps_names, snpsmatch, usedsnps, snpschr, snpspos2) + + gene_pos = matrix(0,length(gene_names),2); + gene_pos[genematch>0, ] = genepos2[genematch, , drop = FALSE]; + gene_pos[rowSums(is.na(gene_pos))>0, ] = 0; + gene_pos[gene_pos==0] = (length(chrNames)+2) * (chrMax+cisDist); + rm(gene_names, genematch, usedgene, genechr, genepos2) + rm(chrNames, chrMax); + + if( is.unsorted(snps_pos) ) { + status("Reordering SNPs\n"); + ordr = sort.list(snps_pos); + snps$RowReorder(ordr); + snps_pos = snps_pos[ordr, , drop = FALSE]; + rm(ordr); + } + if( is.unsorted(rowSums(gene_pos)) ) { + status("Reordering genes\n"); + ordr = sort.list(rowSums(gene_pos)); + gene$RowReorder(ordr); + gene_pos = gene_pos[ordr, , drop = FALSE]; + rm(ordr); + } + + # Slice it back. + geneloc = vector("list", gene$nSlices()) + gene_offset = 0; + for(gc in 1:gene$nSlices()) { + nr = length(gene$rowNameSlices[[gc]]); + geneloc[[gc]] = gene_pos[gene_offset + (1:nr), , drop = FALSE]; + gene_offset = gene_offset + nr; + } + rm(gc, gene_offset, gene_pos); + + snpsloc = vector("list", snps$nSlices()) + snps_offset = 0; + for(sc in 1:snps$nSlices()) { + nr = length(snps$rowNameSlices[[sc]]); + snpsloc[[sc]] = snps_pos[snps_offset + (1:nr), , drop = FALSE]; + snps_offset = snps_offset + nr; + } + rm(nr, sc, snps_offset, snps_pos); + } + ################################# Covariates processing ################################# + { + status("Processing covariates"); + if( useModel == modelLINEAR_CROSS ) { + last.covariate = as.vector(tail( cvrt$getSlice(cvrt$nSlices()), n = 1)); + } + if( cvrt$nRows()>0 ) { + cvrt$SetNanRowMean(); + cvrt$CombineInOneSlice(); + cvrt = rbind(matrix(1,1,snps$nCols()),cvrt$getSlice(1)); + } else { + cvrt = matrix(1,1,snps$nCols()); + } + # Correct for the error covariance structure + if( length(correctionMatrix)>0 ) { + cvrt = cvrt %*% correctionMatrix; + } + # Orthonormalize covariates + # status("Orthonormalizing covariates"); + q = qr(t(cvrt)); + if( min(abs(diag(qr.R(q)))) < .Machine$double.eps * snps$nCols() ) { + stop("Colinear or zero covariates detected"); + } + cvrt = t( qr.Q(q) ); + rm( q ); + } + ################################# Gene expression processing ############################ + { + status("Processing gene expression data (imputation, residualization, etc.)"); + # Impute gene expression + gene$SetNanRowMean(); + # Correct for the error covariance structure + if( length(correctionMatrix)>0 ) { + gene$RowMatrixMultiply(correctionMatrix); + } + # Orthogonolize expression w.r.t. covariates + # status("Orthogonolizing expression w.r.t. covariates"); + gene_offsets = double(gene$nSlices()+1); + for( sl in 1:gene$nSlices() ) { + slice = gene$getSlice(sl); + gene_offsets[sl+1] = gene_offsets[sl] + nrow(slice); + rowsq1 = rowSums(slice^2); + slice = slice - tcrossprod(slice,cvrt) %*% cvrt; + rowsq2 = rowSums(slice^2); + # kill rows colinear with the covariates + delete.rows = (rowsq2 <= rowsq1 * .Machine$double.eps ); + slice[delete.rows] = 0; + div = sqrt( rowSums(slice^2) ); + div[ div == 0 ] = 1; + gene.std$set(sl, div); + gene$setSlice(sl, slice / div); + } + rm(rowsq1, rowsq2, delete.rows, div); + rm( sl, slice ); + #gene$RowRemoveZeroEps(); + } + ################################# snps_process, testfun, pvfun, threshfun, afun ######## + { + # snps_process - preprocess SNPs slice + # + # afun --- abs for signed stats, identity for non-negative + # threshfun --- internal stat threshold for given p-value + # testfun --- t or F statistic from the internal one + # pvfun --- p-value from the t or F statistic + + nSamples = snps$nCols(); + nGenes = gene$nRows(); + nSnps = snps$nRows(); + nCov = nrow(cvrt); + # nVarTested = length(snps_list); # set in case(useModel) + # dfNull = nSamples - nCov; + # d.f. of the full model + betafun = NULL; + + if( useModel == modelLINEAR ) { + snps_process = function(x) { + return( list(.SetNanRowMean(x)) ); + }; + nVarTested = 1; + dfFull = nSamples - nCov - nVarTested; + statistic.fun = function(mat_list) { + return( mat_list[[1]] ); + } + afun = function(x) {return(abs(x))}; + threshfun = function(pv) { + thr = qt(pv/2, dfFull, lower.tail = FALSE); + thr = thr^2; + thr = sqrt( thr / (dfFull + thr) ); + thr[pv >= 1] = 0; + thr[pv <= 0] = 1; + return( thr ); + } + testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } + pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2)); } + thresh.cis = threshfun(pvOutputThreshold.cis); + thresh = threshfun(pvOutputThreshold); + betafun = function(stat, ss, gg, select) { + return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); + } + } else + if( useModel == modelANOVA ) { + snps_process = function(x).SNP_process_split_for_ANOVA(x,n.anova.groups); + nVarTested = n.anova.groups - 1; + dfFull = nSamples - nCov - nVarTested; +# statistic.fun = function(mat_list) { +# return( mat_list[[1]]^2 + mat_list[[2]]^2 ); +# } + statistic.fun = function(mat_list) { + x = mat_list[[1]]^2; + for( j in 2:length(mat_list) ) + x = x + mat_list[[j]]^2; + return( x ); + } + afun = identity; + threshfun = function(pv) { + thr = qf(pv, nVarTested, dfFull, lower.tail = FALSE); + thr = thr / (dfFull/nVarTested + thr); + thr[pv >= 1] = 0; + thr[pv <= 0] = 1; + return( thr ); + } + testfun = function(x) { return( x / (1 - .my.pmin(x,1)) * (dfFull/nVarTested) ); } + pvfun = function(x) { return( .pv.nz(pf(x, nVarTested, dfFull, lower.tail = FALSE)) ); } + thresh.cis = threshfun(pvOutputThreshold.cis); + thresh = threshfun(pvOutputThreshold); + } else + if( useModel == modelLINEAR_CROSS ) { + last.covariate = as.vector( last.covariate ); + snps_process = .SNP_process_split_for_LINEAR_CROSS = function(x) { + out = vector("list", 2); + out[[1]] = .SetNanRowMean(x); + out[[2]] = t( t(out[[1]]) * last.covariate ); + return( out ); + }; + nVarTested = 1; + dfFull = nSamples - nCov - nVarTested - 1; + statistic.fun = function(mat_list) { + return( mat_list[[2]] / sqrt(1 - mat_list[[1]]^2) ); + } + afun = function(x) {return(abs(x))}; + threshfun = function(pv) { + thr = qt(pv/2, dfFull, lower.tail = FALSE); + thr = thr^2; + thr = sqrt( thr / (dfFull + thr) ); + thr[pv >= 1] = 0; + thr[pv <= 0] = 1; + return( thr ); + } + testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } + pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2 )); } + thresh.cis = threshfun(pvOutputThreshold.cis); + thresh = threshfun(pvOutputThreshold); + betafun = function(stat, ss, gg, select) { + return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); + } + } + params$dfFull = dfFull; + } + ################################# Saver class(es) creation ############################## + { + status("Creating output file(s)"); + if(noFDRsaveMemory) { + if( pvOutputThreshold > 0 ) { + saver.tra = .OutputSaver_direct$new(); + } + if( pvOutputThreshold.cis > 0 ) { + saver.cis = .OutputSaver_direct$new(); + } + } else { + if( pvOutputThreshold > 0 ) { + saver.tra = .OutputSaver_FRD$new(); + } + if( pvOutputThreshold.cis > 0 ) { + saver.cis = .OutputSaver_FRD$new(); + } + } + if( pvOutputThreshold > 0 ) + if( pvOutputThreshold * gene$nRows() * snps$nRows() > 1000000 ) + if(!noFDRsaveMemory) + cat('Warning: pvOutputThreshold may be too large.\nExpected number of findings > ', + pvOutputThreshold * gene$nRows() * snps$nRows(),'\n'); + if( (useModel == modelLINEAR) || (useModel == modelLINEAR_CROSS) ) { + statistic_name = "t-stat"; + } else if( useModel == modelANOVA ) { + statistic_name = "F-test"; + } + if(!is.null(betafun)) + statistic_name = paste('beta\t',statistic_name, sep=''); + if( pvOutputThreshold > 0 ) + saver.tra$start(output_file_name, statistic_name, snps, gene, testfun, pvfun); + if( pvOutputThreshold.cis > 0 ) + saver.cis$start(output_file_name.cis, statistic_name, snps, gene, testfun, pvfun); + rm( statistic_name ); + } + ################################# Some useful functions ################################# + { + orthonormalize.snps = function(cursnps, ss) { + for(p in 1:length(cursnps)) { + if(length(correctionMatrix)>0) { + cursnps[[p]] = cursnps[[p]] %*% correctionMatrix; + } + cursnps[[p]] = cursnps[[p]] - tcrossprod(cursnps[[p]],cvrt) %*% cvrt; + for(w in .seq(1,p-1L)) + cursnps[[p]] = cursnps[[p]] - rowSums(cursnps[[p]]*cursnps[[w]]) * cursnps[[w]]; + div = sqrt( rowSums(cursnps[[p]]^2) ); + div[ div == 0 ] = 1; + cursnps[[p]] = cursnps[[p]]/div; + } + snps.std$set(ss, div); + return(cursnps); + } +# if( pvOutputThreshold.cis > 0 ) { +# is.cis.pair = function(gg,ss) { +# return(!( ( snpsloc[[ss]][1, 1] - tail( geneloc[[gg]][ , 2], n = 1L) > cisDist) | +# ( geneloc[[gg]][1, 1] - tail( snpsloc[[ss]] , n = 1L) > cisDist) ) ); +# } +# } + if( pvOutputThreshold.cis > 0 ) { +# sn.l = sapply(snpsloc, function(x)x[1] ); +# sn.r = sapply(snpsloc, function(x)tail(x,1) ); +# ge.l = sapply(geneloc, function(x)x[1,1] ); +# ge.r = sapply(geneloc, function(x)x[nrow(x) , 2] ); + sn.l = sapply(snpsloc, '[', 1 ); + sn.r = sapply(snpsloc, tail, 1 ); + ge.l = sapply(geneloc, '[', 1, 1 ); + ge.r = sapply( lapply(geneloc, tail.matrix, 1 ), '[', 2); + gg.1 = findInterval( sn.l , ge.r + cisDist +1) + 1; +# cat(gg.1,'\n') + gg.2 = findInterval( sn.r , ge.l - cisDist ); +# cat(gg.2,'\n') + rm(sn.l, sn.r, ge.l, ge.r); + } + + } + ################################# Prepare counters and histogram bins ################### + { + pvbins = NULL; # bin edges for p-values + statbins = 0; # bin edges for the test statistic (|t| or F) + do.hist = FALSE; + if( length(pvalue.hist) == 1 ) { + if(pvalue.hist == "qqplot") { + pvbins = c(0, 10^rev(seq(0, log10(.Machine$double.xmin)-1, -0.05))); + } else + if( is.numeric(pvalue.hist) ) { + pvbins = seq(from = 0, to = 1, length.out = pvalue.hist+1); + } else + if( pvalue.hist == TRUE ) { + pvbins = seq(from = 0, to = 1, length.out = 100+1); + } + } else + if( is.numeric(pvalue.hist) && (length(pvalue.hist) > 1) ) { + pvbins = pvalue.hist; + } + if( is.null(pvbins) && (pvalue.hist != FALSE) ) { + stop("Wrong value of pvalue.hist. Must be FALSE, TRUE, \"qqplot\", or numerical"); + } + do.hist = !is.null(pvbins); + if( do.hist ) { + pvbins = sort(pvbins); + statbins = threshfun(pvbins); + if( pvOutputThreshold > 0) { + hist.all = .histogrammer$new(pvbins, statbins); + } + if( pvOutputThreshold.cis > 0) { + hist.cis = .histogrammer$new(pvbins, statbins); + } + } + rm( pvbins, statbins); + if(min.pv.by.genesnp) { + if( pvOutputThreshold > 0) { + minpv.tra = .minpvalue$new(snps,gene); + } + if( pvOutputThreshold.cis > 0) { + minpv.cis = .minpvalue$new(snps,gene); + } + } + } + ################################# Main loop ############################################# + { + beta = NULL; + n.tests.all = 0; + n.tests.cis = 0; + n.eqtls.tra = 0; + n.eqtls.cis = 0; + + status("Performing eQTL analysis"); + # ss = 1; gg = 1; + # ss = snps$nSlices(); gg = gene$nSlices(); + + snps_offset = 0; + for(ss in 1:snps$nSlices()) { +# for(ss in 1:min(2,snps$nSlices())) { #for debug + cursnps = NULL; + nrcs = nrow(snps$getSliceRaw(ss)); + + # loop only through the useful stuff + for(gg in if(pvOutputThreshold>0){1:gene$nSlices()}else{.seq(gg.1[ss],gg.2[ss])} ) { + gene_offset = gene_offsets[gg]; + curgene = gene$getSlice(gg); + nrcg = nrow(curgene); + if(nrcg == 0) next; + + rp = ""; + + statistic = NULL; + select.cis.raw = NULL; + ## do cis analysis +# if( (pvOutputThreshold.cis > 0) && ( is.cis.pair(gg, ss) ) ) { + if( (pvOutputThreshold.cis > 0) && (gg >= gg.1[ss]) && (gg <= gg.2[ss]) ) { + + if( is.null( statistic ) ) { + if( is.null( cursnps ) ) { + cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); + } + mat = vector("list", length(cursnps)); + for(d in 1:length(cursnps)) { + mat[[d]] = tcrossprod(curgene, cursnps[[d]]); + } + statistic = statistic.fun( mat ); + astatistic = afun(statistic); +# rm(mat); + } + +# sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1 +1 , snpsloc[[ss]]); +# sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist -1 , snpsloc[[ss]]); + sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1, snpsloc[[ss]]); + sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist, snpsloc[[ss]]); + xx = unlist(lapply(which(sn.r>sn.l),FUN=function(x){(sn.l[x]:(sn.r[x]-1))*nrow(statistic)+x})) + select.cis.raw = xx[ astatistic[xx] >= thresh.cis ]; + select.cis = arrayInd(select.cis.raw, dim(statistic)) + + n.tests.cis = n.tests.cis + length(xx); + n.eqtls.cis = n.eqtls.cis + length(select.cis.raw); + + if( do.hist ) + hist.cis$update(astatistic[xx]); + + if( min.pv.by.genesnp ) { + # minpv.cis$updatecis(ss, gg, arrayInd(xx, dim(statistic)), astatistic[xx]) + temp = double(length(astatistic)); + dim(temp) = dim(astatistic); + temp[xx] = astatistic[xx]; + minpv.cis$update(ss, gg, temp); + } + + if(!is.null(betafun)) + beta = betafun(mat[[length(mat)]][select.cis.raw], ss, gg, select.cis); + + saver.cis$update( snps_offset + select.cis[ , 2], + gene_offset + select.cis[ , 1], + statistic[select.cis.raw], + beta); + + # statistic.select.cis = statistic[ select.cis ]; + # test = testfun( statistic.select.cis ); + # pv = pvfun(test); + # Saver.cis$WriteBlock( cbind(snps_offset + select.cis[ , 2], gene_offset + select.cis[ , 1], test, pv) ); + # counter.cis$Update(gg, ss, select.cis, pv, n.tests = length(xx), if(do.hist) afun(statistic[xx]) ) + rp = paste(rp, ", ", formatC(n.eqtls.cis, big.mark=",", format = "f", digits = 0), " cis-eQTLs", sep = ""); + } + ## do trans/all analysis + if(pvOutputThreshold>0) { + if( is.null( statistic ) ) { + if( is.null( cursnps ) ) { + cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); + } + mat = vector("list", length(cursnps)); + for(d in 1:length(cursnps)) { + mat[[d]] = tcrossprod(curgene, cursnps[[d]]); + } + statistic = statistic.fun( mat ); + astatistic = afun(statistic); +# rm(mat); + } + + if( do.hist ) + hist.all$update(astatistic); + + if(!is.null(select.cis.raw)) + astatistic[xx] = -1; + # select.tra.raw = select.tra.raw[!(select.tra.raw %in% select.cis.raw)]; + + select.tra.raw = which( astatistic >= thresh); + select.tra = arrayInd(select.tra.raw, dim(statistic)) + + n.eqtls.tra = n.eqtls.tra + length(select.tra.raw); + n.tests.all = n.tests.all + length(statistic); + + if(!is.null(betafun)) + beta = betafun(mat[[length(mat)]][select.tra.raw], ss, gg, select.tra); + + saver.tra$update( snps_offset + select.tra[ , 2], + gene_offset + select.tra[ , 1], + statistic[select.tra.raw], + beta); + + if( min.pv.by.genesnp ) + minpv.tra$update(ss, gg, astatistic) + + # statistic.select.tra = statistic[ select.tra ]; + # test = testfun( statistic.select.tra ); + # pv = pvfun( test ); + # Saver$WriteBlock( cbind( snps_offset + select.tra[ , 2], gene_offset + select.tra[ , 1], test, pv) ); + # counter$Update(gg, ss, select.tra, pv, n.tests = nrcs*nrcg, if(do.hist) afun(statistic) ) + rp = paste(rp, ", ", formatC(n.eqtls.tra, big.mark=",", format = "f", digits = 0), if(pvOutputThreshold.cis > 0)" trans-"else" ","eQTLs", sep = "") + } + + #gene_offset = gene_offset + nrcg; + if( !is.null(statistic) ) { + per = 100*(gg/gene$nSlices() + ss-1) / snps$nSlices(); + cat( formatC(floor(per*100)/100, format = "f", width = 5, digits = 2), "% done" , rp, "\n", sep = ""); + flush.console(); + } + } # gg in 1:gene$nSlices() + snps_offset = snps_offset + nrow(snps$getSliceRaw(ss)); + } # ss in 1:snps$nSlices() + } + ################################# Results collection #################################### + { + rez = list(time.in.sec = proc.time()[3] - start.time); + rez$param = params; + + if(pvOutputThreshold.cis > 0) { + rez.cis = list(ntests = n.tests.cis, neqtls = n.eqtls.cis); + rez.cis = c(rez.cis, saver.cis$getResults( gene, snps, n.tests.cis) ); + if(do.hist) + rez.cis = c(rez.cis, hist.cis$getResults() ); + if(min.pv.by.genesnp) + rez.cis = c(rez.cis, minpv.cis$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); + } + + if(pvOutputThreshold>0) { + rez.all = list(ntests = n.tests.all, neqtls = n.eqtls.tra + n.eqtls.cis); + if(pvOutputThreshold.cis > 0) { + rez.tra = list(ntests = n.tests.all - n.tests.cis, neqtls = n.eqtls.tra); + rez.tra = c(rez.tra, saver.tra$getResults( gene, snps, n.tests.all - n.tests.cis) ); + } else { + rez.all = c(rez.all, saver.tra$getResults( gene, snps, n.tests.all ) ); + } + if(do.hist) { + rez.all = c(rez.all, hist.all$getResults() ); + if(pvOutputThreshold.cis > 0) { + rez.tra$hist.bins = rez.all$hist.bins; + rez.tra$hist.counts = rez.all$hist.counts - rez.cis$hist.counts; + } + } + if(min.pv.by.genesnp) { + if(pvOutputThreshold.cis > 0) { + rez.tra = c(rez.tra, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); + } else { + rez.all = c(rez.all, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); + } + } + } + + if(exists('rez.all')>0) + rez$all = rez.all; + if(exists('rez.tra')>0) + rez$trans = rez.tra; + if(exists('rez.cis')>0) + rez$cis = rez.cis; + + class(rez) = c(class(rez),"MatrixEQTL"); + status(""); + } +# cat('s std ',snps.std$get(1),'\n'); +# cat('g std ',gene.std$get(1),'\n'); + ################################# Results collection #################################### + return(rez); +} + +.histme = function(m, name1, name2, ...) { + cnts = m$hist.counts; + bins = m$hist.bins; + ntst = m$ntests; + centers = 0.5 * (bins[-1L] + bins[-length(bins)]); + density = 0.5 / (bins[-1L] - centers) * cnts / ntst; + ntext = paste("P-value distribution for ", name1, formatC(ntst, big.mark=",", format = "f", digits = 0), name2, " gene-SNP pairs ",sep=""); + r = structure(list(breaks = bins, counts = cnts, density = density, + mids = centers, equidist = FALSE), class = "histogram"); + plot(r, main = ntext, ylab = "Density", xlab = "P-values", ...) + abline( h = 1, col = "blue"); + return(invisible()); +} + +.qqme = function(m, lcol, cex, pch, ...) { + cnts = m$hist.counts; + bins = m$hist.bins; + ntst = m$ntests; + + cusu = cumsum(cnts) / ntst; + ypos = bins[-1][is.finite(cusu)]; + xpos = cusu[is.finite(cusu)]; + lines(-log10(xpos), -log10(ypos), col = lcol, ...); +# lines(xpos, ypos, col = lcol, ...); + if(length(m$eqtls$pvalue)==0) + return(); + ypvs = -log10(m$eqtls$pvalue); + xpvs = -log10(1:length(ypvs) / ntst); + if(length(ypvs) > 1000) { + # need to filter a bit, make the plotting faster + levels = as.integer( xpvs/xpvs[1] * 1e3); + keep = c(TRUE, diff(levels)!=0); + levels = as.integer( ypvs/ypvs[1] * 1e3); + keep = keep | c(TRUE, diff(levels)!=0); + ypvs = ypvs[keep]; + xpvs = xpvs[keep]; + rm(keep) + } + points(xpvs, ypvs, col = lcol, pch = pch, cex = cex, ...); +} + +plot.MatrixEQTL = function(x, cex = 0.5, pch = 19, xlim = NULL, ylim = NULL, ...) { + if( x$param$pvalue.hist == FALSE ) { + warning("Cannot plot p-value distribution: the information was not recorded.\nUse pvalue.hist!=FALSE."); + return(invisible()); + } + if( x$param$pvalue.hist == "qqplot" ) { + xmin = 1/max(x$cis$ntests, x$all$ntests); + ymax = NULL; + if(!is.null(ylim)) { + ymax = ylim[2]; + } else { + ymax = -log10(min( + x$cis$eqtls$pvalue[1], x$cis$hist.bins[ c(FALSE,x$cis$hist.counts>0)][1], + x$all$eqtls$pvalue[1], x$all$hist.bins[ c(FALSE,x$all$hist.counts>0)][1], + x$trans$eqtls$pvalue[1], x$trans$hist.bins[c(FALSE,x$trans$hist.counts>0)][1], + na.rm = TRUE))+0.1; + } + if(ymax == 0) { + ymax = -log10(.Machine$double.xmin) + } + if(!is.null(ymax)) + ylim = c(0,ymax); + + if(is.null(xlim)) + xlim = c(0, -log10(xmin/1.5)); + + plot(numeric(),numeric(), xlab = "-Log10(p-value), theoretical", + ylab = "-Log10(p-value), observed", + xlim = c(0, -log10(xmin/1.5)), + ylim = ylim, + xaxs="i", yaxs="i", ...); + lines(c(0,1e3), c(0,1e3), col = "gray"); + if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { + .qqme( x$cis, "red", cex, pch, ...); + .qqme( x$trans, "blue", cex, pch, ...); + title(paste("QQ-plot for", + formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), + "local and", + formatC(x$trans$ntests, big.mark=",", format = "f", digits = 0), + "distant gene-SNP p-values")); + lset = c(1,2,4); + } else + if(x$param$pvOutputThreshold.cis > 0) { + .qqme(x$cis, "red", cex, pch, ...); + title(paste("QQ-plot for", + formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), + "local gene-SNP p-values")); + lset = c(1,4); + } else { + .qqme(x$all, "blue", cex, pch, ...); + title(paste("QQ-plot for all", + formatC(x$all$ntests, big.mark=",", format = "f", digits = 0), + "gene-SNP p-values")); + lset = c(3,4); + } + legend("topleft", + c("Local p-values","Distant p-values","All p-values","diagonal")[lset], + col = c("red","blue","blue","gray")[lset], + text.col = c("red","blue","blue","gray")[lset], + pch = 20, lwd = 1, pt.cex = c(1,1,1,0)[lset]) + } else { + if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { + par(mfrow=c(2,1)); + .histme(x$cis, "", " local", ...); + tran = list(hist.counts = x$all$hist.counts - x$cis$hist.counts, + hist.bins = x$all$hist.bins, + ntests = x$all$ntests - x$cis$ntests); + .histme(x$trans,""," distant", ...); + par(mfrow=c(1,1)); + } else + if(x$param$pvOutputThreshold.cis > 0) { + .histme(x$cis, "", " local", ...); + } else { + .histme(x$all, "all ", "" , ...); + } + } + return(invisible()); +} +