Mercurial > repos > jasonxu > matrixeqtl
view MatrixEQTL/R/Matrix_eQTL_engine.R @ 0:cd4c8e4a4b5b draft
Uploaded
author | jasonxu |
---|---|
date | Fri, 12 Mar 2021 08:12:46 +0000 |
parents | |
children |
line wrap: on
line source
# Matrix eQTL by Andrey A. Shabalin # http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ # http://cran.r-project.org/web/packages/policies.html library(methods) modelLINEAR = 117348L; modelANOVA = 47074L; modelLINEAR_CROSS = 1113461L; .seq = function(a,b){if(a<=b){a:b}else{integer(0)}}; SlicedData <- setRefClass( "SlicedData", fields = list( dataEnv = "environment", nSlices1 = "numeric", rowNameSlices = "list", columnNames = "character", fileDelimiter = "character", fileSkipColumns = "numeric", fileSkipRows = "numeric", fileSliceSize = "numeric", fileOmitCharacters = "character" ), methods = list( initialize = function( mat = NULL ) { dataEnv <<- new.env(hash = TRUE, size = 29L); nSlices1 <<- 0L; if(!is.null(mat)) { CreateFromMatrix(mat); } fileSliceSize <<- 1000; fileDelimiter <<- "\t"; fileSkipColumns <<- 1L; fileSkipRows <<- 1L; fileOmitCharacters <<- "NA" return(invisible(.self)); }, CreateFromMatrix = function( mat ) { stopifnot( class(mat) == "matrix" ); setSliceRaw( 1L ,mat ); rns = rownames( mat, do.NULL = FALSE); #if( is.null(rns) ) { # rns = paste( "Row_",(1:nrow(mat)), sep="" ); #} rowNameSlices <<- list(rns); cns = colnames( mat, do.NULL = FALSE ); #if( is.null(cns) ){ # cns = paste( "Col_",(1:ncol(mat)), sep="" ); #} columnNames <<- cns; return(invisible(.self)); }, getSlice = function(sl) { value = get(paste(sl), dataEnv); if( is.raw(value) ) { storage.mode(value) = "double"; value[value == 255] = NA; } return( value ) }, getSliceRaw = function(sl) { return( get(paste(sl), dataEnv) ) }, setSliceRaw = function(sl, value) { assign( paste(sl), value, dataEnv ) if( nSlices1 < sl ) { nSlices1 <<- sl; } }, setSlice = function(sl, value) { if( length(value) > 0 ) { if( all(as.integer(value) == value, na.rm = TRUE) ) { if( (min(value, na.rm = TRUE) >= 0 ) && (max(value, na.rm = TRUE) < 255) ) { nv = value; suppressWarnings({storage.mode(nv) = "raw"}); nv[ is.na(value)] = as.raw(255); value = nv; } else { storage.mode(value) = "integer"; } } } setSliceRaw(sl, value); }, nSlices = function() { return( nSlices1 ); }, LoadFile = function(filename, skipRows = NULL, skipColumns = NULL, sliceSize = NULL, omitCharacters = NULL, delimiter = NULL, rowNamesColumn = 1) { if( !is.null(skipRows) ) { fileSkipRows <<- skipRows; } if( !is.null(skipColumns) ) { fileSkipColumns <<- skipColumns; } if( !is.null(omitCharacters) ) { fileOmitCharacters <<- omitCharacters; } if( !is.null(sliceSize) ) { fileSliceSize <<- sliceSize; } if( !is.null(delimiter) ) { fileDelimiter <<- delimiter; } stopifnot( (fileSkipColumns == 0) || (rowNamesColumn <= fileSkipColumns) ) stopifnot( (fileSkipColumns == 0) || (rowNamesColumn >= 1) ) fid = file(description = filename, open = "rt", blocking = FALSE, raw = FALSE) # clean object if file is open Clear(); lines = readLines(con = fid, n = max(fileSkipRows,1L), ok = TRUE, warn = TRUE) line1 = tail(lines,1); splt = strsplit(line1, split = fileDelimiter, fixed = TRUE); if( fileSkipRows > 0L ) { columnNames <<- splt[[1]]; # [ -(1:fileSkipColumns) ]; } else { seek(fid, 0) } rm( lines, line1, splt ); rowNameSlices <<- vector("list", 15); curSliceId = 0L; repeat { # preallocate names and data if(length(rowNameSlices) < curSliceId) { rowNameSlices[[2L*curSliceId]] <<- NULL; } curSliceId = curSliceId + 1L; # read sliceSize rows rowtag = vector("character",fileSliceSize); rowvals = vector("list",fileSliceSize); for(i in 1:fileSliceSize) { temp = ""; if( fileSkipColumns > 0L ) { temp = scan(file = fid, what = character(), n = fileSkipColumns, quiet = TRUE,sep = fileDelimiter); } rowtag[i] = temp[rowNamesColumn];#paste(temp,collapse=" "); rowvals[[i]] = scan(file = fid, what = double(), nlines = 1, quiet = TRUE, sep = fileDelimiter, na.strings = fileOmitCharacters); if( length(rowvals[[i]]) == 0L ) { if(i==1L) { rowtag = matrix(0, 0, 0); rowvals = character(0); } else { rowtag = rowtag[ 1:(i-1) ]; rowvals = rowvals[ 1:(i-1) ]; } break; } } if( length(rowtag) == 0L ) { curSliceId = curSliceId - 1L; break; } rowNameSlices[[curSliceId]] <<- rowtag; data = c(rowvals, recursive = TRUE); dim(data) = c(length(rowvals[[1]]), length(rowvals)); data = t(data); setSlice(curSliceId, data); if( length(rowtag) < fileSliceSize ) { break; } numtxt = formatC(curSliceId*fileSliceSize, big.mark=",", format = "f", digits = 0) cat( "Rows read: ", numtxt, "\n"); flush.console() } close(fid) if( fileSkipRows == 0 ) { columnNames <<- paste("Col_", (1:nCols()), sep=""); } else { columnNames <<- tail(columnNames, ncol(getSliceRaw(1))); } if( fileSkipColumns == 0 ) { cnt = 0L; for( sl in 1:nSlices() ) { nr = length(getSliceRaw(sl)); rowNameSlices[[sl]] <<- paste("Row_",cnt + (1:nr),sep=""); cnt = cnt + nr; } } rowNameSlices <<- rowNameSlices[1:curSliceId]; cat("Rows read: ", nRows(), " done.\n"); return(invisible(.self)); }, SaveFile = function(filename) { if( nSlices() == 0 ) { cat("No data to save"); return(); } fid = file(filename,"wt"); for( sl in 1:nSlices() ) { z = getSlice(sl); rownames(z) = rowNameSlices[[sl]]; colnames(z) = columnNames; write.table(z, file = fid, sep = "\t", col.names = (if(sl == 1){NA}else{FALSE})); } close(fid); }, nRows = function() { s = 0L; for(sl in .seq(1,nSlices())) { s = s + nrow(getSliceRaw(sl)); } return( s ) }, nCols = function() { if( nSlices() == 0L ) { return(0L); } else { return( ncol(getSliceRaw(1L)) ) } }, Clear = function() { for( sl in .seq(1,nSlices()) ) { rm(list = paste(sl), envir = dataEnv) } nSlices1 <<- 0L; rowNameSlices <<- list(); columnNames <<- character(); return(invisible(.self)); }, IsCombined = function() { return( nSlices() <= 1L ); }, GetAllRowNames = function() { return( c(rowNameSlices, recursive=TRUE) ); }, SetNanRowMean = function() { if( (nCols() == 0L) ) { return(invisible(.self)); } for( sl in .seq(1,nSlices()) ) { slice = getSlice(sl); if( any(is.na(slice)) ) { rowmean = rowMeans(slice, na.rm = TRUE); rowmean[is.na(rowmean)] = 0L; for( j in which(!complete.cases(slice)) ) { where1 = is.na(slice[j, ]); slice[j, where1] = rowmean[j]; } setSlice(sl, slice); } } return(invisible(.self)); }, RowStandardizeCentered = function() { for(sl in .seq(1,nSlices()) ) { slice = getSlice(sl); div = sqrt( rowSums(slice^2) ); div[ div == 0 ] = 1; setSlice(sl, slice/div); } return(invisible(.self)); }, CombineInOneSlice = function() { if( nSlices() <= 1L ) { return(invisible(.self)); } nc = nCols(); nr = nRows(); datatypes = c("raw","integer","double"); datafuns = c(as.raw, as.integer, as.double); datatype = character(nSlices()); for(sl in 1:nSlices()) { datatype[sl] = typeof(getSliceRaw(sl)); } mch = max(match(datatype,datatypes,nomatch = length(datatypes))); datafun = datafuns[[mch]]; newData = matrix(datafun(0), nrow = nr, ncol = nc); offset = 0; for(sl in 1:nSlices()) { if(mch==1) { slice = getSliceRaw(sl); } else { slice = getSlice(sl); } newData[ offset + (1:nrow(slice)),] = datafun(slice); setSlice(sl, numeric()); offset = offset + nrow(slice); } nSlices1 <<- 1L; setSliceRaw(1L, newData); rm(newData); newrowNameSlices = GetAllRowNames(); rowNameSlices <<- list(newrowNameSlices) return(invisible(.self)); }, ResliceCombined = function(sliceSize = -1) { if( sliceSize > 0L ) { fileSliceSize <<- sliceSize; } if( fileSliceSize <= 0 ) { fileSliceSize <<- 1000; } if( IsCombined() ) { nRows1 = nRows(); if(nRows1 == 0L) { return(invisible(.self)); } newNSlices = floor( (nRows1 + fileSliceSize - 1)/fileSliceSize ); oldData = getSliceRaw(1L); #oldNames = rowNameSlices[[1]]; newNameslices = vector("list",newNSlices) for( sl in 1:newNSlices ) { range = (1+(sl-1)*fileSliceSize) : (min(nRows1,sl*fileSliceSize)); newpart = oldData[range, ,drop = FALSE]; if( is.raw(oldData) ) { setSliceRaw( sl, newpart); } else { setSlice( sl, newpart); } newNameslices[[sl]] = rowNameSlices[[1]][range]; } rowNameSlices <<- newNameslices ; } else { stop("Reslice of a sliced matrix is not supported yet. Use CombineInOneSlice first."); } return(invisible(.self)); }, Clone = function() { clone = SlicedData$new(); for(sl in .seq(1,nSlices()) ) { clone$setSliceRaw(sl,getSliceRaw(sl)); } clone$rowNameSlices = rowNameSlices; clone$columnNames = columnNames; clone$fileDelimiter = fileDelimiter; clone$fileSkipColumns = fileSkipColumns; clone$fileSkipRows = fileSkipRows; clone$fileSliceSize = fileSliceSize; clone$fileOmitCharacters = fileOmitCharacters; return( clone ); }, RowMatrixMultiply = function(multiplier) { for(sl in .seq(1,nSlices()) ) { setSlice(sl, getSlice(sl) %*% multiplier); } return(invisible(.self)); }, ColumnSubsample = function(subset) { for(sl in .seq(1,nSlices()) ) { setSliceRaw(sl, getSliceRaw(sl)[ ,subset, drop = FALSE]); } columnNames <<- columnNames[subset]; return(invisible(.self)); }, RowReorderSimple = function(ordr) { # had to use an inefficient and dirty method # due to horrible memory management in R if( (typeof(ordr) == "logical") && all(ordr) ) { return(invisible(.self)); } if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { return(invisible(.self)); } CombineInOneSlice(); gc(); setSliceRaw( 1L, getSliceRaw(1L)[ordr, ] ); rowNameSlices[[1]] <<- rowNameSlices[[1]][ordr]; gc(); ResliceCombined(); gc(); return(invisible(.self)); }, RowReorder = function(ordr) { # transform logical into indices if( typeof(ordr) == "logical" ) { if( length(ordr) == nRows() ) { ordr = which(ordr); } else { stop("Parameter \"ordr\" has wrong length") } } ## first, check that anything has to be done at all if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { return(invisible(.self)); } ## check bounds #if( (min(ordr) < 1) || (max(ordr) > nRows()) ) { # stop("Parameter \"ordr\" is out of bounds"); #} ## slice the data into individual rows all_rows = vector("list", nSlices()) for( i in 1:nSlices() ) { slice = getSliceRaw(i) all_rows[[i]] = split(slice, 1:nrow(slice)) setSliceRaw(i,numeric()) } gc(); all_rows = unlist(all_rows, recursive=FALSE, use.names = FALSE); ## Reorder the rows all_rows = all_rows[ordr]; ## get row names all_names = GetAllRowNames(); ## erase the set rowNameSlices <<- list(); ## sort names all_names = all_names[ordr]; ## ## Make slices back nrows = length(all_rows); nSlices1 <<- as.integer((nrows+fileSliceSize-1)/fileSliceSize); ##cat(nrows, " ", nSlices1); rowNameSlices1 = vector("list", nSlices1); for( i in 1:nSlices1 ) { fr = 1 + fileSliceSize*(i-1); to = min( fileSliceSize*i, nrows); subset = all_rows[fr:to]; types = unlist(lapply(subset,typeof)); israw = (types == "raw") if(!all(israw == israw[1])) { # some raw and some are not subset = lapply(subset, function(x){if(is.raw(x)){x=as.integer(x);x[x==255] = NA;return(x)}else{return(x)}}); } subset = unlist(subset); dim(subset) = c( length(all_rows[[fr]]) , to - fr + 1) #subset = matrix(subset, ncol = (to-fr+1)); if(is.raw(subset)) { setSliceRaw(i, t(subset)); } else { setSlice(i, t(subset)); } rowNameSlices1[[i]] = all_names[fr:to]; all_rows[fr:to] = 0; all_names[fr:to] = 0; } rowNameSlices <<- rowNameSlices1; gc(); return(invisible(.self)); }, RowRemoveZeroEps = function(){ for(sl in .seq(1,nSlices()) ) { slice = getSlice(sl); amean = rowMeans(abs(slice)); remove = (amean < .Machine$double.eps*nCols()); if(any(remove)) { rowNameSlices[[sl]] <<- rowNameSlices[[sl]][!remove]; setSlice(sl, slice[!remove, , drop = FALSE]); } } return(invisible(.self)); }, FindRow = function(rowname) { for(sl in .seq(1,nSlices()) ) { mch = match(rowname,rowNameSlices[[sl]], nomatch = 0); if( mch > 0 ) { row = getSlice(sl)[mch[1], , drop=FALSE]; rownames(row) = rowname; colnames(row) = columnNames; return( list(slice = sl, item = mch, row = row) ); } } return( NULL ); }, show = function() { cat("SlicedData object. For more information type: ?SlicedData\n"); cat("Number of columns:", nCols(), "\n"); cat("Number of rows:", nRows(), "\n"); cat("Data is stored in", nSlices(), "slices\n"); if(nCols()>0) { z = getSlice(1L); if(nrow(z)>0) { z = z[1:min(nrow(z),10L), 1:min(ncol(z),10L), drop = FALSE]; rownames(z) = rowNameSlices[[1]][1:nrow(z)]; colnames(z) = columnNames[1:ncol(z)]; cat("Top left corner of the first slice (up to 10x10):\n"); methods:::show(z) } } } )) setGeneric("nrow") setMethod("nrow", "SlicedData", function(x) { return( x$nRows() ); }) setGeneric("NROW") setMethod("NROW", "SlicedData", function(x) { return( x$nRows() ); }) setGeneric("ncol") setMethod("ncol", "SlicedData", function(x) { return( x$nCols() ); }) setGeneric("NCOL") setMethod("NCOL", "SlicedData", function(x) { return( x$nCols() ); }) setGeneric("dim") setMethod("dim", "SlicedData", function(x) { return( c(x$nRows(),x$nCols()) ); }) setGeneric("colnames") setMethod("colnames", "SlicedData", function(x) { return( x$columnNames ); }) setGeneric("rownames") setMethod("rownames", "SlicedData", function(x) { return( x$GetAllRowNames() ); }) setMethod("[[", "SlicedData", function(x,i) { return( x$getSlice(i) ); }) setGeneric("length") setMethod("length", "SlicedData", function(x) { return( x$nSlices() ); }) setMethod("[[<-", "SlicedData", function(x,i,value) { x$setSlice(i, value); return(x); }) summary.SlicedData = function(object, ...) { z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); return(z); } ##### setGeneric("summary") ##### #setMethod("summary", "SlicedData", function(object, ...) { # z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); # return(z); # }) #setGeneric("show", standardGeneric("show")) # setMethod("show", "SlicedData", function(object) { # cat("SlicedData object. For more information type: ?SlicedData\n"); # cat("Number of columns:", object$nCols(), "\n"); # cat("Number of rows:", object$nRows(), "\n"); # cat("Data is stored in", object$nSlices(), "slices\n"); # if(object$nSlices()>0) { # z = object$getSlice(1); # if(nrow(z)>0) { # z = z[1:min(nrow(z),10), 1:min(ncol(z),10), drop = FALSE]; # rownames(z) = object$rowNameSlices[[1]][1:nrow(z)]; # colnames(z) = object$columnNames[1:ncol(z)]; # cat("Top left corner of the first slice (up to 10x10):\n"); # show(z) # } # } # }) setGeneric("as.matrix") setMethod("as.matrix", "SlicedData", function(x) { if(x$nSlices() == 0) { return( matrix(0,0,0) ); } if(x$nSlices() > 1) { copy = x$Clone(); copy$CombineInOneSlice(); } else { copy = x; } mat = copy$getSlice(1L); rownames(mat) = rownames(copy); colnames(mat) = colnames(copy); return( mat ); }) setGeneric("colnames<-") setMethod("colnames<-", "SlicedData", function(x,value) { stopifnot( class(value) == "character" ); stopifnot( length(value) == x$nCols() ); x$columnNames = value; return(x); }) setGeneric("rownames<-") setMethod("rownames<-", "SlicedData", function(x,value) { stopifnot( class(value) == "character" ); stopifnot( length(value) == x$nRows() ); start = 1; newNameSlices = vector("list", x$nSlices()); for( i in .seq(1,x$nSlices()) ) { nr = nrow(x$getSliceRaw(i)); newNameSlices[[i]] = value[ start:(start+nr-1) ]; start = start + nr; } x$rowNameSlices = newNameSlices; return(x); }) setGeneric("rowSums") setMethod("rowSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { if(x$nSlices() == 0) { return( numeric() ); } stopifnot( dims == 1 ); thesum = vector("list", x$nSlices()); for( i in 1:x$nSlices() ) { thesum[[i]] = rowSums(x$getSlice(i), na.rm) } return(unlist(thesum, recursive = FALSE, use.names = FALSE)); }) setGeneric("rowMeans") setMethod("rowMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { if(x$nSlices() == 0) { return( numeric() ); } stopifnot( dims == 1 ); thesum = vector("list", x$nSlices()); for( i in 1:x$nSlices() ) { thesum[[i]] = rowMeans(x$getSlice(i), na.rm) } return(unlist(thesum, recursive = FALSE, use.names = FALSE)); }) setGeneric("colSums") setMethod("colSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { if(x$nCols() == 0) { return( numeric() ); } stopifnot( dims == 1 ); thesum = 0; for( i in .seq(1,x$nSlices()) ) { thesum = thesum + colSums(x$getSlice(i), na.rm) } return(thesum); }) setGeneric("colMeans") setMethod("colMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { if(x$nCols() == 0) { return( numeric() ); } stopifnot( dims == 1 ); thesum = 0; thecounts = x$nRows(); for( i in .seq(1,x$nSlices()) ) { slice = x$getSlice(i); thesum = thesum + colSums(slice, na.rm) if( na.rm ) { thecounts = thecounts - colSums(is.na(slice)) } } return(thesum/thecounts); }) .listBuilder <- setRefClass(".listBuilder", fields = list( dataEnv = "environment", n = "numeric" ), methods = list( initialize = function() { dataEnv <<- new.env(hash = TRUE); n <<- 0; # cumlength <<- 0; return(.self); }, add = function(x) { if(length(x) > 0) { n <<- n + 1; # cumlength <<- cumlength + length(x); assign(paste(n), x, dataEnv ); } return(.self); }, set = function(i,x) { if(length(x) > 0) { if(i>n) n <<- i; assign(paste(i), x, dataEnv ); } return(.self); }, get = function(i) { return(base::get(paste(i),dataEnv)); }, list = function() { if(n==0) return(list()); result = vector('list',n); for( i in 1:n) { result[[i]] = .self$get(i); } return(result); }, unlist = function() { return(base::unlist(.self$list(), recursive=FALSE, use.names = FALSE)); }, show = function() { cat(".listBuilder object.\nIternal object in MatrixEQTL package.\n"); cat("Number of elements:", object$n, "\n"); } )) .histogrammer <- setRefClass(".histogrammer", fields = list( pvbins1 = "numeric", statbins1 = "numeric", hist.count = "numeric" ), methods = list( initialize = function (pvbins, statbins) { if(length(pvbins)) { ord = order(statbins); pvbins1 <<- pvbins[ord]; statbins1 <<- statbins[ord]; hist.count <<- double(length(pvbins)-1); } return(.self); }, update = function(stats.for.hist) { h = hist(stats.for.hist, breaks = statbins1, include.lowest = TRUE, right = TRUE, plot = FALSE)$counts; hist.count <<- hist.count + h; }, getResults = function() { if(!is.unsorted(pvbins1)) { return(list(hist.bins = pvbins1 , hist.counts = hist.count )); } else { return(list(hist.bins = rev(pvbins1), hist.counts = rev(hist.count))); } } )) .minpvalue <- setRefClass(".minpvalue", fields = list( sdata = ".listBuilder", gdata = ".listBuilder" ), methods = list( initialize = function(snps, gene) { sdata <<- .listBuilder$new(); for( ss in 1:snps$nSlices() ) { sdata$set( ss, double(nrow(snps$getSliceRaw(ss)))); } gdata <<- .listBuilder$new(); for( gg in 1:gene$nSlices() ) { gdata$set( gg, double(nrow(gene$getSliceRaw(gg)))); } return(.self); }, update = function(ss, gg, astat) { gmax = gdata$get(gg) z1 = max.col(astat,ties.method='first'); z11 = astat[1:nrow(astat) + nrow(astat) * (z1 - 1)]; gmax = pmax(gmax, z11); gdata$set(gg, gmax); smax = sdata$get(ss) z22 = apply(astat,2,max); smax = pmax(smax, z22); sdata$set(ss, smax); return(.self); }, updatecis = function(ss, gg, select.cis, astat) { if(length(astat)>0) { byrows = aggregate(x=astat, by=list(row=select.cis[,1]), FUN=max); bycols = aggregate(x=astat, by=list(col=select.cis[,2]), FUN=max); gmax = gdata$get(gg); gmax[byrows$row] = pmax(gmax[byrows$row], byrows$x) gdata$set(gg, gmax); smax = sdata$get(ss) smax[bycols$col] = pmax(smax[bycols$col], bycols$x) sdata$set(ss, smax); } return(.self); }, getResults = function(snps, gene, pvfun) { min.pv.snps = pvfun(sdata$unlist()); names(min.pv.snps) = rownames(snps); min.pv.gene = pvfun(gdata$unlist()); names(min.pv.gene) = rownames(gene); return(list(min.pv.snps = min.pv.snps, min.pv.gene = min.pv.gene)); } )) .OutputSaver_FRD <- setRefClass(".OutputSaver_FRD", fields = list( sdata = ".listBuilder", gdata = ".listBuilder", cdata = ".listBuilder", bdata = ".listBuilder", fid = "list", testfun1 = "list", pvfun1 = "list" ), methods = list( initialize = function () { sdata <<- .listBuilder$new(); gdata <<- .listBuilder$new(); cdata <<- .listBuilder$new(); bdata <<- .listBuilder$new(); fid <<- list(0); testfun1 <<- list(0); pvfun1 <<- list(0); return(.self); }, start = function(filename, statistic_name, unused1, unused2, testfun, pvfun) { testfun1 <<- list(testfun); pvfun1 <<- list(pvfun); if(length(filename) > 0) { if(class(filename) == "character") { fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); } else { fid <<- list(filename, FALSE) } writeLines( paste("SNP\tgene\t",statistic_name,"\tp-value\tFDR", sep = ""), fid[[1]]); } else { fid <<- list(); } }, update = function(spos, gpos, sta, beta = NULL) { if(length(sta)>0) { sdata$add(spos); gdata$add(gpos); cdata$add(sta ); if(!is.null(beta )) bdata$add(beta ); } return(.self); }, getResults = function( gene, snps, FDR_total_count) { pvalues = NULL; if(cdata$n > 0) { tests = testfun1[[1]](cdata$unlist()); cdata <<- .listBuilder$new(); pvalues = pvfun1[[1]](tests); ord = order(pvalues); tests = tests[ord]; pvalues = pvalues[ord]; FDR = pvalues * FDR_total_count / (1:length(pvalues)); FDR[length(FDR)] = min(FDR[length(FDR)], 1); FDR = rev(cummin(rev(FDR))); snps_names = snps$GetAllRowNames()[sdata$unlist()[ord]]; sdata <<- .listBuilder$new(); gene_names = gene$GetAllRowNames()[gdata$unlist()[ord]]; gdata <<- .listBuilder$new(); beta = NULL; if(bdata$n > 0) beta = bdata$unlist()[ord]; if(length(fid)>0) { step = 1000; ########### 100000 for( part in 1:ceiling(length(FDR)/step) ) { fr = (part-1)*step + 1; to = min(part*step, length(FDR)); dump = data.frame(snps_names[fr:to], gene_names[fr:to], if(is.null(beta)) tests[fr:to] else list(beta[fr:to],tests[fr:to]), pvalues[fr:to], FDR[fr:to], row.names = NULL, check.rows = FALSE, check.names = FALSE, stringsAsFactors = FALSE); write.table(dump, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); } } } else { cat("No significant associations were found.\n", file = if(length(fid)>0){fid[[1]]}else{""}); } if(length(fid)>0) { if(fid[[2]]) { close(fid[[1]]); } fid <<- list(); } if(!is.null(pvalues)) { eqtls = list( snps = snps_names, gene = gene_names, statistic = tests, pvalue = pvalues, FDR = FDR); if(!is.null(beta)) eqtls$beta = beta; } else { eqtls = list( snps = character(), gene = character(), beta = numeric(), statistic = numeric(), pvalue = numeric(), FDR = numeric()); } return(list(eqtls = data.frame(eqtls))); } ) ) .OutputSaver_direct <- setRefClass(".OutputSaver_direct", fields = list( gene_names = "character", snps_names = "character", fid = "list", testfun1 = "list", pvfun1 = "list" ), methods = list( initialize = function() { gene_names <<- character(0); snps_names <<- character(0); fid <<- list(0); testfun1 <<- list(0); pvfun1 <<- list(0); return(.self); }, start = function(filename, statistic_name, snps, gene, testfun, pvfun) { # I hope the program stops if it fails to open the file if(class(filename) == "character") { fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); } else { fid <<- list(filename, FALSE) } writeLines(paste("SNP\tgene\t", statistic_name, "\tp-value", sep = ""), fid[[1]]); gene_names <<- gene$GetAllRowNames(); snps_names <<- snps$GetAllRowNames(); testfun1 <<- list(testfun); pvfun1 <<- list(pvfun); }, update = function(spos, gpos, sta, beta = NULL) { if( length(sta) == 0 ) return(); sta = testfun1[[1]](sta); lst = list(snps = snps_names[spos], gene = gene_names[gpos], beta = beta, statistic = sta, pvalue = pvfun1[[1]](sta)); lst$beta = lst$beta; dump2 = data.frame(lst, row.names = NULL, check.rows = FALSE, check.names = FALSE, stringsAsFactors = FALSE); write.table(dump2, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); }, getResults = function(...) { if(length(fid)>0) { if(fid[[2]]) { close(fid[[1]]); } fid <<- list(); } gene_names <<- character(0); snps_names <<- character(0); return(list()); } ) ) .my.pmin = function(x, val) { # minimum "pmin" function that can handle empty array if(length(x) == 0) { return(x) } else { return(pmin.int(x,val)); } } .my.pmax = function(x, val) { # minimum "pmin" function that can handle empty array if(length(x) == 0) { return(x) } else { return(pmax.int(x,val)); } } .pv.nz = function(x){return( .my.pmax(x,.Machine$double.xmin) )} .SetNanRowMean = function(x) { if( any(is.na(x)) ) { rowmean = rowMeans(x, na.rm = TRUE); rowmean[ is.na(rowmean) ] = 0; for( j in which(!complete.cases(x)) ) { where1 = is.na( x[j, ] ); x[j,where1] = rowmean[j]; } } return(x); } # .SNP_process_split_for_ANOVA = function(x) { # # split into 2 dummy variables # # uniq = unique(c(x)); # uniq = uniq[!is.na(uniq)]; # # if( length(uniq) > 3 ) { # stop("More than three genotype categories is not handled by ANOVA"); # } else if ( length(uniq) < 3 ) { # uniq = c(uniq, min(uniq)-(1:(3-length(uniq)))); # } # # freq = matrix(0, nrow(x), length(uniq)); # for(i in 1:length(uniq)) { # freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); # } # # md = apply(freq, 1, which.max); # freq[ cbind(1:nrow(x),md) ] = -1; # # md = apply(freq, 1, which.max); # min(freq[cbind(1:nrow(slice),md)] - rowSums(select,na.rm = TRUE )) # new_slice1 = (x == uniq[md]); # new_slice1[is.na(new_slice1)] = 0; # freq[ cbind(1:nrow(x),md) ] = -1; # # md = apply(freq,1,which.max); # new_slice2 = (x == uniq[md]); # new_slice2[ is.na(new_slice2) ] = 0; # rez = vector("list", 2); # rez[[1]] = new_slice1; # rez[[2]] = new_slice2; # return( rez ); # } .SNP_process_split_for_ANOVA = function(x,n.groups) { # split into 2 dummy variables (or more) # # Get the number of ANOVA groups # n.groups = options('MatrixEQTL.ANOVA.categories')[[1]]; # if( is.null(n.groups)) # n.groups = 3; # Unique values in x (make sure it has length of n.groups); uniq = unique(as.vector(x)); uniq = uniq[!is.na(uniq)]; if( length(uniq) > n.groups ) { stop("More than declared number of genotype categories is detected by ANOVA"); } else if ( length(uniq) < n.groups ) { uniq = c(uniq, rep(min(uniq)-1, n.groups-length(uniq))); } # Table of frequencies for each variable (row) freq = matrix(0, nrow(x), n.groups); for(i in 1:n.groups) { freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); } # remove NA's from x for convenience x[is.na(x)] = min(uniq)-2; # Output list of matrices rez = vector('list',n.groups-1); # Skip the most frequent value md = apply(freq, 1, which.max); # most frequent value for each variable freq[ cbind(1:nrow(x),md) ] = -1; # The rest form dumm for(j in 1:(n.groups-1)){ md = apply(freq, 1, which.max); freq[ cbind(1:nrow(x),md) ] = -1; rez[[j]] = (x == uniq[md]); } return( rez ); } Matrix_eQTL_engine = function( snps, gene, cvrt = SlicedData$new(), output_file_name, pvOutputThreshold = 1e-5, useModel = modelLINEAR, errorCovariance = numeric(), verbose = TRUE, pvalue.hist = FALSE, min.pv.by.genesnp = FALSE, noFDRsaveMemory = FALSE) { rez = Matrix_eQTL_main( snps = snps, gene = gene, cvrt = cvrt, output_file_name = output_file_name, pvOutputThreshold = pvOutputThreshold, useModel = useModel, errorCovariance = errorCovariance, verbose = verbose, pvalue.hist = pvalue.hist, min.pv.by.genesnp = min.pv.by.genesnp, noFDRsaveMemory = noFDRsaveMemory); return( rez ); } Matrix_eQTL_main = function( snps, gene, cvrt = SlicedData$new(), output_file_name = "", pvOutputThreshold = 1e-5, useModel = modelLINEAR, errorCovariance = numeric(), verbose = TRUE, output_file_name.cis = "", pvOutputThreshold.cis = 0, snpspos = NULL, genepos = NULL, cisDist = 1e6, pvalue.hist = FALSE, min.pv.by.genesnp = FALSE, noFDRsaveMemory = FALSE) { ################################# Basic variable checks ################################# { # status("Performing basic checks of the input variables"); stopifnot( "SlicedData" %in% class(gene) ); stopifnot( "SlicedData" %in% class(snps) ); stopifnot( "SlicedData" %in% class(cvrt) ); # Check dimensions if( min(snps$nRows(),snps$nCols()) == 0 ) stop("Empty genotype dataset"); if( min(gene$nRows(),gene$nCols()) == 0 ) stop("Empty expression dataset"); if( snps$nCols() != gene$nCols() ) stop("Different number of samples in the genotype and gene expression files"); if( cvrt$nRows()>0 ) { if( snps$nCols() != cvrt$nCols() ) stop("Wrong number of samples in the matrix of covariates"); } stopifnot( class(pvOutputThreshold) == "numeric" ); stopifnot( length(pvOutputThreshold) == 1 ); stopifnot( pvOutputThreshold >= 0 ); stopifnot( pvOutputThreshold <= 1 ); stopifnot( class(noFDRsaveMemory) == "logical" ); stopifnot( length(noFDRsaveMemory) == 1 ); if( pvOutputThreshold > 0 ) { stopifnot( !((length(output_file_name) == 0) && noFDRsaveMemory) ) stopifnot( length(output_file_name) <= 1 ); if( length(output_file_name) == 1 ) { stopifnot( class(output_file_name) %in% c("character","connection") ); } } stopifnot( class(pvOutputThreshold.cis) == "numeric" ); stopifnot( length(pvOutputThreshold.cis) == 1 ); stopifnot( pvOutputThreshold.cis >= 0 ); stopifnot( pvOutputThreshold.cis <= 1 ); stopifnot( !((pvOutputThreshold > 0) & (pvOutputThreshold.cis > 0) & (pvOutputThreshold > pvOutputThreshold.cis)) ); stopifnot( (pvOutputThreshold > 0) | (pvOutputThreshold.cis > 0) ); stopifnot( class(useModel) == class(modelLINEAR) ); stopifnot( length(useModel) == 1 ); stopifnot( useModel %in% c(modelLINEAR, modelANOVA, modelLINEAR_CROSS) ); if( useModel %in% c(modelLINEAR, modelLINEAR_CROSS) ) { if( snps$nCols() <= cvrt$nRows() + 1 + 1) { stop('The number of covariates exceeds the number of samples.\nLinear regression can not be fit.') } } if( useModel == modelLINEAR_CROSS ) { if( cvrt$nRows() == 0 ) { stop( "Model \"modelLINEAR_CROSS\" requires at least one covariate" ); } } if( useModel == modelANOVA ) { n.anova.groups = getOption('MatrixEQTL.ANOVA.categories', 3); stopifnot( n.anova.groups == floor(n.anova.groups) ); stopifnot( n.anova.groups >= 3 ); # stopifnot( n.anova.groups < snps$nCols() - cvrt$nRows() - 2 ); if( snps$nCols() <= cvrt$nRows() + n.anova.groups) { stop('The number of covariates exceeds the number of samples.\nLinear regression (ANOVA) can not be fit.') } } stopifnot( class(verbose) == "logical" ); stopifnot( length(verbose) == 1 ); stopifnot( class(min.pv.by.genesnp) == "logical" ); stopifnot( length(min.pv.by.genesnp) == 1 ); if( pvOutputThreshold.cis > 0 ) { stopifnot( !((length(output_file_name.cis) == 0) && noFDRsaveMemory) ) stopifnot( length(output_file_name.cis) <= 1 ); if( length(output_file_name.cis) == 1 ) { stopifnot( class(output_file_name.cis) %in% c("character","connection") ); } # stopifnot( class(output_file_name.cis) == "character" ); # stopifnot( length(output_file_name.cis) == 1 ); stopifnot( class(snpspos) == "data.frame" ); stopifnot( ncol(snpspos) == 3 ); stopifnot( nrow(snpspos) > 0 ); stopifnot( class(snpspos[1,3]) %in% c("integer", "numeric") ) stopifnot( !any(is.na(snpspos[,3])) ) stopifnot( class(genepos) == "data.frame" ); stopifnot( ncol(genepos) == 4 ); stopifnot( nrow(genepos) > 0 ); stopifnot( class(genepos[1,3]) %in% c("integer", "numeric") ) stopifnot( class(genepos[1,4]) %in% c("integer", "numeric") ) stopifnot( !any(is.na(genepos[,3])) ) stopifnot( !any(is.na(genepos[,4])) ) stopifnot( nzchar(output_file_name.cis) ) } if( pvOutputThreshold > 0 ) { stopifnot( nzchar(output_file_name) ) } stopifnot( class(errorCovariance) %in% c("numeric", "matrix") ); errorCovariance = as.matrix(errorCovariance); if(length(errorCovariance)>0) { if( nrow(errorCovariance) != ncol(errorCovariance) ) { stop("The covariance matrix is not square"); } if( nrow(errorCovariance) != snps$nCols() ) { stop("The covariance matrix size does not match the number of samples"); } if( !all(errorCovariance == t(errorCovariance)) ) { stop("The covariance matrix is not symmetric"); } } } ################################# Initial setup ######################################### { gene.std = .listBuilder$new(); snps.std = .listBuilder$new(); dont.clone.gene = getOption('MatrixEQTL.dont.preserve.gene.object', FALSE) if(is.null(dont.clone.gene)) dont.clone.gene = FALSE; if( !dont.clone.gene ) gene = gene$Clone(); # snps = snps$Clone(); # snps is read only cvrt = cvrt$Clone(); params = list( output_file_name = output_file_name, pvOutputThreshold = pvOutputThreshold, useModel = useModel, errorCovariance = errorCovariance , verbose = verbose, output_file_name.cis = output_file_name.cis, pvOutputThreshold.cis = pvOutputThreshold.cis, cisDist = cisDist , pvalue.hist = pvalue.hist, min.pv.by.genesnp = min.pv.by.genesnp); if( verbose ) { lastTime = 0; status <- function(text) { # gc(); newTime = proc.time()[3]; if(lastTime != 0) { cat("Task finished in ", newTime-lastTime, " seconds\n"); } cat(text,"\n"); lastTime <<- newTime; unused = flush.console(); } } else { status = function(text){} } start.time = proc.time()[3]; } ################################# Error covariance matrix processing #################### { if( length(errorCovariance) > 0 ) { status("Processing the error covariance matrix"); eig = eigen(errorCovariance, symmetric = TRUE) d = eig$values; v = eig$vectors; # errorCovariance == v %*% diag(d) %*% t(v) # errorCovariance^0.5 == v*sqrt(d)*v" (no single quotes anymore) # errorCovariance^(-0.5) == v*diag(1./sqrt(diag(d)))*v" if( any(d<=0) ) { stop("The covariance matrix is not positive definite"); } correctionMatrix = v %*% diag(1./sqrt(d)) %*% t(v); rm( eig, v, d, errorCovariance ) } else { rm( errorCovariance ); correctionMatrix = numeric(); } } ################################# Matching gene and SNPs locations ###################### if( pvOutputThreshold.cis > 0 ) { status("Matching data files and location files") # names in the input data gene_names = gene$GetAllRowNames(); snps_names = snps$GetAllRowNames(); # gene range, set: left<right if(any(genepos[,3] > genepos[,4])) { temp3 = genepos[,3]; temp4 = genepos[,4]; genepos[,3] = pmin(temp3,temp4); genepos[,4] = pmax(temp3,temp4); rm(temp3, temp4); } # match with the location data genematch = match( gene_names, genepos[ ,1], nomatch = 0L); usedgene = matrix(FALSE, nrow(genepos), 1); # genes in "genepos" that are matching "gene_names" usedgene[ genematch ] = TRUE; if( !any(genematch) ) { stop("Gene names do not match those in the gene location file."); } cat( sum(genematch>0), "of", length(gene_names), " genes matched\n"); snpsmatch = match( snps_names, snpspos[ ,1], nomatch = 0L); usedsnps = matrix(FALSE, nrow(snpspos),1); usedsnps[ snpsmatch ] = TRUE; if( !any(snpsmatch) ) { stop("SNP names do not match those in the SNP location file."); } cat( sum(snpsmatch>0), "of", length(snps_names), " SNPs matched\n"); # list used chr names chrNames = unique(c( as.character(unique(snpspos[usedsnps,2])), as.character(unique(genepos[usedgene,2])) )) chrNames = chrNames[ sort.list( suppressWarnings(as.integer(chrNames)), method = "radix", na.last = TRUE ) ]; # match chr names genechr = match(genepos[,2],chrNames); snpschr = match(snpspos[,2],chrNames); # max length of a chromosome chrMax = max( snpspos[usedsnps, 3], genepos[usedgene, 4], na.rm = TRUE) + cisDist; # Single number location for all rows in "genepos" and "snpspos" genepos2 = as.matrix(genepos[ ,3:4, drop = FALSE] + (genechr-1)*chrMax); snpspos2 = as.matrix(snpspos[ ,3 , drop = FALSE] + (snpschr-1)*chrMax); # the final location arrays; snps_pos = matrix(0,length(snps_names),1); snps_pos[snpsmatch>0, ] = snpspos2[snpsmatch, , drop = FALSE]; snps_pos[rowSums(is.na(snps_pos))>0, ] = 0; snps_pos[snps_pos==0] = (length(chrNames)+1) * (chrMax+cisDist); rm(snps_names, snpsmatch, usedsnps, snpschr, snpspos2) gene_pos = matrix(0,length(gene_names),2); gene_pos[genematch>0, ] = genepos2[genematch, , drop = FALSE]; gene_pos[rowSums(is.na(gene_pos))>0, ] = 0; gene_pos[gene_pos==0] = (length(chrNames)+2) * (chrMax+cisDist); rm(gene_names, genematch, usedgene, genechr, genepos2) rm(chrNames, chrMax); if( is.unsorted(snps_pos) ) { status("Reordering SNPs\n"); ordr = sort.list(snps_pos); snps$RowReorder(ordr); snps_pos = snps_pos[ordr, , drop = FALSE]; rm(ordr); } if( is.unsorted(rowSums(gene_pos)) ) { status("Reordering genes\n"); ordr = sort.list(rowSums(gene_pos)); gene$RowReorder(ordr); gene_pos = gene_pos[ordr, , drop = FALSE]; rm(ordr); } # Slice it back. geneloc = vector("list", gene$nSlices()) gene_offset = 0; for(gc in 1:gene$nSlices()) { nr = length(gene$rowNameSlices[[gc]]); geneloc[[gc]] = gene_pos[gene_offset + (1:nr), , drop = FALSE]; gene_offset = gene_offset + nr; } rm(gc, gene_offset, gene_pos); snpsloc = vector("list", snps$nSlices()) snps_offset = 0; for(sc in 1:snps$nSlices()) { nr = length(snps$rowNameSlices[[sc]]); snpsloc[[sc]] = snps_pos[snps_offset + (1:nr), , drop = FALSE]; snps_offset = snps_offset + nr; } rm(nr, sc, snps_offset, snps_pos); } ################################# Covariates processing ################################# { status("Processing covariates"); if( useModel == modelLINEAR_CROSS ) { last.covariate = as.vector(tail( cvrt$getSlice(cvrt$nSlices()), n = 1)); } if( cvrt$nRows()>0 ) { cvrt$SetNanRowMean(); cvrt$CombineInOneSlice(); cvrt = rbind(matrix(1,1,snps$nCols()),cvrt$getSlice(1)); } else { cvrt = matrix(1,1,snps$nCols()); } # Correct for the error covariance structure if( length(correctionMatrix)>0 ) { cvrt = cvrt %*% correctionMatrix; } # Orthonormalize covariates # status("Orthonormalizing covariates"); q = qr(t(cvrt)); if( min(abs(diag(qr.R(q)))) < .Machine$double.eps * snps$nCols() ) { stop("Colinear or zero covariates detected"); } cvrt = t( qr.Q(q) ); rm( q ); } ################################# Gene expression processing ############################ { status("Processing gene expression data (imputation, residualization, etc.)"); # Impute gene expression gene$SetNanRowMean(); # Correct for the error covariance structure if( length(correctionMatrix)>0 ) { gene$RowMatrixMultiply(correctionMatrix); } # Orthogonolize expression w.r.t. covariates # status("Orthogonolizing expression w.r.t. covariates"); gene_offsets = double(gene$nSlices()+1); for( sl in 1:gene$nSlices() ) { slice = gene$getSlice(sl); gene_offsets[sl+1] = gene_offsets[sl] + nrow(slice); rowsq1 = rowSums(slice^2); slice = slice - tcrossprod(slice,cvrt) %*% cvrt; rowsq2 = rowSums(slice^2); # kill rows colinear with the covariates delete.rows = (rowsq2 <= rowsq1 * .Machine$double.eps ); slice[delete.rows] = 0; div = sqrt( rowSums(slice^2) ); div[ div == 0 ] = 1; gene.std$set(sl, div); gene$setSlice(sl, slice / div); } rm(rowsq1, rowsq2, delete.rows, div); rm( sl, slice ); #gene$RowRemoveZeroEps(); } ################################# snps_process, testfun, pvfun, threshfun, afun ######## { # snps_process - preprocess SNPs slice # # afun --- abs for signed stats, identity for non-negative # threshfun --- internal stat threshold for given p-value # testfun --- t or F statistic from the internal one # pvfun --- p-value from the t or F statistic nSamples = snps$nCols(); nGenes = gene$nRows(); nSnps = snps$nRows(); nCov = nrow(cvrt); # nVarTested = length(snps_list); # set in case(useModel) # dfNull = nSamples - nCov; # d.f. of the full model betafun = NULL; if( useModel == modelLINEAR ) { snps_process = function(x) { return( list(.SetNanRowMean(x)) ); }; nVarTested = 1; dfFull = nSamples - nCov - nVarTested; statistic.fun = function(mat_list) { return( mat_list[[1]] ); } afun = function(x) {return(abs(x))}; threshfun = function(pv) { thr = qt(pv/2, dfFull, lower.tail = FALSE); thr = thr^2; thr = sqrt( thr / (dfFull + thr) ); thr[pv >= 1] = 0; thr[pv <= 0] = 1; return( thr ); } testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2)); } thresh.cis = threshfun(pvOutputThreshold.cis); thresh = threshfun(pvOutputThreshold); betafun = function(stat, ss, gg, select) { return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); } } else if( useModel == modelANOVA ) { snps_process = function(x).SNP_process_split_for_ANOVA(x,n.anova.groups); nVarTested = n.anova.groups - 1; dfFull = nSamples - nCov - nVarTested; # statistic.fun = function(mat_list) { # return( mat_list[[1]]^2 + mat_list[[2]]^2 ); # } statistic.fun = function(mat_list) { x = mat_list[[1]]^2; for( j in 2:length(mat_list) ) x = x + mat_list[[j]]^2; return( x ); } afun = identity; threshfun = function(pv) { thr = qf(pv, nVarTested, dfFull, lower.tail = FALSE); thr = thr / (dfFull/nVarTested + thr); thr[pv >= 1] = 0; thr[pv <= 0] = 1; return( thr ); } testfun = function(x) { return( x / (1 - .my.pmin(x,1)) * (dfFull/nVarTested) ); } pvfun = function(x) { return( .pv.nz(pf(x, nVarTested, dfFull, lower.tail = FALSE)) ); } thresh.cis = threshfun(pvOutputThreshold.cis); thresh = threshfun(pvOutputThreshold); } else if( useModel == modelLINEAR_CROSS ) { last.covariate = as.vector( last.covariate ); snps_process = .SNP_process_split_for_LINEAR_CROSS = function(x) { out = vector("list", 2); out[[1]] = .SetNanRowMean(x); out[[2]] = t( t(out[[1]]) * last.covariate ); return( out ); }; nVarTested = 1; dfFull = nSamples - nCov - nVarTested - 1; statistic.fun = function(mat_list) { return( mat_list[[2]] / sqrt(1 - mat_list[[1]]^2) ); } afun = function(x) {return(abs(x))}; threshfun = function(pv) { thr = qt(pv/2, dfFull, lower.tail = FALSE); thr = thr^2; thr = sqrt( thr / (dfFull + thr) ); thr[pv >= 1] = 0; thr[pv <= 0] = 1; return( thr ); } testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2 )); } thresh.cis = threshfun(pvOutputThreshold.cis); thresh = threshfun(pvOutputThreshold); betafun = function(stat, ss, gg, select) { return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); } } params$dfFull = dfFull; } ################################# Saver class(es) creation ############################## { status("Creating output file(s)"); if(noFDRsaveMemory) { if( pvOutputThreshold > 0 ) { saver.tra = .OutputSaver_direct$new(); } if( pvOutputThreshold.cis > 0 ) { saver.cis = .OutputSaver_direct$new(); } } else { if( pvOutputThreshold > 0 ) { saver.tra = .OutputSaver_FRD$new(); } if( pvOutputThreshold.cis > 0 ) { saver.cis = .OutputSaver_FRD$new(); } } if( pvOutputThreshold > 0 ) if( pvOutputThreshold * gene$nRows() * snps$nRows() > 1000000 ) if(!noFDRsaveMemory) cat('Warning: pvOutputThreshold may be too large.\nExpected number of findings > ', pvOutputThreshold * gene$nRows() * snps$nRows(),'\n'); if( (useModel == modelLINEAR) || (useModel == modelLINEAR_CROSS) ) { statistic_name = "t-stat"; } else if( useModel == modelANOVA ) { statistic_name = "F-test"; } if(!is.null(betafun)) statistic_name = paste('beta\t',statistic_name, sep=''); if( pvOutputThreshold > 0 ) saver.tra$start(output_file_name, statistic_name, snps, gene, testfun, pvfun); if( pvOutputThreshold.cis > 0 ) saver.cis$start(output_file_name.cis, statistic_name, snps, gene, testfun, pvfun); rm( statistic_name ); } ################################# Some useful functions ################################# { orthonormalize.snps = function(cursnps, ss) { for(p in 1:length(cursnps)) { if(length(correctionMatrix)>0) { cursnps[[p]] = cursnps[[p]] %*% correctionMatrix; } cursnps[[p]] = cursnps[[p]] - tcrossprod(cursnps[[p]],cvrt) %*% cvrt; for(w in .seq(1,p-1L)) cursnps[[p]] = cursnps[[p]] - rowSums(cursnps[[p]]*cursnps[[w]]) * cursnps[[w]]; div = sqrt( rowSums(cursnps[[p]]^2) ); div[ div == 0 ] = 1; cursnps[[p]] = cursnps[[p]]/div; } snps.std$set(ss, div); return(cursnps); } # if( pvOutputThreshold.cis > 0 ) { # is.cis.pair = function(gg,ss) { # return(!( ( snpsloc[[ss]][1, 1] - tail( geneloc[[gg]][ , 2], n = 1L) > cisDist) | # ( geneloc[[gg]][1, 1] - tail( snpsloc[[ss]] , n = 1L) > cisDist) ) ); # } # } if( pvOutputThreshold.cis > 0 ) { # sn.l = sapply(snpsloc, function(x)x[1] ); # sn.r = sapply(snpsloc, function(x)tail(x,1) ); # ge.l = sapply(geneloc, function(x)x[1,1] ); # ge.r = sapply(geneloc, function(x)x[nrow(x) , 2] ); sn.l = sapply(snpsloc, '[', 1 ); sn.r = sapply(snpsloc, tail, 1 ); ge.l = sapply(geneloc, '[', 1, 1 ); ge.r = sapply( lapply(geneloc, tail.matrix, 1 ), '[', 2); gg.1 = findInterval( sn.l , ge.r + cisDist +1) + 1; # cat(gg.1,'\n') gg.2 = findInterval( sn.r , ge.l - cisDist ); # cat(gg.2,'\n') rm(sn.l, sn.r, ge.l, ge.r); } } ################################# Prepare counters and histogram bins ################### { pvbins = NULL; # bin edges for p-values statbins = 0; # bin edges for the test statistic (|t| or F) do.hist = FALSE; if( length(pvalue.hist) == 1 ) { if(pvalue.hist == "qqplot") { pvbins = c(0, 10^rev(seq(0, log10(.Machine$double.xmin)-1, -0.05))); } else if( is.numeric(pvalue.hist) ) { pvbins = seq(from = 0, to = 1, length.out = pvalue.hist+1); } else if( pvalue.hist == TRUE ) { pvbins = seq(from = 0, to = 1, length.out = 100+1); } } else if( is.numeric(pvalue.hist) && (length(pvalue.hist) > 1) ) { pvbins = pvalue.hist; } if( is.null(pvbins) && (pvalue.hist != FALSE) ) { stop("Wrong value of pvalue.hist. Must be FALSE, TRUE, \"qqplot\", or numerical"); } do.hist = !is.null(pvbins); if( do.hist ) { pvbins = sort(pvbins); statbins = threshfun(pvbins); if( pvOutputThreshold > 0) { hist.all = .histogrammer$new(pvbins, statbins); } if( pvOutputThreshold.cis > 0) { hist.cis = .histogrammer$new(pvbins, statbins); } } rm( pvbins, statbins); if(min.pv.by.genesnp) { if( pvOutputThreshold > 0) { minpv.tra = .minpvalue$new(snps,gene); } if( pvOutputThreshold.cis > 0) { minpv.cis = .minpvalue$new(snps,gene); } } } ################################# Main loop ############################################# { beta = NULL; n.tests.all = 0; n.tests.cis = 0; n.eqtls.tra = 0; n.eqtls.cis = 0; status("Performing eQTL analysis"); # ss = 1; gg = 1; # ss = snps$nSlices(); gg = gene$nSlices(); snps_offset = 0; for(ss in 1:snps$nSlices()) { # for(ss in 1:min(2,snps$nSlices())) { #for debug cursnps = NULL; nrcs = nrow(snps$getSliceRaw(ss)); # loop only through the useful stuff for(gg in if(pvOutputThreshold>0){1:gene$nSlices()}else{.seq(gg.1[ss],gg.2[ss])} ) { gene_offset = gene_offsets[gg]; curgene = gene$getSlice(gg); nrcg = nrow(curgene); if(nrcg == 0) next; rp = ""; statistic = NULL; select.cis.raw = NULL; ## do cis analysis # if( (pvOutputThreshold.cis > 0) && ( is.cis.pair(gg, ss) ) ) { if( (pvOutputThreshold.cis > 0) && (gg >= gg.1[ss]) && (gg <= gg.2[ss]) ) { if( is.null( statistic ) ) { if( is.null( cursnps ) ) { cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); } mat = vector("list", length(cursnps)); for(d in 1:length(cursnps)) { mat[[d]] = tcrossprod(curgene, cursnps[[d]]); } statistic = statistic.fun( mat ); astatistic = afun(statistic); # rm(mat); } # sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1 +1 , snpsloc[[ss]]); # sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist -1 , snpsloc[[ss]]); sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1, snpsloc[[ss]]); sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist, snpsloc[[ss]]); xx = unlist(lapply(which(sn.r>sn.l),FUN=function(x){(sn.l[x]:(sn.r[x]-1))*nrow(statistic)+x})) select.cis.raw = xx[ astatistic[xx] >= thresh.cis ]; select.cis = arrayInd(select.cis.raw, dim(statistic)) n.tests.cis = n.tests.cis + length(xx); n.eqtls.cis = n.eqtls.cis + length(select.cis.raw); if( do.hist ) hist.cis$update(astatistic[xx]); if( min.pv.by.genesnp ) { # minpv.cis$updatecis(ss, gg, arrayInd(xx, dim(statistic)), astatistic[xx]) temp = double(length(astatistic)); dim(temp) = dim(astatistic); temp[xx] = astatistic[xx]; minpv.cis$update(ss, gg, temp); } if(!is.null(betafun)) beta = betafun(mat[[length(mat)]][select.cis.raw], ss, gg, select.cis); saver.cis$update( snps_offset + select.cis[ , 2], gene_offset + select.cis[ , 1], statistic[select.cis.raw], beta); # statistic.select.cis = statistic[ select.cis ]; # test = testfun( statistic.select.cis ); # pv = pvfun(test); # Saver.cis$WriteBlock( cbind(snps_offset + select.cis[ , 2], gene_offset + select.cis[ , 1], test, pv) ); # counter.cis$Update(gg, ss, select.cis, pv, n.tests = length(xx), if(do.hist) afun(statistic[xx]) ) rp = paste(rp, ", ", formatC(n.eqtls.cis, big.mark=",", format = "f", digits = 0), " cis-eQTLs", sep = ""); } ## do trans/all analysis if(pvOutputThreshold>0) { if( is.null( statistic ) ) { if( is.null( cursnps ) ) { cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); } mat = vector("list", length(cursnps)); for(d in 1:length(cursnps)) { mat[[d]] = tcrossprod(curgene, cursnps[[d]]); } statistic = statistic.fun( mat ); astatistic = afun(statistic); # rm(mat); } if( do.hist ) hist.all$update(astatistic); if(!is.null(select.cis.raw)) astatistic[xx] = -1; # select.tra.raw = select.tra.raw[!(select.tra.raw %in% select.cis.raw)]; select.tra.raw = which( astatistic >= thresh); select.tra = arrayInd(select.tra.raw, dim(statistic)) n.eqtls.tra = n.eqtls.tra + length(select.tra.raw); n.tests.all = n.tests.all + length(statistic); if(!is.null(betafun)) beta = betafun(mat[[length(mat)]][select.tra.raw], ss, gg, select.tra); saver.tra$update( snps_offset + select.tra[ , 2], gene_offset + select.tra[ , 1], statistic[select.tra.raw], beta); if( min.pv.by.genesnp ) minpv.tra$update(ss, gg, astatistic) # statistic.select.tra = statistic[ select.tra ]; # test = testfun( statistic.select.tra ); # pv = pvfun( test ); # Saver$WriteBlock( cbind( snps_offset + select.tra[ , 2], gene_offset + select.tra[ , 1], test, pv) ); # counter$Update(gg, ss, select.tra, pv, n.tests = nrcs*nrcg, if(do.hist) afun(statistic) ) rp = paste(rp, ", ", formatC(n.eqtls.tra, big.mark=",", format = "f", digits = 0), if(pvOutputThreshold.cis > 0)" trans-"else" ","eQTLs", sep = "") } #gene_offset = gene_offset + nrcg; if( !is.null(statistic) ) { per = 100*(gg/gene$nSlices() + ss-1) / snps$nSlices(); cat( formatC(floor(per*100)/100, format = "f", width = 5, digits = 2), "% done" , rp, "\n", sep = ""); flush.console(); } } # gg in 1:gene$nSlices() snps_offset = snps_offset + nrow(snps$getSliceRaw(ss)); } # ss in 1:snps$nSlices() } ################################# Results collection #################################### { rez = list(time.in.sec = proc.time()[3] - start.time); rez$param = params; if(pvOutputThreshold.cis > 0) { rez.cis = list(ntests = n.tests.cis, neqtls = n.eqtls.cis); rez.cis = c(rez.cis, saver.cis$getResults( gene, snps, n.tests.cis) ); if(do.hist) rez.cis = c(rez.cis, hist.cis$getResults() ); if(min.pv.by.genesnp) rez.cis = c(rez.cis, minpv.cis$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); } if(pvOutputThreshold>0) { rez.all = list(ntests = n.tests.all, neqtls = n.eqtls.tra + n.eqtls.cis); if(pvOutputThreshold.cis > 0) { rez.tra = list(ntests = n.tests.all - n.tests.cis, neqtls = n.eqtls.tra); rez.tra = c(rez.tra, saver.tra$getResults( gene, snps, n.tests.all - n.tests.cis) ); } else { rez.all = c(rez.all, saver.tra$getResults( gene, snps, n.tests.all ) ); } if(do.hist) { rez.all = c(rez.all, hist.all$getResults() ); if(pvOutputThreshold.cis > 0) { rez.tra$hist.bins = rez.all$hist.bins; rez.tra$hist.counts = rez.all$hist.counts - rez.cis$hist.counts; } } if(min.pv.by.genesnp) { if(pvOutputThreshold.cis > 0) { rez.tra = c(rez.tra, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); } else { rez.all = c(rez.all, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); } } } if(exists('rez.all')>0) rez$all = rez.all; if(exists('rez.tra')>0) rez$trans = rez.tra; if(exists('rez.cis')>0) rez$cis = rez.cis; class(rez) = c(class(rez),"MatrixEQTL"); status(""); } # cat('s std ',snps.std$get(1),'\n'); # cat('g std ',gene.std$get(1),'\n'); ################################# Results collection #################################### return(rez); } .histme = function(m, name1, name2, ...) { cnts = m$hist.counts; bins = m$hist.bins; ntst = m$ntests; centers = 0.5 * (bins[-1L] + bins[-length(bins)]); density = 0.5 / (bins[-1L] - centers) * cnts / ntst; ntext = paste("P-value distribution for ", name1, formatC(ntst, big.mark=",", format = "f", digits = 0), name2, " gene-SNP pairs ",sep=""); r = structure(list(breaks = bins, counts = cnts, density = density, mids = centers, equidist = FALSE), class = "histogram"); plot(r, main = ntext, ylab = "Density", xlab = "P-values", ...) abline( h = 1, col = "blue"); return(invisible()); } .qqme = function(m, lcol, cex, pch, ...) { cnts = m$hist.counts; bins = m$hist.bins; ntst = m$ntests; cusu = cumsum(cnts) / ntst; ypos = bins[-1][is.finite(cusu)]; xpos = cusu[is.finite(cusu)]; lines(-log10(xpos), -log10(ypos), col = lcol, ...); # lines(xpos, ypos, col = lcol, ...); if(length(m$eqtls$pvalue)==0) return(); ypvs = -log10(m$eqtls$pvalue); xpvs = -log10(1:length(ypvs) / ntst); if(length(ypvs) > 1000) { # need to filter a bit, make the plotting faster levels = as.integer( xpvs/xpvs[1] * 1e3); keep = c(TRUE, diff(levels)!=0); levels = as.integer( ypvs/ypvs[1] * 1e3); keep = keep | c(TRUE, diff(levels)!=0); ypvs = ypvs[keep]; xpvs = xpvs[keep]; rm(keep) } points(xpvs, ypvs, col = lcol, pch = pch, cex = cex, ...); } plot.MatrixEQTL = function(x, cex = 0.5, pch = 19, xlim = NULL, ylim = NULL, ...) { if( x$param$pvalue.hist == FALSE ) { warning("Cannot plot p-value distribution: the information was not recorded.\nUse pvalue.hist!=FALSE."); return(invisible()); } if( x$param$pvalue.hist == "qqplot" ) { xmin = 1/max(x$cis$ntests, x$all$ntests); ymax = NULL; if(!is.null(ylim)) { ymax = ylim[2]; } else { ymax = -log10(min( x$cis$eqtls$pvalue[1], x$cis$hist.bins[ c(FALSE,x$cis$hist.counts>0)][1], x$all$eqtls$pvalue[1], x$all$hist.bins[ c(FALSE,x$all$hist.counts>0)][1], x$trans$eqtls$pvalue[1], x$trans$hist.bins[c(FALSE,x$trans$hist.counts>0)][1], na.rm = TRUE))+0.1; } if(ymax == 0) { ymax = -log10(.Machine$double.xmin) } if(!is.null(ymax)) ylim = c(0,ymax); if(is.null(xlim)) xlim = c(0, -log10(xmin/1.5)); plot(numeric(),numeric(), xlab = "-Log10(p-value), theoretical", ylab = "-Log10(p-value), observed", xlim = c(0, -log10(xmin/1.5)), ylim = ylim, xaxs="i", yaxs="i", ...); lines(c(0,1e3), c(0,1e3), col = "gray"); if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { .qqme( x$cis, "red", cex, pch, ...); .qqme( x$trans, "blue", cex, pch, ...); title(paste("QQ-plot for", formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), "local and", formatC(x$trans$ntests, big.mark=",", format = "f", digits = 0), "distant gene-SNP p-values")); lset = c(1,2,4); } else if(x$param$pvOutputThreshold.cis > 0) { .qqme(x$cis, "red", cex, pch, ...); title(paste("QQ-plot for", formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), "local gene-SNP p-values")); lset = c(1,4); } else { .qqme(x$all, "blue", cex, pch, ...); title(paste("QQ-plot for all", formatC(x$all$ntests, big.mark=",", format = "f", digits = 0), "gene-SNP p-values")); lset = c(3,4); } legend("topleft", c("Local p-values","Distant p-values","All p-values","diagonal")[lset], col = c("red","blue","blue","gray")[lset], text.col = c("red","blue","blue","gray")[lset], pch = 20, lwd = 1, pt.cex = c(1,1,1,0)[lset]) } else { if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { par(mfrow=c(2,1)); .histme(x$cis, "", " local", ...); tran = list(hist.counts = x$all$hist.counts - x$cis$hist.counts, hist.bins = x$all$hist.bins, ntests = x$all$ntests - x$cis$ntests); .histme(x$trans,""," distant", ...); par(mfrow=c(1,1)); } else if(x$param$pvOutputThreshold.cis > 0) { .histme(x$cis, "", " local", ...); } else { .histme(x$all, "all ", "" , ...); } } return(invisible()); }