# HG changeset patch # User jasonxu # Date 1615536844 0 # Node ID 2d1118ddc31d25c1492988098f85af3d41b0b501 # Parent cd4c8e4a4b5be7232cc5feb2a668e5b84c2dc16d Uploaded diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/DESCRIPTION --- a/MatrixEQTL/DESCRIPTION Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ -Package: MatrixEQTL -Type: Package -Title: Matrix eQTL: Ultra fast eQTL analysis via large matrix - operations -Version: 2.1.1 -Date: 2014-02-24 -Author: Andrey Shabalin -Maintainer: Andrey Shabalin -Description: Matrix eQTL is designed for fast eQTL analysis on large datasets. - Matrix eQTL can test for association between genotype and gene expression using linear regression - with either additive or ANOVA genotype effects. - The models can include covariates to account for factors - as population stratification, gender, and clinical variables. - It also supports models with heteroscedastic and/or correlated errors, - false discovery rate estimation and separate treatment of local (cis) and distant (trans) eQTLs. -License: LGPL-3 -LazyLoad: yes -URL: http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ -Depends: R (>= 2.12.0), methods -Packaged: 2014-03-01 10:58:21 UTC; Andrey diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/DESCRIPTION --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/DESCRIPTION Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,20 @@ +Package: MatrixEQTL +Type: Package +Title: Matrix eQTL: Ultra fast eQTL analysis via large matrix + operations +Version: 2.1.1 +Date: 2014-02-24 +Author: Andrey Shabalin +Maintainer: Andrey Shabalin +Description: Matrix eQTL is designed for fast eQTL analysis on large datasets. + Matrix eQTL can test for association between genotype and gene expression using linear regression + with either additive or ANOVA genotype effects. + The models can include covariates to account for factors + as population stratification, gender, and clinical variables. + It also supports models with heteroscedastic and/or correlated errors, + false discovery rate estimation and separate treatment of local (cis) and distant (trans) eQTLs. +License: LGPL-3 +LazyLoad: yes +URL: http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ +Depends: R (>= 2.12.0), methods +Packaged: 2014-03-01 10:58:21 UTC; Andrey diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/NAMESPACE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/NAMESPACE Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,27 @@ +importFrom(methods, setRefClass) +exportPattern("^[[:alpha:]]+") +exportClasses( + "SlicedData" +) +S3method(plot, MatrixEQTL) +S3method(summary, SlicedData) +exportMethods( + "[[", + "[[<-", + "colnames", + "colnames<-", + "dim", + "length", + "ncol", + "NCOL", + "nrow", + "NROW", + "rownames", + "rownames<-", + "as.matrix", + "rowMeans", + "rowSums", + "colMeans", + "colSums" +) + diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/R/Matrix_eQTL_engine.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/R/Matrix_eQTL_engine.R Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,1964 @@ +# Matrix eQTL by Andrey A. Shabalin +# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ + +# http://cran.r-project.org/web/packages/policies.html + +library(methods) + +modelLINEAR = 117348L; +modelANOVA = 47074L; +modelLINEAR_CROSS = 1113461L; + +.seq = function(a,b){if(a<=b){a:b}else{integer(0)}}; + +SlicedData <- setRefClass( "SlicedData", + fields = list( + dataEnv = "environment", + nSlices1 = "numeric", + rowNameSlices = "list", + columnNames = "character", + fileDelimiter = "character", + fileSkipColumns = "numeric", + fileSkipRows = "numeric", + fileSliceSize = "numeric", + fileOmitCharacters = "character" + ), + methods = list( + initialize = function( mat = NULL ) { + dataEnv <<- new.env(hash = TRUE, size = 29L); + nSlices1 <<- 0L; + if(!is.null(mat)) { + CreateFromMatrix(mat); + } + fileSliceSize <<- 1000; + fileDelimiter <<- "\t"; + fileSkipColumns <<- 1L; + fileSkipRows <<- 1L; + fileOmitCharacters <<- "NA" + return(invisible(.self)); + }, + CreateFromMatrix = function( mat ) { + stopifnot( class(mat) == "matrix" ); + setSliceRaw( 1L ,mat ); + rns = rownames( mat, do.NULL = FALSE); + #if( is.null(rns) ) { + # rns = paste( "Row_",(1:nrow(mat)), sep="" ); + #} + rowNameSlices <<- list(rns); + cns = colnames( mat, do.NULL = FALSE ); + #if( is.null(cns) ){ + # cns = paste( "Col_",(1:ncol(mat)), sep="" ); + #} + columnNames <<- cns; + return(invisible(.self)); + }, + getSlice = function(sl) { + value = get(paste(sl), dataEnv); + if( is.raw(value) ) { + storage.mode(value) = "double"; + value[value == 255] = NA; + } + return( value ) + }, + getSliceRaw = function(sl) { + return( get(paste(sl), dataEnv) ) + }, + setSliceRaw = function(sl, value) { + assign( paste(sl), value, dataEnv ) + if( nSlices1 < sl ) { + nSlices1 <<- sl; + } + }, + setSlice = function(sl, value) { + if( length(value) > 0 ) { + if( all(as.integer(value) == value, na.rm = TRUE) ) { + if( (min(value, na.rm = TRUE) >= 0 ) && + (max(value, na.rm = TRUE) < 255) ) + { + nv = value; + suppressWarnings({storage.mode(nv) = "raw"}); + nv[ is.na(value)] = as.raw(255); + value = nv; + } else { + storage.mode(value) = "integer"; + } + } + } + setSliceRaw(sl, value); + }, + nSlices = function() { + return( nSlices1 ); + }, + LoadFile = function(filename, skipRows = NULL, skipColumns = NULL, sliceSize = NULL, omitCharacters = NULL, delimiter = NULL, rowNamesColumn = 1) { + if( !is.null(skipRows) ) { + fileSkipRows <<- skipRows; + } + if( !is.null(skipColumns) ) { + fileSkipColumns <<- skipColumns; + } + if( !is.null(omitCharacters) ) { + fileOmitCharacters <<- omitCharacters; + } + if( !is.null(sliceSize) ) { + fileSliceSize <<- sliceSize; + } + if( !is.null(delimiter) ) { + fileDelimiter <<- delimiter; + } + stopifnot( (fileSkipColumns == 0) || (rowNamesColumn <= fileSkipColumns) ) + stopifnot( (fileSkipColumns == 0) || (rowNamesColumn >= 1) ) + + fid = file(description = filename, open = "rt", blocking = FALSE, raw = FALSE) + # clean object if file is open + Clear(); + lines = readLines(con = fid, n = max(fileSkipRows,1L), ok = TRUE, warn = TRUE) + line1 = tail(lines,1); + splt = strsplit(line1, split = fileDelimiter, fixed = TRUE); + if( fileSkipRows > 0L ) { + columnNames <<- splt[[1]]; # [ -(1:fileSkipColumns) ]; + } else { + seek(fid, 0) + } + + rm( lines, line1, splt ); + + rowNameSlices <<- vector("list", 15); + + curSliceId = 0L; + repeat + { + # preallocate names and data + if(length(rowNameSlices) < curSliceId) { + rowNameSlices[[2L*curSliceId]] <<- NULL; + } + curSliceId = curSliceId + 1L; + + # read sliceSize rows + rowtag = vector("character",fileSliceSize); + rowvals = vector("list",fileSliceSize); + for(i in 1:fileSliceSize) { + temp = ""; + if( fileSkipColumns > 0L ) { + temp = scan(file = fid, what = character(), n = fileSkipColumns, quiet = TRUE,sep = fileDelimiter); + } + rowtag[i] = temp[rowNamesColumn];#paste(temp,collapse=" "); + rowvals[[i]] = scan(file = fid, what = double(), nlines = 1, quiet = TRUE, sep = fileDelimiter, na.strings = fileOmitCharacters); + if( length(rowvals[[i]]) == 0L ) { + if(i==1L) { + rowtag = matrix(0, 0, 0); + rowvals = character(0); + } else { + rowtag = rowtag[ 1:(i-1) ]; + rowvals = rowvals[ 1:(i-1) ]; + } + break; + } + } + if( length(rowtag) == 0L ) { + curSliceId = curSliceId - 1L; + break; + } + rowNameSlices[[curSliceId]] <<- rowtag; + data = c(rowvals, recursive = TRUE); + dim(data) = c(length(rowvals[[1]]), length(rowvals)); + data = t(data); + setSlice(curSliceId, data); + if( length(rowtag) < fileSliceSize ) { + break; + } + numtxt = formatC(curSliceId*fileSliceSize, big.mark=",", format = "f", digits = 0) + cat( "Rows read: ", numtxt, "\n"); + flush.console() + } + close(fid) + if( fileSkipRows == 0 ) { + columnNames <<- paste("Col_", (1:nCols()), sep=""); + } else { + columnNames <<- tail(columnNames, ncol(getSliceRaw(1))); + } + if( fileSkipColumns == 0 ) { + cnt = 0L; + for( sl in 1:nSlices() ) { + nr = length(getSliceRaw(sl)); + rowNameSlices[[sl]] <<- paste("Row_",cnt + (1:nr),sep=""); + cnt = cnt + nr; + } + } + rowNameSlices <<- rowNameSlices[1:curSliceId]; + cat("Rows read: ", nRows(), " done.\n"); + return(invisible(.self)); + }, + SaveFile = function(filename) { + if( nSlices() == 0 ) { + cat("No data to save"); + return(); + } + fid = file(filename,"wt"); + for( sl in 1:nSlices() ) { + z = getSlice(sl); + rownames(z) = rowNameSlices[[sl]]; + colnames(z) = columnNames; + write.table(z, file = fid, sep = "\t", + col.names = (if(sl == 1){NA}else{FALSE})); + } + close(fid); + }, + nRows = function() { + s = 0L; + for(sl in .seq(1,nSlices())) { + s = s + nrow(getSliceRaw(sl)); + } + return( s ) + }, + nCols = function() { + if( nSlices() == 0L ) { + return(0L); + } else { + return( ncol(getSliceRaw(1L)) ) + } + }, + Clear = function() { + for( sl in .seq(1,nSlices()) ) { + rm(list = paste(sl), envir = dataEnv) + } + nSlices1 <<- 0L; + rowNameSlices <<- list(); + columnNames <<- character(); + return(invisible(.self)); + }, + IsCombined = function() { + return( nSlices() <= 1L ); + }, + GetAllRowNames = function() { + return( c(rowNameSlices, recursive=TRUE) ); + }, + SetNanRowMean = function() { + if( (nCols() == 0L) ) { + return(invisible(.self)); + } + for( sl in .seq(1,nSlices()) ) { + slice = getSlice(sl); + if( any(is.na(slice)) ) { + rowmean = rowMeans(slice, na.rm = TRUE); + rowmean[is.na(rowmean)] = 0L; + for( j in which(!complete.cases(slice)) ) { + where1 = is.na(slice[j, ]); + slice[j, where1] = rowmean[j]; + } + setSlice(sl, slice); + } + } + return(invisible(.self)); + }, + RowStandardizeCentered = function() { + for(sl in .seq(1,nSlices()) ) { + slice = getSlice(sl); + div = sqrt( rowSums(slice^2) ); + div[ div == 0 ] = 1; + setSlice(sl, slice/div); + } + return(invisible(.self)); + }, + CombineInOneSlice = function() { + if( nSlices() <= 1L ) { + return(invisible(.self)); + } + nc = nCols(); + nr = nRows(); + datatypes = c("raw","integer","double"); + datafuns = c(as.raw, as.integer, as.double); + datatype = character(nSlices()); + for(sl in 1:nSlices()) { + datatype[sl] = typeof(getSliceRaw(sl)); + } + mch = max(match(datatype,datatypes,nomatch = length(datatypes))); + datafun = datafuns[[mch]]; + newData = matrix(datafun(0), nrow = nr, ncol = nc); + offset = 0; + for(sl in 1:nSlices()) { + if(mch==1) { + slice = getSliceRaw(sl); + } else { + slice = getSlice(sl); + } + newData[ offset + (1:nrow(slice)),] = datafun(slice); + setSlice(sl, numeric()); + offset = offset + nrow(slice); + } + + nSlices1 <<- 1L; + setSliceRaw(1L, newData); + rm(newData); + + newrowNameSlices = GetAllRowNames(); + rowNameSlices <<- list(newrowNameSlices) + return(invisible(.self)); + }, + ResliceCombined = function(sliceSize = -1) { + if( sliceSize > 0L ) { + fileSliceSize <<- sliceSize; + } + if( fileSliceSize <= 0 ) { + fileSliceSize <<- 1000; + } + if( IsCombined() ) { + nRows1 = nRows(); + if(nRows1 == 0L) { + return(invisible(.self)); + } + newNSlices = floor( (nRows1 + fileSliceSize - 1)/fileSliceSize ); + oldData = getSliceRaw(1L); + #oldNames = rowNameSlices[[1]]; + newNameslices = vector("list",newNSlices) + for( sl in 1:newNSlices ) { + range = (1+(sl-1)*fileSliceSize) : (min(nRows1,sl*fileSliceSize)); + newpart = oldData[range, ,drop = FALSE]; + if( is.raw(oldData) ) { + setSliceRaw( sl, newpart); + } else { + setSlice( sl, newpart); + } + newNameslices[[sl]] = rowNameSlices[[1]][range]; + } + rowNameSlices <<- newNameslices ; + } else { + stop("Reslice of a sliced matrix is not supported yet. Use CombineInOneSlice first."); + } + return(invisible(.self)); + }, + Clone = function() { + clone = SlicedData$new(); + for(sl in .seq(1,nSlices()) ) { + clone$setSliceRaw(sl,getSliceRaw(sl)); + } + clone$rowNameSlices = rowNameSlices; + clone$columnNames = columnNames; + clone$fileDelimiter = fileDelimiter; + clone$fileSkipColumns = fileSkipColumns; + clone$fileSkipRows = fileSkipRows; + clone$fileSliceSize = fileSliceSize; + clone$fileOmitCharacters = fileOmitCharacters; + return( clone ); + }, + RowMatrixMultiply = function(multiplier) { + for(sl in .seq(1,nSlices()) ) { + setSlice(sl, getSlice(sl) %*% multiplier); + } + return(invisible(.self)); + }, + ColumnSubsample = function(subset) { + for(sl in .seq(1,nSlices()) ) { + setSliceRaw(sl, getSliceRaw(sl)[ ,subset, drop = FALSE]); + } + columnNames <<- columnNames[subset]; + return(invisible(.self)); + }, + RowReorderSimple = function(ordr) { + # had to use an inefficient and dirty method + # due to horrible memory management in R + if( (typeof(ordr) == "logical") && all(ordr) ) { + return(invisible(.self)); + } + if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { + return(invisible(.self)); + } + CombineInOneSlice(); + gc(); + setSliceRaw( 1L, getSliceRaw(1L)[ordr, ] ); + rowNameSlices[[1]] <<- rowNameSlices[[1]][ordr]; + gc(); + ResliceCombined(); + gc(); + return(invisible(.self)); + }, + RowReorder = function(ordr) { + # transform logical into indices + if( typeof(ordr) == "logical" ) { + if( length(ordr) == nRows() ) { + ordr = which(ordr); + } else { + stop("Parameter \"ordr\" has wrong length") + } + } + ## first, check that anything has to be done at all + if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { + return(invisible(.self)); + } + ## check bounds + #if( (min(ordr) < 1) || (max(ordr) > nRows()) ) { + # stop("Parameter \"ordr\" is out of bounds"); + #} + ## slice the data into individual rows + all_rows = vector("list", nSlices()) + for( i in 1:nSlices() ) { + slice = getSliceRaw(i) + all_rows[[i]] = split(slice, 1:nrow(slice)) + setSliceRaw(i,numeric()) + } + gc(); + all_rows = unlist(all_rows, recursive=FALSE, use.names = FALSE); + ## Reorder the rows + all_rows = all_rows[ordr]; + ## get row names + all_names = GetAllRowNames(); + ## erase the set + rowNameSlices <<- list(); + ## sort names + all_names = all_names[ordr]; + ## + ## Make slices back + nrows = length(all_rows); + nSlices1 <<- as.integer((nrows+fileSliceSize-1)/fileSliceSize); + ##cat(nrows, " ", nSlices1); + rowNameSlices1 = vector("list", nSlices1); + for( i in 1:nSlices1 ) { + fr = 1 + fileSliceSize*(i-1); + to = min( fileSliceSize*i, nrows); + + subset = all_rows[fr:to]; + types = unlist(lapply(subset,typeof)); + israw = (types == "raw") + if(!all(israw == israw[1])) { + # some raw and some are not + subset = lapply(subset, function(x){if(is.raw(x)){x=as.integer(x);x[x==255] = NA;return(x)}else{return(x)}}); + } + subset = unlist(subset); + dim(subset) = c( length(all_rows[[fr]]) , to - fr + 1) + #subset = matrix(subset, ncol = (to-fr+1)); + if(is.raw(subset)) { + setSliceRaw(i, t(subset)); + } else { + setSlice(i, t(subset)); + } + rowNameSlices1[[i]] = all_names[fr:to]; + all_rows[fr:to] = 0; + all_names[fr:to] = 0; + } + rowNameSlices <<- rowNameSlices1; + gc(); + return(invisible(.self)); + }, + RowRemoveZeroEps = function(){ + for(sl in .seq(1,nSlices()) ) { + slice = getSlice(sl); + amean = rowMeans(abs(slice)); + remove = (amean < .Machine$double.eps*nCols()); + if(any(remove)) { + rowNameSlices[[sl]] <<- rowNameSlices[[sl]][!remove]; + setSlice(sl, slice[!remove, , drop = FALSE]); + } + } + return(invisible(.self)); + }, + FindRow = function(rowname) { + for(sl in .seq(1,nSlices()) ) { + mch = match(rowname,rowNameSlices[[sl]], nomatch = 0); + if( mch > 0 ) + { + row = getSlice(sl)[mch[1], , drop=FALSE]; + rownames(row) = rowname; + colnames(row) = columnNames; + return( list(slice = sl, item = mch, row = row) ); + } + } + return( NULL ); + }, + show = function() { + cat("SlicedData object. For more information type: ?SlicedData\n"); + cat("Number of columns:", nCols(), "\n"); + cat("Number of rows:", nRows(), "\n"); + cat("Data is stored in", nSlices(), "slices\n"); + if(nCols()>0) { + z = getSlice(1L); + if(nrow(z)>0) { + z = z[1:min(nrow(z),10L), 1:min(ncol(z),10L), drop = FALSE]; + rownames(z) = rowNameSlices[[1]][1:nrow(z)]; + colnames(z) = columnNames[1:ncol(z)]; + cat("Top left corner of the first slice (up to 10x10):\n"); + methods:::show(z) + } + } + } + )) + +setGeneric("nrow") +setMethod("nrow", "SlicedData", function(x) { + return( x$nRows() ); + }) +setGeneric("NROW") +setMethod("NROW", "SlicedData", function(x) { + return( x$nRows() ); + }) +setGeneric("ncol") +setMethod("ncol", "SlicedData", function(x) { + return( x$nCols() ); + }) +setGeneric("NCOL") +setMethod("NCOL", "SlicedData", function(x) { + return( x$nCols() ); + }) +setGeneric("dim") +setMethod("dim", "SlicedData", function(x) { + return( c(x$nRows(),x$nCols()) ); + }) +setGeneric("colnames") +setMethod("colnames", "SlicedData", function(x) { + return( x$columnNames ); + }) +setGeneric("rownames") +setMethod("rownames", "SlicedData", function(x) { + return( x$GetAllRowNames() ); + }) +setMethod("[[", "SlicedData", function(x,i) { + return( x$getSlice(i) ); + }) +setGeneric("length") +setMethod("length", "SlicedData", function(x) { + return( x$nSlices() ); + }) +setMethod("[[<-", "SlicedData", function(x,i,value) { + x$setSlice(i, value); + return(x); +}) +summary.SlicedData = function(object, ...) { + z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); + return(z); +} + +##### setGeneric("summary") ##### +#setMethod("summary", "SlicedData", function(object, ...) { +# z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); +# return(z); +# }) +#setGeneric("show", standardGeneric("show")) +# setMethod("show", "SlicedData", function(object) { +# cat("SlicedData object. For more information type: ?SlicedData\n"); +# cat("Number of columns:", object$nCols(), "\n"); +# cat("Number of rows:", object$nRows(), "\n"); +# cat("Data is stored in", object$nSlices(), "slices\n"); +# if(object$nSlices()>0) { +# z = object$getSlice(1); +# if(nrow(z)>0) { +# z = z[1:min(nrow(z),10), 1:min(ncol(z),10), drop = FALSE]; +# rownames(z) = object$rowNameSlices[[1]][1:nrow(z)]; +# colnames(z) = object$columnNames[1:ncol(z)]; +# cat("Top left corner of the first slice (up to 10x10):\n"); +# show(z) +# } +# } +# }) + +setGeneric("as.matrix") +setMethod("as.matrix", "SlicedData", function(x) { + if(x$nSlices() == 0) { + return( matrix(0,0,0) ); + } + if(x$nSlices() > 1) { + copy = x$Clone(); + copy$CombineInOneSlice(); + } else { + copy = x; + } + mat = copy$getSlice(1L); + rownames(mat) = rownames(copy); + colnames(mat) = colnames(copy); + return( mat ); + }) +setGeneric("colnames<-") +setMethod("colnames<-", "SlicedData", function(x,value) { + stopifnot( class(value) == "character" ); + stopifnot( length(value) == x$nCols() ); + x$columnNames = value; + return(x); + }) +setGeneric("rownames<-") +setMethod("rownames<-", "SlicedData", function(x,value) { + stopifnot( class(value) == "character" ); + stopifnot( length(value) == x$nRows() ); + start = 1; + newNameSlices = vector("list", x$nSlices()); + for( i in .seq(1,x$nSlices()) ) { + nr = nrow(x$getSliceRaw(i)); + newNameSlices[[i]] = value[ start:(start+nr-1) ]; + start = start + nr; + } + x$rowNameSlices = newNameSlices; + return(x); + }) +setGeneric("rowSums") +setMethod("rowSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { + if(x$nSlices() == 0) { + return( numeric() ); + } + stopifnot( dims == 1 ); + thesum = vector("list", x$nSlices()); + for( i in 1:x$nSlices() ) { + thesum[[i]] = rowSums(x$getSlice(i), na.rm) + } + return(unlist(thesum, recursive = FALSE, use.names = FALSE)); + }) +setGeneric("rowMeans") +setMethod("rowMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { + if(x$nSlices() == 0) { + return( numeric() ); + } + stopifnot( dims == 1 ); + thesum = vector("list", x$nSlices()); + for( i in 1:x$nSlices() ) { + thesum[[i]] = rowMeans(x$getSlice(i), na.rm) + } + return(unlist(thesum, recursive = FALSE, use.names = FALSE)); + }) +setGeneric("colSums") +setMethod("colSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { + if(x$nCols() == 0) { + return( numeric() ); + } + stopifnot( dims == 1 ); + thesum = 0; + for( i in .seq(1,x$nSlices()) ) { + thesum = thesum + colSums(x$getSlice(i), na.rm) + } + return(thesum); + }) +setGeneric("colMeans") +setMethod("colMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { + if(x$nCols() == 0) { + return( numeric() ); + } + stopifnot( dims == 1 ); + thesum = 0; + thecounts = x$nRows(); + for( i in .seq(1,x$nSlices()) ) { + slice = x$getSlice(i); + thesum = thesum + colSums(slice, na.rm) + if( na.rm ) { + thecounts = thecounts - colSums(is.na(slice)) + } + } + return(thesum/thecounts); + }) + +.listBuilder <- setRefClass(".listBuilder", + fields = list( + dataEnv = "environment", + n = "numeric" + ), + methods = list( + initialize = function() { + dataEnv <<- new.env(hash = TRUE); + n <<- 0; +# cumlength <<- 0; + return(.self); + }, + add = function(x) { + if(length(x) > 0) { + n <<- n + 1; +# cumlength <<- cumlength + length(x); + assign(paste(n), x, dataEnv ); + } + return(.self); + }, + set = function(i,x) { + if(length(x) > 0) { + if(i>n) + n <<- i; + assign(paste(i), x, dataEnv ); + } + return(.self); + }, + get = function(i) { + return(base::get(paste(i),dataEnv)); + }, + list = function() { + if(n==0) return(list()); + result = vector('list',n); + for( i in 1:n) { + result[[i]] = .self$get(i); + } + return(result); + }, + unlist = function() { + return(base::unlist(.self$list(), recursive=FALSE, use.names = FALSE)); + }, + show = function() { + cat(".listBuilder object.\nIternal object in MatrixEQTL package.\n"); + cat("Number of elements:", object$n, "\n"); + } + )) + +.histogrammer <- setRefClass(".histogrammer", + fields = list( + pvbins1 = "numeric", + statbins1 = "numeric", + hist.count = "numeric" + ), + methods = list( + initialize = function (pvbins, statbins) { + if(length(pvbins)) { + ord = order(statbins); + pvbins1 <<- pvbins[ord]; + statbins1 <<- statbins[ord]; + hist.count <<- double(length(pvbins)-1); + } + return(.self); + }, + update = function(stats.for.hist) { + h = hist(stats.for.hist, breaks = statbins1, include.lowest = TRUE, right = TRUE, plot = FALSE)$counts; + hist.count <<- hist.count + h; + }, + getResults = function() { + if(!is.unsorted(pvbins1)) { + return(list(hist.bins = pvbins1 , hist.counts = hist.count )); + } else { + return(list(hist.bins = rev(pvbins1), hist.counts = rev(hist.count))); + } + } + )) + + +.minpvalue <- setRefClass(".minpvalue", + fields = list( + sdata = ".listBuilder", + gdata = ".listBuilder" + ), + methods = list( + initialize = function(snps, gene) { + sdata <<- .listBuilder$new(); + for( ss in 1:snps$nSlices() ) { + sdata$set( ss, double(nrow(snps$getSliceRaw(ss)))); + } + gdata <<- .listBuilder$new(); + for( gg in 1:gene$nSlices() ) { + gdata$set( gg, double(nrow(gene$getSliceRaw(gg)))); + } + return(.self); + }, + update = function(ss, gg, astat) { + gmax = gdata$get(gg) + z1 = max.col(astat,ties.method='first'); + z11 = astat[1:nrow(astat) + nrow(astat) * (z1 - 1)]; + gmax = pmax(gmax, z11); + gdata$set(gg, gmax); + + smax = sdata$get(ss) + z22 = apply(astat,2,max); + smax = pmax(smax, z22); + sdata$set(ss, smax); + return(.self); + }, + updatecis = function(ss, gg, select.cis, astat) { + if(length(astat)>0) + { + byrows = aggregate(x=astat, by=list(row=select.cis[,1]), FUN=max); + bycols = aggregate(x=astat, by=list(col=select.cis[,2]), FUN=max); + + gmax = gdata$get(gg); + gmax[byrows$row] = pmax(gmax[byrows$row], byrows$x) + gdata$set(gg, gmax); + + smax = sdata$get(ss) + smax[bycols$col] = pmax(smax[bycols$col], bycols$x) + sdata$set(ss, smax); + } + return(.self); + }, + getResults = function(snps, gene, pvfun) { + min.pv.snps = pvfun(sdata$unlist()); + names(min.pv.snps) = rownames(snps); + min.pv.gene = pvfun(gdata$unlist()); + names(min.pv.gene) = rownames(gene); + return(list(min.pv.snps = min.pv.snps, min.pv.gene = min.pv.gene)); + } + )) + +.OutputSaver_FRD <- setRefClass(".OutputSaver_FRD", + fields = list( + sdata = ".listBuilder", + gdata = ".listBuilder", + cdata = ".listBuilder", + bdata = ".listBuilder", + fid = "list", + testfun1 = "list", + pvfun1 = "list" + ), + methods = list( + initialize = function () { + sdata <<- .listBuilder$new(); + gdata <<- .listBuilder$new(); + cdata <<- .listBuilder$new(); + bdata <<- .listBuilder$new(); + fid <<- list(0); + testfun1 <<- list(0); + pvfun1 <<- list(0); + return(.self); + }, + start = function(filename, statistic_name, unused1, unused2, testfun, pvfun) { + testfun1 <<- list(testfun); + pvfun1 <<- list(pvfun); + if(length(filename) > 0) { + if(class(filename) == "character") { + fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); + } else { + fid <<- list(filename, FALSE) + } + writeLines( paste("SNP\tgene\t",statistic_name,"\tp-value\tFDR", sep = ""), fid[[1]]); + } else { + fid <<- list(); + } + }, + update = function(spos, gpos, sta, beta = NULL) { + if(length(sta)>0) { + sdata$add(spos); + gdata$add(gpos); + cdata$add(sta ); + if(!is.null(beta )) + bdata$add(beta ); + } + return(.self); + }, + getResults = function( gene, snps, FDR_total_count) { + pvalues = NULL; + if(cdata$n > 0) { + tests = testfun1[[1]](cdata$unlist()); + cdata <<- .listBuilder$new(); + + pvalues = pvfun1[[1]](tests); + ord = order(pvalues); + + tests = tests[ord]; + pvalues = pvalues[ord]; + + FDR = pvalues * FDR_total_count / (1:length(pvalues)); + FDR[length(FDR)] = min(FDR[length(FDR)], 1); + FDR = rev(cummin(rev(FDR))); + + snps_names = snps$GetAllRowNames()[sdata$unlist()[ord]]; + sdata <<- .listBuilder$new(); + gene_names = gene$GetAllRowNames()[gdata$unlist()[ord]]; + gdata <<- .listBuilder$new(); + + beta = NULL; + if(bdata$n > 0) + beta = bdata$unlist()[ord]; + + if(length(fid)>0) { + step = 1000; ########### 100000 + for( part in 1:ceiling(length(FDR)/step) ) { + fr = (part-1)*step + 1; + to = min(part*step, length(FDR)); + dump = data.frame(snps_names[fr:to], + gene_names[fr:to], + if(is.null(beta)) tests[fr:to] else list(beta[fr:to],tests[fr:to]), + pvalues[fr:to], + FDR[fr:to], + row.names = NULL, + check.rows = FALSE, + check.names = FALSE, + stringsAsFactors = FALSE); + write.table(dump, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); + } + } + } else { + cat("No significant associations were found.\n", file = if(length(fid)>0){fid[[1]]}else{""}); + } + if(length(fid)>0) { + if(fid[[2]]) { + close(fid[[1]]); + } + fid <<- list(); + } + + if(!is.null(pvalues)) { + eqtls = list( snps = snps_names, + gene = gene_names, + statistic = tests, + pvalue = pvalues, + FDR = FDR); + if(!is.null(beta)) + eqtls$beta = beta; + } else { + eqtls = list( snps = character(), + gene = character(), + beta = numeric(), + statistic = numeric(), + pvalue = numeric(), + FDR = numeric()); + } + return(list(eqtls = data.frame(eqtls))); + } + ) +) + + +.OutputSaver_direct <- setRefClass(".OutputSaver_direct", + fields = list( + gene_names = "character", + snps_names = "character", + fid = "list", + testfun1 = "list", + pvfun1 = "list" + ), + methods = list( + initialize = function() { + gene_names <<- character(0); + snps_names <<- character(0); + fid <<- list(0); + testfun1 <<- list(0); + pvfun1 <<- list(0); + return(.self); + }, + start = function(filename, statistic_name, snps, gene, testfun, pvfun) { + # I hope the program stops if it fails to open the file + if(class(filename) == "character") { + fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); + } else { + fid <<- list(filename, FALSE) + } + writeLines(paste("SNP\tgene\t", statistic_name, "\tp-value", sep = ""), fid[[1]]); + gene_names <<- gene$GetAllRowNames(); + snps_names <<- snps$GetAllRowNames(); + testfun1 <<- list(testfun); + pvfun1 <<- list(pvfun); + }, + update = function(spos, gpos, sta, beta = NULL) { + if( length(sta) == 0 ) + return(); + sta = testfun1[[1]](sta); + lst = list(snps = snps_names[spos], gene = gene_names[gpos], beta = beta, statistic = sta, pvalue = pvfun1[[1]](sta)); + lst$beta = lst$beta; + + dump2 = data.frame(lst, row.names = NULL, check.rows = FALSE, check.names = FALSE, stringsAsFactors = FALSE); + write.table(dump2, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); + }, + getResults = function(...) { + if(length(fid)>0) { + if(fid[[2]]) { + close(fid[[1]]); + } + fid <<- list(); + } + gene_names <<- character(0); + snps_names <<- character(0); + return(list()); + } + ) +) + +.my.pmin = function(x, val) { + # minimum "pmin" function that can handle empty array + if(length(x) == 0) { + return(x) + } else { + return(pmin.int(x,val)); + } +} + +.my.pmax = function(x, val) { + # minimum "pmin" function that can handle empty array + if(length(x) == 0) { + return(x) + } else { + return(pmax.int(x,val)); + } +} + +.pv.nz = function(x){return( .my.pmax(x,.Machine$double.xmin) )} + +.SetNanRowMean = function(x) { + if( any(is.na(x)) ) { + rowmean = rowMeans(x, na.rm = TRUE); + rowmean[ is.na(rowmean) ] = 0; + for( j in which(!complete.cases(x)) ) { + where1 = is.na( x[j, ] ); + x[j,where1] = rowmean[j]; + } + } + return(x); +} + +# .SNP_process_split_for_ANOVA = function(x) { +# # split into 2 dummy variables +# +# uniq = unique(c(x)); +# uniq = uniq[!is.na(uniq)]; +# +# if( length(uniq) > 3 ) { +# stop("More than three genotype categories is not handled by ANOVA"); +# } else if ( length(uniq) < 3 ) { +# uniq = c(uniq, min(uniq)-(1:(3-length(uniq)))); +# } +# +# freq = matrix(0, nrow(x), length(uniq)); +# for(i in 1:length(uniq)) { +# freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); +# } +# +# md = apply(freq, 1, which.max); +# freq[ cbind(1:nrow(x),md) ] = -1; +# +# md = apply(freq, 1, which.max); # min(freq[cbind(1:nrow(slice),md)] - rowSums(select,na.rm = TRUE )) +# new_slice1 = (x == uniq[md]); +# new_slice1[is.na(new_slice1)] = 0; +# freq[ cbind(1:nrow(x),md) ] = -1; +# +# md = apply(freq,1,which.max); +# new_slice2 = (x == uniq[md]); +# new_slice2[ is.na(new_slice2) ] = 0; +# rez = vector("list", 2); +# rez[[1]] = new_slice1; +# rez[[2]] = new_slice2; +# return( rez ); +# } + +.SNP_process_split_for_ANOVA = function(x,n.groups) { + # split into 2 dummy variables (or more) + +# # Get the number of ANOVA groups +# n.groups = options('MatrixEQTL.ANOVA.categories')[[1]]; +# if( is.null(n.groups)) +# n.groups = 3; + + # Unique values in x (make sure it has length of n.groups); + uniq = unique(as.vector(x)); + uniq = uniq[!is.na(uniq)]; + if( length(uniq) > n.groups ) { + stop("More than declared number of genotype categories is detected by ANOVA"); + } else if ( length(uniq) < n.groups ) { + uniq = c(uniq, rep(min(uniq)-1, n.groups-length(uniq))); + } + + # Table of frequencies for each variable (row) + freq = matrix(0, nrow(x), n.groups); + for(i in 1:n.groups) { + freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); + } + # remove NA's from x for convenience + x[is.na(x)] = min(uniq)-2; + + # Output list of matrices + rez = vector('list',n.groups-1); + + # Skip the most frequent value + md = apply(freq, 1, which.max); # most frequent value for each variable + freq[ cbind(1:nrow(x),md) ] = -1; + + # The rest form dumm + for(j in 1:(n.groups-1)){ + md = apply(freq, 1, which.max); + freq[ cbind(1:nrow(x),md) ] = -1; + rez[[j]] = (x == uniq[md]); + } + return( rez ); +} + +Matrix_eQTL_engine = function( + snps, + gene, + cvrt = SlicedData$new(), + output_file_name, + pvOutputThreshold = 1e-5, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + pvalue.hist = FALSE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE) { + rez = Matrix_eQTL_main( + snps = snps, + gene = gene, + cvrt = cvrt, + output_file_name = output_file_name, + pvOutputThreshold = pvOutputThreshold, + useModel = useModel, + errorCovariance = errorCovariance, + verbose = verbose, + pvalue.hist = pvalue.hist, + min.pv.by.genesnp = min.pv.by.genesnp, + noFDRsaveMemory = noFDRsaveMemory); + return( rez ); +} + +Matrix_eQTL_main = function( + snps, + gene, + cvrt = SlicedData$new(), + output_file_name = "", + pvOutputThreshold = 1e-5, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + output_file_name.cis = "", + pvOutputThreshold.cis = 0, + snpspos = NULL, + genepos = NULL, + cisDist = 1e6, + pvalue.hist = FALSE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE) { + ################################# Basic variable checks ################################# + { + # status("Performing basic checks of the input variables"); + stopifnot( "SlicedData" %in% class(gene) ); + stopifnot( "SlicedData" %in% class(snps) ); + stopifnot( "SlicedData" %in% class(cvrt) ); + + # Check dimensions + if( min(snps$nRows(),snps$nCols()) == 0 ) + stop("Empty genotype dataset"); + if( min(gene$nRows(),gene$nCols()) == 0 ) + stop("Empty expression dataset"); + if( snps$nCols() != gene$nCols() ) + stop("Different number of samples in the genotype and gene expression files"); + if( cvrt$nRows()>0 ) { + if( snps$nCols() != cvrt$nCols() ) + stop("Wrong number of samples in the matrix of covariates"); + } + + stopifnot( class(pvOutputThreshold) == "numeric" ); + stopifnot( length(pvOutputThreshold) == 1 ); + stopifnot( pvOutputThreshold >= 0 ); + stopifnot( pvOutputThreshold <= 1 ); + + stopifnot( class(noFDRsaveMemory) == "logical" ); + stopifnot( length(noFDRsaveMemory) == 1 ); + + if( pvOutputThreshold > 0 ) { + stopifnot( !((length(output_file_name) == 0) && noFDRsaveMemory) ) + stopifnot( length(output_file_name) <= 1 ); + if( length(output_file_name) == 1 ) { + stopifnot( class(output_file_name) %in% c("character","connection") ); + } + } + + stopifnot( class(pvOutputThreshold.cis) == "numeric" ); + stopifnot( length(pvOutputThreshold.cis) == 1 ); + stopifnot( pvOutputThreshold.cis >= 0 ); + stopifnot( pvOutputThreshold.cis <= 1 ); + stopifnot( !((pvOutputThreshold > 0) & (pvOutputThreshold.cis > 0) & (pvOutputThreshold > pvOutputThreshold.cis)) ); + stopifnot( (pvOutputThreshold > 0) | (pvOutputThreshold.cis > 0) ); + + stopifnot( class(useModel) == class(modelLINEAR) ); + stopifnot( length(useModel) == 1 ); + stopifnot( useModel %in% c(modelLINEAR, modelANOVA, modelLINEAR_CROSS) ); + if( useModel %in% c(modelLINEAR, modelLINEAR_CROSS) ) { + if( snps$nCols() <= cvrt$nRows() + 1 + 1) { + stop('The number of covariates exceeds the number of samples.\nLinear regression can not be fit.') + } + } + if( useModel == modelLINEAR_CROSS ) { + if( cvrt$nRows() == 0 ) { + stop( "Model \"modelLINEAR_CROSS\" requires at least one covariate" ); + } + } + if( useModel == modelANOVA ) { + n.anova.groups = getOption('MatrixEQTL.ANOVA.categories', 3); + stopifnot( n.anova.groups == floor(n.anova.groups) ); + stopifnot( n.anova.groups >= 3 ); +# stopifnot( n.anova.groups < snps$nCols() - cvrt$nRows() - 2 ); + if( snps$nCols() <= cvrt$nRows() + n.anova.groups) { + stop('The number of covariates exceeds the number of samples.\nLinear regression (ANOVA) can not be fit.') + } + } + + stopifnot( class(verbose) == "logical" ); + stopifnot( length(verbose) == 1 ); + + stopifnot( class(min.pv.by.genesnp) == "logical" ); + stopifnot( length(min.pv.by.genesnp) == 1 ); + + if( pvOutputThreshold.cis > 0 ) { + stopifnot( !((length(output_file_name.cis) == 0) && noFDRsaveMemory) ) + stopifnot( length(output_file_name.cis) <= 1 ); + if( length(output_file_name.cis) == 1 ) { + stopifnot( class(output_file_name.cis) %in% c("character","connection") ); + } + +# stopifnot( class(output_file_name.cis) == "character" ); +# stopifnot( length(output_file_name.cis) == 1 ); + stopifnot( class(snpspos) == "data.frame" ); + stopifnot( ncol(snpspos) == 3 ); + stopifnot( nrow(snpspos) > 0 ); + stopifnot( class(snpspos[1,3]) %in% c("integer", "numeric") ) + stopifnot( !any(is.na(snpspos[,3])) ) + stopifnot( class(genepos) == "data.frame" ); + stopifnot( ncol(genepos) == 4 ); + stopifnot( nrow(genepos) > 0 ); + stopifnot( class(genepos[1,3]) %in% c("integer", "numeric") ) + stopifnot( class(genepos[1,4]) %in% c("integer", "numeric") ) + stopifnot( !any(is.na(genepos[,3])) ) + stopifnot( !any(is.na(genepos[,4])) ) + stopifnot( nzchar(output_file_name.cis) ) + } + + if( pvOutputThreshold > 0 ) { + stopifnot( nzchar(output_file_name) ) + } + + stopifnot( class(errorCovariance) %in% c("numeric", "matrix") ); + errorCovariance = as.matrix(errorCovariance); + if(length(errorCovariance)>0) { + if( nrow(errorCovariance) != ncol(errorCovariance) ) { + stop("The covariance matrix is not square"); + } + if( nrow(errorCovariance) != snps$nCols() ) { + stop("The covariance matrix size does not match the number of samples"); + } + if( !all(errorCovariance == t(errorCovariance)) ) { + stop("The covariance matrix is not symmetric"); + } + } + } + ################################# Initial setup ######################################### + { + gene.std = .listBuilder$new(); + snps.std = .listBuilder$new(); + + dont.clone.gene = getOption('MatrixEQTL.dont.preserve.gene.object', FALSE) + if(is.null(dont.clone.gene)) + dont.clone.gene = FALSE; + + if( !dont.clone.gene ) + gene = gene$Clone(); + # snps = snps$Clone(); # snps is read only + cvrt = cvrt$Clone(); + + params = list( + output_file_name = output_file_name, + pvOutputThreshold = pvOutputThreshold, + useModel = useModel, + errorCovariance = errorCovariance , + verbose = verbose, + output_file_name.cis = output_file_name.cis, + pvOutputThreshold.cis = pvOutputThreshold.cis, + cisDist = cisDist , + pvalue.hist = pvalue.hist, + min.pv.by.genesnp = min.pv.by.genesnp); + + if( verbose ) { + lastTime = 0; + status <- function(text) { + # gc(); + newTime = proc.time()[3]; + if(lastTime != 0) { + cat("Task finished in ", newTime-lastTime, " seconds\n"); + } + cat(text,"\n"); + lastTime <<- newTime; + unused = flush.console(); + } + } else { + status = function(text){} + } + start.time = proc.time()[3]; + } + ################################# Error covariance matrix processing #################### + { + if( length(errorCovariance) > 0 ) { + status("Processing the error covariance matrix"); + eig = eigen(errorCovariance, symmetric = TRUE) + d = eig$values; + v = eig$vectors; + # errorCovariance == v %*% diag(d) %*% t(v) + # errorCovariance^0.5 == v*sqrt(d)*v" (no single quotes anymore) + # errorCovariance^(-0.5) == v*diag(1./sqrt(diag(d)))*v" + if( any(d<=0) ) { + stop("The covariance matrix is not positive definite"); + } + correctionMatrix = v %*% diag(1./sqrt(d)) %*% t(v); + rm( eig, v, d, errorCovariance ) + } else { + rm( errorCovariance ); + correctionMatrix = numeric(); + } + } + ################################# Matching gene and SNPs locations ###################### + if( pvOutputThreshold.cis > 0 ) { + status("Matching data files and location files") + + # names in the input data + gene_names = gene$GetAllRowNames(); + snps_names = snps$GetAllRowNames(); + + # gene range, set: left genepos[,4])) { + temp3 = genepos[,3]; + temp4 = genepos[,4]; + genepos[,3] = pmin(temp3,temp4); + genepos[,4] = pmax(temp3,temp4); + rm(temp3, temp4); + } + + # match with the location data + genematch = match( gene_names, genepos[ ,1], nomatch = 0L); + usedgene = matrix(FALSE, nrow(genepos), 1); # genes in "genepos" that are matching "gene_names" + usedgene[ genematch ] = TRUE; + if( !any(genematch) ) { + stop("Gene names do not match those in the gene location file."); + } + cat( sum(genematch>0), "of", length(gene_names), " genes matched\n"); + + + snpsmatch = match( snps_names, snpspos[ ,1], nomatch = 0L); + usedsnps = matrix(FALSE, nrow(snpspos),1); + usedsnps[ snpsmatch ] = TRUE; + if( !any(snpsmatch) ) { + stop("SNP names do not match those in the SNP location file."); + } + cat( sum(snpsmatch>0), "of", length(snps_names), " SNPs matched\n"); + + # list used chr names + chrNames = unique(c( as.character(unique(snpspos[usedsnps,2])), + as.character(unique(genepos[usedgene,2])) )) + chrNames = chrNames[ sort.list( suppressWarnings(as.integer(chrNames)), + method = "radix", na.last = TRUE ) ]; + # match chr names + genechr = match(genepos[,2],chrNames); + snpschr = match(snpspos[,2],chrNames); + + # max length of a chromosome + chrMax = max( snpspos[usedsnps, 3], genepos[usedgene, 4], na.rm = TRUE) + cisDist; + + # Single number location for all rows in "genepos" and "snpspos" + genepos2 = as.matrix(genepos[ ,3:4, drop = FALSE] + (genechr-1)*chrMax); + snpspos2 = as.matrix(snpspos[ ,3 , drop = FALSE] + (snpschr-1)*chrMax); + + # the final location arrays; + snps_pos = matrix(0,length(snps_names),1); + snps_pos[snpsmatch>0, ] = snpspos2[snpsmatch, , drop = FALSE]; + snps_pos[rowSums(is.na(snps_pos))>0, ] = 0; + snps_pos[snps_pos==0] = (length(chrNames)+1) * (chrMax+cisDist); + rm(snps_names, snpsmatch, usedsnps, snpschr, snpspos2) + + gene_pos = matrix(0,length(gene_names),2); + gene_pos[genematch>0, ] = genepos2[genematch, , drop = FALSE]; + gene_pos[rowSums(is.na(gene_pos))>0, ] = 0; + gene_pos[gene_pos==0] = (length(chrNames)+2) * (chrMax+cisDist); + rm(gene_names, genematch, usedgene, genechr, genepos2) + rm(chrNames, chrMax); + + if( is.unsorted(snps_pos) ) { + status("Reordering SNPs\n"); + ordr = sort.list(snps_pos); + snps$RowReorder(ordr); + snps_pos = snps_pos[ordr, , drop = FALSE]; + rm(ordr); + } + if( is.unsorted(rowSums(gene_pos)) ) { + status("Reordering genes\n"); + ordr = sort.list(rowSums(gene_pos)); + gene$RowReorder(ordr); + gene_pos = gene_pos[ordr, , drop = FALSE]; + rm(ordr); + } + + # Slice it back. + geneloc = vector("list", gene$nSlices()) + gene_offset = 0; + for(gc in 1:gene$nSlices()) { + nr = length(gene$rowNameSlices[[gc]]); + geneloc[[gc]] = gene_pos[gene_offset + (1:nr), , drop = FALSE]; + gene_offset = gene_offset + nr; + } + rm(gc, gene_offset, gene_pos); + + snpsloc = vector("list", snps$nSlices()) + snps_offset = 0; + for(sc in 1:snps$nSlices()) { + nr = length(snps$rowNameSlices[[sc]]); + snpsloc[[sc]] = snps_pos[snps_offset + (1:nr), , drop = FALSE]; + snps_offset = snps_offset + nr; + } + rm(nr, sc, snps_offset, snps_pos); + } + ################################# Covariates processing ################################# + { + status("Processing covariates"); + if( useModel == modelLINEAR_CROSS ) { + last.covariate = as.vector(tail( cvrt$getSlice(cvrt$nSlices()), n = 1)); + } + if( cvrt$nRows()>0 ) { + cvrt$SetNanRowMean(); + cvrt$CombineInOneSlice(); + cvrt = rbind(matrix(1,1,snps$nCols()),cvrt$getSlice(1)); + } else { + cvrt = matrix(1,1,snps$nCols()); + } + # Correct for the error covariance structure + if( length(correctionMatrix)>0 ) { + cvrt = cvrt %*% correctionMatrix; + } + # Orthonormalize covariates + # status("Orthonormalizing covariates"); + q = qr(t(cvrt)); + if( min(abs(diag(qr.R(q)))) < .Machine$double.eps * snps$nCols() ) { + stop("Colinear or zero covariates detected"); + } + cvrt = t( qr.Q(q) ); + rm( q ); + } + ################################# Gene expression processing ############################ + { + status("Processing gene expression data (imputation, residualization, etc.)"); + # Impute gene expression + gene$SetNanRowMean(); + # Correct for the error covariance structure + if( length(correctionMatrix)>0 ) { + gene$RowMatrixMultiply(correctionMatrix); + } + # Orthogonolize expression w.r.t. covariates + # status("Orthogonolizing expression w.r.t. covariates"); + gene_offsets = double(gene$nSlices()+1); + for( sl in 1:gene$nSlices() ) { + slice = gene$getSlice(sl); + gene_offsets[sl+1] = gene_offsets[sl] + nrow(slice); + rowsq1 = rowSums(slice^2); + slice = slice - tcrossprod(slice,cvrt) %*% cvrt; + rowsq2 = rowSums(slice^2); + # kill rows colinear with the covariates + delete.rows = (rowsq2 <= rowsq1 * .Machine$double.eps ); + slice[delete.rows] = 0; + div = sqrt( rowSums(slice^2) ); + div[ div == 0 ] = 1; + gene.std$set(sl, div); + gene$setSlice(sl, slice / div); + } + rm(rowsq1, rowsq2, delete.rows, div); + rm( sl, slice ); + #gene$RowRemoveZeroEps(); + } + ################################# snps_process, testfun, pvfun, threshfun, afun ######## + { + # snps_process - preprocess SNPs slice + # + # afun --- abs for signed stats, identity for non-negative + # threshfun --- internal stat threshold for given p-value + # testfun --- t or F statistic from the internal one + # pvfun --- p-value from the t or F statistic + + nSamples = snps$nCols(); + nGenes = gene$nRows(); + nSnps = snps$nRows(); + nCov = nrow(cvrt); + # nVarTested = length(snps_list); # set in case(useModel) + # dfNull = nSamples - nCov; + # d.f. of the full model + betafun = NULL; + + if( useModel == modelLINEAR ) { + snps_process = function(x) { + return( list(.SetNanRowMean(x)) ); + }; + nVarTested = 1; + dfFull = nSamples - nCov - nVarTested; + statistic.fun = function(mat_list) { + return( mat_list[[1]] ); + } + afun = function(x) {return(abs(x))}; + threshfun = function(pv) { + thr = qt(pv/2, dfFull, lower.tail = FALSE); + thr = thr^2; + thr = sqrt( thr / (dfFull + thr) ); + thr[pv >= 1] = 0; + thr[pv <= 0] = 1; + return( thr ); + } + testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } + pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2)); } + thresh.cis = threshfun(pvOutputThreshold.cis); + thresh = threshfun(pvOutputThreshold); + betafun = function(stat, ss, gg, select) { + return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); + } + } else + if( useModel == modelANOVA ) { + snps_process = function(x).SNP_process_split_for_ANOVA(x,n.anova.groups); + nVarTested = n.anova.groups - 1; + dfFull = nSamples - nCov - nVarTested; +# statistic.fun = function(mat_list) { +# return( mat_list[[1]]^2 + mat_list[[2]]^2 ); +# } + statistic.fun = function(mat_list) { + x = mat_list[[1]]^2; + for( j in 2:length(mat_list) ) + x = x + mat_list[[j]]^2; + return( x ); + } + afun = identity; + threshfun = function(pv) { + thr = qf(pv, nVarTested, dfFull, lower.tail = FALSE); + thr = thr / (dfFull/nVarTested + thr); + thr[pv >= 1] = 0; + thr[pv <= 0] = 1; + return( thr ); + } + testfun = function(x) { return( x / (1 - .my.pmin(x,1)) * (dfFull/nVarTested) ); } + pvfun = function(x) { return( .pv.nz(pf(x, nVarTested, dfFull, lower.tail = FALSE)) ); } + thresh.cis = threshfun(pvOutputThreshold.cis); + thresh = threshfun(pvOutputThreshold); + } else + if( useModel == modelLINEAR_CROSS ) { + last.covariate = as.vector( last.covariate ); + snps_process = .SNP_process_split_for_LINEAR_CROSS = function(x) { + out = vector("list", 2); + out[[1]] = .SetNanRowMean(x); + out[[2]] = t( t(out[[1]]) * last.covariate ); + return( out ); + }; + nVarTested = 1; + dfFull = nSamples - nCov - nVarTested - 1; + statistic.fun = function(mat_list) { + return( mat_list[[2]] / sqrt(1 - mat_list[[1]]^2) ); + } + afun = function(x) {return(abs(x))}; + threshfun = function(pv) { + thr = qt(pv/2, dfFull, lower.tail = FALSE); + thr = thr^2; + thr = sqrt( thr / (dfFull + thr) ); + thr[pv >= 1] = 0; + thr[pv <= 0] = 1; + return( thr ); + } + testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } + pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2 )); } + thresh.cis = threshfun(pvOutputThreshold.cis); + thresh = threshfun(pvOutputThreshold); + betafun = function(stat, ss, gg, select) { + return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); + } + } + params$dfFull = dfFull; + } + ################################# Saver class(es) creation ############################## + { + status("Creating output file(s)"); + if(noFDRsaveMemory) { + if( pvOutputThreshold > 0 ) { + saver.tra = .OutputSaver_direct$new(); + } + if( pvOutputThreshold.cis > 0 ) { + saver.cis = .OutputSaver_direct$new(); + } + } else { + if( pvOutputThreshold > 0 ) { + saver.tra = .OutputSaver_FRD$new(); + } + if( pvOutputThreshold.cis > 0 ) { + saver.cis = .OutputSaver_FRD$new(); + } + } + if( pvOutputThreshold > 0 ) + if( pvOutputThreshold * gene$nRows() * snps$nRows() > 1000000 ) + if(!noFDRsaveMemory) + cat('Warning: pvOutputThreshold may be too large.\nExpected number of findings > ', + pvOutputThreshold * gene$nRows() * snps$nRows(),'\n'); + if( (useModel == modelLINEAR) || (useModel == modelLINEAR_CROSS) ) { + statistic_name = "t-stat"; + } else if( useModel == modelANOVA ) { + statistic_name = "F-test"; + } + if(!is.null(betafun)) + statistic_name = paste('beta\t',statistic_name, sep=''); + if( pvOutputThreshold > 0 ) + saver.tra$start(output_file_name, statistic_name, snps, gene, testfun, pvfun); + if( pvOutputThreshold.cis > 0 ) + saver.cis$start(output_file_name.cis, statistic_name, snps, gene, testfun, pvfun); + rm( statistic_name ); + } + ################################# Some useful functions ################################# + { + orthonormalize.snps = function(cursnps, ss) { + for(p in 1:length(cursnps)) { + if(length(correctionMatrix)>0) { + cursnps[[p]] = cursnps[[p]] %*% correctionMatrix; + } + cursnps[[p]] = cursnps[[p]] - tcrossprod(cursnps[[p]],cvrt) %*% cvrt; + for(w in .seq(1,p-1L)) + cursnps[[p]] = cursnps[[p]] - rowSums(cursnps[[p]]*cursnps[[w]]) * cursnps[[w]]; + div = sqrt( rowSums(cursnps[[p]]^2) ); + div[ div == 0 ] = 1; + cursnps[[p]] = cursnps[[p]]/div; + } + snps.std$set(ss, div); + return(cursnps); + } +# if( pvOutputThreshold.cis > 0 ) { +# is.cis.pair = function(gg,ss) { +# return(!( ( snpsloc[[ss]][1, 1] - tail( geneloc[[gg]][ , 2], n = 1L) > cisDist) | +# ( geneloc[[gg]][1, 1] - tail( snpsloc[[ss]] , n = 1L) > cisDist) ) ); +# } +# } + if( pvOutputThreshold.cis > 0 ) { +# sn.l = sapply(snpsloc, function(x)x[1] ); +# sn.r = sapply(snpsloc, function(x)tail(x,1) ); +# ge.l = sapply(geneloc, function(x)x[1,1] ); +# ge.r = sapply(geneloc, function(x)x[nrow(x) , 2] ); + sn.l = sapply(snpsloc, '[', 1 ); + sn.r = sapply(snpsloc, tail, 1 ); + ge.l = sapply(geneloc, '[', 1, 1 ); + ge.r = sapply( lapply(geneloc, tail.matrix, 1 ), '[', 2); + gg.1 = findInterval( sn.l , ge.r + cisDist +1) + 1; +# cat(gg.1,'\n') + gg.2 = findInterval( sn.r , ge.l - cisDist ); +# cat(gg.2,'\n') + rm(sn.l, sn.r, ge.l, ge.r); + } + + } + ################################# Prepare counters and histogram bins ################### + { + pvbins = NULL; # bin edges for p-values + statbins = 0; # bin edges for the test statistic (|t| or F) + do.hist = FALSE; + if( length(pvalue.hist) == 1 ) { + if(pvalue.hist == "qqplot") { + pvbins = c(0, 10^rev(seq(0, log10(.Machine$double.xmin)-1, -0.05))); + } else + if( is.numeric(pvalue.hist) ) { + pvbins = seq(from = 0, to = 1, length.out = pvalue.hist+1); + } else + if( pvalue.hist == TRUE ) { + pvbins = seq(from = 0, to = 1, length.out = 100+1); + } + } else + if( is.numeric(pvalue.hist) && (length(pvalue.hist) > 1) ) { + pvbins = pvalue.hist; + } + if( is.null(pvbins) && (pvalue.hist != FALSE) ) { + stop("Wrong value of pvalue.hist. Must be FALSE, TRUE, \"qqplot\", or numerical"); + } + do.hist = !is.null(pvbins); + if( do.hist ) { + pvbins = sort(pvbins); + statbins = threshfun(pvbins); + if( pvOutputThreshold > 0) { + hist.all = .histogrammer$new(pvbins, statbins); + } + if( pvOutputThreshold.cis > 0) { + hist.cis = .histogrammer$new(pvbins, statbins); + } + } + rm( pvbins, statbins); + if(min.pv.by.genesnp) { + if( pvOutputThreshold > 0) { + minpv.tra = .minpvalue$new(snps,gene); + } + if( pvOutputThreshold.cis > 0) { + minpv.cis = .minpvalue$new(snps,gene); + } + } + } + ################################# Main loop ############################################# + { + beta = NULL; + n.tests.all = 0; + n.tests.cis = 0; + n.eqtls.tra = 0; + n.eqtls.cis = 0; + + status("Performing eQTL analysis"); + # ss = 1; gg = 1; + # ss = snps$nSlices(); gg = gene$nSlices(); + + snps_offset = 0; + for(ss in 1:snps$nSlices()) { +# for(ss in 1:min(2,snps$nSlices())) { #for debug + cursnps = NULL; + nrcs = nrow(snps$getSliceRaw(ss)); + + # loop only through the useful stuff + for(gg in if(pvOutputThreshold>0){1:gene$nSlices()}else{.seq(gg.1[ss],gg.2[ss])} ) { + gene_offset = gene_offsets[gg]; + curgene = gene$getSlice(gg); + nrcg = nrow(curgene); + if(nrcg == 0) next; + + rp = ""; + + statistic = NULL; + select.cis.raw = NULL; + ## do cis analysis +# if( (pvOutputThreshold.cis > 0) && ( is.cis.pair(gg, ss) ) ) { + if( (pvOutputThreshold.cis > 0) && (gg >= gg.1[ss]) && (gg <= gg.2[ss]) ) { + + if( is.null( statistic ) ) { + if( is.null( cursnps ) ) { + cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); + } + mat = vector("list", length(cursnps)); + for(d in 1:length(cursnps)) { + mat[[d]] = tcrossprod(curgene, cursnps[[d]]); + } + statistic = statistic.fun( mat ); + astatistic = afun(statistic); +# rm(mat); + } + +# sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1 +1 , snpsloc[[ss]]); +# sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist -1 , snpsloc[[ss]]); + sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1, snpsloc[[ss]]); + sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist, snpsloc[[ss]]); + xx = unlist(lapply(which(sn.r>sn.l),FUN=function(x){(sn.l[x]:(sn.r[x]-1))*nrow(statistic)+x})) + select.cis.raw = xx[ astatistic[xx] >= thresh.cis ]; + select.cis = arrayInd(select.cis.raw, dim(statistic)) + + n.tests.cis = n.tests.cis + length(xx); + n.eqtls.cis = n.eqtls.cis + length(select.cis.raw); + + if( do.hist ) + hist.cis$update(astatistic[xx]); + + if( min.pv.by.genesnp ) { + # minpv.cis$updatecis(ss, gg, arrayInd(xx, dim(statistic)), astatistic[xx]) + temp = double(length(astatistic)); + dim(temp) = dim(astatistic); + temp[xx] = astatistic[xx]; + minpv.cis$update(ss, gg, temp); + } + + if(!is.null(betafun)) + beta = betafun(mat[[length(mat)]][select.cis.raw], ss, gg, select.cis); + + saver.cis$update( snps_offset + select.cis[ , 2], + gene_offset + select.cis[ , 1], + statistic[select.cis.raw], + beta); + + # statistic.select.cis = statistic[ select.cis ]; + # test = testfun( statistic.select.cis ); + # pv = pvfun(test); + # Saver.cis$WriteBlock( cbind(snps_offset + select.cis[ , 2], gene_offset + select.cis[ , 1], test, pv) ); + # counter.cis$Update(gg, ss, select.cis, pv, n.tests = length(xx), if(do.hist) afun(statistic[xx]) ) + rp = paste(rp, ", ", formatC(n.eqtls.cis, big.mark=",", format = "f", digits = 0), " cis-eQTLs", sep = ""); + } + ## do trans/all analysis + if(pvOutputThreshold>0) { + if( is.null( statistic ) ) { + if( is.null( cursnps ) ) { + cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); + } + mat = vector("list", length(cursnps)); + for(d in 1:length(cursnps)) { + mat[[d]] = tcrossprod(curgene, cursnps[[d]]); + } + statistic = statistic.fun( mat ); + astatistic = afun(statistic); +# rm(mat); + } + + if( do.hist ) + hist.all$update(astatistic); + + if(!is.null(select.cis.raw)) + astatistic[xx] = -1; + # select.tra.raw = select.tra.raw[!(select.tra.raw %in% select.cis.raw)]; + + select.tra.raw = which( astatistic >= thresh); + select.tra = arrayInd(select.tra.raw, dim(statistic)) + + n.eqtls.tra = n.eqtls.tra + length(select.tra.raw); + n.tests.all = n.tests.all + length(statistic); + + if(!is.null(betafun)) + beta = betafun(mat[[length(mat)]][select.tra.raw], ss, gg, select.tra); + + saver.tra$update( snps_offset + select.tra[ , 2], + gene_offset + select.tra[ , 1], + statistic[select.tra.raw], + beta); + + if( min.pv.by.genesnp ) + minpv.tra$update(ss, gg, astatistic) + + # statistic.select.tra = statistic[ select.tra ]; + # test = testfun( statistic.select.tra ); + # pv = pvfun( test ); + # Saver$WriteBlock( cbind( snps_offset + select.tra[ , 2], gene_offset + select.tra[ , 1], test, pv) ); + # counter$Update(gg, ss, select.tra, pv, n.tests = nrcs*nrcg, if(do.hist) afun(statistic) ) + rp = paste(rp, ", ", formatC(n.eqtls.tra, big.mark=",", format = "f", digits = 0), if(pvOutputThreshold.cis > 0)" trans-"else" ","eQTLs", sep = "") + } + + #gene_offset = gene_offset + nrcg; + if( !is.null(statistic) ) { + per = 100*(gg/gene$nSlices() + ss-1) / snps$nSlices(); + cat( formatC(floor(per*100)/100, format = "f", width = 5, digits = 2), "% done" , rp, "\n", sep = ""); + flush.console(); + } + } # gg in 1:gene$nSlices() + snps_offset = snps_offset + nrow(snps$getSliceRaw(ss)); + } # ss in 1:snps$nSlices() + } + ################################# Results collection #################################### + { + rez = list(time.in.sec = proc.time()[3] - start.time); + rez$param = params; + + if(pvOutputThreshold.cis > 0) { + rez.cis = list(ntests = n.tests.cis, neqtls = n.eqtls.cis); + rez.cis = c(rez.cis, saver.cis$getResults( gene, snps, n.tests.cis) ); + if(do.hist) + rez.cis = c(rez.cis, hist.cis$getResults() ); + if(min.pv.by.genesnp) + rez.cis = c(rez.cis, minpv.cis$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); + } + + if(pvOutputThreshold>0) { + rez.all = list(ntests = n.tests.all, neqtls = n.eqtls.tra + n.eqtls.cis); + if(pvOutputThreshold.cis > 0) { + rez.tra = list(ntests = n.tests.all - n.tests.cis, neqtls = n.eqtls.tra); + rez.tra = c(rez.tra, saver.tra$getResults( gene, snps, n.tests.all - n.tests.cis) ); + } else { + rez.all = c(rez.all, saver.tra$getResults( gene, snps, n.tests.all ) ); + } + if(do.hist) { + rez.all = c(rez.all, hist.all$getResults() ); + if(pvOutputThreshold.cis > 0) { + rez.tra$hist.bins = rez.all$hist.bins; + rez.tra$hist.counts = rez.all$hist.counts - rez.cis$hist.counts; + } + } + if(min.pv.by.genesnp) { + if(pvOutputThreshold.cis > 0) { + rez.tra = c(rez.tra, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); + } else { + rez.all = c(rez.all, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); + } + } + } + + if(exists('rez.all')>0) + rez$all = rez.all; + if(exists('rez.tra')>0) + rez$trans = rez.tra; + if(exists('rez.cis')>0) + rez$cis = rez.cis; + + class(rez) = c(class(rez),"MatrixEQTL"); + status(""); + } +# cat('s std ',snps.std$get(1),'\n'); +# cat('g std ',gene.std$get(1),'\n'); + ################################# Results collection #################################### + return(rez); +} + +.histme = function(m, name1, name2, ...) { + cnts = m$hist.counts; + bins = m$hist.bins; + ntst = m$ntests; + centers = 0.5 * (bins[-1L] + bins[-length(bins)]); + density = 0.5 / (bins[-1L] - centers) * cnts / ntst; + ntext = paste("P-value distribution for ", name1, formatC(ntst, big.mark=",", format = "f", digits = 0), name2, " gene-SNP pairs ",sep=""); + r = structure(list(breaks = bins, counts = cnts, density = density, + mids = centers, equidist = FALSE), class = "histogram"); + plot(r, main = ntext, ylab = "Density", xlab = "P-values", ...) + abline( h = 1, col = "blue"); + return(invisible()); +} + +.qqme = function(m, lcol, cex, pch, ...) { + cnts = m$hist.counts; + bins = m$hist.bins; + ntst = m$ntests; + + cusu = cumsum(cnts) / ntst; + ypos = bins[-1][is.finite(cusu)]; + xpos = cusu[is.finite(cusu)]; + lines(-log10(xpos), -log10(ypos), col = lcol, ...); +# lines(xpos, ypos, col = lcol, ...); + if(length(m$eqtls$pvalue)==0) + return(); + ypvs = -log10(m$eqtls$pvalue); + xpvs = -log10(1:length(ypvs) / ntst); + if(length(ypvs) > 1000) { + # need to filter a bit, make the plotting faster + levels = as.integer( xpvs/xpvs[1] * 1e3); + keep = c(TRUE, diff(levels)!=0); + levels = as.integer( ypvs/ypvs[1] * 1e3); + keep = keep | c(TRUE, diff(levels)!=0); + ypvs = ypvs[keep]; + xpvs = xpvs[keep]; + rm(keep) + } + points(xpvs, ypvs, col = lcol, pch = pch, cex = cex, ...); +} + +plot.MatrixEQTL = function(x, cex = 0.5, pch = 19, xlim = NULL, ylim = NULL, ...) { + if( x$param$pvalue.hist == FALSE ) { + warning("Cannot plot p-value distribution: the information was not recorded.\nUse pvalue.hist!=FALSE."); + return(invisible()); + } + if( x$param$pvalue.hist == "qqplot" ) { + xmin = 1/max(x$cis$ntests, x$all$ntests); + ymax = NULL; + if(!is.null(ylim)) { + ymax = ylim[2]; + } else { + ymax = -log10(min( + x$cis$eqtls$pvalue[1], x$cis$hist.bins[ c(FALSE,x$cis$hist.counts>0)][1], + x$all$eqtls$pvalue[1], x$all$hist.bins[ c(FALSE,x$all$hist.counts>0)][1], + x$trans$eqtls$pvalue[1], x$trans$hist.bins[c(FALSE,x$trans$hist.counts>0)][1], + na.rm = TRUE))+0.1; + } + if(ymax == 0) { + ymax = -log10(.Machine$double.xmin) + } + if(!is.null(ymax)) + ylim = c(0,ymax); + + if(is.null(xlim)) + xlim = c(0, -log10(xmin/1.5)); + + plot(numeric(),numeric(), xlab = "-Log10(p-value), theoretical", + ylab = "-Log10(p-value), observed", + xlim = c(0, -log10(xmin/1.5)), + ylim = ylim, + xaxs="i", yaxs="i", ...); + lines(c(0,1e3), c(0,1e3), col = "gray"); + if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { + .qqme( x$cis, "red", cex, pch, ...); + .qqme( x$trans, "blue", cex, pch, ...); + title(paste("QQ-plot for", + formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), + "local and", + formatC(x$trans$ntests, big.mark=",", format = "f", digits = 0), + "distant gene-SNP p-values")); + lset = c(1,2,4); + } else + if(x$param$pvOutputThreshold.cis > 0) { + .qqme(x$cis, "red", cex, pch, ...); + title(paste("QQ-plot for", + formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), + "local gene-SNP p-values")); + lset = c(1,4); + } else { + .qqme(x$all, "blue", cex, pch, ...); + title(paste("QQ-plot for all", + formatC(x$all$ntests, big.mark=",", format = "f", digits = 0), + "gene-SNP p-values")); + lset = c(3,4); + } + legend("topleft", + c("Local p-values","Distant p-values","All p-values","diagonal")[lset], + col = c("red","blue","blue","gray")[lset], + text.col = c("red","blue","blue","gray")[lset], + pch = 20, lwd = 1, pt.cex = c(1,1,1,0)[lset]) + } else { + if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { + par(mfrow=c(2,1)); + .histme(x$cis, "", " local", ...); + tran = list(hist.counts = x$all$hist.counts - x$cis$hist.counts, + hist.bins = x$all$hist.bins, + ntests = x$all$ntests - x$cis$ntests); + .histme(x$trans,""," distant", ...); + par(mfrow=c(1,1)); + } else + if(x$param$pvOutputThreshold.cis > 0) { + .histme(x$cis, "", " local", ...); + } else { + .histme(x$all, "all ", "" , ...); + } + } + return(invisible()); +} + diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/data/Covariates.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/data/Covariates.txt Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,3 @@ +id Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09 Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16 +gender 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 +age 36 40 46 65 69 43 40 54 44 70 24 34 55 62 48 40 diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/data/GE.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/data/GE.txt Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,11 @@ +geneid Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09 Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16 +Gene_01 4.91 4.63 5.18 5.07 5.74 5.09 5.31 5.29 4.73 5.72 4.75 4.54 5.01 5.03 4.84 4.44 +Gene_02 13.78 13.14 13.18 13.04 12.85 13.07 13.09 12.83 14.94 13.86 15.26 14.73 14.08 14.33 14.72 14.97 +Gene_03 12.06 12.29 13.07 13.65 13.86 12.84 12.29 13.03 13.13 14.93 12.40 13.38 13.70 14.49 14.14 13.35 +Gene_04 11.63 11.88 12.74 12.66 13.16 11.99 11.97 12.81 11.74 13.29 10.88 11.37 12.09 12.50 11.40 11.45 +Gene_05 14.72 14.66 14.63 15.91 15.46 14.74 15.17 15.01 14.05 14.90 12.96 13.56 14.06 14.44 14.12 13.76 +Gene_06 12.29 12.23 12.47 13.21 12.63 12.18 12.44 12.45 13.22 12.70 12.93 12.84 13.20 13.19 12.64 12.80 +Gene_07 12.56 12.71 12.49 13.41 13.60 12.35 12.27 13.42 14.73 15.47 13.85 14.31 15.25 15.55 14.90 14.04 +Gene_08 12.27 12.58 12.61 13.02 12.86 12.32 12.30 13.19 15.21 14.60 14.72 14.56 14.98 14.92 14.53 14.72 +Gene_09 9.82 9.29 8.95 8.18 8.11 9.43 9.17 9.25 10.95 9.82 12.60 11.99 11.20 10.31 11.13 11.69 +Gene_10 14.24 14.52 14.62 13.65 13.46 14.04 13.97 13.73 12.29 12.01 13.47 13.30 12.34 12.23 12.80 12.47 diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/data/SNP.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/data/SNP.txt Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,16 @@ +snpid Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09 Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16 +Snp_01 2 0 2 0 2 1 2 1 1 1 2 2 1 2 2 1 +Snp_02 0 1 1 2 2 1 0 0 0 1 1 1 1 0 1 1 +Snp_03 1 0 1 0 1 1 1 1 0 1 1 0 1 1 1 2 +Snp_04 0 1 2 2 2 1 1 0 0 0 1 2 1 1 1 0 +Snp_05 1 1 2 1 1 2 1 1 0 1 1 2 0 1 2 1 +Snp_06 2 2 2 1 1 0 1 0 2 1 1 1 2 0 2 1 +Snp_07 1 1 2 2 0 1 1 1 1 0 2 2 0 1 1 1 +Snp_08 1 0 1 0 1 0 0 1 1 1 0 2 0 1 1 1 +Snp_09 2 1 2 2 0 1 1 0 2 1 1 0 1 1 0 0 +Snp_10 1 1 0 0 0 2 2 1 1 2 1 1 1 1 1 0 +Snp_11 2 2 2 0 2 1 1 2 1 2 0 1 0 1 1 2 +Snp_12 1 1 2 2 2 1 1 1 1 0 2 0 1 1 0 2 +Snp_13 0 1 1 1 1 1 2 1 2 2 0 0 0 1 1 1 +Snp_14 0 0 1 0 1 2 2 2 1 1 1 0 1 0 1 0 +Snp_15 1 2 1 2 2 1 2 1 1 2 1 1 1 2 0 2 diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/data/geneloc.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/data/geneloc.txt Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,11 @@ +geneid chr left right +Gene_01 chr1 721289 731289 +Gene_02 chr1 752565 762565 +Gene_03 chr1 777121 787121 +Gene_04 chr1 785988 795988 +Gene_05 chr1 792479 802479 +Gene_06 chr1 798958 808958 +Gene_07 chr1 888658 898658 +Gene_08 chr1 918572 928572 +Gene_09 chr1 926430 936430 +Gene_10 chr1 1000000 1010000 diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/data/snpsloc.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/data/snpsloc.txt Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,16 @@ +snpid chr pos +Snp_01 chr1 721289 +Snp_02 chr1 752565 +Snp_03 chr1 777121 +Snp_04 chr1 785988 +Snp_05 chr1 792479 +Snp_06 chr1 798958 +Snp_07 chr1 888658 +Snp_08 chr1 918572 +Snp_09 chr1 926430 +Snp_10 chr1 947033 +Snp_11 chr2 721289 +Snp_12 chr2 752565 +Snp_13 chr2 777121 +Snp_14 chr2 785988 +Snp_15 chr2 792479 diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/demo/00Index --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/demo/00Index Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,10 @@ +a.nocvrt Test that Matrix eQTL gives correct estimates for additive linear model. Homoscedastic case without covariates. +b.cvrt Test that Matrix eQTL gives correct estimates for additive linear model. Homoscedastic case with covariates. +c.weights Test that Matrix eQTL gives correct estimates for additive linear model. Heteroscedastic case with covariates. +d.ANOVA Test that Matrix eQTL gives correct estimates for ANOVA linear model with 3 groups. Heteroscedastic case with covariates. +d.ANOVA5 Test that Matrix eQTL gives correct estimates for ANOVA linear model with 5 groups. Heteroscedastic case with covariates. +e.interaction Test that Matrix eQTL gives correct estimates for linear model with genotype-covariate interaction. Heteroscedastic case with covariates. +p.hist Sample code for plotting histogram of p-values +q.qqplot Sample code for plotting QQ-plot of p-values +sample.all Sample code for eQTL analysis of the toy data set. +sample.cis Sample code for eQTL analysis of the toy data set with separate p-value thresholds and FDR estimation for cis- and trans-eQTLs. diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/demo/a.nocvrt.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/demo/a.nocvrt.r Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,56 @@ +library("MatrixEQTL"); + +# Number of columns (samples) +n = 100; + + + + + + + + + + +# Generate the vectors with genotype and expression variables +snps.mat = rnorm(n); +gene.mat = rnorm(n) + 0.5 * snps.mat; + +# Create 3 SlicedData objects for the analysis +snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); +gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); +cvrt1 = SlicedData$new(); + +# Produce no output files +filename = NULL; # tempfile() + +# Call the main analysis function +me = Matrix_eQTL_main( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + pvalue.hist = FALSE ); + +# Pull Matrix eQTL results - t-statistic and p-value +beta = me$all$eqtls$beta; +tstat = me$all$eqtls$statistic; +pvalue = me$all$eqtls$pvalue; +rez = c(beta = beta, tstat = tstat, pvalue = pvalue); +# And compare to those from the linear regression in R +{ + cat("\n\n Matrix eQTL: \n"); + print(rez); + cat("\n R summary(lm()) output: \n"); + lmdl = lm( gene.mat ~ snps.mat ); + + lmout = summary(lmdl)$coefficients[2,c("Estimate","t value","Pr(>|t|)")]; + print( lmout ); +} + +# Results from Matrix eQTL and "lm" must agree +stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/demo/b.cvrt.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/demo/b.cvrt.r Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,56 @@ +library("MatrixEQTL"); + +# Number of columns (samples) +n = 100; + +# Number of covariates +nc = 10; + + + + +# Generate the covariates +cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); + +# Generate the vectors with genotype and expression variables +snps.mat = cvrt.mat %*% rnorm(nc) + rnorm(n); +gene.mat = cvrt.mat %*% rnorm(nc) + rnorm(n) + 0.5 * snps.mat + 1; + +# Create 3 SlicedData objects for the analysis +snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); +gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); +cvrt1 = SlicedData$new( t(cvrt.mat) ); + +# Produce no output files +filename = NULL; # tempfile() + +# Call the main analysis function +me = Matrix_eQTL_main( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + pvalue.hist = FALSE ); + +# Pull Matrix eQTL results - t-statistic and p-value +beta = me$all$eqtls$beta; +tstat = me$all$eqtls$statistic; +pvalue = me$all$eqtls$pvalue; +rez = c(beta = beta, tstat = tstat, pvalue = pvalue); +# And compare to those from the linear regression in R +{ + cat("\n\n Matrix eQTL: \n"); + print(rez); + cat("\n R summary(lm()) output: \n"); + lmdl = lm( gene.mat ~ snps.mat + cvrt.mat ); + + lmout = summary(lmdl)$coefficients[2,c("Estimate","t value","Pr(>|t|)")]; + print( lmout ); +} + +# Results from Matrix eQTL and "lm" must agree +stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/demo/c.weights.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/demo/c.weights.r Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,56 @@ +library("MatrixEQTL"); + +# Number of columns (samples) +n = 100; + +# Number of covariates +nc = 10; + +# Generate the standard deviation of the noise +noise.std = 0.1 + rnorm(n)^2; + +# Generate the covariates +cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); + +# Generate the vectors with genotype and expression variables +snps.mat = cvrt.mat %*% rnorm(nc) + rnorm(n); +gene.mat = cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std + 0.5 * snps.mat + 1; + +# Create 3 SlicedData objects for the analysis +snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); +gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); +cvrt1 = SlicedData$new( t(cvrt.mat) ); + +# Produce no output files +filename = NULL; # tempfile() + +# Call the main analysis function +me = Matrix_eQTL_main( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1, + useModel = modelLINEAR, + errorCovariance = diag(noise.std^2), + verbose = TRUE, + pvalue.hist = FALSE ); + +# Pull Matrix eQTL results - t-statistic and p-value +beta = me$all$eqtls$beta; +tstat = me$all$eqtls$statistic; +pvalue = me$all$eqtls$pvalue; +rez = c(beta = beta, tstat = tstat, pvalue = pvalue); +# And compare to those from the linear regression in R +{ + cat("\n\n Matrix eQTL: \n"); + print(rez); + cat("\n R summary(lm()) output: \n"); + lmdl = lm( gene.mat ~ snps.mat + cvrt.mat, + weights = 1/noise.std^2 ); + lmout = summary(lmdl)$coefficients[2,c("Estimate","t value","Pr(>|t|)")]; + print( lmout ); +} + +# Results from Matrix eQTL and "lm" must agree +stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/demo/d.ANOVA.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/demo/d.ANOVA.r Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,56 @@ +library("MatrixEQTL"); + +# Number of columns (samples) +n = 100; + +# Number of covariates +nc = 10; + +# Generate the standard deviation of the noise +noise.std = 0.1 + rnorm(n)^2; + +# Generate the covariates +cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); + +# Generate the vectors with single genotype and expression variables +snps.mat = floor(runif(n, min = 0, max = 3)); +gene.mat = 1 + (snps.mat==1) + cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std; + +# Create 3 SlicedData objects for the analysis +snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); +gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); +cvrt1 = SlicedData$new( t(cvrt.mat) ); + +# Produce no output files +filename = NULL; # tempfile() + +# Call the main analysis function +me = Matrix_eQTL_main( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1, + useModel = modelANOVA, + errorCovariance = diag(noise.std^2), + verbose = TRUE, + pvalue.hist = FALSE ); + +# Pull Matrix eQTL results - t-statistic and p-value + +fstat = me$all$eqtls$statistic; +pvalue = me$all$eqtls$pvalue; +rez = c( Fstat = fstat, pvalue = pvalue); +# And compare to those from ANOVA in R +{ + cat("\n\n Matrix eQTL: \n"); + print(rez); + cat("\n R anova(lm()) output: \n"); + lmdl = lm( gene.mat ~ cvrt.mat + factor(snps.mat), + weights = 1/noise.std^2 ); + lmout = anova(lmdl)[2, c("F value","Pr(>F)")]; + print( lmout ); +} + +# Results from Matrix eQTL and "lm" must agree +stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/demo/d.ANOVA5.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/demo/d.ANOVA5.r Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,56 @@ +library("MatrixEQTL"); + +anova.groups = 5; +options(MatrixEQTL.ANOVA.categories = anova.groups); + +# Number of columns (samples) +n = 100; +# Number of covariates +nc = 10; +# Generate the standard deviation of the noise +noise.std = 0.1 + rnorm(n)^2; +# Generate the covariates +cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); + +# Generate the vectors with single genotype and expression variables +snps.mat = floor(runif(n, min = 0, max = anova.groups)); +gene.mat = 1 + (snps.mat==1) + cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std; + +# Create 3 SlicedData objects for the analysis +snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); +gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); +cvrt1 = SlicedData$new( t(cvrt.mat) ); + +# Produce no output files +filename = NULL; # tempfile() + +# Call the main analysis function +me = Matrix_eQTL_main( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1, + useModel = modelANOVA, + errorCovariance = diag(noise.std^2), + verbose = TRUE, + pvalue.hist = FALSE ); + +# Pull Matrix eQTL results - t-statistic and p-value + +fstat = me$all$eqtls$statistic; +pvalue = me$all$eqtls$pvalue; +rez = c( Fstat = fstat, pvalue = pvalue); +# And compare to those from ANOVA in R +{ + cat("\n\n Matrix eQTL: \n"); + print(rez); + cat("\n R anova(lm()) output: \n"); + lmdl = lm( gene.mat ~ cvrt.mat + factor(snps.mat), + weights = 1/noise.std^2 ); + lmout = anova(lmdl)[2, c("F value","Pr(>F)")]; + print( lmout ); +} + +# Results from Matrix eQTL and "lm" must agree +stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/demo/e.interaction.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/demo/e.interaction.r Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,56 @@ +library("MatrixEQTL"); + +# Number of columns (samples) +n = 25; + +# Number of covariates +nc = 10; + +# Generate the standard deviation of the noise +noise.std = 0.1 + rnorm(n)^2; + +# Generate the covariates +cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); + +# Generate the vectors with single genotype and expression variables +snps.mat = cvrt.mat %*% rnorm(nc) + rnorm(n); +gene.mat = cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std + + 1 + 0.5 * snps.mat + snps.mat * cvrt.mat[,nc]; +# Create 3 SlicedData objects for the analysis +snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); +gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); +cvrt1 = SlicedData$new( t(cvrt.mat) ); + +# Produce no output files +filename = NULL; # tempfile() + +# Call the main analysis function +me = Matrix_eQTL_main( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1, + useModel = modelLINEAR_CROSS, + errorCovariance = diag(noise.std^2), + verbose = TRUE, + pvalue.hist = FALSE ); + +# Pull Matrix eQTL results - t-statistic and p-value +beta = me$all$eqtls$beta; +tstat = me$all$eqtls$statistic; +pvalue = me$all$eqtls$pvalue; +rez = c(beta = beta, tstat = tstat, pvalue = pvalue); +# And compare to those from the linear regression in R +{ + cat("\n\n Matrix eQTL: \n"); + print(rez); + cat("\n R summary(lm()) output: \n"); + lmdl = lm( gene.mat ~ snps.mat + cvrt.mat + snps.mat*cvrt.mat[,nc], + weights = 1/noise.std^2 ); + lmout = tail(summary(lmdl)$coefficients,1)[,c(1,3,4)]; + print( tail(lmout) ); +} + +# Results from Matrix eQTL and "lm" must agree +stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/demo/p.hist.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/demo/p.hist.r Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,43 @@ +library("MatrixEQTL") + +# Number of samples +n = 100; + +# Number of variables +ngs = 2000; + +# Common signal in all variables (population stratification) +pop = 0.2 * rnorm(n); + +# data matrices +snps.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop; +gene.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop + snps.mat*((1:ngs)/ngs)^9/2; + +# data objects for Matrix eQTL engine +snps1 = SlicedData$new( t( snps.mat ) ); +gene1 = SlicedData$new( t( gene.mat ) ); +cvrt1 = SlicedData$new(); +rm(snps.mat, gene.mat); + +# Slice data in blocks of 500 variables +snps1$ResliceCombined(500); +gene1$ResliceCombined(500); + +# Produce no output files +filename = NULL; # tempfile() + +# Perform analysis recording information for a histogram +me = Matrix_eQTL_main( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1e-100, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + pvalue.hist = 100); + +# png(filename = "histogram.png", width = 650, height = 650); +plot(me, col="grey"); +# dev.off(); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/demo/q.qqplot.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/demo/q.qqplot.r Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,43 @@ +library("MatrixEQTL") + +# Number of samples +n = 100; + +# Number of variables +ngs = 2000; + +# Common signal in all variables (population stratification) +pop = 0.2 * rnorm(n); + +# data matrices +snps.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop; +gene.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop + snps.mat*((1:ngs)/ngs)^9/2; + +# data objects for Matrix eQTL engine +snps1 = SlicedData$new( t( snps.mat ) ); +gene1 = SlicedData$new( t( gene.mat ) ); +cvrt1 = SlicedData$new(); +rm(snps.mat, gene.mat); + +# Slice data in blocks of 500 variables +snps1$ResliceCombined(500); +gene1$ResliceCombined(500); + +# Produce no output files +filename = NULL; # tempfile() + +# Perform analysis recording information for a Q-Q plot +me = Matrix_eQTL_main( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1e-6, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + pvalue.hist = "qqplot"); + +# png(filename = "QQplot.png", width = 650, height = 650); +plot(me, pch = 16, cex = 0.7, ylim = c(0,11)); +# dev.off(); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/demo/sample.all.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/demo/sample.all.r Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,96 @@ +# Matrix eQTL by Andrey A. Shabalin +# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ +# +# Be sure to use an up to date version of R and Matrix eQTL. + +# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); +library(MatrixEQTL) + +## Location of the package with the data files. +base.dir = find.package('MatrixEQTL'); +# base.dir = '.'; + +## Settings + +# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS +useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS + +# Genotype file name +SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); + +# Gene expression file name +expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); + +# Covariates file name +# Set to character() for no covariates +covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); + +# Output file name +output_file_name = tempfile(); + +# Only associations significant at this level will be saved +pvOutputThreshold = 1e-2; + +# Error covariance matrix +# Set to numeric() for identity. +errorCovariance = numeric(); +# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); + + +## Load genotype data + +snps = SlicedData$new(); +snps$fileDelimiter = "\t"; # the TAB character +snps$fileOmitCharacters = "NA"; # denote missing values; +snps$fileSkipRows = 1; # one row of column labels +snps$fileSkipColumns = 1; # one column of row labels +snps$fileSliceSize = 2000; # read file in slices of 2,000 rows +snps$LoadFile(SNP_file_name); + +## Load gene expression data + +gene = SlicedData$new(); +gene$fileDelimiter = "\t"; # the TAB character +gene$fileOmitCharacters = "NA"; # denote missing values; +gene$fileSkipRows = 1; # one row of column labels +gene$fileSkipColumns = 1; # one column of row labels +gene$fileSliceSize = 2000; # read file in slices of 2,000 rows +gene$LoadFile(expression_file_name); + +## Load covariates + +cvrt = SlicedData$new(); +cvrt$fileDelimiter = "\t"; # the TAB character +cvrt$fileOmitCharacters = "NA"; # denote missing values; +cvrt$fileSkipRows = 1; # one row of column labels +cvrt$fileSkipColumns = 1; # one column of row labels +if(length(covariates_file_name)>0) { + cvrt$LoadFile(covariates_file_name); +} + +## Run the analysis + +me = Matrix_eQTL_engine( + snps = snps, + gene = gene, + cvrt = cvrt, + output_file_name = output_file_name, + pvOutputThreshold = pvOutputThreshold, + useModel = useModel, + errorCovariance = errorCovariance, + verbose = TRUE, + pvalue.hist = TRUE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE); + +unlink(output_file_name); + +## Results: + +cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); +cat('Detected eQTLs:', '\n'); +show(me$all$eqtls) + +## Plot the histogram of all p-values + +plot(me) diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/demo/sample.cis.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/demo/sample.cis.r Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,112 @@ +# Matrix eQTL by Andrey A. Shabalin +# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ +# +# Be sure to use an up to date version of R and Matrix eQTL. + +# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); +library(MatrixEQTL) + +## Location of the package with the data files. +base.dir = find.package('MatrixEQTL'); +# base.dir = '.'; + +## Settings + +# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS +useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS + +# Genotype file name +SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); +snps_location_file_name = paste(base.dir, "/data/snpsloc.txt", sep=""); + +# Gene expression file name +expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); +gene_location_file_name = paste(base.dir, "/data/geneloc.txt", sep=""); + +# Covariates file name +# Set to character() for no covariates +covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); + +# Output file name +output_file_name_cis = tempfile(); +output_file_name_tra = tempfile(); + +# Only associations significant at this level will be saved +pvOutputThreshold_cis = 2e-2; +pvOutputThreshold_tra = 1e-2; + +# Error covariance matrix +# Set to numeric() for identity. +errorCovariance = numeric(); +# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); + +# Distance for local gene-SNP pairs +cisDist = 1e6; + +## Load genotype data + +snps = SlicedData$new(); +snps$fileDelimiter = "\t"; # the TAB character +snps$fileOmitCharacters = "NA"; # denote missing values; +snps$fileSkipRows = 1; # one row of column labels +snps$fileSkipColumns = 1; # one column of row labels +snps$fileSliceSize = 2000; # read file in slices of 2,000 rows +snps$LoadFile(SNP_file_name); + +## Load gene expression data + +gene = SlicedData$new(); +gene$fileDelimiter = "\t"; # the TAB character +gene$fileOmitCharacters = "NA"; # denote missing values; +gene$fileSkipRows = 1; # one row of column labels +gene$fileSkipColumns = 1; # one column of row labels +gene$fileSliceSize = 2000; # read file in slices of 2,000 rows +gene$LoadFile(expression_file_name); + +## Load covariates + +cvrt = SlicedData$new(); +cvrt$fileDelimiter = "\t"; # the TAB character +cvrt$fileOmitCharacters = "NA"; # denote missing values; +cvrt$fileSkipRows = 1; # one row of column labels +cvrt$fileSkipColumns = 1; # one column of row labels +if(length(covariates_file_name)>0) { + cvrt$LoadFile(covariates_file_name); +} + +## Run the analysis +snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE); +genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE); + +me = Matrix_eQTL_main( + snps = snps, + gene = gene, + cvrt = cvrt, + output_file_name = output_file_name_tra, + pvOutputThreshold = pvOutputThreshold_tra, + useModel = useModel, + errorCovariance = errorCovariance, + verbose = TRUE, + output_file_name.cis = output_file_name_cis, + pvOutputThreshold.cis = pvOutputThreshold_cis, + snpspos = snpspos, + genepos = genepos, + cisDist = cisDist, + pvalue.hist = TRUE, + min.pv.by.genesnp = TRUE, + noFDRsaveMemory = FALSE); + +unlink(output_file_name_tra); +unlink(output_file_name_cis); + +## Results: + +cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); +cat('Detected local eQTLs:', '\n'); +show(me$cis$eqtls) +cat('Detected distant eQTLs:', '\n'); +show(me$trans$eqtls) + +## Plot the histogram of local and distant p-values + +plot(me) diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/inst/CITATION --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/inst/CITATION Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,15 @@ +citHeader("To cite the 'MatrixEQTL' package in publications use:") + +citEntry(entry="article", + title = "Matrix eQTL: ultra fast eQTL analysis via large matrix operations", + author = personList(as.person("A.A. Shabalin")), + address = "University of North Carolina at Chapel Hill", + year = "2012", + url = "URL http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/", + publisher= "Oxford Univ Press", + journal = "Bioinformatics", + volume = "28", + number = "10", + pages = "1353--1358", + textVersion = "Shabalin, Andrey A. Matrix eQTL: ultra fast eQTL analysis via large matrix operations. Bioinformatics 28, no. 10 (2012): 1353-1358." +) diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/Covariates.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/Covariates.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,25 @@ +\name{Covariates} +\alias{Covariates} +\docType{data} +\title{ + Artificial data for Matrix eQTL sample code: Covariates. +} +\description{ + Artificial data set with 2 covariates across 15 samples. + Columns of the file must match to those of the expression and genotype data sets. +} +\format{ + \tabular{llll}{ + id \tab Sam_01 \tab Sam_02 \tab \ldots \cr + gender \tab 0 \tab 0 \tab \ldots \cr + age \tab 40 \tab 46 \tab \ldots \cr + + } +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\seealso{ + See \code{\link{Matrix_eQTL_engine}} for reference and sample code. +} +\keyword{datasets} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/GE.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/GE.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,25 @@ +\name{GE} +\alias{GE} +\docType{data} +\title{ + Artificial data for Matrix eQTL sample code: Gene expression. +} +\description{ + Artificial data set with expression of 10 genes across 15 samples. + Columns of the file must match to those of the genotype and covariates data sets. +} +\format{ + \tabular{llll}{ + geneid \tab Sam_01 \tab Sam_02 \tab \ldots \cr + Gene_01 \tab 4.91 \tab 4.63 \tab \ldots \cr + Gene_02 \tab 13.78 \tab 13.14 \tab \ldots \cr + \ldots \tab \ldots \tab \ldots \tab \ldots \cr + } +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\seealso{ + See \code{\link{Matrix_eQTL_engine}} for reference and sample code. +} +\keyword{datasets} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/MatrixEQTL-package.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/MatrixEQTL-package.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,40 @@ +\name{MatrixEQTL-package} +\alias{MatrixEQTL-package} +\alias{MatrixEQTL} +\docType{package} +\title{ + Matrix eQTL: Ultra fast eQTL analysis via large matrix operations +} +\description{ + Matrix eQTL is designed for fast eQTL analysis of large datasets. + Matrix eQTL can test for association between genotype and gene expression using linear regression + with either additive or ANOVA (additive and dominant) genotype effects. + The models can include covariates to account for such factors as population stratification, gender, clinical variables, and surrogate variables. + Matrix eQTL also supports models with heteroscedastic and/or correlated errors, + false discovery rate estimation and separate treatment of local (cis) and distant (trans) eQTLs. +} +\details{ + \tabular{ll}{ + Package: \tab MatrixEQTL\cr + Type: \tab Package\cr + Version: \tab 2.1.1\cr + Date: \tab 2014-02-24\cr + License: \tab LGPL-3 \cr + LazyLoad: \tab yes\cr + Depends: \tab methods\cr + } +} +\author{ + Andrey Shabalin \email{ashabalin@vcu.edu} + + Maintainer: Andrey Shabalin +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\keyword{ package } +\keyword{ MatrixEQTL } +\keyword{ Matrix eQTL } +\seealso{ + See \code{\link{Matrix_eQTL_engine}} for reference and sample code. +} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/MatrixEQTL_cis_code.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/MatrixEQTL_cis_code.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,129 @@ +\name{MatrixEQTL_cis_code} +\alias{MatrixEQTL_cis_code} + +\title{Sample code for cis/trans-eQTL analysis with Matrix eQTL} +\description{ + The following code is the best starting point for those who want to perform cis-/trans-eQTL analysis with Matrix eQTL. +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\seealso{ + See \code{\link{Matrix_eQTL_engine}} for reference and other sample code. +} +\author{ + Andrey Shabalin \email{ashabalin@vcu.edu} +} +\examples{ +# Matrix eQTL by Andrey A. Shabalin +# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ +# +# Be sure to use an up to date version of R and Matrix eQTL. + +# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); +library(MatrixEQTL) + +## Location of the package with the data files. +base.dir = find.package('MatrixEQTL'); + +## Settings + +# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS +useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS + +# Genotype file name +SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); +snps_location_file_name = paste(base.dir, "/data/snpsloc.txt", sep=""); + +# Gene expression file name +expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); +gene_location_file_name = paste(base.dir, "/data/geneloc.txt", sep=""); + +# Covariates file name +# Set to character() for no covariates +covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); + +# Output file name +output_file_name_cis = tempfile(); +output_file_name_tra = tempfile(); + +# Only associations significant at this level will be saved +pvOutputThreshold_cis = 2e-2; +pvOutputThreshold_tra = 1e-2; + +# Error covariance matrix +# Set to numeric() for identity. +errorCovariance = numeric(); +# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); + +# Distance for local gene-SNP pairs +cisDist = 1e6; + +## Load genotype data + +snps = SlicedData$new(); +snps$fileDelimiter = "\t"; # the TAB character +snps$fileOmitCharacters = "NA"; # denote missing values; +snps$fileSkipRows = 1; # one row of column labels +snps$fileSkipColumns = 1; # one column of row labels +snps$fileSliceSize = 2000; # read file in slices of 2,000 rows +snps$LoadFile(SNP_file_name); + +## Load gene expression data + +gene = SlicedData$new(); +gene$fileDelimiter = "\t"; # the TAB character +gene$fileOmitCharacters = "NA"; # denote missing values; +gene$fileSkipRows = 1; # one row of column labels +gene$fileSkipColumns = 1; # one column of row labels +gene$fileSliceSize = 2000; # read file in slices of 2,000 rows +gene$LoadFile(expression_file_name); + +## Load covariates + +cvrt = SlicedData$new(); +cvrt$fileDelimiter = "\t"; # the TAB character +cvrt$fileOmitCharacters = "NA"; # denote missing values; +cvrt$fileSkipRows = 1; # one row of column labels +cvrt$fileSkipColumns = 1; # one column of row labels +if(length(covariates_file_name)>0) { + cvrt$LoadFile(covariates_file_name); +} + +## Run the analysis +snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE); +genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE); + +me = Matrix_eQTL_main( + snps = snps, + gene = gene, + cvrt = cvrt, + output_file_name = output_file_name_tra, + pvOutputThreshold = pvOutputThreshold_tra, + useModel = useModel, + errorCovariance = errorCovariance, + verbose = TRUE, + output_file_name.cis = output_file_name_cis, + pvOutputThreshold.cis = pvOutputThreshold_cis, + snpspos = snpspos, + genepos = genepos, + cisDist = cisDist, + pvalue.hist = TRUE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE); + +unlink(output_file_name_tra); +unlink(output_file_name_cis); + +## Results: + +cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); +cat('Detected local eQTLs:', '\n'); +show(me$cis$eqtls) +cat('Detected distant eQTLs:', '\n'); +show(me$trans$eqtls) + +## Make the histogram of local and distant p-values + +plot(me) +} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/Matrix_eQTL_main.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/Matrix_eQTL_main.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,273 @@ +\name{Matrix_eQTL_main} +\alias{Matrix_eQTL_main} +\alias{Matrix_eQTL_engine} +\title{ + Main function for fast eQTL analysis in MatrixEQTL package +} +\description{ + \code{Matrix_eQTL_engine} function tests association of every row of the \code{snps} dataset with every row of the \code{gene} dataset using a linear regression model defined by the \code{useModel} parameter (see below). + + The testing procedure accounts for extra covariates in \code{cvrt} parameter. + + The \code{errorCovariance} parameter can be set to the error variance-covariance matrix to account for heteroskedastic and/or correlated errors. + + Associations significant at \code{pvOutputThreshold} (\code{pvOutputThreshold.cis}) levels are saved to \code{output_file_name} (\code{output_file_name.cis}), with corresponding estimates of effect size (slope coefficient), test statistics, p-values, and q-values (false discovery rate). + + Matrix eQTL can perform separate analysis for local (cis) and distant (trans) eQTLs. + For such analysis one has to set the cis-analysis specific parameters \code{pvOutputThreshold.cis > 0}, \code{cisDist}, \code{snpspos} and {genepos} in the call of \code{Matrix_eQTL_main} function. + A gene-SNP pair is considered local if the distance between them is less or equal to \code{cisDist}. + The genomic location of genes and SNPs is defined by data frames \code{snpspos} and {genepos}. + Depending on p-value thresholds \code{pvOutputThreshold} and \code{pvOutputThreshold.cis} Matrix eQTL runs in one of three different modes: + \itemize{ + \item Set \code{pvOutputThreshold > 0} and \code{pvOutputThreshold.cis = 0} (or use \code{Matrix_eQTL_engine}) to perform eQTL analysis without using gene/SNP locations. Associations significant at the \code{pvOutputThreshold} level are be recorded in \code{output_file_name} and in the returned object. + \item Set \code{pvOutputThreshold = 0} and \code{pvOutputThreshold.cis > 0} to perform eQTL analysis for local gene-SNP pairs only. Local associations significant at \code{pvOutputThreshold.cis} level will be recorded in \code{output_file_name.cis} and in the returned object. + \item Set \code{pvOutputThreshold > 0} and \code{pvOutputThreshold.cis > 0} to perform eQTL analysis with separate p-value thresholds for local and distant eQTLs. Distant and local associations significant at corresponding thresholds are recorded in \code{output_file_name} and \code{output_file_name.cis} respectively and in the returned object. In this case the false discovery rate is calculated separately for these two sets of eQTLs. + } + \code{Matrix_eQTL_engine} is a wrapper for \code{Matrix_eQTL_main} for eQTL analysis without regard to gene/SNP location and provided for compatibility with the previous versions of the package. + + The parameter \code{pvalue.hist} allows to record information sufficient to create a histogram or QQ-plot of all the p-values (see \code{\link[=plot.MatrixEQTL]{plot}}). +} +\usage{ +Matrix_eQTL_main( snps, + gene, + cvrt = SlicedData$new(), + output_file_name = "", + pvOutputThreshold = 1e-5, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + output_file_name.cis = "", + pvOutputThreshold.cis = 0, + snpspos = NULL, + genepos = NULL, + cisDist = 1e6, + pvalue.hist = FALSE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE) + +Matrix_eQTL_engine(snps, + gene, + cvrt = SlicedData$new(), + output_file_name, + pvOutputThreshold = 1e-5, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + pvalue.hist = FALSE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE) +} +\arguments{ + \item{snps}{ + \code{\linkS4class{SlicedData}} object with genotype information. + Can be real-valued for linear models and must take at most 3 distinct values for ANOVA unless the number of ANOVA categories is set to a higher number (see \code{useModel} parameter). + } + \item{gene}{ + \code{\linkS4class{SlicedData}} object with gene expression information. + Must have columns matching those of \code{snps}. + } + \item{cvrt}{ + \code{\linkS4class{SlicedData}} object with additional covariates. + Can be an empty \code{SlicedData} object in case of no covariates. + The constant is always included in the model and would cause an error if included in \code{cvrt}. + The order of columns must match those in \code{snps} and \code{gene}. + } + \item{output_file_name}{ + \code{character}, \code{connection}, or \code{NULL}. + If not \code{NULL}, significant associations are saved to this file (all significant associations if \code{pvOutputThreshold=0} or only distant if \code{pvOutputThreshold>0}). + If the file with this name exists, it is overwritten. + } + \item{output_file_name.cis}{ + \code{character}, \code{connection}, or \code{NULL}. + If not \code{NULL}, significant local associations are saved to this file. + If the file with this name exists, it is overwritten. + } + \item{pvOutputThreshold}{ + \code{numeric}. Significance threshold for all/distant tests. + } + \item{pvOutputThreshold.cis}{ + \code{numeric}. Same as \code{pvOutputThreshold}, but for local eQTLs. + } + \item{useModel}{ + \code{integer}. Eigher \code{modelLINEAR}, \code{modelANOVA}, or \code{modelLINEAR_CROSS}. + \enumerate{ + \item Set \code{useModel = \link{modelLINEAR}} to model the effect of the genotype as additive linear and test for its significance using t-statistic. + \item Set \code{useModel = \link{modelANOVA}} to treat genotype as a categorical variables and use ANOVA model and test for its significance using F-test. The default number of ANOVA categories is 3. Set otherwise like this: \code{options(MatrixEQTL.ANOVA.categories=4)}. + \item Set \code{useModel = \link{modelLINEAR_CROSS}} to add a new term to the model + equal to the product of genotype and the last covariate; the significance of this term is then tested using t-statistic. + } + + } + \item{errorCovariance}{ + \code{numeric}. The error covariance matrix. Use \code{numeric()} for homoskedastic independent errors. + } + \item{verbose}{ + \code{logical}. Set to \code{TRUE} to display more detailed report about the progress. + } + \item{snpspos}{ + \code{data.frame} object with information about SNP locations, must have 3 columns - SNP name, chromosome, and position, like this: + \tabular{ccc}{ + snpid \tab chr \tab pos \cr + Snp_01 \tab 1 \tab 721289 \cr + Snp_02 \tab 1 \tab 752565 \cr + \ldots \tab \ldots \tab \ldots \cr + } + } + \item{genepos}{ + \code{data.frame} with information about transcript locations, must have 4 columns - the name, chromosome, and positions of the left and right ends, like this: + \tabular{cccc}{ + geneid \tab chr \tab left \tab right \cr + Gene_01 \tab 1 \tab 721289 \tab 731289 \cr + Gene_02 \tab 1 \tab 752565 \tab 762565 \cr + \ldots \tab \ldots \tab \ldots \tab \ldots \cr + } + } + \item{cisDist}{ + \code{numeric}. SNP-gene pairs within this distance are considered local. The distance is measured from the nearest end of the gene. SNPs within a gene are always considered local. + } + \item{pvalue.hist}{ + \code{logical}, \code{numerical}, or \code{"qqplot"}. + Defines whether and how the distribution of p-values is recorded in the returned object. + If \code{pvalue.hist = FALSE}, the information is not recorded and the analysis is performed faster. + Alternatively, set \code{pvalue.hist = "qqplot"} to record information sufficient to create a QQ-plot of the p-values (use \code{\link[=plot.MatrixEQTL]{plot}} on the returned object to create the plot). + To record information for a histogram set \code{pvalue.hist} to the desired number of bins of equal size. Finally, \code{pvalue.hist} can also be set to a custom set of bin edges. + } + \item{min.pv.by.genesnp}{ + \code{logical}. Set \code{min.pv.by.genesnp = TRUE} to record the minimum p-value for each SNP and each gene in the returned object. The minimum p-values are recorded even if if they are above the corresponding thresholds of \code{pvOutputThreshold} and \code{pvOutputThreshold.cis}. The analysis runs faster when the parameter is set to \code{FALSE}. + } + \item{noFDRsaveMemory}{ + \code{logical}. Set \code{noFDRsaveMemory = TRUE} to save significant gene-SNP pairs directly to the output files, reduce memory footprint and skip FDR calculation. The eQTLs are not recorded in the returned object if \code{noFDRsaveMemory = TRUE}. + } +} +\details{ + Note that the columns of \code{gene}, \code{snps}, and \code{cvrt} must match. + If they do not match in the input files, use \code{ColumnSubsample} method to subset and/or reorder them. +} +\value{ + The detected eQTLs are saved in \code{output_file_name} and/or \code{output_file_name.cis} if they are not \code{NULL}. + The method also returns a list with a summary of the performed analysis. + \item{param}{Keeps all input parameters and also records the number of degrees of freedom for the full model.} + \item{time.in.sec}{Time difference between the start and the end of the analysis (in seconds).} + \item{all}{Information about all detected eQTLs.} + \item{cis}{Information about detected local eQTLs.} + \item{trans}{Information about detected distant eQTLs.} + The elements \code{all}, \code{cis}, and \code{trans} may contain the following components + \describe{ + \item{\code{ntests}}{Total number of tests performed. This is used for FDR calculation.} + \item{\code{eqtls}}{Data frame with recorded significant associations. Not available if \code{noFDRsaveMemory=FALSE}} + \item{\code{neqtls}}{Number of significant associations recorded.} + \item{\code{hist.bins}}{Histogram bins used for recording p-value distribution. See \code{pvalue.hist} parameter.} + \item{\code{hist.counts}}{Number of p-value that fell in each histogram bin. See \code{pvalue.hist} parameter.} + \item{\code{min.pv.snps}}{Vector with the best p-value for each SNP. See \code{min.pv.by.genesnp} parameter.} + \item{\code{min.pv.gene}}{Vector with the best p-value for each gene. See \code{min.pv.by.genesnp} parameter.} + } +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\author{ + Andrey Shabalin \email{ashabalin@vcu.edu} +} +\seealso{ + The code below is the sample code for eQTL analysis NOT using gene/SNP locations. + + See \code{\link{MatrixEQTL_cis_code}} for sample code for eQTL analysis that separates local and distant tests. +} +\examples{ +# Matrix eQTL by Andrey A. Shabalin +# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ +# +# Be sure to use an up to date version of R and Matrix eQTL. + +# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); +library(MatrixEQTL) + +## Location of the package with the data files. +base.dir = find.package('MatrixEQTL'); + +## Settings + +# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS +useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS + +# Genotype file name +SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); + +# Gene expression file name +expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); + +# Covariates file name +# Set to character() for no covariates +covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); + +# Output file name +output_file_name = tempfile(); + +# Only associations significant at this level will be saved +pvOutputThreshold = 1e-2; + +# Error covariance matrix +# Set to numeric() for identity. +errorCovariance = numeric(); +# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); + + +## Load genotype data + +snps = SlicedData$new(); +snps$fileDelimiter = "\t"; # the TAB character +snps$fileOmitCharacters = "NA"; # denote missing values; +snps$fileSkipRows = 1; # one row of column labels +snps$fileSkipColumns = 1; # one column of row labels +snps$fileSliceSize = 2000; # read file in slices of 2,000 rows +snps$LoadFile(SNP_file_name); + +## Load gene expression data + +gene = SlicedData$new(); +gene$fileDelimiter = "\t"; # the TAB character +gene$fileOmitCharacters = "NA"; # denote missing values; +gene$fileSkipRows = 1; # one row of column labels +gene$fileSkipColumns = 1; # one column of row labels +gene$fileSliceSize = 2000; # read file in slices of 2,000 rows +gene$LoadFile(expression_file_name); + +## Load covariates + +cvrt = SlicedData$new(); +cvrt$fileDelimiter = "\t"; # the TAB character +cvrt$fileOmitCharacters = "NA"; # denote missing values; +cvrt$fileSkipRows = 1; # one row of column labels +cvrt$fileSkipColumns = 1; # one column of row labels +if(length(covariates_file_name)>0) { + cvrt$LoadFile(covariates_file_name); +} + +## Run the analysis + +me = Matrix_eQTL_engine( + snps = snps, + gene = gene, + cvrt = cvrt, + output_file_name = output_file_name, + pvOutputThreshold = pvOutputThreshold, + useModel = useModel, + errorCovariance = errorCovariance, + verbose = TRUE, + pvalue.hist = TRUE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE); + +unlink(output_file_name); + +## Results: + +cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); +cat('Detected eQTLs:', '\n'); +show(me$all$eqtls) + +## Plot the histogram of all p-values + +plot(me) + +} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/SNP.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/SNP.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,25 @@ +\name{SNP} +\alias{SNP} +\docType{data} +\title{ + Artificial data for Matrix eQTL sample code: Genotype. +} +\description{ + Artificial data set with genotype information for 15 markers across 15 samples. + Columns of the file must match to those of the gene expression and covariates data sets. +} +\format{ + \tabular{llll}{ + snpnid \tab Sam_01 \tab Sam_02 \tab \ldots \cr + Snp_01 \tab 2 \tab 0 \tab \ldots \cr + Snp_02 \tab 0 \tab 1 \tab \ldots \cr + \ldots \tab \ldots \tab \ldots \tab \ldots \cr + } +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\seealso{ + See \code{\link{Matrix_eQTL_engine}} for reference and sample code. +} +\keyword{datasets} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/SlicedData-class.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/SlicedData-class.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,189 @@ +\name{SlicedData-class} +\Rdversion{1.1} +\docType{class} +\alias{SlicedData-class} +\alias{SlicedData} +\alias{SlicedData-class} +\alias{[[,SlicedData-method} +\alias{[[<-,SlicedData-method} +\alias{colnames,SlicedData-method} +\alias{colnames<-,SlicedData-method} +\alias{dim,SlicedData-method} +\alias{length,SlicedData-method} +\alias{ncol,SlicedData-method} +\alias{NCOL,SlicedData-method} +\alias{nrow,SlicedData-method} +\alias{NROW,SlicedData-method} +\alias{rownames,SlicedData-method} +\alias{rownames<-,SlicedData-method} +\alias{show,SlicedData-method} +\alias{as.matrix,SlicedData-method} + +\alias{rowMeans,SlicedData-method} +\alias{rowSums,SlicedData-method} +\alias{colMeans,SlicedData-method} +\alias{colSums,SlicedData-method} +\alias{summary.SlicedData} + +\title{Class \code{SlicedData} for storing large matrices} +\description{ + This class is created for fast and memory efficient manipulations with large datasets presented in matrix form. + It is used to load, store, and manipulate large datasets, e.g. genotype and gene expression matrices. + When a dataset is loaded, it is sliced in blocks of 1,000 rows (default size). + This allows imputing, standardizing, and performing other operations with the data with minimal memory overhead. +} +\section{Extends}{ + \code{SlicedData} is a reference classes (\code{\linkS4class{envRefClass}}). + Its methods can change the values of the fields of the class. +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\author{ + Andrey Shabalin \email{ashabalin@vcu.edu} +} +\seealso{ + This class is used to load data for eQTL analysis by \code{\link{Matrix_eQTL_engine}}. +} +\keyword{classes} +\section{Fields}{ + \describe{ + \item{\code{dataEnv}:}{\code{environment}. Stores the slices of the data matrix. The slices should be accessed via \code{getSlice()} and \code{setSlice()} methods. } + \item{\code{nSlices1}:}{\code{numeric}. Number of slices. For internal use. The value should be access via \code{nSlices()} method. } + \item{\code{rowNameSlices}:}{\code{list}. Slices of row names. } + \item{\code{columnNames}:}{\code{character}. Column names. } + \item{\code{fileDelimiter}:}{\code{character}. Delimiter separating values in the input file. } + \item{\code{fileSkipColumns}:}{\code{numeric}. Number of columns with row labels in the input file. } + \item{\code{fileSkipRows}:}{\code{numeric}. Number of rows with column labels in the input file. } + \item{\code{fileSliceSize}:}{\code{numeric}. Maximum number of rows in a slice. } + \item{\code{fileOmitCharacters}:}{\code{character}. Missing value (NaN) representation in the input file. } + } +} +\section{Methods}{ + \describe{ + \item{\code{initialize(mat)}:}{ Create the object from a matrix. } + \item{\code{nSlices()}:}{ Returns the number of slices. } + \item{\code{nCols()}:}{ Returns the number of columns in the matrix. } + \item{\code{nRows()}:}{ Returns the number of rows in the matrix. } + \item{\code{Clear()}:}{ Clears the object. Removes the data slices and row and column names. } + \item{\code{Clone()}:}{ Makes a copy of the object. Changes to the copy do not affect the source object. } + \item{\code{CreateFromMatrix(mat)}:}{ Creates \code{SlicedData} object from a \code{\link[base]{matrix}}.} + \item{\code{LoadFile(filename, skipRows = NULL, skipColumns = NULL,} \cr + \code{sliceSize = NULL, omitCharacters = NULL, delimiter = NULL, rowNamesColumn = 1)}:}{ Loads data matrix from a file. \code{filename} should be a character string. The remaining parameters specify the file format and have the same meaning as \code{file*} fields. Additional \code{rowNamesColumn} parameter specifies which of the columns of row labels to use as row names.} + \item{\code{SaveFile(filename)}:}{ Saves the data to a file. \code{filename} should be a character string.} + \item{\code{getSlice(sl)}:}{ Retrieves \code{sl}-th slice of the matrix. } + \item{\code{setSlice(sl, value)}:}{ Set \code{sl}-th slice of the matrix. } + \item{\code{ColumnSubsample(subset)}:}{ Reorders/subsets the columns according to \code{subset}. \cr +Acts as \code{M = M[ ,subset]} for a matrix \code{M}. } + \item{\code{RowReorder(ordr)}:}{ Reorders rows according to \code{ordr}. \cr +Acts as \code{M = M[ordr, ]} for a matrix \code{M}. } + \item{\code{RowMatrixMultiply(multiplier)}:}{ Multiply each row by the \code{multiplier}. \cr +Acts as \code{M = M \%*\% multiplier} for a matrix \code{M}. } + \item{\code{CombineInOneSlice()}:}{ Combines all slices into one. The whole matrix can then be obtained via \code{$getSlice(1)}. } + \item{\code{IsCombined()}:}{ Returns \code{TRUE} if the number of slices is 1 or 0. } + \item{\code{ResliceCombined(sliceSize = -1)}:}{ Cuts the data into slices of \code{sliceSize} rows. If \code{sliceSize} is not defined, the value of \code{fileSliceSize} field is used.} + \item{\code{GetAllRowNames()}:}{ Returns all row names in one vector. } + \item{\code{RowStandardizeCentered()}:}{ Set the mean of each row to zero and the sum of squares to one. } + \item{\code{SetNanRowMean()}:}{ Impute rows with row mean. Rows full of NaN values are imputed with zeros. } + \item{\code{RowRemoveZeroEps()}:}{ Removes rows of zeros and those that are nearly zero. } + \item{\code{FindRow(rowname)}:}{ Finds row by name. Returns a pair of slice number an row number within the slice. If no row is found, the function returns \code{NULL}. } + \item{\code{rowMeans(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of row means. Works as \link[base]{rowMeans} but requires \code{dims} to be equal to \code{1L}.} + \item{\code{rowSums(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of row sums. Works as \link[base]{rowSums} but requires \code{dims} to be equal to \code{1L}.} + \item{\code{colMeans(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of column means. Works as \link[base]{colMeans} but requires \code{dims} to be equal to \code{1L}.} + \item{\code{colSums(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of column sums. Works as \link[base]{colSums} but requires \code{dims} to be equal to \code{1L}.} + } +} + +\usage{ +# x[[i]] indexing allows easy access to individual slices. +# It is equivalent to x$GetSlice(i) and x$SetSlice(i,value) +\S4method{[[}{SlicedData}(x, i) +\S4method{[[}{SlicedData}(x, i) <- value + +# The following commands work as if x was a simple matrix object +\S4method{nrow}{SlicedData}(x) +\S4method{ncol}{SlicedData}(x) +\S4method{dim}{SlicedData}(x) +\S4method{rownames}{SlicedData}(x) +\S4method{colnames}{SlicedData}(x) +\S4method{rownames}{SlicedData}(x) <- value +\S4method{colnames}{SlicedData}(x) <- value + +# SlicedData object can be easily transformed into a matrix +# preserving row and column names +\S4method{as.matrix}{SlicedData}(x) + +# length(x) can be used in place of x$nSlices() +# to get the number of slices in the object +\S4method{length}{SlicedData}(x) +} +\arguments{ + \item{x}{ + \code{\linkS4class{SlicedData}} object. + } + \item{i}{ + Number of a slice. + } + \item{value}{ + New content for the slice / new row or column names. + } +} + + +\examples{ + +# Create a SlicedData variable +sd = SlicedData$new(); + +# Show the details of the empty object +show(sd); + +# Create a matrix of values and assign to sd +mat = matrix(1:12, 3, 4); +rownames(mat) = c("row1","row2","row3"); +colnames(mat) = c("col1","col2","col3","col4"); +sd$CreateFromMatrix( mat ); + +# Show the detail of the object (one slice) +show(sd); + +# Slice it in pieces of 2 rows +sd$ResliceCombined(sliceSize = 2L); + +# Show the number of slices (equivalent function calls) +sd$nSlices() +length(sd) + +# Is it all in one slice? (No) +sd$IsCombined() + +# Show the column names (equivalent function calls) +sd$columnNames +colnames(sd) + +# Show row name slices +sd$rowNameSlices + +# Show all row names (equivalent function calls) +sd$GetAllRowNames() +rownames(sd) + +# Print the second slice +print( sd[[2]] ) + +# Reorder and subset columns +sd$ColumnSubsample( c(1,3,4) ); + +# Reorder and subset rows +sd$RowReorder( c(3,1) ); + +# Show the detail of the object (one slice again) +show(sd); + +# Is it all in one slice? (Yes) +sd$IsCombined() + +# Find the row with name "row1" (it is second in the first slice) +sd$FindRow("row1"); + +} \ No newline at end of file diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/figures/QQplot.png Binary file MatrixEQTL/MatrixEQTL/man/figures/QQplot.png has changed diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/figures/histogram.png Binary file MatrixEQTL/MatrixEQTL/man/figures/histogram.png has changed diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/geneloc.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/geneloc.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,31 @@ +\name{geneloc} +\alias{geneloc} +\docType{data} +\title{ + Artificial data for Matrix eQTL sample code: Gene location file. +} +\description{ + Artificial Gene location file for 10 genes. + \tabular{llll}{ + geneid \tab chr \tab left \tab right \cr + Gene_01 \tab chr1 \tab 721289 \tab 731289 \cr + Gene_02 \tab chr1 \tab 752565 \tab 762565 \cr + \ldots \tab \ldots \tab \ldots \tab \ldots \cr + } +} +\format{ + A data frame with 4 columns. + \describe{ + \item{\code{geneid}}{A column with gene names. The order does not have to match the gene expression data set.} + \item{\code{chr}}{Chromosome number, i.e. chr1.} + \item{\code{left}}{Lower (smaller) coordinate of the gene begining/end.} + \item{\code{right}}{Upper (larger) coordinate of the gene begining/end.} + } +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\seealso{ + See \code{\link{Matrix_eQTL_engine}} for reference and sample code. +} +\keyword{datasets} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/modelANOVA.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/modelANOVA.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,88 @@ +\name{modelANOVA} +\alias{modelANOVA} +\docType{data} +\title{ + Constant for \code{\link{Matrix_eQTL_engine}}. +} +\description{ + Set parameter \code{useModel = modelANOVA} in the call of \code{\link{Matrix_eQTL_main}} to indicate that the genotype should be treated as a categorical variable. +} +\note{ +By default, the number of ANOVA categories is fixed to be 3. + +To set it to a different number (say, 4) use the following command: + +\code{options(MatrixEQTL.ANOVA.categories=4)} + +To check the current settings run: + +\code{getOption('MatrixEQTL.ANOVA.categories', 3);} +} +\examples{ +library('MatrixEQTL') + +# Number of columns (samples) +n = 100; + +# Number of covariates +nc = 10; + +# Generate the standard deviation of the noise +noise.std = 0.1 + rnorm(n)^2; + +# Generate the covariates +cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); + +# Generate the vectors with single genotype and expression variables +snps.mat = floor(runif(n, min = 0, max = 3)); +gene.mat = 1 + (snps.mat==1) + cvrt.mat \%*\% rnorm(nc) + rnorm(n) * noise.std; + +# Create 3 SlicedData objects for the analysis +snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); +gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); +cvrt1 = SlicedData$new( t(cvrt.mat) ); + +# name of temporary output file +filename = tempfile(); + +snps1 +gene1 + +# Call the main analysis function +me = Matrix_eQTL_main( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1, + useModel = modelANOVA, + errorCovariance = diag(noise.std^2), + verbose = TRUE, + pvalue.hist = FALSE ); +# remove the output file +unlink( filename ); + +# Pull Matrix eQTL results - t-statistic and p-value + +fstat = me$all$eqtls$statistic; +pvalue = me$all$eqtls$pvalue; +rez = c( Fstat = fstat, pvalue = pvalue) +# And compare to those from ANOVA in R +{ + cat('\n\n Matrix eQTL: \n'); + print(rez); + cat('\n R anova(lm()) output: \n') + lmodel = lm( gene.mat ~ cvrt.mat + factor(snps.mat), weights = 1/noise.std^2 ); + lmout = anova( lmodel )[2, 4:5]; + print( lmout ) +} + +# Results from Matrix eQTL and 'lm' must agree +stopifnot(all.equal(lmout, rez, check.attributes=FALSE)) +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\seealso{ + See \code{\link{Matrix_eQTL_engine}} for reference and sample code. +} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/modelLINEAR.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/modelLINEAR.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,75 @@ +\name{modelLINEAR} +\alias{modelLINEAR} +\docType{data} +\title{ + Constant for \code{\link{Matrix_eQTL_engine}}. +} +\description{ + Set parameter \code{useModel = modelLINEAR} in the call of \code{\link{Matrix_eQTL_main}} to indicate that the effect of genotype on expression should be assumed to be additive linear. +} + +\examples{ +library('MatrixEQTL') + +# Number of columns (samples) +n = 100; + +# Number of covariates +nc = 10; + +# Generate the standard deviation of the noise +noise.std = 0.1 + rnorm(n)^2; + +# Generate the covariates +cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); + +# Generate the vectors with genotype and expression variables +snps.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n); +gene.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n) * noise.std + 0.5 * snps.mat + 1; + +# Create 3 SlicedData objects for the analysis +snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); +gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); +cvrt1 = SlicedData$new( t(cvrt.mat) ); + +# name of temporary output file +filename = tempfile(); + +# Call the main analysis function +me = Matrix_eQTL_main( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1, + useModel = modelLINEAR, + errorCovariance = diag(noise.std^2), + verbose = TRUE, + pvalue.hist = FALSE ); +# remove the output file +unlink( filename ); + +# Pull Matrix eQTL results - t-statistic and p-value +beta = me$all$eqtls$beta; +tstat = me$all$eqtls$statistic; +pvalue = me$all$eqtls$pvalue; +rez = c(beta = beta, tstat = tstat, pvalue = pvalue) +# And compare to those from the linear regression in R +{ + cat('\n\n Matrix eQTL: \n'); + print(rez); + cat('\n R summary(lm()) output: \n'); + lmodel = lm( gene.mat ~ snps.mat + cvrt.mat, weights = 1/noise.std^2 ); + lmout = summary( lmodel )$coefficients[2, c(1,3,4)]; + print( lmout ) +} + +# Results from Matrix eQTL and 'lm' must agree +stopifnot(all.equal(lmout, rez, check.attributes=FALSE)) +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\seealso{ + See \code{\link{Matrix_eQTL_engine}} for reference and sample code. +} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/modelLINEAR_CROSS.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/modelLINEAR_CROSS.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,76 @@ +\name{modelLINEAR_CROSS} +\alias{modelLINEAR_CROSS} +\docType{data} +\title{ + Constant for \code{\link{Matrix_eQTL_engine}}. +} +\description{ + Set parameter \code{useModel = modelLINEAR_CROSS} in the call of \code{\link{Matrix_eQTL_main}} to indicate that Matrix eQTL should include the interaction of SNP and last covariate in the model and test for its significance. +} +\examples{ +library('MatrixEQTL') + +# Number of columns (samples) +n = 25; + +# Number of covariates +nc = 10; + +# Generate the standard deviation of the noise +noise.std = 0.1 + rnorm(n)^2; + +# Generate the covariates +cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); + +# Generate the vectors with single genotype and expression variables +snps.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n); +gene.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n) * noise.std + + 1 + 0.5 * snps.mat + snps.mat * cvrt.mat[,nc]; + +# Create 3 SlicedData objects for the analysis +snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); +gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); +cvrt1 = SlicedData$new( t(cvrt.mat) ); + +# name of temporary output file +filename = tempfile(); + +# Call the main analysis function +me = Matrix_eQTL_main( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1, + useModel = modelLINEAR_CROSS, + errorCovariance = diag(noise.std^2), + verbose = TRUE, + pvalue.hist = FALSE ); +# remove the output file +unlink( filename ); + +# Pull Matrix eQTL results - t-statistic and p-value +beta = me$all$eqtls$beta; +tstat = me$all$eqtls$statistic; +pvalue = me$all$eqtls$pvalue; +rez = c(beta = beta, tstat = tstat, pvalue = pvalue) +# And compare to those from the linear regression in R +{ + cat('\n\n Matrix eQTL: \n'); + print(rez); + cat('\n R summary(lm()) output: \n') + lmodel = lm( gene.mat ~ snps.mat + cvrt.mat + snps.mat*cvrt.mat[,nc], + weights = 1/noise.std^2 ); + lmout = tail(summary( lmodel )$coefficients,1)[,c(1,3,4)]; + print( tail(lmout) ); +} + +# Results from Matrix eQTL and 'lm' must agree +stopifnot(all.equal(lmout, rez, check.attributes=FALSE)) +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\seealso{ + See \code{\link{Matrix_eQTL_engine}} for reference and sample code. +} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/plot.MatrixEQTL.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/plot.MatrixEQTL.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,112 @@ +\name{plot.MatrixEQTL} +\alias{plot.MatrixEQTL} +\title{Plot histogram or QQ-plot of all p-values +} +\description{ +This method plots a histogram or QQ-plot of p-values +for all tests performed by \code{\link{Matrix_eQTL_engine}}. +} +\usage{ +\method{plot}{MatrixEQTL}(x, cex = 0.5, pch = 19, xlim = NULL, ylim = NULL,...) +} +\arguments{ + \item{x}{ + An object returned by \code{\link{Matrix_eQTL_engine}}. + } + \item{cex}{ + A numerical value giving the amount by which plotting text and symbols should be magnified relative to the default. + } + \item{pch}{ + Plotting "character", i.e., symbol to use. See \code{\link[graphics]{points}}. + } + \item{xlim}{ + Set the range of the horisontal axis. + } + \item{ylim}{ + Set the range of the vertical axis. + } + \item{\dots}{ + further graphical parameters passed to \code{\link[graphics]{lines}} and \code{\link[graphics]{points}}. + } +} +\details{ + The plot type (histogram vs. QQ-plot) is determined by the \code{pvalue.hist} parameter in the call of \code{\link{Matrix_eQTL_engine}} function. +} +\value{ + The method does not return any value. +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\seealso{ + See \code{\link{Matrix_eQTL_engine}} for reference and sample code. +} +\note{ + The sample code below produces figures like these: + + Histogram: \figure{histogram.png} + + QQ-plot: \figure{QQplot.png} +} +\author{ + Andrey Shabalin \email{ashabalin@vcu.edu} +} +\examples{ +library(MatrixEQTL) + +# Number of samples +n = 100; + +# Number of variables +ngs = 2000; + +# Common signal in all variables +pop = 0.2*rnorm(n); + +# data matrices +snps.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop; +gene.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop + snps.mat*((1:ngs)/ngs)^9/2; + +# data objects for Matrix eQTL engine +snps1 = SlicedData$new( t( snps.mat ) ); +gene1 = SlicedData$new( t( gene.mat ) ); +cvrt1 = SlicedData$new( ); +rm(snps.mat, gene.mat) + +# Slice data in blocks of 500 variables +snps1$ResliceCombined(500); +gene1$ResliceCombined(500); + +# Produce no output files +filename = NULL; # tempfile() + +# Perform analysis recording information for a histogram +meh = Matrix_eQTL_engine( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1e-100, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + pvalue.hist = 100); + +plot(meh, col="grey") + +# Perform analysis recording information for a QQ-plot +meq = Matrix_eQTL_engine( + snps = snps1, + gene = gene1, + cvrt = cvrt1, + output_file_name = filename, + pvOutputThreshold = 1e-6, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + pvalue.hist = "qqplot"); + +plot(meq) +} +\keyword{ QQ-plot } +\keyword{ histogram } diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/MatrixEQTL/man/snpsloc.Rd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/MatrixEQTL/man/snpsloc.Rd Fri Mar 12 08:14:04 2021 +0000 @@ -0,0 +1,30 @@ +\name{snpsloc} +\alias{snpsloc} +\docType{data} +\title{ + Artificial data for Matrix eQTL sample code: SNP location file. +} +\description{ + Artificial SNP location file for 15 markers. + \tabular{llll}{ + snpid \tab chr \tab pos \cr + Snp_01 \tab chr1 \tab 721289 \cr + Snp_02 \tab chr1 \tab 752565 \cr + \ldots \tab \ldots \tab \ldots \cr + } +} +\format{ + A data frame with 3 columns. + \describe{ + \item{\code{snpid}}{A column with SNP names. The order does not have to match the genotype data set.} + \item{\code{chr}}{Chromosome number, i.e. chr1.} + \item{\code{pos}}{Coordinate of the SNP.} + } +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\seealso{ + See \code{\link{Matrix_eQTL_engine}} for reference and sample code. +} +\keyword{datasets} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/NAMESPACE --- a/MatrixEQTL/NAMESPACE Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -importFrom(methods, setRefClass) -exportPattern("^[[:alpha:]]+") -exportClasses( - "SlicedData" -) -S3method(plot, MatrixEQTL) -S3method(summary, SlicedData) -exportMethods( - "[[", - "[[<-", - "colnames", - "colnames<-", - "dim", - "length", - "ncol", - "NCOL", - "nrow", - "NROW", - "rownames", - "rownames<-", - "as.matrix", - "rowMeans", - "rowSums", - "colMeans", - "colSums" -) - diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/R/Matrix_eQTL_engine.R --- a/MatrixEQTL/R/Matrix_eQTL_engine.R Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1964 +0,0 @@ -# Matrix eQTL by Andrey A. Shabalin -# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ - -# http://cran.r-project.org/web/packages/policies.html - -library(methods) - -modelLINEAR = 117348L; -modelANOVA = 47074L; -modelLINEAR_CROSS = 1113461L; - -.seq = function(a,b){if(a<=b){a:b}else{integer(0)}}; - -SlicedData <- setRefClass( "SlicedData", - fields = list( - dataEnv = "environment", - nSlices1 = "numeric", - rowNameSlices = "list", - columnNames = "character", - fileDelimiter = "character", - fileSkipColumns = "numeric", - fileSkipRows = "numeric", - fileSliceSize = "numeric", - fileOmitCharacters = "character" - ), - methods = list( - initialize = function( mat = NULL ) { - dataEnv <<- new.env(hash = TRUE, size = 29L); - nSlices1 <<- 0L; - if(!is.null(mat)) { - CreateFromMatrix(mat); - } - fileSliceSize <<- 1000; - fileDelimiter <<- "\t"; - fileSkipColumns <<- 1L; - fileSkipRows <<- 1L; - fileOmitCharacters <<- "NA" - return(invisible(.self)); - }, - CreateFromMatrix = function( mat ) { - stopifnot( class(mat) == "matrix" ); - setSliceRaw( 1L ,mat ); - rns = rownames( mat, do.NULL = FALSE); - #if( is.null(rns) ) { - # rns = paste( "Row_",(1:nrow(mat)), sep="" ); - #} - rowNameSlices <<- list(rns); - cns = colnames( mat, do.NULL = FALSE ); - #if( is.null(cns) ){ - # cns = paste( "Col_",(1:ncol(mat)), sep="" ); - #} - columnNames <<- cns; - return(invisible(.self)); - }, - getSlice = function(sl) { - value = get(paste(sl), dataEnv); - if( is.raw(value) ) { - storage.mode(value) = "double"; - value[value == 255] = NA; - } - return( value ) - }, - getSliceRaw = function(sl) { - return( get(paste(sl), dataEnv) ) - }, - setSliceRaw = function(sl, value) { - assign( paste(sl), value, dataEnv ) - if( nSlices1 < sl ) { - nSlices1 <<- sl; - } - }, - setSlice = function(sl, value) { - if( length(value) > 0 ) { - if( all(as.integer(value) == value, na.rm = TRUE) ) { - if( (min(value, na.rm = TRUE) >= 0 ) && - (max(value, na.rm = TRUE) < 255) ) - { - nv = value; - suppressWarnings({storage.mode(nv) = "raw"}); - nv[ is.na(value)] = as.raw(255); - value = nv; - } else { - storage.mode(value) = "integer"; - } - } - } - setSliceRaw(sl, value); - }, - nSlices = function() { - return( nSlices1 ); - }, - LoadFile = function(filename, skipRows = NULL, skipColumns = NULL, sliceSize = NULL, omitCharacters = NULL, delimiter = NULL, rowNamesColumn = 1) { - if( !is.null(skipRows) ) { - fileSkipRows <<- skipRows; - } - if( !is.null(skipColumns) ) { - fileSkipColumns <<- skipColumns; - } - if( !is.null(omitCharacters) ) { - fileOmitCharacters <<- omitCharacters; - } - if( !is.null(sliceSize) ) { - fileSliceSize <<- sliceSize; - } - if( !is.null(delimiter) ) { - fileDelimiter <<- delimiter; - } - stopifnot( (fileSkipColumns == 0) || (rowNamesColumn <= fileSkipColumns) ) - stopifnot( (fileSkipColumns == 0) || (rowNamesColumn >= 1) ) - - fid = file(description = filename, open = "rt", blocking = FALSE, raw = FALSE) - # clean object if file is open - Clear(); - lines = readLines(con = fid, n = max(fileSkipRows,1L), ok = TRUE, warn = TRUE) - line1 = tail(lines,1); - splt = strsplit(line1, split = fileDelimiter, fixed = TRUE); - if( fileSkipRows > 0L ) { - columnNames <<- splt[[1]]; # [ -(1:fileSkipColumns) ]; - } else { - seek(fid, 0) - } - - rm( lines, line1, splt ); - - rowNameSlices <<- vector("list", 15); - - curSliceId = 0L; - repeat - { - # preallocate names and data - if(length(rowNameSlices) < curSliceId) { - rowNameSlices[[2L*curSliceId]] <<- NULL; - } - curSliceId = curSliceId + 1L; - - # read sliceSize rows - rowtag = vector("character",fileSliceSize); - rowvals = vector("list",fileSliceSize); - for(i in 1:fileSliceSize) { - temp = ""; - if( fileSkipColumns > 0L ) { - temp = scan(file = fid, what = character(), n = fileSkipColumns, quiet = TRUE,sep = fileDelimiter); - } - rowtag[i] = temp[rowNamesColumn];#paste(temp,collapse=" "); - rowvals[[i]] = scan(file = fid, what = double(), nlines = 1, quiet = TRUE, sep = fileDelimiter, na.strings = fileOmitCharacters); - if( length(rowvals[[i]]) == 0L ) { - if(i==1L) { - rowtag = matrix(0, 0, 0); - rowvals = character(0); - } else { - rowtag = rowtag[ 1:(i-1) ]; - rowvals = rowvals[ 1:(i-1) ]; - } - break; - } - } - if( length(rowtag) == 0L ) { - curSliceId = curSliceId - 1L; - break; - } - rowNameSlices[[curSliceId]] <<- rowtag; - data = c(rowvals, recursive = TRUE); - dim(data) = c(length(rowvals[[1]]), length(rowvals)); - data = t(data); - setSlice(curSliceId, data); - if( length(rowtag) < fileSliceSize ) { - break; - } - numtxt = formatC(curSliceId*fileSliceSize, big.mark=",", format = "f", digits = 0) - cat( "Rows read: ", numtxt, "\n"); - flush.console() - } - close(fid) - if( fileSkipRows == 0 ) { - columnNames <<- paste("Col_", (1:nCols()), sep=""); - } else { - columnNames <<- tail(columnNames, ncol(getSliceRaw(1))); - } - if( fileSkipColumns == 0 ) { - cnt = 0L; - for( sl in 1:nSlices() ) { - nr = length(getSliceRaw(sl)); - rowNameSlices[[sl]] <<- paste("Row_",cnt + (1:nr),sep=""); - cnt = cnt + nr; - } - } - rowNameSlices <<- rowNameSlices[1:curSliceId]; - cat("Rows read: ", nRows(), " done.\n"); - return(invisible(.self)); - }, - SaveFile = function(filename) { - if( nSlices() == 0 ) { - cat("No data to save"); - return(); - } - fid = file(filename,"wt"); - for( sl in 1:nSlices() ) { - z = getSlice(sl); - rownames(z) = rowNameSlices[[sl]]; - colnames(z) = columnNames; - write.table(z, file = fid, sep = "\t", - col.names = (if(sl == 1){NA}else{FALSE})); - } - close(fid); - }, - nRows = function() { - s = 0L; - for(sl in .seq(1,nSlices())) { - s = s + nrow(getSliceRaw(sl)); - } - return( s ) - }, - nCols = function() { - if( nSlices() == 0L ) { - return(0L); - } else { - return( ncol(getSliceRaw(1L)) ) - } - }, - Clear = function() { - for( sl in .seq(1,nSlices()) ) { - rm(list = paste(sl), envir = dataEnv) - } - nSlices1 <<- 0L; - rowNameSlices <<- list(); - columnNames <<- character(); - return(invisible(.self)); - }, - IsCombined = function() { - return( nSlices() <= 1L ); - }, - GetAllRowNames = function() { - return( c(rowNameSlices, recursive=TRUE) ); - }, - SetNanRowMean = function() { - if( (nCols() == 0L) ) { - return(invisible(.self)); - } - for( sl in .seq(1,nSlices()) ) { - slice = getSlice(sl); - if( any(is.na(slice)) ) { - rowmean = rowMeans(slice, na.rm = TRUE); - rowmean[is.na(rowmean)] = 0L; - for( j in which(!complete.cases(slice)) ) { - where1 = is.na(slice[j, ]); - slice[j, where1] = rowmean[j]; - } - setSlice(sl, slice); - } - } - return(invisible(.self)); - }, - RowStandardizeCentered = function() { - for(sl in .seq(1,nSlices()) ) { - slice = getSlice(sl); - div = sqrt( rowSums(slice^2) ); - div[ div == 0 ] = 1; - setSlice(sl, slice/div); - } - return(invisible(.self)); - }, - CombineInOneSlice = function() { - if( nSlices() <= 1L ) { - return(invisible(.self)); - } - nc = nCols(); - nr = nRows(); - datatypes = c("raw","integer","double"); - datafuns = c(as.raw, as.integer, as.double); - datatype = character(nSlices()); - for(sl in 1:nSlices()) { - datatype[sl] = typeof(getSliceRaw(sl)); - } - mch = max(match(datatype,datatypes,nomatch = length(datatypes))); - datafun = datafuns[[mch]]; - newData = matrix(datafun(0), nrow = nr, ncol = nc); - offset = 0; - for(sl in 1:nSlices()) { - if(mch==1) { - slice = getSliceRaw(sl); - } else { - slice = getSlice(sl); - } - newData[ offset + (1:nrow(slice)),] = datafun(slice); - setSlice(sl, numeric()); - offset = offset + nrow(slice); - } - - nSlices1 <<- 1L; - setSliceRaw(1L, newData); - rm(newData); - - newrowNameSlices = GetAllRowNames(); - rowNameSlices <<- list(newrowNameSlices) - return(invisible(.self)); - }, - ResliceCombined = function(sliceSize = -1) { - if( sliceSize > 0L ) { - fileSliceSize <<- sliceSize; - } - if( fileSliceSize <= 0 ) { - fileSliceSize <<- 1000; - } - if( IsCombined() ) { - nRows1 = nRows(); - if(nRows1 == 0L) { - return(invisible(.self)); - } - newNSlices = floor( (nRows1 + fileSliceSize - 1)/fileSliceSize ); - oldData = getSliceRaw(1L); - #oldNames = rowNameSlices[[1]]; - newNameslices = vector("list",newNSlices) - for( sl in 1:newNSlices ) { - range = (1+(sl-1)*fileSliceSize) : (min(nRows1,sl*fileSliceSize)); - newpart = oldData[range, ,drop = FALSE]; - if( is.raw(oldData) ) { - setSliceRaw( sl, newpart); - } else { - setSlice( sl, newpart); - } - newNameslices[[sl]] = rowNameSlices[[1]][range]; - } - rowNameSlices <<- newNameslices ; - } else { - stop("Reslice of a sliced matrix is not supported yet. Use CombineInOneSlice first."); - } - return(invisible(.self)); - }, - Clone = function() { - clone = SlicedData$new(); - for(sl in .seq(1,nSlices()) ) { - clone$setSliceRaw(sl,getSliceRaw(sl)); - } - clone$rowNameSlices = rowNameSlices; - clone$columnNames = columnNames; - clone$fileDelimiter = fileDelimiter; - clone$fileSkipColumns = fileSkipColumns; - clone$fileSkipRows = fileSkipRows; - clone$fileSliceSize = fileSliceSize; - clone$fileOmitCharacters = fileOmitCharacters; - return( clone ); - }, - RowMatrixMultiply = function(multiplier) { - for(sl in .seq(1,nSlices()) ) { - setSlice(sl, getSlice(sl) %*% multiplier); - } - return(invisible(.self)); - }, - ColumnSubsample = function(subset) { - for(sl in .seq(1,nSlices()) ) { - setSliceRaw(sl, getSliceRaw(sl)[ ,subset, drop = FALSE]); - } - columnNames <<- columnNames[subset]; - return(invisible(.self)); - }, - RowReorderSimple = function(ordr) { - # had to use an inefficient and dirty method - # due to horrible memory management in R - if( (typeof(ordr) == "logical") && all(ordr) ) { - return(invisible(.self)); - } - if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { - return(invisible(.self)); - } - CombineInOneSlice(); - gc(); - setSliceRaw( 1L, getSliceRaw(1L)[ordr, ] ); - rowNameSlices[[1]] <<- rowNameSlices[[1]][ordr]; - gc(); - ResliceCombined(); - gc(); - return(invisible(.self)); - }, - RowReorder = function(ordr) { - # transform logical into indices - if( typeof(ordr) == "logical" ) { - if( length(ordr) == nRows() ) { - ordr = which(ordr); - } else { - stop("Parameter \"ordr\" has wrong length") - } - } - ## first, check that anything has to be done at all - if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { - return(invisible(.self)); - } - ## check bounds - #if( (min(ordr) < 1) || (max(ordr) > nRows()) ) { - # stop("Parameter \"ordr\" is out of bounds"); - #} - ## slice the data into individual rows - all_rows = vector("list", nSlices()) - for( i in 1:nSlices() ) { - slice = getSliceRaw(i) - all_rows[[i]] = split(slice, 1:nrow(slice)) - setSliceRaw(i,numeric()) - } - gc(); - all_rows = unlist(all_rows, recursive=FALSE, use.names = FALSE); - ## Reorder the rows - all_rows = all_rows[ordr]; - ## get row names - all_names = GetAllRowNames(); - ## erase the set - rowNameSlices <<- list(); - ## sort names - all_names = all_names[ordr]; - ## - ## Make slices back - nrows = length(all_rows); - nSlices1 <<- as.integer((nrows+fileSliceSize-1)/fileSliceSize); - ##cat(nrows, " ", nSlices1); - rowNameSlices1 = vector("list", nSlices1); - for( i in 1:nSlices1 ) { - fr = 1 + fileSliceSize*(i-1); - to = min( fileSliceSize*i, nrows); - - subset = all_rows[fr:to]; - types = unlist(lapply(subset,typeof)); - israw = (types == "raw") - if(!all(israw == israw[1])) { - # some raw and some are not - subset = lapply(subset, function(x){if(is.raw(x)){x=as.integer(x);x[x==255] = NA;return(x)}else{return(x)}}); - } - subset = unlist(subset); - dim(subset) = c( length(all_rows[[fr]]) , to - fr + 1) - #subset = matrix(subset, ncol = (to-fr+1)); - if(is.raw(subset)) { - setSliceRaw(i, t(subset)); - } else { - setSlice(i, t(subset)); - } - rowNameSlices1[[i]] = all_names[fr:to]; - all_rows[fr:to] = 0; - all_names[fr:to] = 0; - } - rowNameSlices <<- rowNameSlices1; - gc(); - return(invisible(.self)); - }, - RowRemoveZeroEps = function(){ - for(sl in .seq(1,nSlices()) ) { - slice = getSlice(sl); - amean = rowMeans(abs(slice)); - remove = (amean < .Machine$double.eps*nCols()); - if(any(remove)) { - rowNameSlices[[sl]] <<- rowNameSlices[[sl]][!remove]; - setSlice(sl, slice[!remove, , drop = FALSE]); - } - } - return(invisible(.self)); - }, - FindRow = function(rowname) { - for(sl in .seq(1,nSlices()) ) { - mch = match(rowname,rowNameSlices[[sl]], nomatch = 0); - if( mch > 0 ) - { - row = getSlice(sl)[mch[1], , drop=FALSE]; - rownames(row) = rowname; - colnames(row) = columnNames; - return( list(slice = sl, item = mch, row = row) ); - } - } - return( NULL ); - }, - show = function() { - cat("SlicedData object. For more information type: ?SlicedData\n"); - cat("Number of columns:", nCols(), "\n"); - cat("Number of rows:", nRows(), "\n"); - cat("Data is stored in", nSlices(), "slices\n"); - if(nCols()>0) { - z = getSlice(1L); - if(nrow(z)>0) { - z = z[1:min(nrow(z),10L), 1:min(ncol(z),10L), drop = FALSE]; - rownames(z) = rowNameSlices[[1]][1:nrow(z)]; - colnames(z) = columnNames[1:ncol(z)]; - cat("Top left corner of the first slice (up to 10x10):\n"); - methods:::show(z) - } - } - } - )) - -setGeneric("nrow") -setMethod("nrow", "SlicedData", function(x) { - return( x$nRows() ); - }) -setGeneric("NROW") -setMethod("NROW", "SlicedData", function(x) { - return( x$nRows() ); - }) -setGeneric("ncol") -setMethod("ncol", "SlicedData", function(x) { - return( x$nCols() ); - }) -setGeneric("NCOL") -setMethod("NCOL", "SlicedData", function(x) { - return( x$nCols() ); - }) -setGeneric("dim") -setMethod("dim", "SlicedData", function(x) { - return( c(x$nRows(),x$nCols()) ); - }) -setGeneric("colnames") -setMethod("colnames", "SlicedData", function(x) { - return( x$columnNames ); - }) -setGeneric("rownames") -setMethod("rownames", "SlicedData", function(x) { - return( x$GetAllRowNames() ); - }) -setMethod("[[", "SlicedData", function(x,i) { - return( x$getSlice(i) ); - }) -setGeneric("length") -setMethod("length", "SlicedData", function(x) { - return( x$nSlices() ); - }) -setMethod("[[<-", "SlicedData", function(x,i,value) { - x$setSlice(i, value); - return(x); -}) -summary.SlicedData = function(object, ...) { - z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); - return(z); -} - -##### setGeneric("summary") ##### -#setMethod("summary", "SlicedData", function(object, ...) { -# z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); -# return(z); -# }) -#setGeneric("show", standardGeneric("show")) -# setMethod("show", "SlicedData", function(object) { -# cat("SlicedData object. For more information type: ?SlicedData\n"); -# cat("Number of columns:", object$nCols(), "\n"); -# cat("Number of rows:", object$nRows(), "\n"); -# cat("Data is stored in", object$nSlices(), "slices\n"); -# if(object$nSlices()>0) { -# z = object$getSlice(1); -# if(nrow(z)>0) { -# z = z[1:min(nrow(z),10), 1:min(ncol(z),10), drop = FALSE]; -# rownames(z) = object$rowNameSlices[[1]][1:nrow(z)]; -# colnames(z) = object$columnNames[1:ncol(z)]; -# cat("Top left corner of the first slice (up to 10x10):\n"); -# show(z) -# } -# } -# }) - -setGeneric("as.matrix") -setMethod("as.matrix", "SlicedData", function(x) { - if(x$nSlices() == 0) { - return( matrix(0,0,0) ); - } - if(x$nSlices() > 1) { - copy = x$Clone(); - copy$CombineInOneSlice(); - } else { - copy = x; - } - mat = copy$getSlice(1L); - rownames(mat) = rownames(copy); - colnames(mat) = colnames(copy); - return( mat ); - }) -setGeneric("colnames<-") -setMethod("colnames<-", "SlicedData", function(x,value) { - stopifnot( class(value) == "character" ); - stopifnot( length(value) == x$nCols() ); - x$columnNames = value; - return(x); - }) -setGeneric("rownames<-") -setMethod("rownames<-", "SlicedData", function(x,value) { - stopifnot( class(value) == "character" ); - stopifnot( length(value) == x$nRows() ); - start = 1; - newNameSlices = vector("list", x$nSlices()); - for( i in .seq(1,x$nSlices()) ) { - nr = nrow(x$getSliceRaw(i)); - newNameSlices[[i]] = value[ start:(start+nr-1) ]; - start = start + nr; - } - x$rowNameSlices = newNameSlices; - return(x); - }) -setGeneric("rowSums") -setMethod("rowSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { - if(x$nSlices() == 0) { - return( numeric() ); - } - stopifnot( dims == 1 ); - thesum = vector("list", x$nSlices()); - for( i in 1:x$nSlices() ) { - thesum[[i]] = rowSums(x$getSlice(i), na.rm) - } - return(unlist(thesum, recursive = FALSE, use.names = FALSE)); - }) -setGeneric("rowMeans") -setMethod("rowMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { - if(x$nSlices() == 0) { - return( numeric() ); - } - stopifnot( dims == 1 ); - thesum = vector("list", x$nSlices()); - for( i in 1:x$nSlices() ) { - thesum[[i]] = rowMeans(x$getSlice(i), na.rm) - } - return(unlist(thesum, recursive = FALSE, use.names = FALSE)); - }) -setGeneric("colSums") -setMethod("colSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { - if(x$nCols() == 0) { - return( numeric() ); - } - stopifnot( dims == 1 ); - thesum = 0; - for( i in .seq(1,x$nSlices()) ) { - thesum = thesum + colSums(x$getSlice(i), na.rm) - } - return(thesum); - }) -setGeneric("colMeans") -setMethod("colMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { - if(x$nCols() == 0) { - return( numeric() ); - } - stopifnot( dims == 1 ); - thesum = 0; - thecounts = x$nRows(); - for( i in .seq(1,x$nSlices()) ) { - slice = x$getSlice(i); - thesum = thesum + colSums(slice, na.rm) - if( na.rm ) { - thecounts = thecounts - colSums(is.na(slice)) - } - } - return(thesum/thecounts); - }) - -.listBuilder <- setRefClass(".listBuilder", - fields = list( - dataEnv = "environment", - n = "numeric" - ), - methods = list( - initialize = function() { - dataEnv <<- new.env(hash = TRUE); - n <<- 0; -# cumlength <<- 0; - return(.self); - }, - add = function(x) { - if(length(x) > 0) { - n <<- n + 1; -# cumlength <<- cumlength + length(x); - assign(paste(n), x, dataEnv ); - } - return(.self); - }, - set = function(i,x) { - if(length(x) > 0) { - if(i>n) - n <<- i; - assign(paste(i), x, dataEnv ); - } - return(.self); - }, - get = function(i) { - return(base::get(paste(i),dataEnv)); - }, - list = function() { - if(n==0) return(list()); - result = vector('list',n); - for( i in 1:n) { - result[[i]] = .self$get(i); - } - return(result); - }, - unlist = function() { - return(base::unlist(.self$list(), recursive=FALSE, use.names = FALSE)); - }, - show = function() { - cat(".listBuilder object.\nIternal object in MatrixEQTL package.\n"); - cat("Number of elements:", object$n, "\n"); - } - )) - -.histogrammer <- setRefClass(".histogrammer", - fields = list( - pvbins1 = "numeric", - statbins1 = "numeric", - hist.count = "numeric" - ), - methods = list( - initialize = function (pvbins, statbins) { - if(length(pvbins)) { - ord = order(statbins); - pvbins1 <<- pvbins[ord]; - statbins1 <<- statbins[ord]; - hist.count <<- double(length(pvbins)-1); - } - return(.self); - }, - update = function(stats.for.hist) { - h = hist(stats.for.hist, breaks = statbins1, include.lowest = TRUE, right = TRUE, plot = FALSE)$counts; - hist.count <<- hist.count + h; - }, - getResults = function() { - if(!is.unsorted(pvbins1)) { - return(list(hist.bins = pvbins1 , hist.counts = hist.count )); - } else { - return(list(hist.bins = rev(pvbins1), hist.counts = rev(hist.count))); - } - } - )) - - -.minpvalue <- setRefClass(".minpvalue", - fields = list( - sdata = ".listBuilder", - gdata = ".listBuilder" - ), - methods = list( - initialize = function(snps, gene) { - sdata <<- .listBuilder$new(); - for( ss in 1:snps$nSlices() ) { - sdata$set( ss, double(nrow(snps$getSliceRaw(ss)))); - } - gdata <<- .listBuilder$new(); - for( gg in 1:gene$nSlices() ) { - gdata$set( gg, double(nrow(gene$getSliceRaw(gg)))); - } - return(.self); - }, - update = function(ss, gg, astat) { - gmax = gdata$get(gg) - z1 = max.col(astat,ties.method='first'); - z11 = astat[1:nrow(astat) + nrow(astat) * (z1 - 1)]; - gmax = pmax(gmax, z11); - gdata$set(gg, gmax); - - smax = sdata$get(ss) - z22 = apply(astat,2,max); - smax = pmax(smax, z22); - sdata$set(ss, smax); - return(.self); - }, - updatecis = function(ss, gg, select.cis, astat) { - if(length(astat)>0) - { - byrows = aggregate(x=astat, by=list(row=select.cis[,1]), FUN=max); - bycols = aggregate(x=astat, by=list(col=select.cis[,2]), FUN=max); - - gmax = gdata$get(gg); - gmax[byrows$row] = pmax(gmax[byrows$row], byrows$x) - gdata$set(gg, gmax); - - smax = sdata$get(ss) - smax[bycols$col] = pmax(smax[bycols$col], bycols$x) - sdata$set(ss, smax); - } - return(.self); - }, - getResults = function(snps, gene, pvfun) { - min.pv.snps = pvfun(sdata$unlist()); - names(min.pv.snps) = rownames(snps); - min.pv.gene = pvfun(gdata$unlist()); - names(min.pv.gene) = rownames(gene); - return(list(min.pv.snps = min.pv.snps, min.pv.gene = min.pv.gene)); - } - )) - -.OutputSaver_FRD <- setRefClass(".OutputSaver_FRD", - fields = list( - sdata = ".listBuilder", - gdata = ".listBuilder", - cdata = ".listBuilder", - bdata = ".listBuilder", - fid = "list", - testfun1 = "list", - pvfun1 = "list" - ), - methods = list( - initialize = function () { - sdata <<- .listBuilder$new(); - gdata <<- .listBuilder$new(); - cdata <<- .listBuilder$new(); - bdata <<- .listBuilder$new(); - fid <<- list(0); - testfun1 <<- list(0); - pvfun1 <<- list(0); - return(.self); - }, - start = function(filename, statistic_name, unused1, unused2, testfun, pvfun) { - testfun1 <<- list(testfun); - pvfun1 <<- list(pvfun); - if(length(filename) > 0) { - if(class(filename) == "character") { - fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); - } else { - fid <<- list(filename, FALSE) - } - writeLines( paste("SNP\tgene\t",statistic_name,"\tp-value\tFDR", sep = ""), fid[[1]]); - } else { - fid <<- list(); - } - }, - update = function(spos, gpos, sta, beta = NULL) { - if(length(sta)>0) { - sdata$add(spos); - gdata$add(gpos); - cdata$add(sta ); - if(!is.null(beta )) - bdata$add(beta ); - } - return(.self); - }, - getResults = function( gene, snps, FDR_total_count) { - pvalues = NULL; - if(cdata$n > 0) { - tests = testfun1[[1]](cdata$unlist()); - cdata <<- .listBuilder$new(); - - pvalues = pvfun1[[1]](tests); - ord = order(pvalues); - - tests = tests[ord]; - pvalues = pvalues[ord]; - - FDR = pvalues * FDR_total_count / (1:length(pvalues)); - FDR[length(FDR)] = min(FDR[length(FDR)], 1); - FDR = rev(cummin(rev(FDR))); - - snps_names = snps$GetAllRowNames()[sdata$unlist()[ord]]; - sdata <<- .listBuilder$new(); - gene_names = gene$GetAllRowNames()[gdata$unlist()[ord]]; - gdata <<- .listBuilder$new(); - - beta = NULL; - if(bdata$n > 0) - beta = bdata$unlist()[ord]; - - if(length(fid)>0) { - step = 1000; ########### 100000 - for( part in 1:ceiling(length(FDR)/step) ) { - fr = (part-1)*step + 1; - to = min(part*step, length(FDR)); - dump = data.frame(snps_names[fr:to], - gene_names[fr:to], - if(is.null(beta)) tests[fr:to] else list(beta[fr:to],tests[fr:to]), - pvalues[fr:to], - FDR[fr:to], - row.names = NULL, - check.rows = FALSE, - check.names = FALSE, - stringsAsFactors = FALSE); - write.table(dump, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); - } - } - } else { - cat("No significant associations were found.\n", file = if(length(fid)>0){fid[[1]]}else{""}); - } - if(length(fid)>0) { - if(fid[[2]]) { - close(fid[[1]]); - } - fid <<- list(); - } - - if(!is.null(pvalues)) { - eqtls = list( snps = snps_names, - gene = gene_names, - statistic = tests, - pvalue = pvalues, - FDR = FDR); - if(!is.null(beta)) - eqtls$beta = beta; - } else { - eqtls = list( snps = character(), - gene = character(), - beta = numeric(), - statistic = numeric(), - pvalue = numeric(), - FDR = numeric()); - } - return(list(eqtls = data.frame(eqtls))); - } - ) -) - - -.OutputSaver_direct <- setRefClass(".OutputSaver_direct", - fields = list( - gene_names = "character", - snps_names = "character", - fid = "list", - testfun1 = "list", - pvfun1 = "list" - ), - methods = list( - initialize = function() { - gene_names <<- character(0); - snps_names <<- character(0); - fid <<- list(0); - testfun1 <<- list(0); - pvfun1 <<- list(0); - return(.self); - }, - start = function(filename, statistic_name, snps, gene, testfun, pvfun) { - # I hope the program stops if it fails to open the file - if(class(filename) == "character") { - fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); - } else { - fid <<- list(filename, FALSE) - } - writeLines(paste("SNP\tgene\t", statistic_name, "\tp-value", sep = ""), fid[[1]]); - gene_names <<- gene$GetAllRowNames(); - snps_names <<- snps$GetAllRowNames(); - testfun1 <<- list(testfun); - pvfun1 <<- list(pvfun); - }, - update = function(spos, gpos, sta, beta = NULL) { - if( length(sta) == 0 ) - return(); - sta = testfun1[[1]](sta); - lst = list(snps = snps_names[spos], gene = gene_names[gpos], beta = beta, statistic = sta, pvalue = pvfun1[[1]](sta)); - lst$beta = lst$beta; - - dump2 = data.frame(lst, row.names = NULL, check.rows = FALSE, check.names = FALSE, stringsAsFactors = FALSE); - write.table(dump2, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); - }, - getResults = function(...) { - if(length(fid)>0) { - if(fid[[2]]) { - close(fid[[1]]); - } - fid <<- list(); - } - gene_names <<- character(0); - snps_names <<- character(0); - return(list()); - } - ) -) - -.my.pmin = function(x, val) { - # minimum "pmin" function that can handle empty array - if(length(x) == 0) { - return(x) - } else { - return(pmin.int(x,val)); - } -} - -.my.pmax = function(x, val) { - # minimum "pmin" function that can handle empty array - if(length(x) == 0) { - return(x) - } else { - return(pmax.int(x,val)); - } -} - -.pv.nz = function(x){return( .my.pmax(x,.Machine$double.xmin) )} - -.SetNanRowMean = function(x) { - if( any(is.na(x)) ) { - rowmean = rowMeans(x, na.rm = TRUE); - rowmean[ is.na(rowmean) ] = 0; - for( j in which(!complete.cases(x)) ) { - where1 = is.na( x[j, ] ); - x[j,where1] = rowmean[j]; - } - } - return(x); -} - -# .SNP_process_split_for_ANOVA = function(x) { -# # split into 2 dummy variables -# -# uniq = unique(c(x)); -# uniq = uniq[!is.na(uniq)]; -# -# if( length(uniq) > 3 ) { -# stop("More than three genotype categories is not handled by ANOVA"); -# } else if ( length(uniq) < 3 ) { -# uniq = c(uniq, min(uniq)-(1:(3-length(uniq)))); -# } -# -# freq = matrix(0, nrow(x), length(uniq)); -# for(i in 1:length(uniq)) { -# freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); -# } -# -# md = apply(freq, 1, which.max); -# freq[ cbind(1:nrow(x),md) ] = -1; -# -# md = apply(freq, 1, which.max); # min(freq[cbind(1:nrow(slice),md)] - rowSums(select,na.rm = TRUE )) -# new_slice1 = (x == uniq[md]); -# new_slice1[is.na(new_slice1)] = 0; -# freq[ cbind(1:nrow(x),md) ] = -1; -# -# md = apply(freq,1,which.max); -# new_slice2 = (x == uniq[md]); -# new_slice2[ is.na(new_slice2) ] = 0; -# rez = vector("list", 2); -# rez[[1]] = new_slice1; -# rez[[2]] = new_slice2; -# return( rez ); -# } - -.SNP_process_split_for_ANOVA = function(x,n.groups) { - # split into 2 dummy variables (or more) - -# # Get the number of ANOVA groups -# n.groups = options('MatrixEQTL.ANOVA.categories')[[1]]; -# if( is.null(n.groups)) -# n.groups = 3; - - # Unique values in x (make sure it has length of n.groups); - uniq = unique(as.vector(x)); - uniq = uniq[!is.na(uniq)]; - if( length(uniq) > n.groups ) { - stop("More than declared number of genotype categories is detected by ANOVA"); - } else if ( length(uniq) < n.groups ) { - uniq = c(uniq, rep(min(uniq)-1, n.groups-length(uniq))); - } - - # Table of frequencies for each variable (row) - freq = matrix(0, nrow(x), n.groups); - for(i in 1:n.groups) { - freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); - } - # remove NA's from x for convenience - x[is.na(x)] = min(uniq)-2; - - # Output list of matrices - rez = vector('list',n.groups-1); - - # Skip the most frequent value - md = apply(freq, 1, which.max); # most frequent value for each variable - freq[ cbind(1:nrow(x),md) ] = -1; - - # The rest form dumm - for(j in 1:(n.groups-1)){ - md = apply(freq, 1, which.max); - freq[ cbind(1:nrow(x),md) ] = -1; - rez[[j]] = (x == uniq[md]); - } - return( rez ); -} - -Matrix_eQTL_engine = function( - snps, - gene, - cvrt = SlicedData$new(), - output_file_name, - pvOutputThreshold = 1e-5, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = FALSE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE) { - rez = Matrix_eQTL_main( - snps = snps, - gene = gene, - cvrt = cvrt, - output_file_name = output_file_name, - pvOutputThreshold = pvOutputThreshold, - useModel = useModel, - errorCovariance = errorCovariance, - verbose = verbose, - pvalue.hist = pvalue.hist, - min.pv.by.genesnp = min.pv.by.genesnp, - noFDRsaveMemory = noFDRsaveMemory); - return( rez ); -} - -Matrix_eQTL_main = function( - snps, - gene, - cvrt = SlicedData$new(), - output_file_name = "", - pvOutputThreshold = 1e-5, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - output_file_name.cis = "", - pvOutputThreshold.cis = 0, - snpspos = NULL, - genepos = NULL, - cisDist = 1e6, - pvalue.hist = FALSE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE) { - ################################# Basic variable checks ################################# - { - # status("Performing basic checks of the input variables"); - stopifnot( "SlicedData" %in% class(gene) ); - stopifnot( "SlicedData" %in% class(snps) ); - stopifnot( "SlicedData" %in% class(cvrt) ); - - # Check dimensions - if( min(snps$nRows(),snps$nCols()) == 0 ) - stop("Empty genotype dataset"); - if( min(gene$nRows(),gene$nCols()) == 0 ) - stop("Empty expression dataset"); - if( snps$nCols() != gene$nCols() ) - stop("Different number of samples in the genotype and gene expression files"); - if( cvrt$nRows()>0 ) { - if( snps$nCols() != cvrt$nCols() ) - stop("Wrong number of samples in the matrix of covariates"); - } - - stopifnot( class(pvOutputThreshold) == "numeric" ); - stopifnot( length(pvOutputThreshold) == 1 ); - stopifnot( pvOutputThreshold >= 0 ); - stopifnot( pvOutputThreshold <= 1 ); - - stopifnot( class(noFDRsaveMemory) == "logical" ); - stopifnot( length(noFDRsaveMemory) == 1 ); - - if( pvOutputThreshold > 0 ) { - stopifnot( !((length(output_file_name) == 0) && noFDRsaveMemory) ) - stopifnot( length(output_file_name) <= 1 ); - if( length(output_file_name) == 1 ) { - stopifnot( class(output_file_name) %in% c("character","connection") ); - } - } - - stopifnot( class(pvOutputThreshold.cis) == "numeric" ); - stopifnot( length(pvOutputThreshold.cis) == 1 ); - stopifnot( pvOutputThreshold.cis >= 0 ); - stopifnot( pvOutputThreshold.cis <= 1 ); - stopifnot( !((pvOutputThreshold > 0) & (pvOutputThreshold.cis > 0) & (pvOutputThreshold > pvOutputThreshold.cis)) ); - stopifnot( (pvOutputThreshold > 0) | (pvOutputThreshold.cis > 0) ); - - stopifnot( class(useModel) == class(modelLINEAR) ); - stopifnot( length(useModel) == 1 ); - stopifnot( useModel %in% c(modelLINEAR, modelANOVA, modelLINEAR_CROSS) ); - if( useModel %in% c(modelLINEAR, modelLINEAR_CROSS) ) { - if( snps$nCols() <= cvrt$nRows() + 1 + 1) { - stop('The number of covariates exceeds the number of samples.\nLinear regression can not be fit.') - } - } - if( useModel == modelLINEAR_CROSS ) { - if( cvrt$nRows() == 0 ) { - stop( "Model \"modelLINEAR_CROSS\" requires at least one covariate" ); - } - } - if( useModel == modelANOVA ) { - n.anova.groups = getOption('MatrixEQTL.ANOVA.categories', 3); - stopifnot( n.anova.groups == floor(n.anova.groups) ); - stopifnot( n.anova.groups >= 3 ); -# stopifnot( n.anova.groups < snps$nCols() - cvrt$nRows() - 2 ); - if( snps$nCols() <= cvrt$nRows() + n.anova.groups) { - stop('The number of covariates exceeds the number of samples.\nLinear regression (ANOVA) can not be fit.') - } - } - - stopifnot( class(verbose) == "logical" ); - stopifnot( length(verbose) == 1 ); - - stopifnot( class(min.pv.by.genesnp) == "logical" ); - stopifnot( length(min.pv.by.genesnp) == 1 ); - - if( pvOutputThreshold.cis > 0 ) { - stopifnot( !((length(output_file_name.cis) == 0) && noFDRsaveMemory) ) - stopifnot( length(output_file_name.cis) <= 1 ); - if( length(output_file_name.cis) == 1 ) { - stopifnot( class(output_file_name.cis) %in% c("character","connection") ); - } - -# stopifnot( class(output_file_name.cis) == "character" ); -# stopifnot( length(output_file_name.cis) == 1 ); - stopifnot( class(snpspos) == "data.frame" ); - stopifnot( ncol(snpspos) == 3 ); - stopifnot( nrow(snpspos) > 0 ); - stopifnot( class(snpspos[1,3]) %in% c("integer", "numeric") ) - stopifnot( !any(is.na(snpspos[,3])) ) - stopifnot( class(genepos) == "data.frame" ); - stopifnot( ncol(genepos) == 4 ); - stopifnot( nrow(genepos) > 0 ); - stopifnot( class(genepos[1,3]) %in% c("integer", "numeric") ) - stopifnot( class(genepos[1,4]) %in% c("integer", "numeric") ) - stopifnot( !any(is.na(genepos[,3])) ) - stopifnot( !any(is.na(genepos[,4])) ) - stopifnot( nzchar(output_file_name.cis) ) - } - - if( pvOutputThreshold > 0 ) { - stopifnot( nzchar(output_file_name) ) - } - - stopifnot( class(errorCovariance) %in% c("numeric", "matrix") ); - errorCovariance = as.matrix(errorCovariance); - if(length(errorCovariance)>0) { - if( nrow(errorCovariance) != ncol(errorCovariance) ) { - stop("The covariance matrix is not square"); - } - if( nrow(errorCovariance) != snps$nCols() ) { - stop("The covariance matrix size does not match the number of samples"); - } - if( !all(errorCovariance == t(errorCovariance)) ) { - stop("The covariance matrix is not symmetric"); - } - } - } - ################################# Initial setup ######################################### - { - gene.std = .listBuilder$new(); - snps.std = .listBuilder$new(); - - dont.clone.gene = getOption('MatrixEQTL.dont.preserve.gene.object', FALSE) - if(is.null(dont.clone.gene)) - dont.clone.gene = FALSE; - - if( !dont.clone.gene ) - gene = gene$Clone(); - # snps = snps$Clone(); # snps is read only - cvrt = cvrt$Clone(); - - params = list( - output_file_name = output_file_name, - pvOutputThreshold = pvOutputThreshold, - useModel = useModel, - errorCovariance = errorCovariance , - verbose = verbose, - output_file_name.cis = output_file_name.cis, - pvOutputThreshold.cis = pvOutputThreshold.cis, - cisDist = cisDist , - pvalue.hist = pvalue.hist, - min.pv.by.genesnp = min.pv.by.genesnp); - - if( verbose ) { - lastTime = 0; - status <- function(text) { - # gc(); - newTime = proc.time()[3]; - if(lastTime != 0) { - cat("Task finished in ", newTime-lastTime, " seconds\n"); - } - cat(text,"\n"); - lastTime <<- newTime; - unused = flush.console(); - } - } else { - status = function(text){} - } - start.time = proc.time()[3]; - } - ################################# Error covariance matrix processing #################### - { - if( length(errorCovariance) > 0 ) { - status("Processing the error covariance matrix"); - eig = eigen(errorCovariance, symmetric = TRUE) - d = eig$values; - v = eig$vectors; - # errorCovariance == v %*% diag(d) %*% t(v) - # errorCovariance^0.5 == v*sqrt(d)*v" (no single quotes anymore) - # errorCovariance^(-0.5) == v*diag(1./sqrt(diag(d)))*v" - if( any(d<=0) ) { - stop("The covariance matrix is not positive definite"); - } - correctionMatrix = v %*% diag(1./sqrt(d)) %*% t(v); - rm( eig, v, d, errorCovariance ) - } else { - rm( errorCovariance ); - correctionMatrix = numeric(); - } - } - ################################# Matching gene and SNPs locations ###################### - if( pvOutputThreshold.cis > 0 ) { - status("Matching data files and location files") - - # names in the input data - gene_names = gene$GetAllRowNames(); - snps_names = snps$GetAllRowNames(); - - # gene range, set: left genepos[,4])) { - temp3 = genepos[,3]; - temp4 = genepos[,4]; - genepos[,3] = pmin(temp3,temp4); - genepos[,4] = pmax(temp3,temp4); - rm(temp3, temp4); - } - - # match with the location data - genematch = match( gene_names, genepos[ ,1], nomatch = 0L); - usedgene = matrix(FALSE, nrow(genepos), 1); # genes in "genepos" that are matching "gene_names" - usedgene[ genematch ] = TRUE; - if( !any(genematch) ) { - stop("Gene names do not match those in the gene location file."); - } - cat( sum(genematch>0), "of", length(gene_names), " genes matched\n"); - - - snpsmatch = match( snps_names, snpspos[ ,1], nomatch = 0L); - usedsnps = matrix(FALSE, nrow(snpspos),1); - usedsnps[ snpsmatch ] = TRUE; - if( !any(snpsmatch) ) { - stop("SNP names do not match those in the SNP location file."); - } - cat( sum(snpsmatch>0), "of", length(snps_names), " SNPs matched\n"); - - # list used chr names - chrNames = unique(c( as.character(unique(snpspos[usedsnps,2])), - as.character(unique(genepos[usedgene,2])) )) - chrNames = chrNames[ sort.list( suppressWarnings(as.integer(chrNames)), - method = "radix", na.last = TRUE ) ]; - # match chr names - genechr = match(genepos[,2],chrNames); - snpschr = match(snpspos[,2],chrNames); - - # max length of a chromosome - chrMax = max( snpspos[usedsnps, 3], genepos[usedgene, 4], na.rm = TRUE) + cisDist; - - # Single number location for all rows in "genepos" and "snpspos" - genepos2 = as.matrix(genepos[ ,3:4, drop = FALSE] + (genechr-1)*chrMax); - snpspos2 = as.matrix(snpspos[ ,3 , drop = FALSE] + (snpschr-1)*chrMax); - - # the final location arrays; - snps_pos = matrix(0,length(snps_names),1); - snps_pos[snpsmatch>0, ] = snpspos2[snpsmatch, , drop = FALSE]; - snps_pos[rowSums(is.na(snps_pos))>0, ] = 0; - snps_pos[snps_pos==0] = (length(chrNames)+1) * (chrMax+cisDist); - rm(snps_names, snpsmatch, usedsnps, snpschr, snpspos2) - - gene_pos = matrix(0,length(gene_names),2); - gene_pos[genematch>0, ] = genepos2[genematch, , drop = FALSE]; - gene_pos[rowSums(is.na(gene_pos))>0, ] = 0; - gene_pos[gene_pos==0] = (length(chrNames)+2) * (chrMax+cisDist); - rm(gene_names, genematch, usedgene, genechr, genepos2) - rm(chrNames, chrMax); - - if( is.unsorted(snps_pos) ) { - status("Reordering SNPs\n"); - ordr = sort.list(snps_pos); - snps$RowReorder(ordr); - snps_pos = snps_pos[ordr, , drop = FALSE]; - rm(ordr); - } - if( is.unsorted(rowSums(gene_pos)) ) { - status("Reordering genes\n"); - ordr = sort.list(rowSums(gene_pos)); - gene$RowReorder(ordr); - gene_pos = gene_pos[ordr, , drop = FALSE]; - rm(ordr); - } - - # Slice it back. - geneloc = vector("list", gene$nSlices()) - gene_offset = 0; - for(gc in 1:gene$nSlices()) { - nr = length(gene$rowNameSlices[[gc]]); - geneloc[[gc]] = gene_pos[gene_offset + (1:nr), , drop = FALSE]; - gene_offset = gene_offset + nr; - } - rm(gc, gene_offset, gene_pos); - - snpsloc = vector("list", snps$nSlices()) - snps_offset = 0; - for(sc in 1:snps$nSlices()) { - nr = length(snps$rowNameSlices[[sc]]); - snpsloc[[sc]] = snps_pos[snps_offset + (1:nr), , drop = FALSE]; - snps_offset = snps_offset + nr; - } - rm(nr, sc, snps_offset, snps_pos); - } - ################################# Covariates processing ################################# - { - status("Processing covariates"); - if( useModel == modelLINEAR_CROSS ) { - last.covariate = as.vector(tail( cvrt$getSlice(cvrt$nSlices()), n = 1)); - } - if( cvrt$nRows()>0 ) { - cvrt$SetNanRowMean(); - cvrt$CombineInOneSlice(); - cvrt = rbind(matrix(1,1,snps$nCols()),cvrt$getSlice(1)); - } else { - cvrt = matrix(1,1,snps$nCols()); - } - # Correct for the error covariance structure - if( length(correctionMatrix)>0 ) { - cvrt = cvrt %*% correctionMatrix; - } - # Orthonormalize covariates - # status("Orthonormalizing covariates"); - q = qr(t(cvrt)); - if( min(abs(diag(qr.R(q)))) < .Machine$double.eps * snps$nCols() ) { - stop("Colinear or zero covariates detected"); - } - cvrt = t( qr.Q(q) ); - rm( q ); - } - ################################# Gene expression processing ############################ - { - status("Processing gene expression data (imputation, residualization, etc.)"); - # Impute gene expression - gene$SetNanRowMean(); - # Correct for the error covariance structure - if( length(correctionMatrix)>0 ) { - gene$RowMatrixMultiply(correctionMatrix); - } - # Orthogonolize expression w.r.t. covariates - # status("Orthogonolizing expression w.r.t. covariates"); - gene_offsets = double(gene$nSlices()+1); - for( sl in 1:gene$nSlices() ) { - slice = gene$getSlice(sl); - gene_offsets[sl+1] = gene_offsets[sl] + nrow(slice); - rowsq1 = rowSums(slice^2); - slice = slice - tcrossprod(slice,cvrt) %*% cvrt; - rowsq2 = rowSums(slice^2); - # kill rows colinear with the covariates - delete.rows = (rowsq2 <= rowsq1 * .Machine$double.eps ); - slice[delete.rows] = 0; - div = sqrt( rowSums(slice^2) ); - div[ div == 0 ] = 1; - gene.std$set(sl, div); - gene$setSlice(sl, slice / div); - } - rm(rowsq1, rowsq2, delete.rows, div); - rm( sl, slice ); - #gene$RowRemoveZeroEps(); - } - ################################# snps_process, testfun, pvfun, threshfun, afun ######## - { - # snps_process - preprocess SNPs slice - # - # afun --- abs for signed stats, identity for non-negative - # threshfun --- internal stat threshold for given p-value - # testfun --- t or F statistic from the internal one - # pvfun --- p-value from the t or F statistic - - nSamples = snps$nCols(); - nGenes = gene$nRows(); - nSnps = snps$nRows(); - nCov = nrow(cvrt); - # nVarTested = length(snps_list); # set in case(useModel) - # dfNull = nSamples - nCov; - # d.f. of the full model - betafun = NULL; - - if( useModel == modelLINEAR ) { - snps_process = function(x) { - return( list(.SetNanRowMean(x)) ); - }; - nVarTested = 1; - dfFull = nSamples - nCov - nVarTested; - statistic.fun = function(mat_list) { - return( mat_list[[1]] ); - } - afun = function(x) {return(abs(x))}; - threshfun = function(pv) { - thr = qt(pv/2, dfFull, lower.tail = FALSE); - thr = thr^2; - thr = sqrt( thr / (dfFull + thr) ); - thr[pv >= 1] = 0; - thr[pv <= 0] = 1; - return( thr ); - } - testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } - pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2)); } - thresh.cis = threshfun(pvOutputThreshold.cis); - thresh = threshfun(pvOutputThreshold); - betafun = function(stat, ss, gg, select) { - return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); - } - } else - if( useModel == modelANOVA ) { - snps_process = function(x).SNP_process_split_for_ANOVA(x,n.anova.groups); - nVarTested = n.anova.groups - 1; - dfFull = nSamples - nCov - nVarTested; -# statistic.fun = function(mat_list) { -# return( mat_list[[1]]^2 + mat_list[[2]]^2 ); -# } - statistic.fun = function(mat_list) { - x = mat_list[[1]]^2; - for( j in 2:length(mat_list) ) - x = x + mat_list[[j]]^2; - return( x ); - } - afun = identity; - threshfun = function(pv) { - thr = qf(pv, nVarTested, dfFull, lower.tail = FALSE); - thr = thr / (dfFull/nVarTested + thr); - thr[pv >= 1] = 0; - thr[pv <= 0] = 1; - return( thr ); - } - testfun = function(x) { return( x / (1 - .my.pmin(x,1)) * (dfFull/nVarTested) ); } - pvfun = function(x) { return( .pv.nz(pf(x, nVarTested, dfFull, lower.tail = FALSE)) ); } - thresh.cis = threshfun(pvOutputThreshold.cis); - thresh = threshfun(pvOutputThreshold); - } else - if( useModel == modelLINEAR_CROSS ) { - last.covariate = as.vector( last.covariate ); - snps_process = .SNP_process_split_for_LINEAR_CROSS = function(x) { - out = vector("list", 2); - out[[1]] = .SetNanRowMean(x); - out[[2]] = t( t(out[[1]]) * last.covariate ); - return( out ); - }; - nVarTested = 1; - dfFull = nSamples - nCov - nVarTested - 1; - statistic.fun = function(mat_list) { - return( mat_list[[2]] / sqrt(1 - mat_list[[1]]^2) ); - } - afun = function(x) {return(abs(x))}; - threshfun = function(pv) { - thr = qt(pv/2, dfFull, lower.tail = FALSE); - thr = thr^2; - thr = sqrt( thr / (dfFull + thr) ); - thr[pv >= 1] = 0; - thr[pv <= 0] = 1; - return( thr ); - } - testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } - pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2 )); } - thresh.cis = threshfun(pvOutputThreshold.cis); - thresh = threshfun(pvOutputThreshold); - betafun = function(stat, ss, gg, select) { - return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); - } - } - params$dfFull = dfFull; - } - ################################# Saver class(es) creation ############################## - { - status("Creating output file(s)"); - if(noFDRsaveMemory) { - if( pvOutputThreshold > 0 ) { - saver.tra = .OutputSaver_direct$new(); - } - if( pvOutputThreshold.cis > 0 ) { - saver.cis = .OutputSaver_direct$new(); - } - } else { - if( pvOutputThreshold > 0 ) { - saver.tra = .OutputSaver_FRD$new(); - } - if( pvOutputThreshold.cis > 0 ) { - saver.cis = .OutputSaver_FRD$new(); - } - } - if( pvOutputThreshold > 0 ) - if( pvOutputThreshold * gene$nRows() * snps$nRows() > 1000000 ) - if(!noFDRsaveMemory) - cat('Warning: pvOutputThreshold may be too large.\nExpected number of findings > ', - pvOutputThreshold * gene$nRows() * snps$nRows(),'\n'); - if( (useModel == modelLINEAR) || (useModel == modelLINEAR_CROSS) ) { - statistic_name = "t-stat"; - } else if( useModel == modelANOVA ) { - statistic_name = "F-test"; - } - if(!is.null(betafun)) - statistic_name = paste('beta\t',statistic_name, sep=''); - if( pvOutputThreshold > 0 ) - saver.tra$start(output_file_name, statistic_name, snps, gene, testfun, pvfun); - if( pvOutputThreshold.cis > 0 ) - saver.cis$start(output_file_name.cis, statistic_name, snps, gene, testfun, pvfun); - rm( statistic_name ); - } - ################################# Some useful functions ################################# - { - orthonormalize.snps = function(cursnps, ss) { - for(p in 1:length(cursnps)) { - if(length(correctionMatrix)>0) { - cursnps[[p]] = cursnps[[p]] %*% correctionMatrix; - } - cursnps[[p]] = cursnps[[p]] - tcrossprod(cursnps[[p]],cvrt) %*% cvrt; - for(w in .seq(1,p-1L)) - cursnps[[p]] = cursnps[[p]] - rowSums(cursnps[[p]]*cursnps[[w]]) * cursnps[[w]]; - div = sqrt( rowSums(cursnps[[p]]^2) ); - div[ div == 0 ] = 1; - cursnps[[p]] = cursnps[[p]]/div; - } - snps.std$set(ss, div); - return(cursnps); - } -# if( pvOutputThreshold.cis > 0 ) { -# is.cis.pair = function(gg,ss) { -# return(!( ( snpsloc[[ss]][1, 1] - tail( geneloc[[gg]][ , 2], n = 1L) > cisDist) | -# ( geneloc[[gg]][1, 1] - tail( snpsloc[[ss]] , n = 1L) > cisDist) ) ); -# } -# } - if( pvOutputThreshold.cis > 0 ) { -# sn.l = sapply(snpsloc, function(x)x[1] ); -# sn.r = sapply(snpsloc, function(x)tail(x,1) ); -# ge.l = sapply(geneloc, function(x)x[1,1] ); -# ge.r = sapply(geneloc, function(x)x[nrow(x) , 2] ); - sn.l = sapply(snpsloc, '[', 1 ); - sn.r = sapply(snpsloc, tail, 1 ); - ge.l = sapply(geneloc, '[', 1, 1 ); - ge.r = sapply( lapply(geneloc, tail.matrix, 1 ), '[', 2); - gg.1 = findInterval( sn.l , ge.r + cisDist +1) + 1; -# cat(gg.1,'\n') - gg.2 = findInterval( sn.r , ge.l - cisDist ); -# cat(gg.2,'\n') - rm(sn.l, sn.r, ge.l, ge.r); - } - - } - ################################# Prepare counters and histogram bins ################### - { - pvbins = NULL; # bin edges for p-values - statbins = 0; # bin edges for the test statistic (|t| or F) - do.hist = FALSE; - if( length(pvalue.hist) == 1 ) { - if(pvalue.hist == "qqplot") { - pvbins = c(0, 10^rev(seq(0, log10(.Machine$double.xmin)-1, -0.05))); - } else - if( is.numeric(pvalue.hist) ) { - pvbins = seq(from = 0, to = 1, length.out = pvalue.hist+1); - } else - if( pvalue.hist == TRUE ) { - pvbins = seq(from = 0, to = 1, length.out = 100+1); - } - } else - if( is.numeric(pvalue.hist) && (length(pvalue.hist) > 1) ) { - pvbins = pvalue.hist; - } - if( is.null(pvbins) && (pvalue.hist != FALSE) ) { - stop("Wrong value of pvalue.hist. Must be FALSE, TRUE, \"qqplot\", or numerical"); - } - do.hist = !is.null(pvbins); - if( do.hist ) { - pvbins = sort(pvbins); - statbins = threshfun(pvbins); - if( pvOutputThreshold > 0) { - hist.all = .histogrammer$new(pvbins, statbins); - } - if( pvOutputThreshold.cis > 0) { - hist.cis = .histogrammer$new(pvbins, statbins); - } - } - rm( pvbins, statbins); - if(min.pv.by.genesnp) { - if( pvOutputThreshold > 0) { - minpv.tra = .minpvalue$new(snps,gene); - } - if( pvOutputThreshold.cis > 0) { - minpv.cis = .minpvalue$new(snps,gene); - } - } - } - ################################# Main loop ############################################# - { - beta = NULL; - n.tests.all = 0; - n.tests.cis = 0; - n.eqtls.tra = 0; - n.eqtls.cis = 0; - - status("Performing eQTL analysis"); - # ss = 1; gg = 1; - # ss = snps$nSlices(); gg = gene$nSlices(); - - snps_offset = 0; - for(ss in 1:snps$nSlices()) { -# for(ss in 1:min(2,snps$nSlices())) { #for debug - cursnps = NULL; - nrcs = nrow(snps$getSliceRaw(ss)); - - # loop only through the useful stuff - for(gg in if(pvOutputThreshold>0){1:gene$nSlices()}else{.seq(gg.1[ss],gg.2[ss])} ) { - gene_offset = gene_offsets[gg]; - curgene = gene$getSlice(gg); - nrcg = nrow(curgene); - if(nrcg == 0) next; - - rp = ""; - - statistic = NULL; - select.cis.raw = NULL; - ## do cis analysis -# if( (pvOutputThreshold.cis > 0) && ( is.cis.pair(gg, ss) ) ) { - if( (pvOutputThreshold.cis > 0) && (gg >= gg.1[ss]) && (gg <= gg.2[ss]) ) { - - if( is.null( statistic ) ) { - if( is.null( cursnps ) ) { - cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); - } - mat = vector("list", length(cursnps)); - for(d in 1:length(cursnps)) { - mat[[d]] = tcrossprod(curgene, cursnps[[d]]); - } - statistic = statistic.fun( mat ); - astatistic = afun(statistic); -# rm(mat); - } - -# sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1 +1 , snpsloc[[ss]]); -# sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist -1 , snpsloc[[ss]]); - sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1, snpsloc[[ss]]); - sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist, snpsloc[[ss]]); - xx = unlist(lapply(which(sn.r>sn.l),FUN=function(x){(sn.l[x]:(sn.r[x]-1))*nrow(statistic)+x})) - select.cis.raw = xx[ astatistic[xx] >= thresh.cis ]; - select.cis = arrayInd(select.cis.raw, dim(statistic)) - - n.tests.cis = n.tests.cis + length(xx); - n.eqtls.cis = n.eqtls.cis + length(select.cis.raw); - - if( do.hist ) - hist.cis$update(astatistic[xx]); - - if( min.pv.by.genesnp ) { - # minpv.cis$updatecis(ss, gg, arrayInd(xx, dim(statistic)), astatistic[xx]) - temp = double(length(astatistic)); - dim(temp) = dim(astatistic); - temp[xx] = astatistic[xx]; - minpv.cis$update(ss, gg, temp); - } - - if(!is.null(betafun)) - beta = betafun(mat[[length(mat)]][select.cis.raw], ss, gg, select.cis); - - saver.cis$update( snps_offset + select.cis[ , 2], - gene_offset + select.cis[ , 1], - statistic[select.cis.raw], - beta); - - # statistic.select.cis = statistic[ select.cis ]; - # test = testfun( statistic.select.cis ); - # pv = pvfun(test); - # Saver.cis$WriteBlock( cbind(snps_offset + select.cis[ , 2], gene_offset + select.cis[ , 1], test, pv) ); - # counter.cis$Update(gg, ss, select.cis, pv, n.tests = length(xx), if(do.hist) afun(statistic[xx]) ) - rp = paste(rp, ", ", formatC(n.eqtls.cis, big.mark=",", format = "f", digits = 0), " cis-eQTLs", sep = ""); - } - ## do trans/all analysis - if(pvOutputThreshold>0) { - if( is.null( statistic ) ) { - if( is.null( cursnps ) ) { - cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); - } - mat = vector("list", length(cursnps)); - for(d in 1:length(cursnps)) { - mat[[d]] = tcrossprod(curgene, cursnps[[d]]); - } - statistic = statistic.fun( mat ); - astatistic = afun(statistic); -# rm(mat); - } - - if( do.hist ) - hist.all$update(astatistic); - - if(!is.null(select.cis.raw)) - astatistic[xx] = -1; - # select.tra.raw = select.tra.raw[!(select.tra.raw %in% select.cis.raw)]; - - select.tra.raw = which( astatistic >= thresh); - select.tra = arrayInd(select.tra.raw, dim(statistic)) - - n.eqtls.tra = n.eqtls.tra + length(select.tra.raw); - n.tests.all = n.tests.all + length(statistic); - - if(!is.null(betafun)) - beta = betafun(mat[[length(mat)]][select.tra.raw], ss, gg, select.tra); - - saver.tra$update( snps_offset + select.tra[ , 2], - gene_offset + select.tra[ , 1], - statistic[select.tra.raw], - beta); - - if( min.pv.by.genesnp ) - minpv.tra$update(ss, gg, astatistic) - - # statistic.select.tra = statistic[ select.tra ]; - # test = testfun( statistic.select.tra ); - # pv = pvfun( test ); - # Saver$WriteBlock( cbind( snps_offset + select.tra[ , 2], gene_offset + select.tra[ , 1], test, pv) ); - # counter$Update(gg, ss, select.tra, pv, n.tests = nrcs*nrcg, if(do.hist) afun(statistic) ) - rp = paste(rp, ", ", formatC(n.eqtls.tra, big.mark=",", format = "f", digits = 0), if(pvOutputThreshold.cis > 0)" trans-"else" ","eQTLs", sep = "") - } - - #gene_offset = gene_offset + nrcg; - if( !is.null(statistic) ) { - per = 100*(gg/gene$nSlices() + ss-1) / snps$nSlices(); - cat( formatC(floor(per*100)/100, format = "f", width = 5, digits = 2), "% done" , rp, "\n", sep = ""); - flush.console(); - } - } # gg in 1:gene$nSlices() - snps_offset = snps_offset + nrow(snps$getSliceRaw(ss)); - } # ss in 1:snps$nSlices() - } - ################################# Results collection #################################### - { - rez = list(time.in.sec = proc.time()[3] - start.time); - rez$param = params; - - if(pvOutputThreshold.cis > 0) { - rez.cis = list(ntests = n.tests.cis, neqtls = n.eqtls.cis); - rez.cis = c(rez.cis, saver.cis$getResults( gene, snps, n.tests.cis) ); - if(do.hist) - rez.cis = c(rez.cis, hist.cis$getResults() ); - if(min.pv.by.genesnp) - rez.cis = c(rez.cis, minpv.cis$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); - } - - if(pvOutputThreshold>0) { - rez.all = list(ntests = n.tests.all, neqtls = n.eqtls.tra + n.eqtls.cis); - if(pvOutputThreshold.cis > 0) { - rez.tra = list(ntests = n.tests.all - n.tests.cis, neqtls = n.eqtls.tra); - rez.tra = c(rez.tra, saver.tra$getResults( gene, snps, n.tests.all - n.tests.cis) ); - } else { - rez.all = c(rez.all, saver.tra$getResults( gene, snps, n.tests.all ) ); - } - if(do.hist) { - rez.all = c(rez.all, hist.all$getResults() ); - if(pvOutputThreshold.cis > 0) { - rez.tra$hist.bins = rez.all$hist.bins; - rez.tra$hist.counts = rez.all$hist.counts - rez.cis$hist.counts; - } - } - if(min.pv.by.genesnp) { - if(pvOutputThreshold.cis > 0) { - rez.tra = c(rez.tra, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); - } else { - rez.all = c(rez.all, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); - } - } - } - - if(exists('rez.all')>0) - rez$all = rez.all; - if(exists('rez.tra')>0) - rez$trans = rez.tra; - if(exists('rez.cis')>0) - rez$cis = rez.cis; - - class(rez) = c(class(rez),"MatrixEQTL"); - status(""); - } -# cat('s std ',snps.std$get(1),'\n'); -# cat('g std ',gene.std$get(1),'\n'); - ################################# Results collection #################################### - return(rez); -} - -.histme = function(m, name1, name2, ...) { - cnts = m$hist.counts; - bins = m$hist.bins; - ntst = m$ntests; - centers = 0.5 * (bins[-1L] + bins[-length(bins)]); - density = 0.5 / (bins[-1L] - centers) * cnts / ntst; - ntext = paste("P-value distribution for ", name1, formatC(ntst, big.mark=",", format = "f", digits = 0), name2, " gene-SNP pairs ",sep=""); - r = structure(list(breaks = bins, counts = cnts, density = density, - mids = centers, equidist = FALSE), class = "histogram"); - plot(r, main = ntext, ylab = "Density", xlab = "P-values", ...) - abline( h = 1, col = "blue"); - return(invisible()); -} - -.qqme = function(m, lcol, cex, pch, ...) { - cnts = m$hist.counts; - bins = m$hist.bins; - ntst = m$ntests; - - cusu = cumsum(cnts) / ntst; - ypos = bins[-1][is.finite(cusu)]; - xpos = cusu[is.finite(cusu)]; - lines(-log10(xpos), -log10(ypos), col = lcol, ...); -# lines(xpos, ypos, col = lcol, ...); - if(length(m$eqtls$pvalue)==0) - return(); - ypvs = -log10(m$eqtls$pvalue); - xpvs = -log10(1:length(ypvs) / ntst); - if(length(ypvs) > 1000) { - # need to filter a bit, make the plotting faster - levels = as.integer( xpvs/xpvs[1] * 1e3); - keep = c(TRUE, diff(levels)!=0); - levels = as.integer( ypvs/ypvs[1] * 1e3); - keep = keep | c(TRUE, diff(levels)!=0); - ypvs = ypvs[keep]; - xpvs = xpvs[keep]; - rm(keep) - } - points(xpvs, ypvs, col = lcol, pch = pch, cex = cex, ...); -} - -plot.MatrixEQTL = function(x, cex = 0.5, pch = 19, xlim = NULL, ylim = NULL, ...) { - if( x$param$pvalue.hist == FALSE ) { - warning("Cannot plot p-value distribution: the information was not recorded.\nUse pvalue.hist!=FALSE."); - return(invisible()); - } - if( x$param$pvalue.hist == "qqplot" ) { - xmin = 1/max(x$cis$ntests, x$all$ntests); - ymax = NULL; - if(!is.null(ylim)) { - ymax = ylim[2]; - } else { - ymax = -log10(min( - x$cis$eqtls$pvalue[1], x$cis$hist.bins[ c(FALSE,x$cis$hist.counts>0)][1], - x$all$eqtls$pvalue[1], x$all$hist.bins[ c(FALSE,x$all$hist.counts>0)][1], - x$trans$eqtls$pvalue[1], x$trans$hist.bins[c(FALSE,x$trans$hist.counts>0)][1], - na.rm = TRUE))+0.1; - } - if(ymax == 0) { - ymax = -log10(.Machine$double.xmin) - } - if(!is.null(ymax)) - ylim = c(0,ymax); - - if(is.null(xlim)) - xlim = c(0, -log10(xmin/1.5)); - - plot(numeric(),numeric(), xlab = "-Log10(p-value), theoretical", - ylab = "-Log10(p-value), observed", - xlim = c(0, -log10(xmin/1.5)), - ylim = ylim, - xaxs="i", yaxs="i", ...); - lines(c(0,1e3), c(0,1e3), col = "gray"); - if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { - .qqme( x$cis, "red", cex, pch, ...); - .qqme( x$trans, "blue", cex, pch, ...); - title(paste("QQ-plot for", - formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), - "local and", - formatC(x$trans$ntests, big.mark=",", format = "f", digits = 0), - "distant gene-SNP p-values")); - lset = c(1,2,4); - } else - if(x$param$pvOutputThreshold.cis > 0) { - .qqme(x$cis, "red", cex, pch, ...); - title(paste("QQ-plot for", - formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), - "local gene-SNP p-values")); - lset = c(1,4); - } else { - .qqme(x$all, "blue", cex, pch, ...); - title(paste("QQ-plot for all", - formatC(x$all$ntests, big.mark=",", format = "f", digits = 0), - "gene-SNP p-values")); - lset = c(3,4); - } - legend("topleft", - c("Local p-values","Distant p-values","All p-values","diagonal")[lset], - col = c("red","blue","blue","gray")[lset], - text.col = c("red","blue","blue","gray")[lset], - pch = 20, lwd = 1, pt.cex = c(1,1,1,0)[lset]) - } else { - if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { - par(mfrow=c(2,1)); - .histme(x$cis, "", " local", ...); - tran = list(hist.counts = x$all$hist.counts - x$cis$hist.counts, - hist.bins = x$all$hist.bins, - ntests = x$all$ntests - x$cis$ntests); - .histme(x$trans,""," distant", ...); - par(mfrow=c(1,1)); - } else - if(x$param$pvOutputThreshold.cis > 0) { - .histme(x$cis, "", " local", ...); - } else { - .histme(x$all, "all ", "" , ...); - } - } - return(invisible()); -} - diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/data/Covariates.txt --- a/MatrixEQTL/data/Covariates.txt Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -id Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09 Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16 -gender 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 -age 36 40 46 65 69 43 40 54 44 70 24 34 55 62 48 40 diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/data/GE.txt --- a/MatrixEQTL/data/GE.txt Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -geneid Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09 Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16 -Gene_01 4.91 4.63 5.18 5.07 5.74 5.09 5.31 5.29 4.73 5.72 4.75 4.54 5.01 5.03 4.84 4.44 -Gene_02 13.78 13.14 13.18 13.04 12.85 13.07 13.09 12.83 14.94 13.86 15.26 14.73 14.08 14.33 14.72 14.97 -Gene_03 12.06 12.29 13.07 13.65 13.86 12.84 12.29 13.03 13.13 14.93 12.40 13.38 13.70 14.49 14.14 13.35 -Gene_04 11.63 11.88 12.74 12.66 13.16 11.99 11.97 12.81 11.74 13.29 10.88 11.37 12.09 12.50 11.40 11.45 -Gene_05 14.72 14.66 14.63 15.91 15.46 14.74 15.17 15.01 14.05 14.90 12.96 13.56 14.06 14.44 14.12 13.76 -Gene_06 12.29 12.23 12.47 13.21 12.63 12.18 12.44 12.45 13.22 12.70 12.93 12.84 13.20 13.19 12.64 12.80 -Gene_07 12.56 12.71 12.49 13.41 13.60 12.35 12.27 13.42 14.73 15.47 13.85 14.31 15.25 15.55 14.90 14.04 -Gene_08 12.27 12.58 12.61 13.02 12.86 12.32 12.30 13.19 15.21 14.60 14.72 14.56 14.98 14.92 14.53 14.72 -Gene_09 9.82 9.29 8.95 8.18 8.11 9.43 9.17 9.25 10.95 9.82 12.60 11.99 11.20 10.31 11.13 11.69 -Gene_10 14.24 14.52 14.62 13.65 13.46 14.04 13.97 13.73 12.29 12.01 13.47 13.30 12.34 12.23 12.80 12.47 diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/data/SNP.txt --- a/MatrixEQTL/data/SNP.txt Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ -snpid Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09 Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16 -Snp_01 2 0 2 0 2 1 2 1 1 1 2 2 1 2 2 1 -Snp_02 0 1 1 2 2 1 0 0 0 1 1 1 1 0 1 1 -Snp_03 1 0 1 0 1 1 1 1 0 1 1 0 1 1 1 2 -Snp_04 0 1 2 2 2 1 1 0 0 0 1 2 1 1 1 0 -Snp_05 1 1 2 1 1 2 1 1 0 1 1 2 0 1 2 1 -Snp_06 2 2 2 1 1 0 1 0 2 1 1 1 2 0 2 1 -Snp_07 1 1 2 2 0 1 1 1 1 0 2 2 0 1 1 1 -Snp_08 1 0 1 0 1 0 0 1 1 1 0 2 0 1 1 1 -Snp_09 2 1 2 2 0 1 1 0 2 1 1 0 1 1 0 0 -Snp_10 1 1 0 0 0 2 2 1 1 2 1 1 1 1 1 0 -Snp_11 2 2 2 0 2 1 1 2 1 2 0 1 0 1 1 2 -Snp_12 1 1 2 2 2 1 1 1 1 0 2 0 1 1 0 2 -Snp_13 0 1 1 1 1 1 2 1 2 2 0 0 0 1 1 1 -Snp_14 0 0 1 0 1 2 2 2 1 1 1 0 1 0 1 0 -Snp_15 1 2 1 2 2 1 2 1 1 2 1 1 1 2 0 2 diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/data/geneloc.txt --- a/MatrixEQTL/data/geneloc.txt Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -geneid chr left right -Gene_01 chr1 721289 731289 -Gene_02 chr1 752565 762565 -Gene_03 chr1 777121 787121 -Gene_04 chr1 785988 795988 -Gene_05 chr1 792479 802479 -Gene_06 chr1 798958 808958 -Gene_07 chr1 888658 898658 -Gene_08 chr1 918572 928572 -Gene_09 chr1 926430 936430 -Gene_10 chr1 1000000 1010000 diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/data/snpsloc.txt --- a/MatrixEQTL/data/snpsloc.txt Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ -snpid chr pos -Snp_01 chr1 721289 -Snp_02 chr1 752565 -Snp_03 chr1 777121 -Snp_04 chr1 785988 -Snp_05 chr1 792479 -Snp_06 chr1 798958 -Snp_07 chr1 888658 -Snp_08 chr1 918572 -Snp_09 chr1 926430 -Snp_10 chr1 947033 -Snp_11 chr2 721289 -Snp_12 chr2 752565 -Snp_13 chr2 777121 -Snp_14 chr2 785988 -Snp_15 chr2 792479 diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/demo/00Index --- a/MatrixEQTL/demo/00Index Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -a.nocvrt Test that Matrix eQTL gives correct estimates for additive linear model. Homoscedastic case without covariates. -b.cvrt Test that Matrix eQTL gives correct estimates for additive linear model. Homoscedastic case with covariates. -c.weights Test that Matrix eQTL gives correct estimates for additive linear model. Heteroscedastic case with covariates. -d.ANOVA Test that Matrix eQTL gives correct estimates for ANOVA linear model with 3 groups. Heteroscedastic case with covariates. -d.ANOVA5 Test that Matrix eQTL gives correct estimates for ANOVA linear model with 5 groups. Heteroscedastic case with covariates. -e.interaction Test that Matrix eQTL gives correct estimates for linear model with genotype-covariate interaction. Heteroscedastic case with covariates. -p.hist Sample code for plotting histogram of p-values -q.qqplot Sample code for plotting QQ-plot of p-values -sample.all Sample code for eQTL analysis of the toy data set. -sample.cis Sample code for eQTL analysis of the toy data set with separate p-value thresholds and FDR estimation for cis- and trans-eQTLs. diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/demo/a.nocvrt.r --- a/MatrixEQTL/demo/a.nocvrt.r Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -# Number of columns (samples) -n = 100; - - - - - - - - - - -# Generate the vectors with genotype and expression variables -snps.mat = rnorm(n); -gene.mat = rnorm(n) + 0.5 * snps.mat; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new(); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue); -# And compare to those from the linear regression in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R summary(lm()) output: \n"); - lmdl = lm( gene.mat ~ snps.mat ); - - lmout = summary(lmdl)$coefficients[2,c("Estimate","t value","Pr(>|t|)")]; - print( lmout ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/demo/b.cvrt.r --- a/MatrixEQTL/demo/b.cvrt.r Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -# Number of columns (samples) -n = 100; - -# Number of covariates -nc = 10; - - - - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with genotype and expression variables -snps.mat = cvrt.mat %*% rnorm(nc) + rnorm(n); -gene.mat = cvrt.mat %*% rnorm(nc) + rnorm(n) + 0.5 * snps.mat + 1; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue); -# And compare to those from the linear regression in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R summary(lm()) output: \n"); - lmdl = lm( gene.mat ~ snps.mat + cvrt.mat ); - - lmout = summary(lmdl)$coefficients[2,c("Estimate","t value","Pr(>|t|)")]; - print( lmout ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/demo/c.weights.r --- a/MatrixEQTL/demo/c.weights.r Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -# Number of columns (samples) -n = 100; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with genotype and expression variables -snps.mat = cvrt.mat %*% rnorm(nc) + rnorm(n); -gene.mat = cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std + 0.5 * snps.mat + 1; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue); -# And compare to those from the linear regression in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R summary(lm()) output: \n"); - lmdl = lm( gene.mat ~ snps.mat + cvrt.mat, - weights = 1/noise.std^2 ); - lmout = summary(lmdl)$coefficients[2,c("Estimate","t value","Pr(>|t|)")]; - print( lmout ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/demo/d.ANOVA.r --- a/MatrixEQTL/demo/d.ANOVA.r Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -# Number of columns (samples) -n = 100; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with single genotype and expression variables -snps.mat = floor(runif(n, min = 0, max = 3)); -gene.mat = 1 + (snps.mat==1) + cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelANOVA, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value - -fstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c( Fstat = fstat, pvalue = pvalue); -# And compare to those from ANOVA in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R anova(lm()) output: \n"); - lmdl = lm( gene.mat ~ cvrt.mat + factor(snps.mat), - weights = 1/noise.std^2 ); - lmout = anova(lmdl)[2, c("F value","Pr(>F)")]; - print( lmout ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/demo/d.ANOVA5.r --- a/MatrixEQTL/demo/d.ANOVA5.r Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -anova.groups = 5; -options(MatrixEQTL.ANOVA.categories = anova.groups); - -# Number of columns (samples) -n = 100; -# Number of covariates -nc = 10; -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with single genotype and expression variables -snps.mat = floor(runif(n, min = 0, max = anova.groups)); -gene.mat = 1 + (snps.mat==1) + cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelANOVA, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value - -fstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c( Fstat = fstat, pvalue = pvalue); -# And compare to those from ANOVA in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R anova(lm()) output: \n"); - lmdl = lm( gene.mat ~ cvrt.mat + factor(snps.mat), - weights = 1/noise.std^2 ); - lmout = anova(lmdl)[2, c("F value","Pr(>F)")]; - print( lmout ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/demo/e.interaction.r --- a/MatrixEQTL/demo/e.interaction.r Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -# Number of columns (samples) -n = 25; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with single genotype and expression variables -snps.mat = cvrt.mat %*% rnorm(nc) + rnorm(n); -gene.mat = cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std + - 1 + 0.5 * snps.mat + snps.mat * cvrt.mat[,nc]; -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR_CROSS, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue); -# And compare to those from the linear regression in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R summary(lm()) output: \n"); - lmdl = lm( gene.mat ~ snps.mat + cvrt.mat + snps.mat*cvrt.mat[,nc], - weights = 1/noise.std^2 ); - lmout = tail(summary(lmdl)$coefficients,1)[,c(1,3,4)]; - print( tail(lmout) ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/demo/p.hist.r --- a/MatrixEQTL/demo/p.hist.r Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -library("MatrixEQTL") - -# Number of samples -n = 100; - -# Number of variables -ngs = 2000; - -# Common signal in all variables (population stratification) -pop = 0.2 * rnorm(n); - -# data matrices -snps.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop; -gene.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop + snps.mat*((1:ngs)/ngs)^9/2; - -# data objects for Matrix eQTL engine -snps1 = SlicedData$new( t( snps.mat ) ); -gene1 = SlicedData$new( t( gene.mat ) ); -cvrt1 = SlicedData$new(); -rm(snps.mat, gene.mat); - -# Slice data in blocks of 500 variables -snps1$ResliceCombined(500); -gene1$ResliceCombined(500); - -# Produce no output files -filename = NULL; # tempfile() - -# Perform analysis recording information for a histogram -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1e-100, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = 100); - -# png(filename = "histogram.png", width = 650, height = 650); -plot(me, col="grey"); -# dev.off(); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/demo/q.qqplot.r --- a/MatrixEQTL/demo/q.qqplot.r Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -library("MatrixEQTL") - -# Number of samples -n = 100; - -# Number of variables -ngs = 2000; - -# Common signal in all variables (population stratification) -pop = 0.2 * rnorm(n); - -# data matrices -snps.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop; -gene.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop + snps.mat*((1:ngs)/ngs)^9/2; - -# data objects for Matrix eQTL engine -snps1 = SlicedData$new( t( snps.mat ) ); -gene1 = SlicedData$new( t( gene.mat ) ); -cvrt1 = SlicedData$new(); -rm(snps.mat, gene.mat); - -# Slice data in blocks of 500 variables -snps1$ResliceCombined(500); -gene1$ResliceCombined(500); - -# Produce no output files -filename = NULL; # tempfile() - -# Perform analysis recording information for a Q-Q plot -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1e-6, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = "qqplot"); - -# png(filename = "QQplot.png", width = 650, height = 650); -plot(me, pch = 16, cex = 0.7, ylim = c(0,11)); -# dev.off(); diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/demo/sample.all.r --- a/MatrixEQTL/demo/sample.all.r Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -# Matrix eQTL by Andrey A. Shabalin -# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ -# -# Be sure to use an up to date version of R and Matrix eQTL. - -# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); -library(MatrixEQTL) - -## Location of the package with the data files. -base.dir = find.package('MatrixEQTL'); -# base.dir = '.'; - -## Settings - -# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS -useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS - -# Genotype file name -SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); - -# Gene expression file name -expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); - -# Covariates file name -# Set to character() for no covariates -covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); - -# Output file name -output_file_name = tempfile(); - -# Only associations significant at this level will be saved -pvOutputThreshold = 1e-2; - -# Error covariance matrix -# Set to numeric() for identity. -errorCovariance = numeric(); -# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); - - -## Load genotype data - -snps = SlicedData$new(); -snps$fileDelimiter = "\t"; # the TAB character -snps$fileOmitCharacters = "NA"; # denote missing values; -snps$fileSkipRows = 1; # one row of column labels -snps$fileSkipColumns = 1; # one column of row labels -snps$fileSliceSize = 2000; # read file in slices of 2,000 rows -snps$LoadFile(SNP_file_name); - -## Load gene expression data - -gene = SlicedData$new(); -gene$fileDelimiter = "\t"; # the TAB character -gene$fileOmitCharacters = "NA"; # denote missing values; -gene$fileSkipRows = 1; # one row of column labels -gene$fileSkipColumns = 1; # one column of row labels -gene$fileSliceSize = 2000; # read file in slices of 2,000 rows -gene$LoadFile(expression_file_name); - -## Load covariates - -cvrt = SlicedData$new(); -cvrt$fileDelimiter = "\t"; # the TAB character -cvrt$fileOmitCharacters = "NA"; # denote missing values; -cvrt$fileSkipRows = 1; # one row of column labels -cvrt$fileSkipColumns = 1; # one column of row labels -if(length(covariates_file_name)>0) { - cvrt$LoadFile(covariates_file_name); -} - -## Run the analysis - -me = Matrix_eQTL_engine( - snps = snps, - gene = gene, - cvrt = cvrt, - output_file_name = output_file_name, - pvOutputThreshold = pvOutputThreshold, - useModel = useModel, - errorCovariance = errorCovariance, - verbose = TRUE, - pvalue.hist = TRUE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE); - -unlink(output_file_name); - -## Results: - -cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); -cat('Detected eQTLs:', '\n'); -show(me$all$eqtls) - -## Plot the histogram of all p-values - -plot(me) diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/demo/sample.cis.r --- a/MatrixEQTL/demo/sample.cis.r Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ -# Matrix eQTL by Andrey A. Shabalin -# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ -# -# Be sure to use an up to date version of R and Matrix eQTL. - -# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); -library(MatrixEQTL) - -## Location of the package with the data files. -base.dir = find.package('MatrixEQTL'); -# base.dir = '.'; - -## Settings - -# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS -useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS - -# Genotype file name -SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); -snps_location_file_name = paste(base.dir, "/data/snpsloc.txt", sep=""); - -# Gene expression file name -expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); -gene_location_file_name = paste(base.dir, "/data/geneloc.txt", sep=""); - -# Covariates file name -# Set to character() for no covariates -covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); - -# Output file name -output_file_name_cis = tempfile(); -output_file_name_tra = tempfile(); - -# Only associations significant at this level will be saved -pvOutputThreshold_cis = 2e-2; -pvOutputThreshold_tra = 1e-2; - -# Error covariance matrix -# Set to numeric() for identity. -errorCovariance = numeric(); -# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); - -# Distance for local gene-SNP pairs -cisDist = 1e6; - -## Load genotype data - -snps = SlicedData$new(); -snps$fileDelimiter = "\t"; # the TAB character -snps$fileOmitCharacters = "NA"; # denote missing values; -snps$fileSkipRows = 1; # one row of column labels -snps$fileSkipColumns = 1; # one column of row labels -snps$fileSliceSize = 2000; # read file in slices of 2,000 rows -snps$LoadFile(SNP_file_name); - -## Load gene expression data - -gene = SlicedData$new(); -gene$fileDelimiter = "\t"; # the TAB character -gene$fileOmitCharacters = "NA"; # denote missing values; -gene$fileSkipRows = 1; # one row of column labels -gene$fileSkipColumns = 1; # one column of row labels -gene$fileSliceSize = 2000; # read file in slices of 2,000 rows -gene$LoadFile(expression_file_name); - -## Load covariates - -cvrt = SlicedData$new(); -cvrt$fileDelimiter = "\t"; # the TAB character -cvrt$fileOmitCharacters = "NA"; # denote missing values; -cvrt$fileSkipRows = 1; # one row of column labels -cvrt$fileSkipColumns = 1; # one column of row labels -if(length(covariates_file_name)>0) { - cvrt$LoadFile(covariates_file_name); -} - -## Run the analysis -snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE); -genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE); - -me = Matrix_eQTL_main( - snps = snps, - gene = gene, - cvrt = cvrt, - output_file_name = output_file_name_tra, - pvOutputThreshold = pvOutputThreshold_tra, - useModel = useModel, - errorCovariance = errorCovariance, - verbose = TRUE, - output_file_name.cis = output_file_name_cis, - pvOutputThreshold.cis = pvOutputThreshold_cis, - snpspos = snpspos, - genepos = genepos, - cisDist = cisDist, - pvalue.hist = TRUE, - min.pv.by.genesnp = TRUE, - noFDRsaveMemory = FALSE); - -unlink(output_file_name_tra); -unlink(output_file_name_cis); - -## Results: - -cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); -cat('Detected local eQTLs:', '\n'); -show(me$cis$eqtls) -cat('Detected distant eQTLs:', '\n'); -show(me$trans$eqtls) - -## Plot the histogram of local and distant p-values - -plot(me) diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/inst/CITATION --- a/MatrixEQTL/inst/CITATION Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -citHeader("To cite the 'MatrixEQTL' package in publications use:") - -citEntry(entry="article", - title = "Matrix eQTL: ultra fast eQTL analysis via large matrix operations", - author = personList(as.person("A.A. Shabalin")), - address = "University of North Carolina at Chapel Hill", - year = "2012", - url = "URL http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/", - publisher= "Oxford Univ Press", - journal = "Bioinformatics", - volume = "28", - number = "10", - pages = "1353--1358", - textVersion = "Shabalin, Andrey A. Matrix eQTL: ultra fast eQTL analysis via large matrix operations. Bioinformatics 28, no. 10 (2012): 1353-1358." -) diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/Covariates.Rd --- a/MatrixEQTL/man/Covariates.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -\name{Covariates} -\alias{Covariates} -\docType{data} -\title{ - Artificial data for Matrix eQTL sample code: Covariates. -} -\description{ - Artificial data set with 2 covariates across 15 samples. - Columns of the file must match to those of the expression and genotype data sets. -} -\format{ - \tabular{llll}{ - id \tab Sam_01 \tab Sam_02 \tab \ldots \cr - gender \tab 0 \tab 0 \tab \ldots \cr - age \tab 40 \tab 46 \tab \ldots \cr - - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\keyword{datasets} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/GE.Rd --- a/MatrixEQTL/man/GE.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -\name{GE} -\alias{GE} -\docType{data} -\title{ - Artificial data for Matrix eQTL sample code: Gene expression. -} -\description{ - Artificial data set with expression of 10 genes across 15 samples. - Columns of the file must match to those of the genotype and covariates data sets. -} -\format{ - \tabular{llll}{ - geneid \tab Sam_01 \tab Sam_02 \tab \ldots \cr - Gene_01 \tab 4.91 \tab 4.63 \tab \ldots \cr - Gene_02 \tab 13.78 \tab 13.14 \tab \ldots \cr - \ldots \tab \ldots \tab \ldots \tab \ldots \cr - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\keyword{datasets} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/MatrixEQTL-package.Rd --- a/MatrixEQTL/man/MatrixEQTL-package.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -\name{MatrixEQTL-package} -\alias{MatrixEQTL-package} -\alias{MatrixEQTL} -\docType{package} -\title{ - Matrix eQTL: Ultra fast eQTL analysis via large matrix operations -} -\description{ - Matrix eQTL is designed for fast eQTL analysis of large datasets. - Matrix eQTL can test for association between genotype and gene expression using linear regression - with either additive or ANOVA (additive and dominant) genotype effects. - The models can include covariates to account for such factors as population stratification, gender, clinical variables, and surrogate variables. - Matrix eQTL also supports models with heteroscedastic and/or correlated errors, - false discovery rate estimation and separate treatment of local (cis) and distant (trans) eQTLs. -} -\details{ - \tabular{ll}{ - Package: \tab MatrixEQTL\cr - Type: \tab Package\cr - Version: \tab 2.1.1\cr - Date: \tab 2014-02-24\cr - License: \tab LGPL-3 \cr - LazyLoad: \tab yes\cr - Depends: \tab methods\cr - } -} -\author{ - Andrey Shabalin \email{ashabalin@vcu.edu} - - Maintainer: Andrey Shabalin -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\keyword{ package } -\keyword{ MatrixEQTL } -\keyword{ Matrix eQTL } -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/MatrixEQTL_cis_code.Rd --- a/MatrixEQTL/man/MatrixEQTL_cis_code.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,129 +0,0 @@ -\name{MatrixEQTL_cis_code} -\alias{MatrixEQTL_cis_code} - -\title{Sample code for cis/trans-eQTL analysis with Matrix eQTL} -\description{ - The following code is the best starting point for those who want to perform cis-/trans-eQTL analysis with Matrix eQTL. -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and other sample code. -} -\author{ - Andrey Shabalin \email{ashabalin@vcu.edu} -} -\examples{ -# Matrix eQTL by Andrey A. Shabalin -# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ -# -# Be sure to use an up to date version of R and Matrix eQTL. - -# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); -library(MatrixEQTL) - -## Location of the package with the data files. -base.dir = find.package('MatrixEQTL'); - -## Settings - -# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS -useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS - -# Genotype file name -SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); -snps_location_file_name = paste(base.dir, "/data/snpsloc.txt", sep=""); - -# Gene expression file name -expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); -gene_location_file_name = paste(base.dir, "/data/geneloc.txt", sep=""); - -# Covariates file name -# Set to character() for no covariates -covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); - -# Output file name -output_file_name_cis = tempfile(); -output_file_name_tra = tempfile(); - -# Only associations significant at this level will be saved -pvOutputThreshold_cis = 2e-2; -pvOutputThreshold_tra = 1e-2; - -# Error covariance matrix -# Set to numeric() for identity. -errorCovariance = numeric(); -# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); - -# Distance for local gene-SNP pairs -cisDist = 1e6; - -## Load genotype data - -snps = SlicedData$new(); -snps$fileDelimiter = "\t"; # the TAB character -snps$fileOmitCharacters = "NA"; # denote missing values; -snps$fileSkipRows = 1; # one row of column labels -snps$fileSkipColumns = 1; # one column of row labels -snps$fileSliceSize = 2000; # read file in slices of 2,000 rows -snps$LoadFile(SNP_file_name); - -## Load gene expression data - -gene = SlicedData$new(); -gene$fileDelimiter = "\t"; # the TAB character -gene$fileOmitCharacters = "NA"; # denote missing values; -gene$fileSkipRows = 1; # one row of column labels -gene$fileSkipColumns = 1; # one column of row labels -gene$fileSliceSize = 2000; # read file in slices of 2,000 rows -gene$LoadFile(expression_file_name); - -## Load covariates - -cvrt = SlicedData$new(); -cvrt$fileDelimiter = "\t"; # the TAB character -cvrt$fileOmitCharacters = "NA"; # denote missing values; -cvrt$fileSkipRows = 1; # one row of column labels -cvrt$fileSkipColumns = 1; # one column of row labels -if(length(covariates_file_name)>0) { - cvrt$LoadFile(covariates_file_name); -} - -## Run the analysis -snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE); -genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE); - -me = Matrix_eQTL_main( - snps = snps, - gene = gene, - cvrt = cvrt, - output_file_name = output_file_name_tra, - pvOutputThreshold = pvOutputThreshold_tra, - useModel = useModel, - errorCovariance = errorCovariance, - verbose = TRUE, - output_file_name.cis = output_file_name_cis, - pvOutputThreshold.cis = pvOutputThreshold_cis, - snpspos = snpspos, - genepos = genepos, - cisDist = cisDist, - pvalue.hist = TRUE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE); - -unlink(output_file_name_tra); -unlink(output_file_name_cis); - -## Results: - -cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); -cat('Detected local eQTLs:', '\n'); -show(me$cis$eqtls) -cat('Detected distant eQTLs:', '\n'); -show(me$trans$eqtls) - -## Make the histogram of local and distant p-values - -plot(me) -} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/Matrix_eQTL_main.Rd --- a/MatrixEQTL/man/Matrix_eQTL_main.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,273 +0,0 @@ -\name{Matrix_eQTL_main} -\alias{Matrix_eQTL_main} -\alias{Matrix_eQTL_engine} -\title{ - Main function for fast eQTL analysis in MatrixEQTL package -} -\description{ - \code{Matrix_eQTL_engine} function tests association of every row of the \code{snps} dataset with every row of the \code{gene} dataset using a linear regression model defined by the \code{useModel} parameter (see below). - - The testing procedure accounts for extra covariates in \code{cvrt} parameter. - - The \code{errorCovariance} parameter can be set to the error variance-covariance matrix to account for heteroskedastic and/or correlated errors. - - Associations significant at \code{pvOutputThreshold} (\code{pvOutputThreshold.cis}) levels are saved to \code{output_file_name} (\code{output_file_name.cis}), with corresponding estimates of effect size (slope coefficient), test statistics, p-values, and q-values (false discovery rate). - - Matrix eQTL can perform separate analysis for local (cis) and distant (trans) eQTLs. - For such analysis one has to set the cis-analysis specific parameters \code{pvOutputThreshold.cis > 0}, \code{cisDist}, \code{snpspos} and {genepos} in the call of \code{Matrix_eQTL_main} function. - A gene-SNP pair is considered local if the distance between them is less or equal to \code{cisDist}. - The genomic location of genes and SNPs is defined by data frames \code{snpspos} and {genepos}. - Depending on p-value thresholds \code{pvOutputThreshold} and \code{pvOutputThreshold.cis} Matrix eQTL runs in one of three different modes: - \itemize{ - \item Set \code{pvOutputThreshold > 0} and \code{pvOutputThreshold.cis = 0} (or use \code{Matrix_eQTL_engine}) to perform eQTL analysis without using gene/SNP locations. Associations significant at the \code{pvOutputThreshold} level are be recorded in \code{output_file_name} and in the returned object. - \item Set \code{pvOutputThreshold = 0} and \code{pvOutputThreshold.cis > 0} to perform eQTL analysis for local gene-SNP pairs only. Local associations significant at \code{pvOutputThreshold.cis} level will be recorded in \code{output_file_name.cis} and in the returned object. - \item Set \code{pvOutputThreshold > 0} and \code{pvOutputThreshold.cis > 0} to perform eQTL analysis with separate p-value thresholds for local and distant eQTLs. Distant and local associations significant at corresponding thresholds are recorded in \code{output_file_name} and \code{output_file_name.cis} respectively and in the returned object. In this case the false discovery rate is calculated separately for these two sets of eQTLs. - } - \code{Matrix_eQTL_engine} is a wrapper for \code{Matrix_eQTL_main} for eQTL analysis without regard to gene/SNP location and provided for compatibility with the previous versions of the package. - - The parameter \code{pvalue.hist} allows to record information sufficient to create a histogram or QQ-plot of all the p-values (see \code{\link[=plot.MatrixEQTL]{plot}}). -} -\usage{ -Matrix_eQTL_main( snps, - gene, - cvrt = SlicedData$new(), - output_file_name = "", - pvOutputThreshold = 1e-5, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - output_file_name.cis = "", - pvOutputThreshold.cis = 0, - snpspos = NULL, - genepos = NULL, - cisDist = 1e6, - pvalue.hist = FALSE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE) - -Matrix_eQTL_engine(snps, - gene, - cvrt = SlicedData$new(), - output_file_name, - pvOutputThreshold = 1e-5, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = FALSE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE) -} -\arguments{ - \item{snps}{ - \code{\linkS4class{SlicedData}} object with genotype information. - Can be real-valued for linear models and must take at most 3 distinct values for ANOVA unless the number of ANOVA categories is set to a higher number (see \code{useModel} parameter). - } - \item{gene}{ - \code{\linkS4class{SlicedData}} object with gene expression information. - Must have columns matching those of \code{snps}. - } - \item{cvrt}{ - \code{\linkS4class{SlicedData}} object with additional covariates. - Can be an empty \code{SlicedData} object in case of no covariates. - The constant is always included in the model and would cause an error if included in \code{cvrt}. - The order of columns must match those in \code{snps} and \code{gene}. - } - \item{output_file_name}{ - \code{character}, \code{connection}, or \code{NULL}. - If not \code{NULL}, significant associations are saved to this file (all significant associations if \code{pvOutputThreshold=0} or only distant if \code{pvOutputThreshold>0}). - If the file with this name exists, it is overwritten. - } - \item{output_file_name.cis}{ - \code{character}, \code{connection}, or \code{NULL}. - If not \code{NULL}, significant local associations are saved to this file. - If the file with this name exists, it is overwritten. - } - \item{pvOutputThreshold}{ - \code{numeric}. Significance threshold for all/distant tests. - } - \item{pvOutputThreshold.cis}{ - \code{numeric}. Same as \code{pvOutputThreshold}, but for local eQTLs. - } - \item{useModel}{ - \code{integer}. Eigher \code{modelLINEAR}, \code{modelANOVA}, or \code{modelLINEAR_CROSS}. - \enumerate{ - \item Set \code{useModel = \link{modelLINEAR}} to model the effect of the genotype as additive linear and test for its significance using t-statistic. - \item Set \code{useModel = \link{modelANOVA}} to treat genotype as a categorical variables and use ANOVA model and test for its significance using F-test. The default number of ANOVA categories is 3. Set otherwise like this: \code{options(MatrixEQTL.ANOVA.categories=4)}. - \item Set \code{useModel = \link{modelLINEAR_CROSS}} to add a new term to the model - equal to the product of genotype and the last covariate; the significance of this term is then tested using t-statistic. - } - - } - \item{errorCovariance}{ - \code{numeric}. The error covariance matrix. Use \code{numeric()} for homoskedastic independent errors. - } - \item{verbose}{ - \code{logical}. Set to \code{TRUE} to display more detailed report about the progress. - } - \item{snpspos}{ - \code{data.frame} object with information about SNP locations, must have 3 columns - SNP name, chromosome, and position, like this: - \tabular{ccc}{ - snpid \tab chr \tab pos \cr - Snp_01 \tab 1 \tab 721289 \cr - Snp_02 \tab 1 \tab 752565 \cr - \ldots \tab \ldots \tab \ldots \cr - } - } - \item{genepos}{ - \code{data.frame} with information about transcript locations, must have 4 columns - the name, chromosome, and positions of the left and right ends, like this: - \tabular{cccc}{ - geneid \tab chr \tab left \tab right \cr - Gene_01 \tab 1 \tab 721289 \tab 731289 \cr - Gene_02 \tab 1 \tab 752565 \tab 762565 \cr - \ldots \tab \ldots \tab \ldots \tab \ldots \cr - } - } - \item{cisDist}{ - \code{numeric}. SNP-gene pairs within this distance are considered local. The distance is measured from the nearest end of the gene. SNPs within a gene are always considered local. - } - \item{pvalue.hist}{ - \code{logical}, \code{numerical}, or \code{"qqplot"}. - Defines whether and how the distribution of p-values is recorded in the returned object. - If \code{pvalue.hist = FALSE}, the information is not recorded and the analysis is performed faster. - Alternatively, set \code{pvalue.hist = "qqplot"} to record information sufficient to create a QQ-plot of the p-values (use \code{\link[=plot.MatrixEQTL]{plot}} on the returned object to create the plot). - To record information for a histogram set \code{pvalue.hist} to the desired number of bins of equal size. Finally, \code{pvalue.hist} can also be set to a custom set of bin edges. - } - \item{min.pv.by.genesnp}{ - \code{logical}. Set \code{min.pv.by.genesnp = TRUE} to record the minimum p-value for each SNP and each gene in the returned object. The minimum p-values are recorded even if if they are above the corresponding thresholds of \code{pvOutputThreshold} and \code{pvOutputThreshold.cis}. The analysis runs faster when the parameter is set to \code{FALSE}. - } - \item{noFDRsaveMemory}{ - \code{logical}. Set \code{noFDRsaveMemory = TRUE} to save significant gene-SNP pairs directly to the output files, reduce memory footprint and skip FDR calculation. The eQTLs are not recorded in the returned object if \code{noFDRsaveMemory = TRUE}. - } -} -\details{ - Note that the columns of \code{gene}, \code{snps}, and \code{cvrt} must match. - If they do not match in the input files, use \code{ColumnSubsample} method to subset and/or reorder them. -} -\value{ - The detected eQTLs are saved in \code{output_file_name} and/or \code{output_file_name.cis} if they are not \code{NULL}. - The method also returns a list with a summary of the performed analysis. - \item{param}{Keeps all input parameters and also records the number of degrees of freedom for the full model.} - \item{time.in.sec}{Time difference between the start and the end of the analysis (in seconds).} - \item{all}{Information about all detected eQTLs.} - \item{cis}{Information about detected local eQTLs.} - \item{trans}{Information about detected distant eQTLs.} - The elements \code{all}, \code{cis}, and \code{trans} may contain the following components - \describe{ - \item{\code{ntests}}{Total number of tests performed. This is used for FDR calculation.} - \item{\code{eqtls}}{Data frame with recorded significant associations. Not available if \code{noFDRsaveMemory=FALSE}} - \item{\code{neqtls}}{Number of significant associations recorded.} - \item{\code{hist.bins}}{Histogram bins used for recording p-value distribution. See \code{pvalue.hist} parameter.} - \item{\code{hist.counts}}{Number of p-value that fell in each histogram bin. See \code{pvalue.hist} parameter.} - \item{\code{min.pv.snps}}{Vector with the best p-value for each SNP. See \code{min.pv.by.genesnp} parameter.} - \item{\code{min.pv.gene}}{Vector with the best p-value for each gene. See \code{min.pv.by.genesnp} parameter.} - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\author{ - Andrey Shabalin \email{ashabalin@vcu.edu} -} -\seealso{ - The code below is the sample code for eQTL analysis NOT using gene/SNP locations. - - See \code{\link{MatrixEQTL_cis_code}} for sample code for eQTL analysis that separates local and distant tests. -} -\examples{ -# Matrix eQTL by Andrey A. Shabalin -# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ -# -# Be sure to use an up to date version of R and Matrix eQTL. - -# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); -library(MatrixEQTL) - -## Location of the package with the data files. -base.dir = find.package('MatrixEQTL'); - -## Settings - -# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS -useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS - -# Genotype file name -SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); - -# Gene expression file name -expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); - -# Covariates file name -# Set to character() for no covariates -covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); - -# Output file name -output_file_name = tempfile(); - -# Only associations significant at this level will be saved -pvOutputThreshold = 1e-2; - -# Error covariance matrix -# Set to numeric() for identity. -errorCovariance = numeric(); -# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); - - -## Load genotype data - -snps = SlicedData$new(); -snps$fileDelimiter = "\t"; # the TAB character -snps$fileOmitCharacters = "NA"; # denote missing values; -snps$fileSkipRows = 1; # one row of column labels -snps$fileSkipColumns = 1; # one column of row labels -snps$fileSliceSize = 2000; # read file in slices of 2,000 rows -snps$LoadFile(SNP_file_name); - -## Load gene expression data - -gene = SlicedData$new(); -gene$fileDelimiter = "\t"; # the TAB character -gene$fileOmitCharacters = "NA"; # denote missing values; -gene$fileSkipRows = 1; # one row of column labels -gene$fileSkipColumns = 1; # one column of row labels -gene$fileSliceSize = 2000; # read file in slices of 2,000 rows -gene$LoadFile(expression_file_name); - -## Load covariates - -cvrt = SlicedData$new(); -cvrt$fileDelimiter = "\t"; # the TAB character -cvrt$fileOmitCharacters = "NA"; # denote missing values; -cvrt$fileSkipRows = 1; # one row of column labels -cvrt$fileSkipColumns = 1; # one column of row labels -if(length(covariates_file_name)>0) { - cvrt$LoadFile(covariates_file_name); -} - -## Run the analysis - -me = Matrix_eQTL_engine( - snps = snps, - gene = gene, - cvrt = cvrt, - output_file_name = output_file_name, - pvOutputThreshold = pvOutputThreshold, - useModel = useModel, - errorCovariance = errorCovariance, - verbose = TRUE, - pvalue.hist = TRUE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE); - -unlink(output_file_name); - -## Results: - -cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); -cat('Detected eQTLs:', '\n'); -show(me$all$eqtls) - -## Plot the histogram of all p-values - -plot(me) - -} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/SNP.Rd --- a/MatrixEQTL/man/SNP.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -\name{SNP} -\alias{SNP} -\docType{data} -\title{ - Artificial data for Matrix eQTL sample code: Genotype. -} -\description{ - Artificial data set with genotype information for 15 markers across 15 samples. - Columns of the file must match to those of the gene expression and covariates data sets. -} -\format{ - \tabular{llll}{ - snpnid \tab Sam_01 \tab Sam_02 \tab \ldots \cr - Snp_01 \tab 2 \tab 0 \tab \ldots \cr - Snp_02 \tab 0 \tab 1 \tab \ldots \cr - \ldots \tab \ldots \tab \ldots \tab \ldots \cr - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\keyword{datasets} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/SlicedData-class.Rd --- a/MatrixEQTL/man/SlicedData-class.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,189 +0,0 @@ -\name{SlicedData-class} -\Rdversion{1.1} -\docType{class} -\alias{SlicedData-class} -\alias{SlicedData} -\alias{SlicedData-class} -\alias{[[,SlicedData-method} -\alias{[[<-,SlicedData-method} -\alias{colnames,SlicedData-method} -\alias{colnames<-,SlicedData-method} -\alias{dim,SlicedData-method} -\alias{length,SlicedData-method} -\alias{ncol,SlicedData-method} -\alias{NCOL,SlicedData-method} -\alias{nrow,SlicedData-method} -\alias{NROW,SlicedData-method} -\alias{rownames,SlicedData-method} -\alias{rownames<-,SlicedData-method} -\alias{show,SlicedData-method} -\alias{as.matrix,SlicedData-method} - -\alias{rowMeans,SlicedData-method} -\alias{rowSums,SlicedData-method} -\alias{colMeans,SlicedData-method} -\alias{colSums,SlicedData-method} -\alias{summary.SlicedData} - -\title{Class \code{SlicedData} for storing large matrices} -\description{ - This class is created for fast and memory efficient manipulations with large datasets presented in matrix form. - It is used to load, store, and manipulate large datasets, e.g. genotype and gene expression matrices. - When a dataset is loaded, it is sliced in blocks of 1,000 rows (default size). - This allows imputing, standardizing, and performing other operations with the data with minimal memory overhead. -} -\section{Extends}{ - \code{SlicedData} is a reference classes (\code{\linkS4class{envRefClass}}). - Its methods can change the values of the fields of the class. -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\author{ - Andrey Shabalin \email{ashabalin@vcu.edu} -} -\seealso{ - This class is used to load data for eQTL analysis by \code{\link{Matrix_eQTL_engine}}. -} -\keyword{classes} -\section{Fields}{ - \describe{ - \item{\code{dataEnv}:}{\code{environment}. Stores the slices of the data matrix. The slices should be accessed via \code{getSlice()} and \code{setSlice()} methods. } - \item{\code{nSlices1}:}{\code{numeric}. Number of slices. For internal use. The value should be access via \code{nSlices()} method. } - \item{\code{rowNameSlices}:}{\code{list}. Slices of row names. } - \item{\code{columnNames}:}{\code{character}. Column names. } - \item{\code{fileDelimiter}:}{\code{character}. Delimiter separating values in the input file. } - \item{\code{fileSkipColumns}:}{\code{numeric}. Number of columns with row labels in the input file. } - \item{\code{fileSkipRows}:}{\code{numeric}. Number of rows with column labels in the input file. } - \item{\code{fileSliceSize}:}{\code{numeric}. Maximum number of rows in a slice. } - \item{\code{fileOmitCharacters}:}{\code{character}. Missing value (NaN) representation in the input file. } - } -} -\section{Methods}{ - \describe{ - \item{\code{initialize(mat)}:}{ Create the object from a matrix. } - \item{\code{nSlices()}:}{ Returns the number of slices. } - \item{\code{nCols()}:}{ Returns the number of columns in the matrix. } - \item{\code{nRows()}:}{ Returns the number of rows in the matrix. } - \item{\code{Clear()}:}{ Clears the object. Removes the data slices and row and column names. } - \item{\code{Clone()}:}{ Makes a copy of the object. Changes to the copy do not affect the source object. } - \item{\code{CreateFromMatrix(mat)}:}{ Creates \code{SlicedData} object from a \code{\link[base]{matrix}}.} - \item{\code{LoadFile(filename, skipRows = NULL, skipColumns = NULL,} \cr - \code{sliceSize = NULL, omitCharacters = NULL, delimiter = NULL, rowNamesColumn = 1)}:}{ Loads data matrix from a file. \code{filename} should be a character string. The remaining parameters specify the file format and have the same meaning as \code{file*} fields. Additional \code{rowNamesColumn} parameter specifies which of the columns of row labels to use as row names.} - \item{\code{SaveFile(filename)}:}{ Saves the data to a file. \code{filename} should be a character string.} - \item{\code{getSlice(sl)}:}{ Retrieves \code{sl}-th slice of the matrix. } - \item{\code{setSlice(sl, value)}:}{ Set \code{sl}-th slice of the matrix. } - \item{\code{ColumnSubsample(subset)}:}{ Reorders/subsets the columns according to \code{subset}. \cr -Acts as \code{M = M[ ,subset]} for a matrix \code{M}. } - \item{\code{RowReorder(ordr)}:}{ Reorders rows according to \code{ordr}. \cr -Acts as \code{M = M[ordr, ]} for a matrix \code{M}. } - \item{\code{RowMatrixMultiply(multiplier)}:}{ Multiply each row by the \code{multiplier}. \cr -Acts as \code{M = M \%*\% multiplier} for a matrix \code{M}. } - \item{\code{CombineInOneSlice()}:}{ Combines all slices into one. The whole matrix can then be obtained via \code{$getSlice(1)}. } - \item{\code{IsCombined()}:}{ Returns \code{TRUE} if the number of slices is 1 or 0. } - \item{\code{ResliceCombined(sliceSize = -1)}:}{ Cuts the data into slices of \code{sliceSize} rows. If \code{sliceSize} is not defined, the value of \code{fileSliceSize} field is used.} - \item{\code{GetAllRowNames()}:}{ Returns all row names in one vector. } - \item{\code{RowStandardizeCentered()}:}{ Set the mean of each row to zero and the sum of squares to one. } - \item{\code{SetNanRowMean()}:}{ Impute rows with row mean. Rows full of NaN values are imputed with zeros. } - \item{\code{RowRemoveZeroEps()}:}{ Removes rows of zeros and those that are nearly zero. } - \item{\code{FindRow(rowname)}:}{ Finds row by name. Returns a pair of slice number an row number within the slice. If no row is found, the function returns \code{NULL}. } - \item{\code{rowMeans(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of row means. Works as \link[base]{rowMeans} but requires \code{dims} to be equal to \code{1L}.} - \item{\code{rowSums(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of row sums. Works as \link[base]{rowSums} but requires \code{dims} to be equal to \code{1L}.} - \item{\code{colMeans(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of column means. Works as \link[base]{colMeans} but requires \code{dims} to be equal to \code{1L}.} - \item{\code{colSums(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of column sums. Works as \link[base]{colSums} but requires \code{dims} to be equal to \code{1L}.} - } -} - -\usage{ -# x[[i]] indexing allows easy access to individual slices. -# It is equivalent to x$GetSlice(i) and x$SetSlice(i,value) -\S4method{[[}{SlicedData}(x, i) -\S4method{[[}{SlicedData}(x, i) <- value - -# The following commands work as if x was a simple matrix object -\S4method{nrow}{SlicedData}(x) -\S4method{ncol}{SlicedData}(x) -\S4method{dim}{SlicedData}(x) -\S4method{rownames}{SlicedData}(x) -\S4method{colnames}{SlicedData}(x) -\S4method{rownames}{SlicedData}(x) <- value -\S4method{colnames}{SlicedData}(x) <- value - -# SlicedData object can be easily transformed into a matrix -# preserving row and column names -\S4method{as.matrix}{SlicedData}(x) - -# length(x) can be used in place of x$nSlices() -# to get the number of slices in the object -\S4method{length}{SlicedData}(x) -} -\arguments{ - \item{x}{ - \code{\linkS4class{SlicedData}} object. - } - \item{i}{ - Number of a slice. - } - \item{value}{ - New content for the slice / new row or column names. - } -} - - -\examples{ - -# Create a SlicedData variable -sd = SlicedData$new(); - -# Show the details of the empty object -show(sd); - -# Create a matrix of values and assign to sd -mat = matrix(1:12, 3, 4); -rownames(mat) = c("row1","row2","row3"); -colnames(mat) = c("col1","col2","col3","col4"); -sd$CreateFromMatrix( mat ); - -# Show the detail of the object (one slice) -show(sd); - -# Slice it in pieces of 2 rows -sd$ResliceCombined(sliceSize = 2L); - -# Show the number of slices (equivalent function calls) -sd$nSlices() -length(sd) - -# Is it all in one slice? (No) -sd$IsCombined() - -# Show the column names (equivalent function calls) -sd$columnNames -colnames(sd) - -# Show row name slices -sd$rowNameSlices - -# Show all row names (equivalent function calls) -sd$GetAllRowNames() -rownames(sd) - -# Print the second slice -print( sd[[2]] ) - -# Reorder and subset columns -sd$ColumnSubsample( c(1,3,4) ); - -# Reorder and subset rows -sd$RowReorder( c(3,1) ); - -# Show the detail of the object (one slice again) -show(sd); - -# Is it all in one slice? (Yes) -sd$IsCombined() - -# Find the row with name "row1" (it is second in the first slice) -sd$FindRow("row1"); - -} \ No newline at end of file diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/figures/QQplot.png Binary file MatrixEQTL/man/figures/QQplot.png has changed diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/figures/histogram.png Binary file MatrixEQTL/man/figures/histogram.png has changed diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/geneloc.Rd --- a/MatrixEQTL/man/geneloc.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -\name{geneloc} -\alias{geneloc} -\docType{data} -\title{ - Artificial data for Matrix eQTL sample code: Gene location file. -} -\description{ - Artificial Gene location file for 10 genes. - \tabular{llll}{ - geneid \tab chr \tab left \tab right \cr - Gene_01 \tab chr1 \tab 721289 \tab 731289 \cr - Gene_02 \tab chr1 \tab 752565 \tab 762565 \cr - \ldots \tab \ldots \tab \ldots \tab \ldots \cr - } -} -\format{ - A data frame with 4 columns. - \describe{ - \item{\code{geneid}}{A column with gene names. The order does not have to match the gene expression data set.} - \item{\code{chr}}{Chromosome number, i.e. chr1.} - \item{\code{left}}{Lower (smaller) coordinate of the gene begining/end.} - \item{\code{right}}{Upper (larger) coordinate of the gene begining/end.} - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\keyword{datasets} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/modelANOVA.Rd --- a/MatrixEQTL/man/modelANOVA.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,88 +0,0 @@ -\name{modelANOVA} -\alias{modelANOVA} -\docType{data} -\title{ - Constant for \code{\link{Matrix_eQTL_engine}}. -} -\description{ - Set parameter \code{useModel = modelANOVA} in the call of \code{\link{Matrix_eQTL_main}} to indicate that the genotype should be treated as a categorical variable. -} -\note{ -By default, the number of ANOVA categories is fixed to be 3. - -To set it to a different number (say, 4) use the following command: - -\code{options(MatrixEQTL.ANOVA.categories=4)} - -To check the current settings run: - -\code{getOption('MatrixEQTL.ANOVA.categories', 3);} -} -\examples{ -library('MatrixEQTL') - -# Number of columns (samples) -n = 100; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with single genotype and expression variables -snps.mat = floor(runif(n, min = 0, max = 3)); -gene.mat = 1 + (snps.mat==1) + cvrt.mat \%*\% rnorm(nc) + rnorm(n) * noise.std; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# name of temporary output file -filename = tempfile(); - -snps1 -gene1 - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelANOVA, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); -# remove the output file -unlink( filename ); - -# Pull Matrix eQTL results - t-statistic and p-value - -fstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c( Fstat = fstat, pvalue = pvalue) -# And compare to those from ANOVA in R -{ - cat('\n\n Matrix eQTL: \n'); - print(rez); - cat('\n R anova(lm()) output: \n') - lmodel = lm( gene.mat ~ cvrt.mat + factor(snps.mat), weights = 1/noise.std^2 ); - lmout = anova( lmodel )[2, 4:5]; - print( lmout ) -} - -# Results from Matrix eQTL and 'lm' must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)) -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/modelLINEAR.Rd --- a/MatrixEQTL/man/modelLINEAR.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ -\name{modelLINEAR} -\alias{modelLINEAR} -\docType{data} -\title{ - Constant for \code{\link{Matrix_eQTL_engine}}. -} -\description{ - Set parameter \code{useModel = modelLINEAR} in the call of \code{\link{Matrix_eQTL_main}} to indicate that the effect of genotype on expression should be assumed to be additive linear. -} - -\examples{ -library('MatrixEQTL') - -# Number of columns (samples) -n = 100; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with genotype and expression variables -snps.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n); -gene.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n) * noise.std + 0.5 * snps.mat + 1; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# name of temporary output file -filename = tempfile(); - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); -# remove the output file -unlink( filename ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue) -# And compare to those from the linear regression in R -{ - cat('\n\n Matrix eQTL: \n'); - print(rez); - cat('\n R summary(lm()) output: \n'); - lmodel = lm( gene.mat ~ snps.mat + cvrt.mat, weights = 1/noise.std^2 ); - lmout = summary( lmodel )$coefficients[2, c(1,3,4)]; - print( lmout ) -} - -# Results from Matrix eQTL and 'lm' must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)) -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/modelLINEAR_CROSS.Rd --- a/MatrixEQTL/man/modelLINEAR_CROSS.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -\name{modelLINEAR_CROSS} -\alias{modelLINEAR_CROSS} -\docType{data} -\title{ - Constant for \code{\link{Matrix_eQTL_engine}}. -} -\description{ - Set parameter \code{useModel = modelLINEAR_CROSS} in the call of \code{\link{Matrix_eQTL_main}} to indicate that Matrix eQTL should include the interaction of SNP and last covariate in the model and test for its significance. -} -\examples{ -library('MatrixEQTL') - -# Number of columns (samples) -n = 25; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with single genotype and expression variables -snps.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n); -gene.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n) * noise.std + - 1 + 0.5 * snps.mat + snps.mat * cvrt.mat[,nc]; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# name of temporary output file -filename = tempfile(); - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR_CROSS, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); -# remove the output file -unlink( filename ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue) -# And compare to those from the linear regression in R -{ - cat('\n\n Matrix eQTL: \n'); - print(rez); - cat('\n R summary(lm()) output: \n') - lmodel = lm( gene.mat ~ snps.mat + cvrt.mat + snps.mat*cvrt.mat[,nc], - weights = 1/noise.std^2 ); - lmout = tail(summary( lmodel )$coefficients,1)[,c(1,3,4)]; - print( tail(lmout) ); -} - -# Results from Matrix eQTL and 'lm' must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)) -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/plot.MatrixEQTL.Rd --- a/MatrixEQTL/man/plot.MatrixEQTL.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ -\name{plot.MatrixEQTL} -\alias{plot.MatrixEQTL} -\title{Plot histogram or QQ-plot of all p-values -} -\description{ -This method plots a histogram or QQ-plot of p-values -for all tests performed by \code{\link{Matrix_eQTL_engine}}. -} -\usage{ -\method{plot}{MatrixEQTL}(x, cex = 0.5, pch = 19, xlim = NULL, ylim = NULL,...) -} -\arguments{ - \item{x}{ - An object returned by \code{\link{Matrix_eQTL_engine}}. - } - \item{cex}{ - A numerical value giving the amount by which plotting text and symbols should be magnified relative to the default. - } - \item{pch}{ - Plotting "character", i.e., symbol to use. See \code{\link[graphics]{points}}. - } - \item{xlim}{ - Set the range of the horisontal axis. - } - \item{ylim}{ - Set the range of the vertical axis. - } - \item{\dots}{ - further graphical parameters passed to \code{\link[graphics]{lines}} and \code{\link[graphics]{points}}. - } -} -\details{ - The plot type (histogram vs. QQ-plot) is determined by the \code{pvalue.hist} parameter in the call of \code{\link{Matrix_eQTL_engine}} function. -} -\value{ - The method does not return any value. -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\note{ - The sample code below produces figures like these: - - Histogram: \figure{histogram.png} - - QQ-plot: \figure{QQplot.png} -} -\author{ - Andrey Shabalin \email{ashabalin@vcu.edu} -} -\examples{ -library(MatrixEQTL) - -# Number of samples -n = 100; - -# Number of variables -ngs = 2000; - -# Common signal in all variables -pop = 0.2*rnorm(n); - -# data matrices -snps.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop; -gene.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop + snps.mat*((1:ngs)/ngs)^9/2; - -# data objects for Matrix eQTL engine -snps1 = SlicedData$new( t( snps.mat ) ); -gene1 = SlicedData$new( t( gene.mat ) ); -cvrt1 = SlicedData$new( ); -rm(snps.mat, gene.mat) - -# Slice data in blocks of 500 variables -snps1$ResliceCombined(500); -gene1$ResliceCombined(500); - -# Produce no output files -filename = NULL; # tempfile() - -# Perform analysis recording information for a histogram -meh = Matrix_eQTL_engine( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1e-100, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = 100); - -plot(meh, col="grey") - -# Perform analysis recording information for a QQ-plot -meq = Matrix_eQTL_engine( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1e-6, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = "qqplot"); - -plot(meq) -} -\keyword{ QQ-plot } -\keyword{ histogram } diff -r cd4c8e4a4b5b -r 2d1118ddc31d MatrixEQTL/man/snpsloc.Rd --- a/MatrixEQTL/man/snpsloc.Rd Fri Mar 12 08:12:46 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -\name{snpsloc} -\alias{snpsloc} -\docType{data} -\title{ - Artificial data for Matrix eQTL sample code: SNP location file. -} -\description{ - Artificial SNP location file for 15 markers. - \tabular{llll}{ - snpid \tab chr \tab pos \cr - Snp_01 \tab chr1 \tab 721289 \cr - Snp_02 \tab chr1 \tab 752565 \cr - \ldots \tab \ldots \tab \ldots \cr - } -} -\format{ - A data frame with 3 columns. - \describe{ - \item{\code{snpid}}{A column with SNP names. The order does not have to match the genotype data set.} - \item{\code{chr}}{Chromosome number, i.e. chr1.} - \item{\code{pos}}{Coordinate of the SNP.} - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\keyword{datasets}