# HG changeset patch # User jasonxu # Date 1615537188 0 # Node ID 5fafba5359eb15608f467660ac7049230434fae5 # Parent 2d1118ddc31d25c1492988098f85af3d41b0b501 Deleted selected files diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/DESCRIPTION --- a/MatrixEQTL/MatrixEQTL/DESCRIPTION Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ -Package: MatrixEQTL -Type: Package -Title: Matrix eQTL: Ultra fast eQTL analysis via large matrix - operations -Version: 2.1.1 -Date: 2014-02-24 -Author: Andrey Shabalin -Maintainer: Andrey Shabalin -Description: Matrix eQTL is designed for fast eQTL analysis on large datasets. - Matrix eQTL can test for association between genotype and gene expression using linear regression - with either additive or ANOVA genotype effects. - The models can include covariates to account for factors - as population stratification, gender, and clinical variables. - It also supports models with heteroscedastic and/or correlated errors, - false discovery rate estimation and separate treatment of local (cis) and distant (trans) eQTLs. -License: LGPL-3 -LazyLoad: yes -URL: http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ -Depends: R (>= 2.12.0), methods -Packaged: 2014-03-01 10:58:21 UTC; Andrey diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/NAMESPACE --- a/MatrixEQTL/MatrixEQTL/NAMESPACE Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -importFrom(methods, setRefClass) -exportPattern("^[[:alpha:]]+") -exportClasses( - "SlicedData" -) -S3method(plot, MatrixEQTL) -S3method(summary, SlicedData) -exportMethods( - "[[", - "[[<-", - "colnames", - "colnames<-", - "dim", - "length", - "ncol", - "NCOL", - "nrow", - "NROW", - "rownames", - "rownames<-", - "as.matrix", - "rowMeans", - "rowSums", - "colMeans", - "colSums" -) - diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/R/Matrix_eQTL_engine.R --- a/MatrixEQTL/MatrixEQTL/R/Matrix_eQTL_engine.R Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1964 +0,0 @@ -# Matrix eQTL by Andrey A. Shabalin -# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ - -# http://cran.r-project.org/web/packages/policies.html - -library(methods) - -modelLINEAR = 117348L; -modelANOVA = 47074L; -modelLINEAR_CROSS = 1113461L; - -.seq = function(a,b){if(a<=b){a:b}else{integer(0)}}; - -SlicedData <- setRefClass( "SlicedData", - fields = list( - dataEnv = "environment", - nSlices1 = "numeric", - rowNameSlices = "list", - columnNames = "character", - fileDelimiter = "character", - fileSkipColumns = "numeric", - fileSkipRows = "numeric", - fileSliceSize = "numeric", - fileOmitCharacters = "character" - ), - methods = list( - initialize = function( mat = NULL ) { - dataEnv <<- new.env(hash = TRUE, size = 29L); - nSlices1 <<- 0L; - if(!is.null(mat)) { - CreateFromMatrix(mat); - } - fileSliceSize <<- 1000; - fileDelimiter <<- "\t"; - fileSkipColumns <<- 1L; - fileSkipRows <<- 1L; - fileOmitCharacters <<- "NA" - return(invisible(.self)); - }, - CreateFromMatrix = function( mat ) { - stopifnot( class(mat) == "matrix" ); - setSliceRaw( 1L ,mat ); - rns = rownames( mat, do.NULL = FALSE); - #if( is.null(rns) ) { - # rns = paste( "Row_",(1:nrow(mat)), sep="" ); - #} - rowNameSlices <<- list(rns); - cns = colnames( mat, do.NULL = FALSE ); - #if( is.null(cns) ){ - # cns = paste( "Col_",(1:ncol(mat)), sep="" ); - #} - columnNames <<- cns; - return(invisible(.self)); - }, - getSlice = function(sl) { - value = get(paste(sl), dataEnv); - if( is.raw(value) ) { - storage.mode(value) = "double"; - value[value == 255] = NA; - } - return( value ) - }, - getSliceRaw = function(sl) { - return( get(paste(sl), dataEnv) ) - }, - setSliceRaw = function(sl, value) { - assign( paste(sl), value, dataEnv ) - if( nSlices1 < sl ) { - nSlices1 <<- sl; - } - }, - setSlice = function(sl, value) { - if( length(value) > 0 ) { - if( all(as.integer(value) == value, na.rm = TRUE) ) { - if( (min(value, na.rm = TRUE) >= 0 ) && - (max(value, na.rm = TRUE) < 255) ) - { - nv = value; - suppressWarnings({storage.mode(nv) = "raw"}); - nv[ is.na(value)] = as.raw(255); - value = nv; - } else { - storage.mode(value) = "integer"; - } - } - } - setSliceRaw(sl, value); - }, - nSlices = function() { - return( nSlices1 ); - }, - LoadFile = function(filename, skipRows = NULL, skipColumns = NULL, sliceSize = NULL, omitCharacters = NULL, delimiter = NULL, rowNamesColumn = 1) { - if( !is.null(skipRows) ) { - fileSkipRows <<- skipRows; - } - if( !is.null(skipColumns) ) { - fileSkipColumns <<- skipColumns; - } - if( !is.null(omitCharacters) ) { - fileOmitCharacters <<- omitCharacters; - } - if( !is.null(sliceSize) ) { - fileSliceSize <<- sliceSize; - } - if( !is.null(delimiter) ) { - fileDelimiter <<- delimiter; - } - stopifnot( (fileSkipColumns == 0) || (rowNamesColumn <= fileSkipColumns) ) - stopifnot( (fileSkipColumns == 0) || (rowNamesColumn >= 1) ) - - fid = file(description = filename, open = "rt", blocking = FALSE, raw = FALSE) - # clean object if file is open - Clear(); - lines = readLines(con = fid, n = max(fileSkipRows,1L), ok = TRUE, warn = TRUE) - line1 = tail(lines,1); - splt = strsplit(line1, split = fileDelimiter, fixed = TRUE); - if( fileSkipRows > 0L ) { - columnNames <<- splt[[1]]; # [ -(1:fileSkipColumns) ]; - } else { - seek(fid, 0) - } - - rm( lines, line1, splt ); - - rowNameSlices <<- vector("list", 15); - - curSliceId = 0L; - repeat - { - # preallocate names and data - if(length(rowNameSlices) < curSliceId) { - rowNameSlices[[2L*curSliceId]] <<- NULL; - } - curSliceId = curSliceId + 1L; - - # read sliceSize rows - rowtag = vector("character",fileSliceSize); - rowvals = vector("list",fileSliceSize); - for(i in 1:fileSliceSize) { - temp = ""; - if( fileSkipColumns > 0L ) { - temp = scan(file = fid, what = character(), n = fileSkipColumns, quiet = TRUE,sep = fileDelimiter); - } - rowtag[i] = temp[rowNamesColumn];#paste(temp,collapse=" "); - rowvals[[i]] = scan(file = fid, what = double(), nlines = 1, quiet = TRUE, sep = fileDelimiter, na.strings = fileOmitCharacters); - if( length(rowvals[[i]]) == 0L ) { - if(i==1L) { - rowtag = matrix(0, 0, 0); - rowvals = character(0); - } else { - rowtag = rowtag[ 1:(i-1) ]; - rowvals = rowvals[ 1:(i-1) ]; - } - break; - } - } - if( length(rowtag) == 0L ) { - curSliceId = curSliceId - 1L; - break; - } - rowNameSlices[[curSliceId]] <<- rowtag; - data = c(rowvals, recursive = TRUE); - dim(data) = c(length(rowvals[[1]]), length(rowvals)); - data = t(data); - setSlice(curSliceId, data); - if( length(rowtag) < fileSliceSize ) { - break; - } - numtxt = formatC(curSliceId*fileSliceSize, big.mark=",", format = "f", digits = 0) - cat( "Rows read: ", numtxt, "\n"); - flush.console() - } - close(fid) - if( fileSkipRows == 0 ) { - columnNames <<- paste("Col_", (1:nCols()), sep=""); - } else { - columnNames <<- tail(columnNames, ncol(getSliceRaw(1))); - } - if( fileSkipColumns == 0 ) { - cnt = 0L; - for( sl in 1:nSlices() ) { - nr = length(getSliceRaw(sl)); - rowNameSlices[[sl]] <<- paste("Row_",cnt + (1:nr),sep=""); - cnt = cnt + nr; - } - } - rowNameSlices <<- rowNameSlices[1:curSliceId]; - cat("Rows read: ", nRows(), " done.\n"); - return(invisible(.self)); - }, - SaveFile = function(filename) { - if( nSlices() == 0 ) { - cat("No data to save"); - return(); - } - fid = file(filename,"wt"); - for( sl in 1:nSlices() ) { - z = getSlice(sl); - rownames(z) = rowNameSlices[[sl]]; - colnames(z) = columnNames; - write.table(z, file = fid, sep = "\t", - col.names = (if(sl == 1){NA}else{FALSE})); - } - close(fid); - }, - nRows = function() { - s = 0L; - for(sl in .seq(1,nSlices())) { - s = s + nrow(getSliceRaw(sl)); - } - return( s ) - }, - nCols = function() { - if( nSlices() == 0L ) { - return(0L); - } else { - return( ncol(getSliceRaw(1L)) ) - } - }, - Clear = function() { - for( sl in .seq(1,nSlices()) ) { - rm(list = paste(sl), envir = dataEnv) - } - nSlices1 <<- 0L; - rowNameSlices <<- list(); - columnNames <<- character(); - return(invisible(.self)); - }, - IsCombined = function() { - return( nSlices() <= 1L ); - }, - GetAllRowNames = function() { - return( c(rowNameSlices, recursive=TRUE) ); - }, - SetNanRowMean = function() { - if( (nCols() == 0L) ) { - return(invisible(.self)); - } - for( sl in .seq(1,nSlices()) ) { - slice = getSlice(sl); - if( any(is.na(slice)) ) { - rowmean = rowMeans(slice, na.rm = TRUE); - rowmean[is.na(rowmean)] = 0L; - for( j in which(!complete.cases(slice)) ) { - where1 = is.na(slice[j, ]); - slice[j, where1] = rowmean[j]; - } - setSlice(sl, slice); - } - } - return(invisible(.self)); - }, - RowStandardizeCentered = function() { - for(sl in .seq(1,nSlices()) ) { - slice = getSlice(sl); - div = sqrt( rowSums(slice^2) ); - div[ div == 0 ] = 1; - setSlice(sl, slice/div); - } - return(invisible(.self)); - }, - CombineInOneSlice = function() { - if( nSlices() <= 1L ) { - return(invisible(.self)); - } - nc = nCols(); - nr = nRows(); - datatypes = c("raw","integer","double"); - datafuns = c(as.raw, as.integer, as.double); - datatype = character(nSlices()); - for(sl in 1:nSlices()) { - datatype[sl] = typeof(getSliceRaw(sl)); - } - mch = max(match(datatype,datatypes,nomatch = length(datatypes))); - datafun = datafuns[[mch]]; - newData = matrix(datafun(0), nrow = nr, ncol = nc); - offset = 0; - for(sl in 1:nSlices()) { - if(mch==1) { - slice = getSliceRaw(sl); - } else { - slice = getSlice(sl); - } - newData[ offset + (1:nrow(slice)),] = datafun(slice); - setSlice(sl, numeric()); - offset = offset + nrow(slice); - } - - nSlices1 <<- 1L; - setSliceRaw(1L, newData); - rm(newData); - - newrowNameSlices = GetAllRowNames(); - rowNameSlices <<- list(newrowNameSlices) - return(invisible(.self)); - }, - ResliceCombined = function(sliceSize = -1) { - if( sliceSize > 0L ) { - fileSliceSize <<- sliceSize; - } - if( fileSliceSize <= 0 ) { - fileSliceSize <<- 1000; - } - if( IsCombined() ) { - nRows1 = nRows(); - if(nRows1 == 0L) { - return(invisible(.self)); - } - newNSlices = floor( (nRows1 + fileSliceSize - 1)/fileSliceSize ); - oldData = getSliceRaw(1L); - #oldNames = rowNameSlices[[1]]; - newNameslices = vector("list",newNSlices) - for( sl in 1:newNSlices ) { - range = (1+(sl-1)*fileSliceSize) : (min(nRows1,sl*fileSliceSize)); - newpart = oldData[range, ,drop = FALSE]; - if( is.raw(oldData) ) { - setSliceRaw( sl, newpart); - } else { - setSlice( sl, newpart); - } - newNameslices[[sl]] = rowNameSlices[[1]][range]; - } - rowNameSlices <<- newNameslices ; - } else { - stop("Reslice of a sliced matrix is not supported yet. Use CombineInOneSlice first."); - } - return(invisible(.self)); - }, - Clone = function() { - clone = SlicedData$new(); - for(sl in .seq(1,nSlices()) ) { - clone$setSliceRaw(sl,getSliceRaw(sl)); - } - clone$rowNameSlices = rowNameSlices; - clone$columnNames = columnNames; - clone$fileDelimiter = fileDelimiter; - clone$fileSkipColumns = fileSkipColumns; - clone$fileSkipRows = fileSkipRows; - clone$fileSliceSize = fileSliceSize; - clone$fileOmitCharacters = fileOmitCharacters; - return( clone ); - }, - RowMatrixMultiply = function(multiplier) { - for(sl in .seq(1,nSlices()) ) { - setSlice(sl, getSlice(sl) %*% multiplier); - } - return(invisible(.self)); - }, - ColumnSubsample = function(subset) { - for(sl in .seq(1,nSlices()) ) { - setSliceRaw(sl, getSliceRaw(sl)[ ,subset, drop = FALSE]); - } - columnNames <<- columnNames[subset]; - return(invisible(.self)); - }, - RowReorderSimple = function(ordr) { - # had to use an inefficient and dirty method - # due to horrible memory management in R - if( (typeof(ordr) == "logical") && all(ordr) ) { - return(invisible(.self)); - } - if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { - return(invisible(.self)); - } - CombineInOneSlice(); - gc(); - setSliceRaw( 1L, getSliceRaw(1L)[ordr, ] ); - rowNameSlices[[1]] <<- rowNameSlices[[1]][ordr]; - gc(); - ResliceCombined(); - gc(); - return(invisible(.self)); - }, - RowReorder = function(ordr) { - # transform logical into indices - if( typeof(ordr) == "logical" ) { - if( length(ordr) == nRows() ) { - ordr = which(ordr); - } else { - stop("Parameter \"ordr\" has wrong length") - } - } - ## first, check that anything has to be done at all - if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) { - return(invisible(.self)); - } - ## check bounds - #if( (min(ordr) < 1) || (max(ordr) > nRows()) ) { - # stop("Parameter \"ordr\" is out of bounds"); - #} - ## slice the data into individual rows - all_rows = vector("list", nSlices()) - for( i in 1:nSlices() ) { - slice = getSliceRaw(i) - all_rows[[i]] = split(slice, 1:nrow(slice)) - setSliceRaw(i,numeric()) - } - gc(); - all_rows = unlist(all_rows, recursive=FALSE, use.names = FALSE); - ## Reorder the rows - all_rows = all_rows[ordr]; - ## get row names - all_names = GetAllRowNames(); - ## erase the set - rowNameSlices <<- list(); - ## sort names - all_names = all_names[ordr]; - ## - ## Make slices back - nrows = length(all_rows); - nSlices1 <<- as.integer((nrows+fileSliceSize-1)/fileSliceSize); - ##cat(nrows, " ", nSlices1); - rowNameSlices1 = vector("list", nSlices1); - for( i in 1:nSlices1 ) { - fr = 1 + fileSliceSize*(i-1); - to = min( fileSliceSize*i, nrows); - - subset = all_rows[fr:to]; - types = unlist(lapply(subset,typeof)); - israw = (types == "raw") - if(!all(israw == israw[1])) { - # some raw and some are not - subset = lapply(subset, function(x){if(is.raw(x)){x=as.integer(x);x[x==255] = NA;return(x)}else{return(x)}}); - } - subset = unlist(subset); - dim(subset) = c( length(all_rows[[fr]]) , to - fr + 1) - #subset = matrix(subset, ncol = (to-fr+1)); - if(is.raw(subset)) { - setSliceRaw(i, t(subset)); - } else { - setSlice(i, t(subset)); - } - rowNameSlices1[[i]] = all_names[fr:to]; - all_rows[fr:to] = 0; - all_names[fr:to] = 0; - } - rowNameSlices <<- rowNameSlices1; - gc(); - return(invisible(.self)); - }, - RowRemoveZeroEps = function(){ - for(sl in .seq(1,nSlices()) ) { - slice = getSlice(sl); - amean = rowMeans(abs(slice)); - remove = (amean < .Machine$double.eps*nCols()); - if(any(remove)) { - rowNameSlices[[sl]] <<- rowNameSlices[[sl]][!remove]; - setSlice(sl, slice[!remove, , drop = FALSE]); - } - } - return(invisible(.self)); - }, - FindRow = function(rowname) { - for(sl in .seq(1,nSlices()) ) { - mch = match(rowname,rowNameSlices[[sl]], nomatch = 0); - if( mch > 0 ) - { - row = getSlice(sl)[mch[1], , drop=FALSE]; - rownames(row) = rowname; - colnames(row) = columnNames; - return( list(slice = sl, item = mch, row = row) ); - } - } - return( NULL ); - }, - show = function() { - cat("SlicedData object. For more information type: ?SlicedData\n"); - cat("Number of columns:", nCols(), "\n"); - cat("Number of rows:", nRows(), "\n"); - cat("Data is stored in", nSlices(), "slices\n"); - if(nCols()>0) { - z = getSlice(1L); - if(nrow(z)>0) { - z = z[1:min(nrow(z),10L), 1:min(ncol(z),10L), drop = FALSE]; - rownames(z) = rowNameSlices[[1]][1:nrow(z)]; - colnames(z) = columnNames[1:ncol(z)]; - cat("Top left corner of the first slice (up to 10x10):\n"); - methods:::show(z) - } - } - } - )) - -setGeneric("nrow") -setMethod("nrow", "SlicedData", function(x) { - return( x$nRows() ); - }) -setGeneric("NROW") -setMethod("NROW", "SlicedData", function(x) { - return( x$nRows() ); - }) -setGeneric("ncol") -setMethod("ncol", "SlicedData", function(x) { - return( x$nCols() ); - }) -setGeneric("NCOL") -setMethod("NCOL", "SlicedData", function(x) { - return( x$nCols() ); - }) -setGeneric("dim") -setMethod("dim", "SlicedData", function(x) { - return( c(x$nRows(),x$nCols()) ); - }) -setGeneric("colnames") -setMethod("colnames", "SlicedData", function(x) { - return( x$columnNames ); - }) -setGeneric("rownames") -setMethod("rownames", "SlicedData", function(x) { - return( x$GetAllRowNames() ); - }) -setMethod("[[", "SlicedData", function(x,i) { - return( x$getSlice(i) ); - }) -setGeneric("length") -setMethod("length", "SlicedData", function(x) { - return( x$nSlices() ); - }) -setMethod("[[<-", "SlicedData", function(x,i,value) { - x$setSlice(i, value); - return(x); -}) -summary.SlicedData = function(object, ...) { - z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); - return(z); -} - -##### setGeneric("summary") ##### -#setMethod("summary", "SlicedData", function(object, ...) { -# z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices()); -# return(z); -# }) -#setGeneric("show", standardGeneric("show")) -# setMethod("show", "SlicedData", function(object) { -# cat("SlicedData object. For more information type: ?SlicedData\n"); -# cat("Number of columns:", object$nCols(), "\n"); -# cat("Number of rows:", object$nRows(), "\n"); -# cat("Data is stored in", object$nSlices(), "slices\n"); -# if(object$nSlices()>0) { -# z = object$getSlice(1); -# if(nrow(z)>0) { -# z = z[1:min(nrow(z),10), 1:min(ncol(z),10), drop = FALSE]; -# rownames(z) = object$rowNameSlices[[1]][1:nrow(z)]; -# colnames(z) = object$columnNames[1:ncol(z)]; -# cat("Top left corner of the first slice (up to 10x10):\n"); -# show(z) -# } -# } -# }) - -setGeneric("as.matrix") -setMethod("as.matrix", "SlicedData", function(x) { - if(x$nSlices() == 0) { - return( matrix(0,0,0) ); - } - if(x$nSlices() > 1) { - copy = x$Clone(); - copy$CombineInOneSlice(); - } else { - copy = x; - } - mat = copy$getSlice(1L); - rownames(mat) = rownames(copy); - colnames(mat) = colnames(copy); - return( mat ); - }) -setGeneric("colnames<-") -setMethod("colnames<-", "SlicedData", function(x,value) { - stopifnot( class(value) == "character" ); - stopifnot( length(value) == x$nCols() ); - x$columnNames = value; - return(x); - }) -setGeneric("rownames<-") -setMethod("rownames<-", "SlicedData", function(x,value) { - stopifnot( class(value) == "character" ); - stopifnot( length(value) == x$nRows() ); - start = 1; - newNameSlices = vector("list", x$nSlices()); - for( i in .seq(1,x$nSlices()) ) { - nr = nrow(x$getSliceRaw(i)); - newNameSlices[[i]] = value[ start:(start+nr-1) ]; - start = start + nr; - } - x$rowNameSlices = newNameSlices; - return(x); - }) -setGeneric("rowSums") -setMethod("rowSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { - if(x$nSlices() == 0) { - return( numeric() ); - } - stopifnot( dims == 1 ); - thesum = vector("list", x$nSlices()); - for( i in 1:x$nSlices() ) { - thesum[[i]] = rowSums(x$getSlice(i), na.rm) - } - return(unlist(thesum, recursive = FALSE, use.names = FALSE)); - }) -setGeneric("rowMeans") -setMethod("rowMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { - if(x$nSlices() == 0) { - return( numeric() ); - } - stopifnot( dims == 1 ); - thesum = vector("list", x$nSlices()); - for( i in 1:x$nSlices() ) { - thesum[[i]] = rowMeans(x$getSlice(i), na.rm) - } - return(unlist(thesum, recursive = FALSE, use.names = FALSE)); - }) -setGeneric("colSums") -setMethod("colSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { - if(x$nCols() == 0) { - return( numeric() ); - } - stopifnot( dims == 1 ); - thesum = 0; - for( i in .seq(1,x$nSlices()) ) { - thesum = thesum + colSums(x$getSlice(i), na.rm) - } - return(thesum); - }) -setGeneric("colMeans") -setMethod("colMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) { - if(x$nCols() == 0) { - return( numeric() ); - } - stopifnot( dims == 1 ); - thesum = 0; - thecounts = x$nRows(); - for( i in .seq(1,x$nSlices()) ) { - slice = x$getSlice(i); - thesum = thesum + colSums(slice, na.rm) - if( na.rm ) { - thecounts = thecounts - colSums(is.na(slice)) - } - } - return(thesum/thecounts); - }) - -.listBuilder <- setRefClass(".listBuilder", - fields = list( - dataEnv = "environment", - n = "numeric" - ), - methods = list( - initialize = function() { - dataEnv <<- new.env(hash = TRUE); - n <<- 0; -# cumlength <<- 0; - return(.self); - }, - add = function(x) { - if(length(x) > 0) { - n <<- n + 1; -# cumlength <<- cumlength + length(x); - assign(paste(n), x, dataEnv ); - } - return(.self); - }, - set = function(i,x) { - if(length(x) > 0) { - if(i>n) - n <<- i; - assign(paste(i), x, dataEnv ); - } - return(.self); - }, - get = function(i) { - return(base::get(paste(i),dataEnv)); - }, - list = function() { - if(n==0) return(list()); - result = vector('list',n); - for( i in 1:n) { - result[[i]] = .self$get(i); - } - return(result); - }, - unlist = function() { - return(base::unlist(.self$list(), recursive=FALSE, use.names = FALSE)); - }, - show = function() { - cat(".listBuilder object.\nIternal object in MatrixEQTL package.\n"); - cat("Number of elements:", object$n, "\n"); - } - )) - -.histogrammer <- setRefClass(".histogrammer", - fields = list( - pvbins1 = "numeric", - statbins1 = "numeric", - hist.count = "numeric" - ), - methods = list( - initialize = function (pvbins, statbins) { - if(length(pvbins)) { - ord = order(statbins); - pvbins1 <<- pvbins[ord]; - statbins1 <<- statbins[ord]; - hist.count <<- double(length(pvbins)-1); - } - return(.self); - }, - update = function(stats.for.hist) { - h = hist(stats.for.hist, breaks = statbins1, include.lowest = TRUE, right = TRUE, plot = FALSE)$counts; - hist.count <<- hist.count + h; - }, - getResults = function() { - if(!is.unsorted(pvbins1)) { - return(list(hist.bins = pvbins1 , hist.counts = hist.count )); - } else { - return(list(hist.bins = rev(pvbins1), hist.counts = rev(hist.count))); - } - } - )) - - -.minpvalue <- setRefClass(".minpvalue", - fields = list( - sdata = ".listBuilder", - gdata = ".listBuilder" - ), - methods = list( - initialize = function(snps, gene) { - sdata <<- .listBuilder$new(); - for( ss in 1:snps$nSlices() ) { - sdata$set( ss, double(nrow(snps$getSliceRaw(ss)))); - } - gdata <<- .listBuilder$new(); - for( gg in 1:gene$nSlices() ) { - gdata$set( gg, double(nrow(gene$getSliceRaw(gg)))); - } - return(.self); - }, - update = function(ss, gg, astat) { - gmax = gdata$get(gg) - z1 = max.col(astat,ties.method='first'); - z11 = astat[1:nrow(astat) + nrow(astat) * (z1 - 1)]; - gmax = pmax(gmax, z11); - gdata$set(gg, gmax); - - smax = sdata$get(ss) - z22 = apply(astat,2,max); - smax = pmax(smax, z22); - sdata$set(ss, smax); - return(.self); - }, - updatecis = function(ss, gg, select.cis, astat) { - if(length(astat)>0) - { - byrows = aggregate(x=astat, by=list(row=select.cis[,1]), FUN=max); - bycols = aggregate(x=astat, by=list(col=select.cis[,2]), FUN=max); - - gmax = gdata$get(gg); - gmax[byrows$row] = pmax(gmax[byrows$row], byrows$x) - gdata$set(gg, gmax); - - smax = sdata$get(ss) - smax[bycols$col] = pmax(smax[bycols$col], bycols$x) - sdata$set(ss, smax); - } - return(.self); - }, - getResults = function(snps, gene, pvfun) { - min.pv.snps = pvfun(sdata$unlist()); - names(min.pv.snps) = rownames(snps); - min.pv.gene = pvfun(gdata$unlist()); - names(min.pv.gene) = rownames(gene); - return(list(min.pv.snps = min.pv.snps, min.pv.gene = min.pv.gene)); - } - )) - -.OutputSaver_FRD <- setRefClass(".OutputSaver_FRD", - fields = list( - sdata = ".listBuilder", - gdata = ".listBuilder", - cdata = ".listBuilder", - bdata = ".listBuilder", - fid = "list", - testfun1 = "list", - pvfun1 = "list" - ), - methods = list( - initialize = function () { - sdata <<- .listBuilder$new(); - gdata <<- .listBuilder$new(); - cdata <<- .listBuilder$new(); - bdata <<- .listBuilder$new(); - fid <<- list(0); - testfun1 <<- list(0); - pvfun1 <<- list(0); - return(.self); - }, - start = function(filename, statistic_name, unused1, unused2, testfun, pvfun) { - testfun1 <<- list(testfun); - pvfun1 <<- list(pvfun); - if(length(filename) > 0) { - if(class(filename) == "character") { - fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); - } else { - fid <<- list(filename, FALSE) - } - writeLines( paste("SNP\tgene\t",statistic_name,"\tp-value\tFDR", sep = ""), fid[[1]]); - } else { - fid <<- list(); - } - }, - update = function(spos, gpos, sta, beta = NULL) { - if(length(sta)>0) { - sdata$add(spos); - gdata$add(gpos); - cdata$add(sta ); - if(!is.null(beta )) - bdata$add(beta ); - } - return(.self); - }, - getResults = function( gene, snps, FDR_total_count) { - pvalues = NULL; - if(cdata$n > 0) { - tests = testfun1[[1]](cdata$unlist()); - cdata <<- .listBuilder$new(); - - pvalues = pvfun1[[1]](tests); - ord = order(pvalues); - - tests = tests[ord]; - pvalues = pvalues[ord]; - - FDR = pvalues * FDR_total_count / (1:length(pvalues)); - FDR[length(FDR)] = min(FDR[length(FDR)], 1); - FDR = rev(cummin(rev(FDR))); - - snps_names = snps$GetAllRowNames()[sdata$unlist()[ord]]; - sdata <<- .listBuilder$new(); - gene_names = gene$GetAllRowNames()[gdata$unlist()[ord]]; - gdata <<- .listBuilder$new(); - - beta = NULL; - if(bdata$n > 0) - beta = bdata$unlist()[ord]; - - if(length(fid)>0) { - step = 1000; ########### 100000 - for( part in 1:ceiling(length(FDR)/step) ) { - fr = (part-1)*step + 1; - to = min(part*step, length(FDR)); - dump = data.frame(snps_names[fr:to], - gene_names[fr:to], - if(is.null(beta)) tests[fr:to] else list(beta[fr:to],tests[fr:to]), - pvalues[fr:to], - FDR[fr:to], - row.names = NULL, - check.rows = FALSE, - check.names = FALSE, - stringsAsFactors = FALSE); - write.table(dump, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); - } - } - } else { - cat("No significant associations were found.\n", file = if(length(fid)>0){fid[[1]]}else{""}); - } - if(length(fid)>0) { - if(fid[[2]]) { - close(fid[[1]]); - } - fid <<- list(); - } - - if(!is.null(pvalues)) { - eqtls = list( snps = snps_names, - gene = gene_names, - statistic = tests, - pvalue = pvalues, - FDR = FDR); - if(!is.null(beta)) - eqtls$beta = beta; - } else { - eqtls = list( snps = character(), - gene = character(), - beta = numeric(), - statistic = numeric(), - pvalue = numeric(), - FDR = numeric()); - } - return(list(eqtls = data.frame(eqtls))); - } - ) -) - - -.OutputSaver_direct <- setRefClass(".OutputSaver_direct", - fields = list( - gene_names = "character", - snps_names = "character", - fid = "list", - testfun1 = "list", - pvfun1 = "list" - ), - methods = list( - initialize = function() { - gene_names <<- character(0); - snps_names <<- character(0); - fid <<- list(0); - testfun1 <<- list(0); - pvfun1 <<- list(0); - return(.self); - }, - start = function(filename, statistic_name, snps, gene, testfun, pvfun) { - # I hope the program stops if it fails to open the file - if(class(filename) == "character") { - fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE); - } else { - fid <<- list(filename, FALSE) - } - writeLines(paste("SNP\tgene\t", statistic_name, "\tp-value", sep = ""), fid[[1]]); - gene_names <<- gene$GetAllRowNames(); - snps_names <<- snps$GetAllRowNames(); - testfun1 <<- list(testfun); - pvfun1 <<- list(pvfun); - }, - update = function(spos, gpos, sta, beta = NULL) { - if( length(sta) == 0 ) - return(); - sta = testfun1[[1]](sta); - lst = list(snps = snps_names[spos], gene = gene_names[gpos], beta = beta, statistic = sta, pvalue = pvfun1[[1]](sta)); - lst$beta = lst$beta; - - dump2 = data.frame(lst, row.names = NULL, check.rows = FALSE, check.names = FALSE, stringsAsFactors = FALSE); - write.table(dump2, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE); - }, - getResults = function(...) { - if(length(fid)>0) { - if(fid[[2]]) { - close(fid[[1]]); - } - fid <<- list(); - } - gene_names <<- character(0); - snps_names <<- character(0); - return(list()); - } - ) -) - -.my.pmin = function(x, val) { - # minimum "pmin" function that can handle empty array - if(length(x) == 0) { - return(x) - } else { - return(pmin.int(x,val)); - } -} - -.my.pmax = function(x, val) { - # minimum "pmin" function that can handle empty array - if(length(x) == 0) { - return(x) - } else { - return(pmax.int(x,val)); - } -} - -.pv.nz = function(x){return( .my.pmax(x,.Machine$double.xmin) )} - -.SetNanRowMean = function(x) { - if( any(is.na(x)) ) { - rowmean = rowMeans(x, na.rm = TRUE); - rowmean[ is.na(rowmean) ] = 0; - for( j in which(!complete.cases(x)) ) { - where1 = is.na( x[j, ] ); - x[j,where1] = rowmean[j]; - } - } - return(x); -} - -# .SNP_process_split_for_ANOVA = function(x) { -# # split into 2 dummy variables -# -# uniq = unique(c(x)); -# uniq = uniq[!is.na(uniq)]; -# -# if( length(uniq) > 3 ) { -# stop("More than three genotype categories is not handled by ANOVA"); -# } else if ( length(uniq) < 3 ) { -# uniq = c(uniq, min(uniq)-(1:(3-length(uniq)))); -# } -# -# freq = matrix(0, nrow(x), length(uniq)); -# for(i in 1:length(uniq)) { -# freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); -# } -# -# md = apply(freq, 1, which.max); -# freq[ cbind(1:nrow(x),md) ] = -1; -# -# md = apply(freq, 1, which.max); # min(freq[cbind(1:nrow(slice),md)] - rowSums(select,na.rm = TRUE )) -# new_slice1 = (x == uniq[md]); -# new_slice1[is.na(new_slice1)] = 0; -# freq[ cbind(1:nrow(x),md) ] = -1; -# -# md = apply(freq,1,which.max); -# new_slice2 = (x == uniq[md]); -# new_slice2[ is.na(new_slice2) ] = 0; -# rez = vector("list", 2); -# rez[[1]] = new_slice1; -# rez[[2]] = new_slice2; -# return( rez ); -# } - -.SNP_process_split_for_ANOVA = function(x,n.groups) { - # split into 2 dummy variables (or more) - -# # Get the number of ANOVA groups -# n.groups = options('MatrixEQTL.ANOVA.categories')[[1]]; -# if( is.null(n.groups)) -# n.groups = 3; - - # Unique values in x (make sure it has length of n.groups); - uniq = unique(as.vector(x)); - uniq = uniq[!is.na(uniq)]; - if( length(uniq) > n.groups ) { - stop("More than declared number of genotype categories is detected by ANOVA"); - } else if ( length(uniq) < n.groups ) { - uniq = c(uniq, rep(min(uniq)-1, n.groups-length(uniq))); - } - - # Table of frequencies for each variable (row) - freq = matrix(0, nrow(x), n.groups); - for(i in 1:n.groups) { - freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE); - } - # remove NA's from x for convenience - x[is.na(x)] = min(uniq)-2; - - # Output list of matrices - rez = vector('list',n.groups-1); - - # Skip the most frequent value - md = apply(freq, 1, which.max); # most frequent value for each variable - freq[ cbind(1:nrow(x),md) ] = -1; - - # The rest form dumm - for(j in 1:(n.groups-1)){ - md = apply(freq, 1, which.max); - freq[ cbind(1:nrow(x),md) ] = -1; - rez[[j]] = (x == uniq[md]); - } - return( rez ); -} - -Matrix_eQTL_engine = function( - snps, - gene, - cvrt = SlicedData$new(), - output_file_name, - pvOutputThreshold = 1e-5, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = FALSE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE) { - rez = Matrix_eQTL_main( - snps = snps, - gene = gene, - cvrt = cvrt, - output_file_name = output_file_name, - pvOutputThreshold = pvOutputThreshold, - useModel = useModel, - errorCovariance = errorCovariance, - verbose = verbose, - pvalue.hist = pvalue.hist, - min.pv.by.genesnp = min.pv.by.genesnp, - noFDRsaveMemory = noFDRsaveMemory); - return( rez ); -} - -Matrix_eQTL_main = function( - snps, - gene, - cvrt = SlicedData$new(), - output_file_name = "", - pvOutputThreshold = 1e-5, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - output_file_name.cis = "", - pvOutputThreshold.cis = 0, - snpspos = NULL, - genepos = NULL, - cisDist = 1e6, - pvalue.hist = FALSE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE) { - ################################# Basic variable checks ################################# - { - # status("Performing basic checks of the input variables"); - stopifnot( "SlicedData" %in% class(gene) ); - stopifnot( "SlicedData" %in% class(snps) ); - stopifnot( "SlicedData" %in% class(cvrt) ); - - # Check dimensions - if( min(snps$nRows(),snps$nCols()) == 0 ) - stop("Empty genotype dataset"); - if( min(gene$nRows(),gene$nCols()) == 0 ) - stop("Empty expression dataset"); - if( snps$nCols() != gene$nCols() ) - stop("Different number of samples in the genotype and gene expression files"); - if( cvrt$nRows()>0 ) { - if( snps$nCols() != cvrt$nCols() ) - stop("Wrong number of samples in the matrix of covariates"); - } - - stopifnot( class(pvOutputThreshold) == "numeric" ); - stopifnot( length(pvOutputThreshold) == 1 ); - stopifnot( pvOutputThreshold >= 0 ); - stopifnot( pvOutputThreshold <= 1 ); - - stopifnot( class(noFDRsaveMemory) == "logical" ); - stopifnot( length(noFDRsaveMemory) == 1 ); - - if( pvOutputThreshold > 0 ) { - stopifnot( !((length(output_file_name) == 0) && noFDRsaveMemory) ) - stopifnot( length(output_file_name) <= 1 ); - if( length(output_file_name) == 1 ) { - stopifnot( class(output_file_name) %in% c("character","connection") ); - } - } - - stopifnot( class(pvOutputThreshold.cis) == "numeric" ); - stopifnot( length(pvOutputThreshold.cis) == 1 ); - stopifnot( pvOutputThreshold.cis >= 0 ); - stopifnot( pvOutputThreshold.cis <= 1 ); - stopifnot( !((pvOutputThreshold > 0) & (pvOutputThreshold.cis > 0) & (pvOutputThreshold > pvOutputThreshold.cis)) ); - stopifnot( (pvOutputThreshold > 0) | (pvOutputThreshold.cis > 0) ); - - stopifnot( class(useModel) == class(modelLINEAR) ); - stopifnot( length(useModel) == 1 ); - stopifnot( useModel %in% c(modelLINEAR, modelANOVA, modelLINEAR_CROSS) ); - if( useModel %in% c(modelLINEAR, modelLINEAR_CROSS) ) { - if( snps$nCols() <= cvrt$nRows() + 1 + 1) { - stop('The number of covariates exceeds the number of samples.\nLinear regression can not be fit.') - } - } - if( useModel == modelLINEAR_CROSS ) { - if( cvrt$nRows() == 0 ) { - stop( "Model \"modelLINEAR_CROSS\" requires at least one covariate" ); - } - } - if( useModel == modelANOVA ) { - n.anova.groups = getOption('MatrixEQTL.ANOVA.categories', 3); - stopifnot( n.anova.groups == floor(n.anova.groups) ); - stopifnot( n.anova.groups >= 3 ); -# stopifnot( n.anova.groups < snps$nCols() - cvrt$nRows() - 2 ); - if( snps$nCols() <= cvrt$nRows() + n.anova.groups) { - stop('The number of covariates exceeds the number of samples.\nLinear regression (ANOVA) can not be fit.') - } - } - - stopifnot( class(verbose) == "logical" ); - stopifnot( length(verbose) == 1 ); - - stopifnot( class(min.pv.by.genesnp) == "logical" ); - stopifnot( length(min.pv.by.genesnp) == 1 ); - - if( pvOutputThreshold.cis > 0 ) { - stopifnot( !((length(output_file_name.cis) == 0) && noFDRsaveMemory) ) - stopifnot( length(output_file_name.cis) <= 1 ); - if( length(output_file_name.cis) == 1 ) { - stopifnot( class(output_file_name.cis) %in% c("character","connection") ); - } - -# stopifnot( class(output_file_name.cis) == "character" ); -# stopifnot( length(output_file_name.cis) == 1 ); - stopifnot( class(snpspos) == "data.frame" ); - stopifnot( ncol(snpspos) == 3 ); - stopifnot( nrow(snpspos) > 0 ); - stopifnot( class(snpspos[1,3]) %in% c("integer", "numeric") ) - stopifnot( !any(is.na(snpspos[,3])) ) - stopifnot( class(genepos) == "data.frame" ); - stopifnot( ncol(genepos) == 4 ); - stopifnot( nrow(genepos) > 0 ); - stopifnot( class(genepos[1,3]) %in% c("integer", "numeric") ) - stopifnot( class(genepos[1,4]) %in% c("integer", "numeric") ) - stopifnot( !any(is.na(genepos[,3])) ) - stopifnot( !any(is.na(genepos[,4])) ) - stopifnot( nzchar(output_file_name.cis) ) - } - - if( pvOutputThreshold > 0 ) { - stopifnot( nzchar(output_file_name) ) - } - - stopifnot( class(errorCovariance) %in% c("numeric", "matrix") ); - errorCovariance = as.matrix(errorCovariance); - if(length(errorCovariance)>0) { - if( nrow(errorCovariance) != ncol(errorCovariance) ) { - stop("The covariance matrix is not square"); - } - if( nrow(errorCovariance) != snps$nCols() ) { - stop("The covariance matrix size does not match the number of samples"); - } - if( !all(errorCovariance == t(errorCovariance)) ) { - stop("The covariance matrix is not symmetric"); - } - } - } - ################################# Initial setup ######################################### - { - gene.std = .listBuilder$new(); - snps.std = .listBuilder$new(); - - dont.clone.gene = getOption('MatrixEQTL.dont.preserve.gene.object', FALSE) - if(is.null(dont.clone.gene)) - dont.clone.gene = FALSE; - - if( !dont.clone.gene ) - gene = gene$Clone(); - # snps = snps$Clone(); # snps is read only - cvrt = cvrt$Clone(); - - params = list( - output_file_name = output_file_name, - pvOutputThreshold = pvOutputThreshold, - useModel = useModel, - errorCovariance = errorCovariance , - verbose = verbose, - output_file_name.cis = output_file_name.cis, - pvOutputThreshold.cis = pvOutputThreshold.cis, - cisDist = cisDist , - pvalue.hist = pvalue.hist, - min.pv.by.genesnp = min.pv.by.genesnp); - - if( verbose ) { - lastTime = 0; - status <- function(text) { - # gc(); - newTime = proc.time()[3]; - if(lastTime != 0) { - cat("Task finished in ", newTime-lastTime, " seconds\n"); - } - cat(text,"\n"); - lastTime <<- newTime; - unused = flush.console(); - } - } else { - status = function(text){} - } - start.time = proc.time()[3]; - } - ################################# Error covariance matrix processing #################### - { - if( length(errorCovariance) > 0 ) { - status("Processing the error covariance matrix"); - eig = eigen(errorCovariance, symmetric = TRUE) - d = eig$values; - v = eig$vectors; - # errorCovariance == v %*% diag(d) %*% t(v) - # errorCovariance^0.5 == v*sqrt(d)*v" (no single quotes anymore) - # errorCovariance^(-0.5) == v*diag(1./sqrt(diag(d)))*v" - if( any(d<=0) ) { - stop("The covariance matrix is not positive definite"); - } - correctionMatrix = v %*% diag(1./sqrt(d)) %*% t(v); - rm( eig, v, d, errorCovariance ) - } else { - rm( errorCovariance ); - correctionMatrix = numeric(); - } - } - ################################# Matching gene and SNPs locations ###################### - if( pvOutputThreshold.cis > 0 ) { - status("Matching data files and location files") - - # names in the input data - gene_names = gene$GetAllRowNames(); - snps_names = snps$GetAllRowNames(); - - # gene range, set: left genepos[,4])) { - temp3 = genepos[,3]; - temp4 = genepos[,4]; - genepos[,3] = pmin(temp3,temp4); - genepos[,4] = pmax(temp3,temp4); - rm(temp3, temp4); - } - - # match with the location data - genematch = match( gene_names, genepos[ ,1], nomatch = 0L); - usedgene = matrix(FALSE, nrow(genepos), 1); # genes in "genepos" that are matching "gene_names" - usedgene[ genematch ] = TRUE; - if( !any(genematch) ) { - stop("Gene names do not match those in the gene location file."); - } - cat( sum(genematch>0), "of", length(gene_names), " genes matched\n"); - - - snpsmatch = match( snps_names, snpspos[ ,1], nomatch = 0L); - usedsnps = matrix(FALSE, nrow(snpspos),1); - usedsnps[ snpsmatch ] = TRUE; - if( !any(snpsmatch) ) { - stop("SNP names do not match those in the SNP location file."); - } - cat( sum(snpsmatch>0), "of", length(snps_names), " SNPs matched\n"); - - # list used chr names - chrNames = unique(c( as.character(unique(snpspos[usedsnps,2])), - as.character(unique(genepos[usedgene,2])) )) - chrNames = chrNames[ sort.list( suppressWarnings(as.integer(chrNames)), - method = "radix", na.last = TRUE ) ]; - # match chr names - genechr = match(genepos[,2],chrNames); - snpschr = match(snpspos[,2],chrNames); - - # max length of a chromosome - chrMax = max( snpspos[usedsnps, 3], genepos[usedgene, 4], na.rm = TRUE) + cisDist; - - # Single number location for all rows in "genepos" and "snpspos" - genepos2 = as.matrix(genepos[ ,3:4, drop = FALSE] + (genechr-1)*chrMax); - snpspos2 = as.matrix(snpspos[ ,3 , drop = FALSE] + (snpschr-1)*chrMax); - - # the final location arrays; - snps_pos = matrix(0,length(snps_names),1); - snps_pos[snpsmatch>0, ] = snpspos2[snpsmatch, , drop = FALSE]; - snps_pos[rowSums(is.na(snps_pos))>0, ] = 0; - snps_pos[snps_pos==0] = (length(chrNames)+1) * (chrMax+cisDist); - rm(snps_names, snpsmatch, usedsnps, snpschr, snpspos2) - - gene_pos = matrix(0,length(gene_names),2); - gene_pos[genematch>0, ] = genepos2[genematch, , drop = FALSE]; - gene_pos[rowSums(is.na(gene_pos))>0, ] = 0; - gene_pos[gene_pos==0] = (length(chrNames)+2) * (chrMax+cisDist); - rm(gene_names, genematch, usedgene, genechr, genepos2) - rm(chrNames, chrMax); - - if( is.unsorted(snps_pos) ) { - status("Reordering SNPs\n"); - ordr = sort.list(snps_pos); - snps$RowReorder(ordr); - snps_pos = snps_pos[ordr, , drop = FALSE]; - rm(ordr); - } - if( is.unsorted(rowSums(gene_pos)) ) { - status("Reordering genes\n"); - ordr = sort.list(rowSums(gene_pos)); - gene$RowReorder(ordr); - gene_pos = gene_pos[ordr, , drop = FALSE]; - rm(ordr); - } - - # Slice it back. - geneloc = vector("list", gene$nSlices()) - gene_offset = 0; - for(gc in 1:gene$nSlices()) { - nr = length(gene$rowNameSlices[[gc]]); - geneloc[[gc]] = gene_pos[gene_offset + (1:nr), , drop = FALSE]; - gene_offset = gene_offset + nr; - } - rm(gc, gene_offset, gene_pos); - - snpsloc = vector("list", snps$nSlices()) - snps_offset = 0; - for(sc in 1:snps$nSlices()) { - nr = length(snps$rowNameSlices[[sc]]); - snpsloc[[sc]] = snps_pos[snps_offset + (1:nr), , drop = FALSE]; - snps_offset = snps_offset + nr; - } - rm(nr, sc, snps_offset, snps_pos); - } - ################################# Covariates processing ################################# - { - status("Processing covariates"); - if( useModel == modelLINEAR_CROSS ) { - last.covariate = as.vector(tail( cvrt$getSlice(cvrt$nSlices()), n = 1)); - } - if( cvrt$nRows()>0 ) { - cvrt$SetNanRowMean(); - cvrt$CombineInOneSlice(); - cvrt = rbind(matrix(1,1,snps$nCols()),cvrt$getSlice(1)); - } else { - cvrt = matrix(1,1,snps$nCols()); - } - # Correct for the error covariance structure - if( length(correctionMatrix)>0 ) { - cvrt = cvrt %*% correctionMatrix; - } - # Orthonormalize covariates - # status("Orthonormalizing covariates"); - q = qr(t(cvrt)); - if( min(abs(diag(qr.R(q)))) < .Machine$double.eps * snps$nCols() ) { - stop("Colinear or zero covariates detected"); - } - cvrt = t( qr.Q(q) ); - rm( q ); - } - ################################# Gene expression processing ############################ - { - status("Processing gene expression data (imputation, residualization, etc.)"); - # Impute gene expression - gene$SetNanRowMean(); - # Correct for the error covariance structure - if( length(correctionMatrix)>0 ) { - gene$RowMatrixMultiply(correctionMatrix); - } - # Orthogonolize expression w.r.t. covariates - # status("Orthogonolizing expression w.r.t. covariates"); - gene_offsets = double(gene$nSlices()+1); - for( sl in 1:gene$nSlices() ) { - slice = gene$getSlice(sl); - gene_offsets[sl+1] = gene_offsets[sl] + nrow(slice); - rowsq1 = rowSums(slice^2); - slice = slice - tcrossprod(slice,cvrt) %*% cvrt; - rowsq2 = rowSums(slice^2); - # kill rows colinear with the covariates - delete.rows = (rowsq2 <= rowsq1 * .Machine$double.eps ); - slice[delete.rows] = 0; - div = sqrt( rowSums(slice^2) ); - div[ div == 0 ] = 1; - gene.std$set(sl, div); - gene$setSlice(sl, slice / div); - } - rm(rowsq1, rowsq2, delete.rows, div); - rm( sl, slice ); - #gene$RowRemoveZeroEps(); - } - ################################# snps_process, testfun, pvfun, threshfun, afun ######## - { - # snps_process - preprocess SNPs slice - # - # afun --- abs for signed stats, identity for non-negative - # threshfun --- internal stat threshold for given p-value - # testfun --- t or F statistic from the internal one - # pvfun --- p-value from the t or F statistic - - nSamples = snps$nCols(); - nGenes = gene$nRows(); - nSnps = snps$nRows(); - nCov = nrow(cvrt); - # nVarTested = length(snps_list); # set in case(useModel) - # dfNull = nSamples - nCov; - # d.f. of the full model - betafun = NULL; - - if( useModel == modelLINEAR ) { - snps_process = function(x) { - return( list(.SetNanRowMean(x)) ); - }; - nVarTested = 1; - dfFull = nSamples - nCov - nVarTested; - statistic.fun = function(mat_list) { - return( mat_list[[1]] ); - } - afun = function(x) {return(abs(x))}; - threshfun = function(pv) { - thr = qt(pv/2, dfFull, lower.tail = FALSE); - thr = thr^2; - thr = sqrt( thr / (dfFull + thr) ); - thr[pv >= 1] = 0; - thr[pv <= 0] = 1; - return( thr ); - } - testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } - pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2)); } - thresh.cis = threshfun(pvOutputThreshold.cis); - thresh = threshfun(pvOutputThreshold); - betafun = function(stat, ss, gg, select) { - return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); - } - } else - if( useModel == modelANOVA ) { - snps_process = function(x).SNP_process_split_for_ANOVA(x,n.anova.groups); - nVarTested = n.anova.groups - 1; - dfFull = nSamples - nCov - nVarTested; -# statistic.fun = function(mat_list) { -# return( mat_list[[1]]^2 + mat_list[[2]]^2 ); -# } - statistic.fun = function(mat_list) { - x = mat_list[[1]]^2; - for( j in 2:length(mat_list) ) - x = x + mat_list[[j]]^2; - return( x ); - } - afun = identity; - threshfun = function(pv) { - thr = qf(pv, nVarTested, dfFull, lower.tail = FALSE); - thr = thr / (dfFull/nVarTested + thr); - thr[pv >= 1] = 0; - thr[pv <= 0] = 1; - return( thr ); - } - testfun = function(x) { return( x / (1 - .my.pmin(x,1)) * (dfFull/nVarTested) ); } - pvfun = function(x) { return( .pv.nz(pf(x, nVarTested, dfFull, lower.tail = FALSE)) ); } - thresh.cis = threshfun(pvOutputThreshold.cis); - thresh = threshfun(pvOutputThreshold); - } else - if( useModel == modelLINEAR_CROSS ) { - last.covariate = as.vector( last.covariate ); - snps_process = .SNP_process_split_for_LINEAR_CROSS = function(x) { - out = vector("list", 2); - out[[1]] = .SetNanRowMean(x); - out[[2]] = t( t(out[[1]]) * last.covariate ); - return( out ); - }; - nVarTested = 1; - dfFull = nSamples - nCov - nVarTested - 1; - statistic.fun = function(mat_list) { - return( mat_list[[2]] / sqrt(1 - mat_list[[1]]^2) ); - } - afun = function(x) {return(abs(x))}; - threshfun = function(pv) { - thr = qt(pv/2, dfFull, lower.tail = FALSE); - thr = thr^2; - thr = sqrt( thr / (dfFull + thr) ); - thr[pv >= 1] = 0; - thr[pv <= 0] = 1; - return( thr ); - } - testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); } - pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2 )); } - thresh.cis = threshfun(pvOutputThreshold.cis); - thresh = threshfun(pvOutputThreshold); - betafun = function(stat, ss, gg, select) { - return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]); - } - } - params$dfFull = dfFull; - } - ################################# Saver class(es) creation ############################## - { - status("Creating output file(s)"); - if(noFDRsaveMemory) { - if( pvOutputThreshold > 0 ) { - saver.tra = .OutputSaver_direct$new(); - } - if( pvOutputThreshold.cis > 0 ) { - saver.cis = .OutputSaver_direct$new(); - } - } else { - if( pvOutputThreshold > 0 ) { - saver.tra = .OutputSaver_FRD$new(); - } - if( pvOutputThreshold.cis > 0 ) { - saver.cis = .OutputSaver_FRD$new(); - } - } - if( pvOutputThreshold > 0 ) - if( pvOutputThreshold * gene$nRows() * snps$nRows() > 1000000 ) - if(!noFDRsaveMemory) - cat('Warning: pvOutputThreshold may be too large.\nExpected number of findings > ', - pvOutputThreshold * gene$nRows() * snps$nRows(),'\n'); - if( (useModel == modelLINEAR) || (useModel == modelLINEAR_CROSS) ) { - statistic_name = "t-stat"; - } else if( useModel == modelANOVA ) { - statistic_name = "F-test"; - } - if(!is.null(betafun)) - statistic_name = paste('beta\t',statistic_name, sep=''); - if( pvOutputThreshold > 0 ) - saver.tra$start(output_file_name, statistic_name, snps, gene, testfun, pvfun); - if( pvOutputThreshold.cis > 0 ) - saver.cis$start(output_file_name.cis, statistic_name, snps, gene, testfun, pvfun); - rm( statistic_name ); - } - ################################# Some useful functions ################################# - { - orthonormalize.snps = function(cursnps, ss) { - for(p in 1:length(cursnps)) { - if(length(correctionMatrix)>0) { - cursnps[[p]] = cursnps[[p]] %*% correctionMatrix; - } - cursnps[[p]] = cursnps[[p]] - tcrossprod(cursnps[[p]],cvrt) %*% cvrt; - for(w in .seq(1,p-1L)) - cursnps[[p]] = cursnps[[p]] - rowSums(cursnps[[p]]*cursnps[[w]]) * cursnps[[w]]; - div = sqrt( rowSums(cursnps[[p]]^2) ); - div[ div == 0 ] = 1; - cursnps[[p]] = cursnps[[p]]/div; - } - snps.std$set(ss, div); - return(cursnps); - } -# if( pvOutputThreshold.cis > 0 ) { -# is.cis.pair = function(gg,ss) { -# return(!( ( snpsloc[[ss]][1, 1] - tail( geneloc[[gg]][ , 2], n = 1L) > cisDist) | -# ( geneloc[[gg]][1, 1] - tail( snpsloc[[ss]] , n = 1L) > cisDist) ) ); -# } -# } - if( pvOutputThreshold.cis > 0 ) { -# sn.l = sapply(snpsloc, function(x)x[1] ); -# sn.r = sapply(snpsloc, function(x)tail(x,1) ); -# ge.l = sapply(geneloc, function(x)x[1,1] ); -# ge.r = sapply(geneloc, function(x)x[nrow(x) , 2] ); - sn.l = sapply(snpsloc, '[', 1 ); - sn.r = sapply(snpsloc, tail, 1 ); - ge.l = sapply(geneloc, '[', 1, 1 ); - ge.r = sapply( lapply(geneloc, tail.matrix, 1 ), '[', 2); - gg.1 = findInterval( sn.l , ge.r + cisDist +1) + 1; -# cat(gg.1,'\n') - gg.2 = findInterval( sn.r , ge.l - cisDist ); -# cat(gg.2,'\n') - rm(sn.l, sn.r, ge.l, ge.r); - } - - } - ################################# Prepare counters and histogram bins ################### - { - pvbins = NULL; # bin edges for p-values - statbins = 0; # bin edges for the test statistic (|t| or F) - do.hist = FALSE; - if( length(pvalue.hist) == 1 ) { - if(pvalue.hist == "qqplot") { - pvbins = c(0, 10^rev(seq(0, log10(.Machine$double.xmin)-1, -0.05))); - } else - if( is.numeric(pvalue.hist) ) { - pvbins = seq(from = 0, to = 1, length.out = pvalue.hist+1); - } else - if( pvalue.hist == TRUE ) { - pvbins = seq(from = 0, to = 1, length.out = 100+1); - } - } else - if( is.numeric(pvalue.hist) && (length(pvalue.hist) > 1) ) { - pvbins = pvalue.hist; - } - if( is.null(pvbins) && (pvalue.hist != FALSE) ) { - stop("Wrong value of pvalue.hist. Must be FALSE, TRUE, \"qqplot\", or numerical"); - } - do.hist = !is.null(pvbins); - if( do.hist ) { - pvbins = sort(pvbins); - statbins = threshfun(pvbins); - if( pvOutputThreshold > 0) { - hist.all = .histogrammer$new(pvbins, statbins); - } - if( pvOutputThreshold.cis > 0) { - hist.cis = .histogrammer$new(pvbins, statbins); - } - } - rm( pvbins, statbins); - if(min.pv.by.genesnp) { - if( pvOutputThreshold > 0) { - minpv.tra = .minpvalue$new(snps,gene); - } - if( pvOutputThreshold.cis > 0) { - minpv.cis = .minpvalue$new(snps,gene); - } - } - } - ################################# Main loop ############################################# - { - beta = NULL; - n.tests.all = 0; - n.tests.cis = 0; - n.eqtls.tra = 0; - n.eqtls.cis = 0; - - status("Performing eQTL analysis"); - # ss = 1; gg = 1; - # ss = snps$nSlices(); gg = gene$nSlices(); - - snps_offset = 0; - for(ss in 1:snps$nSlices()) { -# for(ss in 1:min(2,snps$nSlices())) { #for debug - cursnps = NULL; - nrcs = nrow(snps$getSliceRaw(ss)); - - # loop only through the useful stuff - for(gg in if(pvOutputThreshold>0){1:gene$nSlices()}else{.seq(gg.1[ss],gg.2[ss])} ) { - gene_offset = gene_offsets[gg]; - curgene = gene$getSlice(gg); - nrcg = nrow(curgene); - if(nrcg == 0) next; - - rp = ""; - - statistic = NULL; - select.cis.raw = NULL; - ## do cis analysis -# if( (pvOutputThreshold.cis > 0) && ( is.cis.pair(gg, ss) ) ) { - if( (pvOutputThreshold.cis > 0) && (gg >= gg.1[ss]) && (gg <= gg.2[ss]) ) { - - if( is.null( statistic ) ) { - if( is.null( cursnps ) ) { - cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); - } - mat = vector("list", length(cursnps)); - for(d in 1:length(cursnps)) { - mat[[d]] = tcrossprod(curgene, cursnps[[d]]); - } - statistic = statistic.fun( mat ); - astatistic = afun(statistic); -# rm(mat); - } - -# sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1 +1 , snpsloc[[ss]]); -# sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist -1 , snpsloc[[ss]]); - sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1, snpsloc[[ss]]); - sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist, snpsloc[[ss]]); - xx = unlist(lapply(which(sn.r>sn.l),FUN=function(x){(sn.l[x]:(sn.r[x]-1))*nrow(statistic)+x})) - select.cis.raw = xx[ astatistic[xx] >= thresh.cis ]; - select.cis = arrayInd(select.cis.raw, dim(statistic)) - - n.tests.cis = n.tests.cis + length(xx); - n.eqtls.cis = n.eqtls.cis + length(select.cis.raw); - - if( do.hist ) - hist.cis$update(astatistic[xx]); - - if( min.pv.by.genesnp ) { - # minpv.cis$updatecis(ss, gg, arrayInd(xx, dim(statistic)), astatistic[xx]) - temp = double(length(astatistic)); - dim(temp) = dim(astatistic); - temp[xx] = astatistic[xx]; - minpv.cis$update(ss, gg, temp); - } - - if(!is.null(betafun)) - beta = betafun(mat[[length(mat)]][select.cis.raw], ss, gg, select.cis); - - saver.cis$update( snps_offset + select.cis[ , 2], - gene_offset + select.cis[ , 1], - statistic[select.cis.raw], - beta); - - # statistic.select.cis = statistic[ select.cis ]; - # test = testfun( statistic.select.cis ); - # pv = pvfun(test); - # Saver.cis$WriteBlock( cbind(snps_offset + select.cis[ , 2], gene_offset + select.cis[ , 1], test, pv) ); - # counter.cis$Update(gg, ss, select.cis, pv, n.tests = length(xx), if(do.hist) afun(statistic[xx]) ) - rp = paste(rp, ", ", formatC(n.eqtls.cis, big.mark=",", format = "f", digits = 0), " cis-eQTLs", sep = ""); - } - ## do trans/all analysis - if(pvOutputThreshold>0) { - if( is.null( statistic ) ) { - if( is.null( cursnps ) ) { - cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss ); - } - mat = vector("list", length(cursnps)); - for(d in 1:length(cursnps)) { - mat[[d]] = tcrossprod(curgene, cursnps[[d]]); - } - statistic = statistic.fun( mat ); - astatistic = afun(statistic); -# rm(mat); - } - - if( do.hist ) - hist.all$update(astatistic); - - if(!is.null(select.cis.raw)) - astatistic[xx] = -1; - # select.tra.raw = select.tra.raw[!(select.tra.raw %in% select.cis.raw)]; - - select.tra.raw = which( astatistic >= thresh); - select.tra = arrayInd(select.tra.raw, dim(statistic)) - - n.eqtls.tra = n.eqtls.tra + length(select.tra.raw); - n.tests.all = n.tests.all + length(statistic); - - if(!is.null(betafun)) - beta = betafun(mat[[length(mat)]][select.tra.raw], ss, gg, select.tra); - - saver.tra$update( snps_offset + select.tra[ , 2], - gene_offset + select.tra[ , 1], - statistic[select.tra.raw], - beta); - - if( min.pv.by.genesnp ) - minpv.tra$update(ss, gg, astatistic) - - # statistic.select.tra = statistic[ select.tra ]; - # test = testfun( statistic.select.tra ); - # pv = pvfun( test ); - # Saver$WriteBlock( cbind( snps_offset + select.tra[ , 2], gene_offset + select.tra[ , 1], test, pv) ); - # counter$Update(gg, ss, select.tra, pv, n.tests = nrcs*nrcg, if(do.hist) afun(statistic) ) - rp = paste(rp, ", ", formatC(n.eqtls.tra, big.mark=",", format = "f", digits = 0), if(pvOutputThreshold.cis > 0)" trans-"else" ","eQTLs", sep = "") - } - - #gene_offset = gene_offset + nrcg; - if( !is.null(statistic) ) { - per = 100*(gg/gene$nSlices() + ss-1) / snps$nSlices(); - cat( formatC(floor(per*100)/100, format = "f", width = 5, digits = 2), "% done" , rp, "\n", sep = ""); - flush.console(); - } - } # gg in 1:gene$nSlices() - snps_offset = snps_offset + nrow(snps$getSliceRaw(ss)); - } # ss in 1:snps$nSlices() - } - ################################# Results collection #################################### - { - rez = list(time.in.sec = proc.time()[3] - start.time); - rez$param = params; - - if(pvOutputThreshold.cis > 0) { - rez.cis = list(ntests = n.tests.cis, neqtls = n.eqtls.cis); - rez.cis = c(rez.cis, saver.cis$getResults( gene, snps, n.tests.cis) ); - if(do.hist) - rez.cis = c(rez.cis, hist.cis$getResults() ); - if(min.pv.by.genesnp) - rez.cis = c(rez.cis, minpv.cis$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); - } - - if(pvOutputThreshold>0) { - rez.all = list(ntests = n.tests.all, neqtls = n.eqtls.tra + n.eqtls.cis); - if(pvOutputThreshold.cis > 0) { - rez.tra = list(ntests = n.tests.all - n.tests.cis, neqtls = n.eqtls.tra); - rez.tra = c(rez.tra, saver.tra$getResults( gene, snps, n.tests.all - n.tests.cis) ); - } else { - rez.all = c(rez.all, saver.tra$getResults( gene, snps, n.tests.all ) ); - } - if(do.hist) { - rez.all = c(rez.all, hist.all$getResults() ); - if(pvOutputThreshold.cis > 0) { - rez.tra$hist.bins = rez.all$hist.bins; - rez.tra$hist.counts = rez.all$hist.counts - rez.cis$hist.counts; - } - } - if(min.pv.by.genesnp) { - if(pvOutputThreshold.cis > 0) { - rez.tra = c(rez.tra, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); - } else { - rez.all = c(rez.all, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) ); - } - } - } - - if(exists('rez.all')>0) - rez$all = rez.all; - if(exists('rez.tra')>0) - rez$trans = rez.tra; - if(exists('rez.cis')>0) - rez$cis = rez.cis; - - class(rez) = c(class(rez),"MatrixEQTL"); - status(""); - } -# cat('s std ',snps.std$get(1),'\n'); -# cat('g std ',gene.std$get(1),'\n'); - ################################# Results collection #################################### - return(rez); -} - -.histme = function(m, name1, name2, ...) { - cnts = m$hist.counts; - bins = m$hist.bins; - ntst = m$ntests; - centers = 0.5 * (bins[-1L] + bins[-length(bins)]); - density = 0.5 / (bins[-1L] - centers) * cnts / ntst; - ntext = paste("P-value distribution for ", name1, formatC(ntst, big.mark=",", format = "f", digits = 0), name2, " gene-SNP pairs ",sep=""); - r = structure(list(breaks = bins, counts = cnts, density = density, - mids = centers, equidist = FALSE), class = "histogram"); - plot(r, main = ntext, ylab = "Density", xlab = "P-values", ...) - abline( h = 1, col = "blue"); - return(invisible()); -} - -.qqme = function(m, lcol, cex, pch, ...) { - cnts = m$hist.counts; - bins = m$hist.bins; - ntst = m$ntests; - - cusu = cumsum(cnts) / ntst; - ypos = bins[-1][is.finite(cusu)]; - xpos = cusu[is.finite(cusu)]; - lines(-log10(xpos), -log10(ypos), col = lcol, ...); -# lines(xpos, ypos, col = lcol, ...); - if(length(m$eqtls$pvalue)==0) - return(); - ypvs = -log10(m$eqtls$pvalue); - xpvs = -log10(1:length(ypvs) / ntst); - if(length(ypvs) > 1000) { - # need to filter a bit, make the plotting faster - levels = as.integer( xpvs/xpvs[1] * 1e3); - keep = c(TRUE, diff(levels)!=0); - levels = as.integer( ypvs/ypvs[1] * 1e3); - keep = keep | c(TRUE, diff(levels)!=0); - ypvs = ypvs[keep]; - xpvs = xpvs[keep]; - rm(keep) - } - points(xpvs, ypvs, col = lcol, pch = pch, cex = cex, ...); -} - -plot.MatrixEQTL = function(x, cex = 0.5, pch = 19, xlim = NULL, ylim = NULL, ...) { - if( x$param$pvalue.hist == FALSE ) { - warning("Cannot plot p-value distribution: the information was not recorded.\nUse pvalue.hist!=FALSE."); - return(invisible()); - } - if( x$param$pvalue.hist == "qqplot" ) { - xmin = 1/max(x$cis$ntests, x$all$ntests); - ymax = NULL; - if(!is.null(ylim)) { - ymax = ylim[2]; - } else { - ymax = -log10(min( - x$cis$eqtls$pvalue[1], x$cis$hist.bins[ c(FALSE,x$cis$hist.counts>0)][1], - x$all$eqtls$pvalue[1], x$all$hist.bins[ c(FALSE,x$all$hist.counts>0)][1], - x$trans$eqtls$pvalue[1], x$trans$hist.bins[c(FALSE,x$trans$hist.counts>0)][1], - na.rm = TRUE))+0.1; - } - if(ymax == 0) { - ymax = -log10(.Machine$double.xmin) - } - if(!is.null(ymax)) - ylim = c(0,ymax); - - if(is.null(xlim)) - xlim = c(0, -log10(xmin/1.5)); - - plot(numeric(),numeric(), xlab = "-Log10(p-value), theoretical", - ylab = "-Log10(p-value), observed", - xlim = c(0, -log10(xmin/1.5)), - ylim = ylim, - xaxs="i", yaxs="i", ...); - lines(c(0,1e3), c(0,1e3), col = "gray"); - if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { - .qqme( x$cis, "red", cex, pch, ...); - .qqme( x$trans, "blue", cex, pch, ...); - title(paste("QQ-plot for", - formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), - "local and", - formatC(x$trans$ntests, big.mark=",", format = "f", digits = 0), - "distant gene-SNP p-values")); - lset = c(1,2,4); - } else - if(x$param$pvOutputThreshold.cis > 0) { - .qqme(x$cis, "red", cex, pch, ...); - title(paste("QQ-plot for", - formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0), - "local gene-SNP p-values")); - lset = c(1,4); - } else { - .qqme(x$all, "blue", cex, pch, ...); - title(paste("QQ-plot for all", - formatC(x$all$ntests, big.mark=",", format = "f", digits = 0), - "gene-SNP p-values")); - lset = c(3,4); - } - legend("topleft", - c("Local p-values","Distant p-values","All p-values","diagonal")[lset], - col = c("red","blue","blue","gray")[lset], - text.col = c("red","blue","blue","gray")[lset], - pch = 20, lwd = 1, pt.cex = c(1,1,1,0)[lset]) - } else { - if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) { - par(mfrow=c(2,1)); - .histme(x$cis, "", " local", ...); - tran = list(hist.counts = x$all$hist.counts - x$cis$hist.counts, - hist.bins = x$all$hist.bins, - ntests = x$all$ntests - x$cis$ntests); - .histme(x$trans,""," distant", ...); - par(mfrow=c(1,1)); - } else - if(x$param$pvOutputThreshold.cis > 0) { - .histme(x$cis, "", " local", ...); - } else { - .histme(x$all, "all ", "" , ...); - } - } - return(invisible()); -} - diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/data/Covariates.txt --- a/MatrixEQTL/MatrixEQTL/data/Covariates.txt Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -id Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09 Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16 -gender 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 -age 36 40 46 65 69 43 40 54 44 70 24 34 55 62 48 40 diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/data/GE.txt --- a/MatrixEQTL/MatrixEQTL/data/GE.txt Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -geneid Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09 Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16 -Gene_01 4.91 4.63 5.18 5.07 5.74 5.09 5.31 5.29 4.73 5.72 4.75 4.54 5.01 5.03 4.84 4.44 -Gene_02 13.78 13.14 13.18 13.04 12.85 13.07 13.09 12.83 14.94 13.86 15.26 14.73 14.08 14.33 14.72 14.97 -Gene_03 12.06 12.29 13.07 13.65 13.86 12.84 12.29 13.03 13.13 14.93 12.40 13.38 13.70 14.49 14.14 13.35 -Gene_04 11.63 11.88 12.74 12.66 13.16 11.99 11.97 12.81 11.74 13.29 10.88 11.37 12.09 12.50 11.40 11.45 -Gene_05 14.72 14.66 14.63 15.91 15.46 14.74 15.17 15.01 14.05 14.90 12.96 13.56 14.06 14.44 14.12 13.76 -Gene_06 12.29 12.23 12.47 13.21 12.63 12.18 12.44 12.45 13.22 12.70 12.93 12.84 13.20 13.19 12.64 12.80 -Gene_07 12.56 12.71 12.49 13.41 13.60 12.35 12.27 13.42 14.73 15.47 13.85 14.31 15.25 15.55 14.90 14.04 -Gene_08 12.27 12.58 12.61 13.02 12.86 12.32 12.30 13.19 15.21 14.60 14.72 14.56 14.98 14.92 14.53 14.72 -Gene_09 9.82 9.29 8.95 8.18 8.11 9.43 9.17 9.25 10.95 9.82 12.60 11.99 11.20 10.31 11.13 11.69 -Gene_10 14.24 14.52 14.62 13.65 13.46 14.04 13.97 13.73 12.29 12.01 13.47 13.30 12.34 12.23 12.80 12.47 diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/data/SNP.txt --- a/MatrixEQTL/MatrixEQTL/data/SNP.txt Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ -snpid Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09 Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16 -Snp_01 2 0 2 0 2 1 2 1 1 1 2 2 1 2 2 1 -Snp_02 0 1 1 2 2 1 0 0 0 1 1 1 1 0 1 1 -Snp_03 1 0 1 0 1 1 1 1 0 1 1 0 1 1 1 2 -Snp_04 0 1 2 2 2 1 1 0 0 0 1 2 1 1 1 0 -Snp_05 1 1 2 1 1 2 1 1 0 1 1 2 0 1 2 1 -Snp_06 2 2 2 1 1 0 1 0 2 1 1 1 2 0 2 1 -Snp_07 1 1 2 2 0 1 1 1 1 0 2 2 0 1 1 1 -Snp_08 1 0 1 0 1 0 0 1 1 1 0 2 0 1 1 1 -Snp_09 2 1 2 2 0 1 1 0 2 1 1 0 1 1 0 0 -Snp_10 1 1 0 0 0 2 2 1 1 2 1 1 1 1 1 0 -Snp_11 2 2 2 0 2 1 1 2 1 2 0 1 0 1 1 2 -Snp_12 1 1 2 2 2 1 1 1 1 0 2 0 1 1 0 2 -Snp_13 0 1 1 1 1 1 2 1 2 2 0 0 0 1 1 1 -Snp_14 0 0 1 0 1 2 2 2 1 1 1 0 1 0 1 0 -Snp_15 1 2 1 2 2 1 2 1 1 2 1 1 1 2 0 2 diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/data/geneloc.txt --- a/MatrixEQTL/MatrixEQTL/data/geneloc.txt Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -geneid chr left right -Gene_01 chr1 721289 731289 -Gene_02 chr1 752565 762565 -Gene_03 chr1 777121 787121 -Gene_04 chr1 785988 795988 -Gene_05 chr1 792479 802479 -Gene_06 chr1 798958 808958 -Gene_07 chr1 888658 898658 -Gene_08 chr1 918572 928572 -Gene_09 chr1 926430 936430 -Gene_10 chr1 1000000 1010000 diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/data/snpsloc.txt --- a/MatrixEQTL/MatrixEQTL/data/snpsloc.txt Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ -snpid chr pos -Snp_01 chr1 721289 -Snp_02 chr1 752565 -Snp_03 chr1 777121 -Snp_04 chr1 785988 -Snp_05 chr1 792479 -Snp_06 chr1 798958 -Snp_07 chr1 888658 -Snp_08 chr1 918572 -Snp_09 chr1 926430 -Snp_10 chr1 947033 -Snp_11 chr2 721289 -Snp_12 chr2 752565 -Snp_13 chr2 777121 -Snp_14 chr2 785988 -Snp_15 chr2 792479 diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/demo/00Index --- a/MatrixEQTL/MatrixEQTL/demo/00Index Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -a.nocvrt Test that Matrix eQTL gives correct estimates for additive linear model. Homoscedastic case without covariates. -b.cvrt Test that Matrix eQTL gives correct estimates for additive linear model. Homoscedastic case with covariates. -c.weights Test that Matrix eQTL gives correct estimates for additive linear model. Heteroscedastic case with covariates. -d.ANOVA Test that Matrix eQTL gives correct estimates for ANOVA linear model with 3 groups. Heteroscedastic case with covariates. -d.ANOVA5 Test that Matrix eQTL gives correct estimates for ANOVA linear model with 5 groups. Heteroscedastic case with covariates. -e.interaction Test that Matrix eQTL gives correct estimates for linear model with genotype-covariate interaction. Heteroscedastic case with covariates. -p.hist Sample code for plotting histogram of p-values -q.qqplot Sample code for plotting QQ-plot of p-values -sample.all Sample code for eQTL analysis of the toy data set. -sample.cis Sample code for eQTL analysis of the toy data set with separate p-value thresholds and FDR estimation for cis- and trans-eQTLs. diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/demo/a.nocvrt.r --- a/MatrixEQTL/MatrixEQTL/demo/a.nocvrt.r Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -# Number of columns (samples) -n = 100; - - - - - - - - - - -# Generate the vectors with genotype and expression variables -snps.mat = rnorm(n); -gene.mat = rnorm(n) + 0.5 * snps.mat; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new(); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue); -# And compare to those from the linear regression in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R summary(lm()) output: \n"); - lmdl = lm( gene.mat ~ snps.mat ); - - lmout = summary(lmdl)$coefficients[2,c("Estimate","t value","Pr(>|t|)")]; - print( lmout ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/demo/b.cvrt.r --- a/MatrixEQTL/MatrixEQTL/demo/b.cvrt.r Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -# Number of columns (samples) -n = 100; - -# Number of covariates -nc = 10; - - - - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with genotype and expression variables -snps.mat = cvrt.mat %*% rnorm(nc) + rnorm(n); -gene.mat = cvrt.mat %*% rnorm(nc) + rnorm(n) + 0.5 * snps.mat + 1; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue); -# And compare to those from the linear regression in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R summary(lm()) output: \n"); - lmdl = lm( gene.mat ~ snps.mat + cvrt.mat ); - - lmout = summary(lmdl)$coefficients[2,c("Estimate","t value","Pr(>|t|)")]; - print( lmout ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/demo/c.weights.r --- a/MatrixEQTL/MatrixEQTL/demo/c.weights.r Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -# Number of columns (samples) -n = 100; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with genotype and expression variables -snps.mat = cvrt.mat %*% rnorm(nc) + rnorm(n); -gene.mat = cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std + 0.5 * snps.mat + 1; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue); -# And compare to those from the linear regression in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R summary(lm()) output: \n"); - lmdl = lm( gene.mat ~ snps.mat + cvrt.mat, - weights = 1/noise.std^2 ); - lmout = summary(lmdl)$coefficients[2,c("Estimate","t value","Pr(>|t|)")]; - print( lmout ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/demo/d.ANOVA.r --- a/MatrixEQTL/MatrixEQTL/demo/d.ANOVA.r Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -# Number of columns (samples) -n = 100; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with single genotype and expression variables -snps.mat = floor(runif(n, min = 0, max = 3)); -gene.mat = 1 + (snps.mat==1) + cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelANOVA, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value - -fstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c( Fstat = fstat, pvalue = pvalue); -# And compare to those from ANOVA in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R anova(lm()) output: \n"); - lmdl = lm( gene.mat ~ cvrt.mat + factor(snps.mat), - weights = 1/noise.std^2 ); - lmout = anova(lmdl)[2, c("F value","Pr(>F)")]; - print( lmout ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/demo/d.ANOVA5.r --- a/MatrixEQTL/MatrixEQTL/demo/d.ANOVA5.r Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -anova.groups = 5; -options(MatrixEQTL.ANOVA.categories = anova.groups); - -# Number of columns (samples) -n = 100; -# Number of covariates -nc = 10; -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with single genotype and expression variables -snps.mat = floor(runif(n, min = 0, max = anova.groups)); -gene.mat = 1 + (snps.mat==1) + cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelANOVA, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value - -fstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c( Fstat = fstat, pvalue = pvalue); -# And compare to those from ANOVA in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R anova(lm()) output: \n"); - lmdl = lm( gene.mat ~ cvrt.mat + factor(snps.mat), - weights = 1/noise.std^2 ); - lmout = anova(lmdl)[2, c("F value","Pr(>F)")]; - print( lmout ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/demo/e.interaction.r --- a/MatrixEQTL/MatrixEQTL/demo/e.interaction.r Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -library("MatrixEQTL"); - -# Number of columns (samples) -n = 25; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with single genotype and expression variables -snps.mat = cvrt.mat %*% rnorm(nc) + rnorm(n); -gene.mat = cvrt.mat %*% rnorm(nc) + rnorm(n) * noise.std + - 1 + 0.5 * snps.mat + snps.mat * cvrt.mat[,nc]; -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# Produce no output files -filename = NULL; # tempfile() - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR_CROSS, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue); -# And compare to those from the linear regression in R -{ - cat("\n\n Matrix eQTL: \n"); - print(rez); - cat("\n R summary(lm()) output: \n"); - lmdl = lm( gene.mat ~ snps.mat + cvrt.mat + snps.mat*cvrt.mat[,nc], - weights = 1/noise.std^2 ); - lmout = tail(summary(lmdl)$coefficients,1)[,c(1,3,4)]; - print( tail(lmout) ); -} - -# Results from Matrix eQTL and "lm" must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)); diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/demo/p.hist.r --- a/MatrixEQTL/MatrixEQTL/demo/p.hist.r Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -library("MatrixEQTL") - -# Number of samples -n = 100; - -# Number of variables -ngs = 2000; - -# Common signal in all variables (population stratification) -pop = 0.2 * rnorm(n); - -# data matrices -snps.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop; -gene.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop + snps.mat*((1:ngs)/ngs)^9/2; - -# data objects for Matrix eQTL engine -snps1 = SlicedData$new( t( snps.mat ) ); -gene1 = SlicedData$new( t( gene.mat ) ); -cvrt1 = SlicedData$new(); -rm(snps.mat, gene.mat); - -# Slice data in blocks of 500 variables -snps1$ResliceCombined(500); -gene1$ResliceCombined(500); - -# Produce no output files -filename = NULL; # tempfile() - -# Perform analysis recording information for a histogram -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1e-100, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = 100); - -# png(filename = "histogram.png", width = 650, height = 650); -plot(me, col="grey"); -# dev.off(); diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/demo/q.qqplot.r --- a/MatrixEQTL/MatrixEQTL/demo/q.qqplot.r Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -library("MatrixEQTL") - -# Number of samples -n = 100; - -# Number of variables -ngs = 2000; - -# Common signal in all variables (population stratification) -pop = 0.2 * rnorm(n); - -# data matrices -snps.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop; -gene.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop + snps.mat*((1:ngs)/ngs)^9/2; - -# data objects for Matrix eQTL engine -snps1 = SlicedData$new( t( snps.mat ) ); -gene1 = SlicedData$new( t( gene.mat ) ); -cvrt1 = SlicedData$new(); -rm(snps.mat, gene.mat); - -# Slice data in blocks of 500 variables -snps1$ResliceCombined(500); -gene1$ResliceCombined(500); - -# Produce no output files -filename = NULL; # tempfile() - -# Perform analysis recording information for a Q-Q plot -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1e-6, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = "qqplot"); - -# png(filename = "QQplot.png", width = 650, height = 650); -plot(me, pch = 16, cex = 0.7, ylim = c(0,11)); -# dev.off(); diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/demo/sample.all.r --- a/MatrixEQTL/MatrixEQTL/demo/sample.all.r Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -# Matrix eQTL by Andrey A. Shabalin -# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ -# -# Be sure to use an up to date version of R and Matrix eQTL. - -# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); -library(MatrixEQTL) - -## Location of the package with the data files. -base.dir = find.package('MatrixEQTL'); -# base.dir = '.'; - -## Settings - -# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS -useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS - -# Genotype file name -SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); - -# Gene expression file name -expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); - -# Covariates file name -# Set to character() for no covariates -covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); - -# Output file name -output_file_name = tempfile(); - -# Only associations significant at this level will be saved -pvOutputThreshold = 1e-2; - -# Error covariance matrix -# Set to numeric() for identity. -errorCovariance = numeric(); -# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); - - -## Load genotype data - -snps = SlicedData$new(); -snps$fileDelimiter = "\t"; # the TAB character -snps$fileOmitCharacters = "NA"; # denote missing values; -snps$fileSkipRows = 1; # one row of column labels -snps$fileSkipColumns = 1; # one column of row labels -snps$fileSliceSize = 2000; # read file in slices of 2,000 rows -snps$LoadFile(SNP_file_name); - -## Load gene expression data - -gene = SlicedData$new(); -gene$fileDelimiter = "\t"; # the TAB character -gene$fileOmitCharacters = "NA"; # denote missing values; -gene$fileSkipRows = 1; # one row of column labels -gene$fileSkipColumns = 1; # one column of row labels -gene$fileSliceSize = 2000; # read file in slices of 2,000 rows -gene$LoadFile(expression_file_name); - -## Load covariates - -cvrt = SlicedData$new(); -cvrt$fileDelimiter = "\t"; # the TAB character -cvrt$fileOmitCharacters = "NA"; # denote missing values; -cvrt$fileSkipRows = 1; # one row of column labels -cvrt$fileSkipColumns = 1; # one column of row labels -if(length(covariates_file_name)>0) { - cvrt$LoadFile(covariates_file_name); -} - -## Run the analysis - -me = Matrix_eQTL_engine( - snps = snps, - gene = gene, - cvrt = cvrt, - output_file_name = output_file_name, - pvOutputThreshold = pvOutputThreshold, - useModel = useModel, - errorCovariance = errorCovariance, - verbose = TRUE, - pvalue.hist = TRUE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE); - -unlink(output_file_name); - -## Results: - -cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); -cat('Detected eQTLs:', '\n'); -show(me$all$eqtls) - -## Plot the histogram of all p-values - -plot(me) diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/demo/sample.cis.r --- a/MatrixEQTL/MatrixEQTL/demo/sample.cis.r Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ -# Matrix eQTL by Andrey A. Shabalin -# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ -# -# Be sure to use an up to date version of R and Matrix eQTL. - -# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); -library(MatrixEQTL) - -## Location of the package with the data files. -base.dir = find.package('MatrixEQTL'); -# base.dir = '.'; - -## Settings - -# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS -useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS - -# Genotype file name -SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); -snps_location_file_name = paste(base.dir, "/data/snpsloc.txt", sep=""); - -# Gene expression file name -expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); -gene_location_file_name = paste(base.dir, "/data/geneloc.txt", sep=""); - -# Covariates file name -# Set to character() for no covariates -covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); - -# Output file name -output_file_name_cis = tempfile(); -output_file_name_tra = tempfile(); - -# Only associations significant at this level will be saved -pvOutputThreshold_cis = 2e-2; -pvOutputThreshold_tra = 1e-2; - -# Error covariance matrix -# Set to numeric() for identity. -errorCovariance = numeric(); -# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); - -# Distance for local gene-SNP pairs -cisDist = 1e6; - -## Load genotype data - -snps = SlicedData$new(); -snps$fileDelimiter = "\t"; # the TAB character -snps$fileOmitCharacters = "NA"; # denote missing values; -snps$fileSkipRows = 1; # one row of column labels -snps$fileSkipColumns = 1; # one column of row labels -snps$fileSliceSize = 2000; # read file in slices of 2,000 rows -snps$LoadFile(SNP_file_name); - -## Load gene expression data - -gene = SlicedData$new(); -gene$fileDelimiter = "\t"; # the TAB character -gene$fileOmitCharacters = "NA"; # denote missing values; -gene$fileSkipRows = 1; # one row of column labels -gene$fileSkipColumns = 1; # one column of row labels -gene$fileSliceSize = 2000; # read file in slices of 2,000 rows -gene$LoadFile(expression_file_name); - -## Load covariates - -cvrt = SlicedData$new(); -cvrt$fileDelimiter = "\t"; # the TAB character -cvrt$fileOmitCharacters = "NA"; # denote missing values; -cvrt$fileSkipRows = 1; # one row of column labels -cvrt$fileSkipColumns = 1; # one column of row labels -if(length(covariates_file_name)>0) { - cvrt$LoadFile(covariates_file_name); -} - -## Run the analysis -snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE); -genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE); - -me = Matrix_eQTL_main( - snps = snps, - gene = gene, - cvrt = cvrt, - output_file_name = output_file_name_tra, - pvOutputThreshold = pvOutputThreshold_tra, - useModel = useModel, - errorCovariance = errorCovariance, - verbose = TRUE, - output_file_name.cis = output_file_name_cis, - pvOutputThreshold.cis = pvOutputThreshold_cis, - snpspos = snpspos, - genepos = genepos, - cisDist = cisDist, - pvalue.hist = TRUE, - min.pv.by.genesnp = TRUE, - noFDRsaveMemory = FALSE); - -unlink(output_file_name_tra); -unlink(output_file_name_cis); - -## Results: - -cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); -cat('Detected local eQTLs:', '\n'); -show(me$cis$eqtls) -cat('Detected distant eQTLs:', '\n'); -show(me$trans$eqtls) - -## Plot the histogram of local and distant p-values - -plot(me) diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/inst/CITATION --- a/MatrixEQTL/MatrixEQTL/inst/CITATION Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -citHeader("To cite the 'MatrixEQTL' package in publications use:") - -citEntry(entry="article", - title = "Matrix eQTL: ultra fast eQTL analysis via large matrix operations", - author = personList(as.person("A.A. Shabalin")), - address = "University of North Carolina at Chapel Hill", - year = "2012", - url = "URL http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/", - publisher= "Oxford Univ Press", - journal = "Bioinformatics", - volume = "28", - number = "10", - pages = "1353--1358", - textVersion = "Shabalin, Andrey A. Matrix eQTL: ultra fast eQTL analysis via large matrix operations. Bioinformatics 28, no. 10 (2012): 1353-1358." -) diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/Covariates.Rd --- a/MatrixEQTL/MatrixEQTL/man/Covariates.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -\name{Covariates} -\alias{Covariates} -\docType{data} -\title{ - Artificial data for Matrix eQTL sample code: Covariates. -} -\description{ - Artificial data set with 2 covariates across 15 samples. - Columns of the file must match to those of the expression and genotype data sets. -} -\format{ - \tabular{llll}{ - id \tab Sam_01 \tab Sam_02 \tab \ldots \cr - gender \tab 0 \tab 0 \tab \ldots \cr - age \tab 40 \tab 46 \tab \ldots \cr - - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\keyword{datasets} diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/GE.Rd --- a/MatrixEQTL/MatrixEQTL/man/GE.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -\name{GE} -\alias{GE} -\docType{data} -\title{ - Artificial data for Matrix eQTL sample code: Gene expression. -} -\description{ - Artificial data set with expression of 10 genes across 15 samples. - Columns of the file must match to those of the genotype and covariates data sets. -} -\format{ - \tabular{llll}{ - geneid \tab Sam_01 \tab Sam_02 \tab \ldots \cr - Gene_01 \tab 4.91 \tab 4.63 \tab \ldots \cr - Gene_02 \tab 13.78 \tab 13.14 \tab \ldots \cr - \ldots \tab \ldots \tab \ldots \tab \ldots \cr - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\keyword{datasets} diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/MatrixEQTL-package.Rd --- a/MatrixEQTL/MatrixEQTL/man/MatrixEQTL-package.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -\name{MatrixEQTL-package} -\alias{MatrixEQTL-package} -\alias{MatrixEQTL} -\docType{package} -\title{ - Matrix eQTL: Ultra fast eQTL analysis via large matrix operations -} -\description{ - Matrix eQTL is designed for fast eQTL analysis of large datasets. - Matrix eQTL can test for association between genotype and gene expression using linear regression - with either additive or ANOVA (additive and dominant) genotype effects. - The models can include covariates to account for such factors as population stratification, gender, clinical variables, and surrogate variables. - Matrix eQTL also supports models with heteroscedastic and/or correlated errors, - false discovery rate estimation and separate treatment of local (cis) and distant (trans) eQTLs. -} -\details{ - \tabular{ll}{ - Package: \tab MatrixEQTL\cr - Type: \tab Package\cr - Version: \tab 2.1.1\cr - Date: \tab 2014-02-24\cr - License: \tab LGPL-3 \cr - LazyLoad: \tab yes\cr - Depends: \tab methods\cr - } -} -\author{ - Andrey Shabalin \email{ashabalin@vcu.edu} - - Maintainer: Andrey Shabalin -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\keyword{ package } -\keyword{ MatrixEQTL } -\keyword{ Matrix eQTL } -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/MatrixEQTL_cis_code.Rd --- a/MatrixEQTL/MatrixEQTL/man/MatrixEQTL_cis_code.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,129 +0,0 @@ -\name{MatrixEQTL_cis_code} -\alias{MatrixEQTL_cis_code} - -\title{Sample code for cis/trans-eQTL analysis with Matrix eQTL} -\description{ - The following code is the best starting point for those who want to perform cis-/trans-eQTL analysis with Matrix eQTL. -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and other sample code. -} -\author{ - Andrey Shabalin \email{ashabalin@vcu.edu} -} -\examples{ -# Matrix eQTL by Andrey A. Shabalin -# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ -# -# Be sure to use an up to date version of R and Matrix eQTL. - -# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); -library(MatrixEQTL) - -## Location of the package with the data files. -base.dir = find.package('MatrixEQTL'); - -## Settings - -# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS -useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS - -# Genotype file name -SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); -snps_location_file_name = paste(base.dir, "/data/snpsloc.txt", sep=""); - -# Gene expression file name -expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); -gene_location_file_name = paste(base.dir, "/data/geneloc.txt", sep=""); - -# Covariates file name -# Set to character() for no covariates -covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); - -# Output file name -output_file_name_cis = tempfile(); -output_file_name_tra = tempfile(); - -# Only associations significant at this level will be saved -pvOutputThreshold_cis = 2e-2; -pvOutputThreshold_tra = 1e-2; - -# Error covariance matrix -# Set to numeric() for identity. -errorCovariance = numeric(); -# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); - -# Distance for local gene-SNP pairs -cisDist = 1e6; - -## Load genotype data - -snps = SlicedData$new(); -snps$fileDelimiter = "\t"; # the TAB character -snps$fileOmitCharacters = "NA"; # denote missing values; -snps$fileSkipRows = 1; # one row of column labels -snps$fileSkipColumns = 1; # one column of row labels -snps$fileSliceSize = 2000; # read file in slices of 2,000 rows -snps$LoadFile(SNP_file_name); - -## Load gene expression data - -gene = SlicedData$new(); -gene$fileDelimiter = "\t"; # the TAB character -gene$fileOmitCharacters = "NA"; # denote missing values; -gene$fileSkipRows = 1; # one row of column labels -gene$fileSkipColumns = 1; # one column of row labels -gene$fileSliceSize = 2000; # read file in slices of 2,000 rows -gene$LoadFile(expression_file_name); - -## Load covariates - -cvrt = SlicedData$new(); -cvrt$fileDelimiter = "\t"; # the TAB character -cvrt$fileOmitCharacters = "NA"; # denote missing values; -cvrt$fileSkipRows = 1; # one row of column labels -cvrt$fileSkipColumns = 1; # one column of row labels -if(length(covariates_file_name)>0) { - cvrt$LoadFile(covariates_file_name); -} - -## Run the analysis -snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE); -genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE); - -me = Matrix_eQTL_main( - snps = snps, - gene = gene, - cvrt = cvrt, - output_file_name = output_file_name_tra, - pvOutputThreshold = pvOutputThreshold_tra, - useModel = useModel, - errorCovariance = errorCovariance, - verbose = TRUE, - output_file_name.cis = output_file_name_cis, - pvOutputThreshold.cis = pvOutputThreshold_cis, - snpspos = snpspos, - genepos = genepos, - cisDist = cisDist, - pvalue.hist = TRUE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE); - -unlink(output_file_name_tra); -unlink(output_file_name_cis); - -## Results: - -cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); -cat('Detected local eQTLs:', '\n'); -show(me$cis$eqtls) -cat('Detected distant eQTLs:', '\n'); -show(me$trans$eqtls) - -## Make the histogram of local and distant p-values - -plot(me) -} diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/Matrix_eQTL_main.Rd --- a/MatrixEQTL/MatrixEQTL/man/Matrix_eQTL_main.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,273 +0,0 @@ -\name{Matrix_eQTL_main} -\alias{Matrix_eQTL_main} -\alias{Matrix_eQTL_engine} -\title{ - Main function for fast eQTL analysis in MatrixEQTL package -} -\description{ - \code{Matrix_eQTL_engine} function tests association of every row of the \code{snps} dataset with every row of the \code{gene} dataset using a linear regression model defined by the \code{useModel} parameter (see below). - - The testing procedure accounts for extra covariates in \code{cvrt} parameter. - - The \code{errorCovariance} parameter can be set to the error variance-covariance matrix to account for heteroskedastic and/or correlated errors. - - Associations significant at \code{pvOutputThreshold} (\code{pvOutputThreshold.cis}) levels are saved to \code{output_file_name} (\code{output_file_name.cis}), with corresponding estimates of effect size (slope coefficient), test statistics, p-values, and q-values (false discovery rate). - - Matrix eQTL can perform separate analysis for local (cis) and distant (trans) eQTLs. - For such analysis one has to set the cis-analysis specific parameters \code{pvOutputThreshold.cis > 0}, \code{cisDist}, \code{snpspos} and {genepos} in the call of \code{Matrix_eQTL_main} function. - A gene-SNP pair is considered local if the distance between them is less or equal to \code{cisDist}. - The genomic location of genes and SNPs is defined by data frames \code{snpspos} and {genepos}. - Depending on p-value thresholds \code{pvOutputThreshold} and \code{pvOutputThreshold.cis} Matrix eQTL runs in one of three different modes: - \itemize{ - \item Set \code{pvOutputThreshold > 0} and \code{pvOutputThreshold.cis = 0} (or use \code{Matrix_eQTL_engine}) to perform eQTL analysis without using gene/SNP locations. Associations significant at the \code{pvOutputThreshold} level are be recorded in \code{output_file_name} and in the returned object. - \item Set \code{pvOutputThreshold = 0} and \code{pvOutputThreshold.cis > 0} to perform eQTL analysis for local gene-SNP pairs only. Local associations significant at \code{pvOutputThreshold.cis} level will be recorded in \code{output_file_name.cis} and in the returned object. - \item Set \code{pvOutputThreshold > 0} and \code{pvOutputThreshold.cis > 0} to perform eQTL analysis with separate p-value thresholds for local and distant eQTLs. Distant and local associations significant at corresponding thresholds are recorded in \code{output_file_name} and \code{output_file_name.cis} respectively and in the returned object. In this case the false discovery rate is calculated separately for these two sets of eQTLs. - } - \code{Matrix_eQTL_engine} is a wrapper for \code{Matrix_eQTL_main} for eQTL analysis without regard to gene/SNP location and provided for compatibility with the previous versions of the package. - - The parameter \code{pvalue.hist} allows to record information sufficient to create a histogram or QQ-plot of all the p-values (see \code{\link[=plot.MatrixEQTL]{plot}}). -} -\usage{ -Matrix_eQTL_main( snps, - gene, - cvrt = SlicedData$new(), - output_file_name = "", - pvOutputThreshold = 1e-5, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - output_file_name.cis = "", - pvOutputThreshold.cis = 0, - snpspos = NULL, - genepos = NULL, - cisDist = 1e6, - pvalue.hist = FALSE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE) - -Matrix_eQTL_engine(snps, - gene, - cvrt = SlicedData$new(), - output_file_name, - pvOutputThreshold = 1e-5, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = FALSE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE) -} -\arguments{ - \item{snps}{ - \code{\linkS4class{SlicedData}} object with genotype information. - Can be real-valued for linear models and must take at most 3 distinct values for ANOVA unless the number of ANOVA categories is set to a higher number (see \code{useModel} parameter). - } - \item{gene}{ - \code{\linkS4class{SlicedData}} object with gene expression information. - Must have columns matching those of \code{snps}. - } - \item{cvrt}{ - \code{\linkS4class{SlicedData}} object with additional covariates. - Can be an empty \code{SlicedData} object in case of no covariates. - The constant is always included in the model and would cause an error if included in \code{cvrt}. - The order of columns must match those in \code{snps} and \code{gene}. - } - \item{output_file_name}{ - \code{character}, \code{connection}, or \code{NULL}. - If not \code{NULL}, significant associations are saved to this file (all significant associations if \code{pvOutputThreshold=0} or only distant if \code{pvOutputThreshold>0}). - If the file with this name exists, it is overwritten. - } - \item{output_file_name.cis}{ - \code{character}, \code{connection}, or \code{NULL}. - If not \code{NULL}, significant local associations are saved to this file. - If the file with this name exists, it is overwritten. - } - \item{pvOutputThreshold}{ - \code{numeric}. Significance threshold for all/distant tests. - } - \item{pvOutputThreshold.cis}{ - \code{numeric}. Same as \code{pvOutputThreshold}, but for local eQTLs. - } - \item{useModel}{ - \code{integer}. Eigher \code{modelLINEAR}, \code{modelANOVA}, or \code{modelLINEAR_CROSS}. - \enumerate{ - \item Set \code{useModel = \link{modelLINEAR}} to model the effect of the genotype as additive linear and test for its significance using t-statistic. - \item Set \code{useModel = \link{modelANOVA}} to treat genotype as a categorical variables and use ANOVA model and test for its significance using F-test. The default number of ANOVA categories is 3. Set otherwise like this: \code{options(MatrixEQTL.ANOVA.categories=4)}. - \item Set \code{useModel = \link{modelLINEAR_CROSS}} to add a new term to the model - equal to the product of genotype and the last covariate; the significance of this term is then tested using t-statistic. - } - - } - \item{errorCovariance}{ - \code{numeric}. The error covariance matrix. Use \code{numeric()} for homoskedastic independent errors. - } - \item{verbose}{ - \code{logical}. Set to \code{TRUE} to display more detailed report about the progress. - } - \item{snpspos}{ - \code{data.frame} object with information about SNP locations, must have 3 columns - SNP name, chromosome, and position, like this: - \tabular{ccc}{ - snpid \tab chr \tab pos \cr - Snp_01 \tab 1 \tab 721289 \cr - Snp_02 \tab 1 \tab 752565 \cr - \ldots \tab \ldots \tab \ldots \cr - } - } - \item{genepos}{ - \code{data.frame} with information about transcript locations, must have 4 columns - the name, chromosome, and positions of the left and right ends, like this: - \tabular{cccc}{ - geneid \tab chr \tab left \tab right \cr - Gene_01 \tab 1 \tab 721289 \tab 731289 \cr - Gene_02 \tab 1 \tab 752565 \tab 762565 \cr - \ldots \tab \ldots \tab \ldots \tab \ldots \cr - } - } - \item{cisDist}{ - \code{numeric}. SNP-gene pairs within this distance are considered local. The distance is measured from the nearest end of the gene. SNPs within a gene are always considered local. - } - \item{pvalue.hist}{ - \code{logical}, \code{numerical}, or \code{"qqplot"}. - Defines whether and how the distribution of p-values is recorded in the returned object. - If \code{pvalue.hist = FALSE}, the information is not recorded and the analysis is performed faster. - Alternatively, set \code{pvalue.hist = "qqplot"} to record information sufficient to create a QQ-plot of the p-values (use \code{\link[=plot.MatrixEQTL]{plot}} on the returned object to create the plot). - To record information for a histogram set \code{pvalue.hist} to the desired number of bins of equal size. Finally, \code{pvalue.hist} can also be set to a custom set of bin edges. - } - \item{min.pv.by.genesnp}{ - \code{logical}. Set \code{min.pv.by.genesnp = TRUE} to record the minimum p-value for each SNP and each gene in the returned object. The minimum p-values are recorded even if if they are above the corresponding thresholds of \code{pvOutputThreshold} and \code{pvOutputThreshold.cis}. The analysis runs faster when the parameter is set to \code{FALSE}. - } - \item{noFDRsaveMemory}{ - \code{logical}. Set \code{noFDRsaveMemory = TRUE} to save significant gene-SNP pairs directly to the output files, reduce memory footprint and skip FDR calculation. The eQTLs are not recorded in the returned object if \code{noFDRsaveMemory = TRUE}. - } -} -\details{ - Note that the columns of \code{gene}, \code{snps}, and \code{cvrt} must match. - If they do not match in the input files, use \code{ColumnSubsample} method to subset and/or reorder them. -} -\value{ - The detected eQTLs are saved in \code{output_file_name} and/or \code{output_file_name.cis} if they are not \code{NULL}. - The method also returns a list with a summary of the performed analysis. - \item{param}{Keeps all input parameters and also records the number of degrees of freedom for the full model.} - \item{time.in.sec}{Time difference between the start and the end of the analysis (in seconds).} - \item{all}{Information about all detected eQTLs.} - \item{cis}{Information about detected local eQTLs.} - \item{trans}{Information about detected distant eQTLs.} - The elements \code{all}, \code{cis}, and \code{trans} may contain the following components - \describe{ - \item{\code{ntests}}{Total number of tests performed. This is used for FDR calculation.} - \item{\code{eqtls}}{Data frame with recorded significant associations. Not available if \code{noFDRsaveMemory=FALSE}} - \item{\code{neqtls}}{Number of significant associations recorded.} - \item{\code{hist.bins}}{Histogram bins used for recording p-value distribution. See \code{pvalue.hist} parameter.} - \item{\code{hist.counts}}{Number of p-value that fell in each histogram bin. See \code{pvalue.hist} parameter.} - \item{\code{min.pv.snps}}{Vector with the best p-value for each SNP. See \code{min.pv.by.genesnp} parameter.} - \item{\code{min.pv.gene}}{Vector with the best p-value for each gene. See \code{min.pv.by.genesnp} parameter.} - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\author{ - Andrey Shabalin \email{ashabalin@vcu.edu} -} -\seealso{ - The code below is the sample code for eQTL analysis NOT using gene/SNP locations. - - See \code{\link{MatrixEQTL_cis_code}} for sample code for eQTL analysis that separates local and distant tests. -} -\examples{ -# Matrix eQTL by Andrey A. Shabalin -# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ -# -# Be sure to use an up to date version of R and Matrix eQTL. - -# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); -library(MatrixEQTL) - -## Location of the package with the data files. -base.dir = find.package('MatrixEQTL'); - -## Settings - -# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS -useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS - -# Genotype file name -SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); - -# Gene expression file name -expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); - -# Covariates file name -# Set to character() for no covariates -covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); - -# Output file name -output_file_name = tempfile(); - -# Only associations significant at this level will be saved -pvOutputThreshold = 1e-2; - -# Error covariance matrix -# Set to numeric() for identity. -errorCovariance = numeric(); -# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); - - -## Load genotype data - -snps = SlicedData$new(); -snps$fileDelimiter = "\t"; # the TAB character -snps$fileOmitCharacters = "NA"; # denote missing values; -snps$fileSkipRows = 1; # one row of column labels -snps$fileSkipColumns = 1; # one column of row labels -snps$fileSliceSize = 2000; # read file in slices of 2,000 rows -snps$LoadFile(SNP_file_name); - -## Load gene expression data - -gene = SlicedData$new(); -gene$fileDelimiter = "\t"; # the TAB character -gene$fileOmitCharacters = "NA"; # denote missing values; -gene$fileSkipRows = 1; # one row of column labels -gene$fileSkipColumns = 1; # one column of row labels -gene$fileSliceSize = 2000; # read file in slices of 2,000 rows -gene$LoadFile(expression_file_name); - -## Load covariates - -cvrt = SlicedData$new(); -cvrt$fileDelimiter = "\t"; # the TAB character -cvrt$fileOmitCharacters = "NA"; # denote missing values; -cvrt$fileSkipRows = 1; # one row of column labels -cvrt$fileSkipColumns = 1; # one column of row labels -if(length(covariates_file_name)>0) { - cvrt$LoadFile(covariates_file_name); -} - -## Run the analysis - -me = Matrix_eQTL_engine( - snps = snps, - gene = gene, - cvrt = cvrt, - output_file_name = output_file_name, - pvOutputThreshold = pvOutputThreshold, - useModel = useModel, - errorCovariance = errorCovariance, - verbose = TRUE, - pvalue.hist = TRUE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE); - -unlink(output_file_name); - -## Results: - -cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); -cat('Detected eQTLs:', '\n'); -show(me$all$eqtls) - -## Plot the histogram of all p-values - -plot(me) - -} diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/SNP.Rd --- a/MatrixEQTL/MatrixEQTL/man/SNP.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -\name{SNP} -\alias{SNP} -\docType{data} -\title{ - Artificial data for Matrix eQTL sample code: Genotype. -} -\description{ - Artificial data set with genotype information for 15 markers across 15 samples. - Columns of the file must match to those of the gene expression and covariates data sets. -} -\format{ - \tabular{llll}{ - snpnid \tab Sam_01 \tab Sam_02 \tab \ldots \cr - Snp_01 \tab 2 \tab 0 \tab \ldots \cr - Snp_02 \tab 0 \tab 1 \tab \ldots \cr - \ldots \tab \ldots \tab \ldots \tab \ldots \cr - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\keyword{datasets} diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/SlicedData-class.Rd --- a/MatrixEQTL/MatrixEQTL/man/SlicedData-class.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,189 +0,0 @@ -\name{SlicedData-class} -\Rdversion{1.1} -\docType{class} -\alias{SlicedData-class} -\alias{SlicedData} -\alias{SlicedData-class} -\alias{[[,SlicedData-method} -\alias{[[<-,SlicedData-method} -\alias{colnames,SlicedData-method} -\alias{colnames<-,SlicedData-method} -\alias{dim,SlicedData-method} -\alias{length,SlicedData-method} -\alias{ncol,SlicedData-method} -\alias{NCOL,SlicedData-method} -\alias{nrow,SlicedData-method} -\alias{NROW,SlicedData-method} -\alias{rownames,SlicedData-method} -\alias{rownames<-,SlicedData-method} -\alias{show,SlicedData-method} -\alias{as.matrix,SlicedData-method} - -\alias{rowMeans,SlicedData-method} -\alias{rowSums,SlicedData-method} -\alias{colMeans,SlicedData-method} -\alias{colSums,SlicedData-method} -\alias{summary.SlicedData} - -\title{Class \code{SlicedData} for storing large matrices} -\description{ - This class is created for fast and memory efficient manipulations with large datasets presented in matrix form. - It is used to load, store, and manipulate large datasets, e.g. genotype and gene expression matrices. - When a dataset is loaded, it is sliced in blocks of 1,000 rows (default size). - This allows imputing, standardizing, and performing other operations with the data with minimal memory overhead. -} -\section{Extends}{ - \code{SlicedData} is a reference classes (\code{\linkS4class{envRefClass}}). - Its methods can change the values of the fields of the class. -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\author{ - Andrey Shabalin \email{ashabalin@vcu.edu} -} -\seealso{ - This class is used to load data for eQTL analysis by \code{\link{Matrix_eQTL_engine}}. -} -\keyword{classes} -\section{Fields}{ - \describe{ - \item{\code{dataEnv}:}{\code{environment}. Stores the slices of the data matrix. The slices should be accessed via \code{getSlice()} and \code{setSlice()} methods. } - \item{\code{nSlices1}:}{\code{numeric}. Number of slices. For internal use. The value should be access via \code{nSlices()} method. } - \item{\code{rowNameSlices}:}{\code{list}. Slices of row names. } - \item{\code{columnNames}:}{\code{character}. Column names. } - \item{\code{fileDelimiter}:}{\code{character}. Delimiter separating values in the input file. } - \item{\code{fileSkipColumns}:}{\code{numeric}. Number of columns with row labels in the input file. } - \item{\code{fileSkipRows}:}{\code{numeric}. Number of rows with column labels in the input file. } - \item{\code{fileSliceSize}:}{\code{numeric}. Maximum number of rows in a slice. } - \item{\code{fileOmitCharacters}:}{\code{character}. Missing value (NaN) representation in the input file. } - } -} -\section{Methods}{ - \describe{ - \item{\code{initialize(mat)}:}{ Create the object from a matrix. } - \item{\code{nSlices()}:}{ Returns the number of slices. } - \item{\code{nCols()}:}{ Returns the number of columns in the matrix. } - \item{\code{nRows()}:}{ Returns the number of rows in the matrix. } - \item{\code{Clear()}:}{ Clears the object. Removes the data slices and row and column names. } - \item{\code{Clone()}:}{ Makes a copy of the object. Changes to the copy do not affect the source object. } - \item{\code{CreateFromMatrix(mat)}:}{ Creates \code{SlicedData} object from a \code{\link[base]{matrix}}.} - \item{\code{LoadFile(filename, skipRows = NULL, skipColumns = NULL,} \cr - \code{sliceSize = NULL, omitCharacters = NULL, delimiter = NULL, rowNamesColumn = 1)}:}{ Loads data matrix from a file. \code{filename} should be a character string. The remaining parameters specify the file format and have the same meaning as \code{file*} fields. Additional \code{rowNamesColumn} parameter specifies which of the columns of row labels to use as row names.} - \item{\code{SaveFile(filename)}:}{ Saves the data to a file. \code{filename} should be a character string.} - \item{\code{getSlice(sl)}:}{ Retrieves \code{sl}-th slice of the matrix. } - \item{\code{setSlice(sl, value)}:}{ Set \code{sl}-th slice of the matrix. } - \item{\code{ColumnSubsample(subset)}:}{ Reorders/subsets the columns according to \code{subset}. \cr -Acts as \code{M = M[ ,subset]} for a matrix \code{M}. } - \item{\code{RowReorder(ordr)}:}{ Reorders rows according to \code{ordr}. \cr -Acts as \code{M = M[ordr, ]} for a matrix \code{M}. } - \item{\code{RowMatrixMultiply(multiplier)}:}{ Multiply each row by the \code{multiplier}. \cr -Acts as \code{M = M \%*\% multiplier} for a matrix \code{M}. } - \item{\code{CombineInOneSlice()}:}{ Combines all slices into one. The whole matrix can then be obtained via \code{$getSlice(1)}. } - \item{\code{IsCombined()}:}{ Returns \code{TRUE} if the number of slices is 1 or 0. } - \item{\code{ResliceCombined(sliceSize = -1)}:}{ Cuts the data into slices of \code{sliceSize} rows. If \code{sliceSize} is not defined, the value of \code{fileSliceSize} field is used.} - \item{\code{GetAllRowNames()}:}{ Returns all row names in one vector. } - \item{\code{RowStandardizeCentered()}:}{ Set the mean of each row to zero and the sum of squares to one. } - \item{\code{SetNanRowMean()}:}{ Impute rows with row mean. Rows full of NaN values are imputed with zeros. } - \item{\code{RowRemoveZeroEps()}:}{ Removes rows of zeros and those that are nearly zero. } - \item{\code{FindRow(rowname)}:}{ Finds row by name. Returns a pair of slice number an row number within the slice. If no row is found, the function returns \code{NULL}. } - \item{\code{rowMeans(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of row means. Works as \link[base]{rowMeans} but requires \code{dims} to be equal to \code{1L}.} - \item{\code{rowSums(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of row sums. Works as \link[base]{rowSums} but requires \code{dims} to be equal to \code{1L}.} - \item{\code{colMeans(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of column means. Works as \link[base]{colMeans} but requires \code{dims} to be equal to \code{1L}.} - \item{\code{colSums(x, na.rm = FALSE, dims = 1L)}:}{Returns a vector of column sums. Works as \link[base]{colSums} but requires \code{dims} to be equal to \code{1L}.} - } -} - -\usage{ -# x[[i]] indexing allows easy access to individual slices. -# It is equivalent to x$GetSlice(i) and x$SetSlice(i,value) -\S4method{[[}{SlicedData}(x, i) -\S4method{[[}{SlicedData}(x, i) <- value - -# The following commands work as if x was a simple matrix object -\S4method{nrow}{SlicedData}(x) -\S4method{ncol}{SlicedData}(x) -\S4method{dim}{SlicedData}(x) -\S4method{rownames}{SlicedData}(x) -\S4method{colnames}{SlicedData}(x) -\S4method{rownames}{SlicedData}(x) <- value -\S4method{colnames}{SlicedData}(x) <- value - -# SlicedData object can be easily transformed into a matrix -# preserving row and column names -\S4method{as.matrix}{SlicedData}(x) - -# length(x) can be used in place of x$nSlices() -# to get the number of slices in the object -\S4method{length}{SlicedData}(x) -} -\arguments{ - \item{x}{ - \code{\linkS4class{SlicedData}} object. - } - \item{i}{ - Number of a slice. - } - \item{value}{ - New content for the slice / new row or column names. - } -} - - -\examples{ - -# Create a SlicedData variable -sd = SlicedData$new(); - -# Show the details of the empty object -show(sd); - -# Create a matrix of values and assign to sd -mat = matrix(1:12, 3, 4); -rownames(mat) = c("row1","row2","row3"); -colnames(mat) = c("col1","col2","col3","col4"); -sd$CreateFromMatrix( mat ); - -# Show the detail of the object (one slice) -show(sd); - -# Slice it in pieces of 2 rows -sd$ResliceCombined(sliceSize = 2L); - -# Show the number of slices (equivalent function calls) -sd$nSlices() -length(sd) - -# Is it all in one slice? (No) -sd$IsCombined() - -# Show the column names (equivalent function calls) -sd$columnNames -colnames(sd) - -# Show row name slices -sd$rowNameSlices - -# Show all row names (equivalent function calls) -sd$GetAllRowNames() -rownames(sd) - -# Print the second slice -print( sd[[2]] ) - -# Reorder and subset columns -sd$ColumnSubsample( c(1,3,4) ); - -# Reorder and subset rows -sd$RowReorder( c(3,1) ); - -# Show the detail of the object (one slice again) -show(sd); - -# Is it all in one slice? (Yes) -sd$IsCombined() - -# Find the row with name "row1" (it is second in the first slice) -sd$FindRow("row1"); - -} \ No newline at end of file diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/figures/QQplot.png Binary file MatrixEQTL/MatrixEQTL/man/figures/QQplot.png has changed diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/figures/histogram.png Binary file MatrixEQTL/MatrixEQTL/man/figures/histogram.png has changed diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/geneloc.Rd --- a/MatrixEQTL/MatrixEQTL/man/geneloc.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -\name{geneloc} -\alias{geneloc} -\docType{data} -\title{ - Artificial data for Matrix eQTL sample code: Gene location file. -} -\description{ - Artificial Gene location file for 10 genes. - \tabular{llll}{ - geneid \tab chr \tab left \tab right \cr - Gene_01 \tab chr1 \tab 721289 \tab 731289 \cr - Gene_02 \tab chr1 \tab 752565 \tab 762565 \cr - \ldots \tab \ldots \tab \ldots \tab \ldots \cr - } -} -\format{ - A data frame with 4 columns. - \describe{ - \item{\code{geneid}}{A column with gene names. The order does not have to match the gene expression data set.} - \item{\code{chr}}{Chromosome number, i.e. chr1.} - \item{\code{left}}{Lower (smaller) coordinate of the gene begining/end.} - \item{\code{right}}{Upper (larger) coordinate of the gene begining/end.} - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\keyword{datasets} diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/modelANOVA.Rd --- a/MatrixEQTL/MatrixEQTL/man/modelANOVA.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,88 +0,0 @@ -\name{modelANOVA} -\alias{modelANOVA} -\docType{data} -\title{ - Constant for \code{\link{Matrix_eQTL_engine}}. -} -\description{ - Set parameter \code{useModel = modelANOVA} in the call of \code{\link{Matrix_eQTL_main}} to indicate that the genotype should be treated as a categorical variable. -} -\note{ -By default, the number of ANOVA categories is fixed to be 3. - -To set it to a different number (say, 4) use the following command: - -\code{options(MatrixEQTL.ANOVA.categories=4)} - -To check the current settings run: - -\code{getOption('MatrixEQTL.ANOVA.categories', 3);} -} -\examples{ -library('MatrixEQTL') - -# Number of columns (samples) -n = 100; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with single genotype and expression variables -snps.mat = floor(runif(n, min = 0, max = 3)); -gene.mat = 1 + (snps.mat==1) + cvrt.mat \%*\% rnorm(nc) + rnorm(n) * noise.std; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# name of temporary output file -filename = tempfile(); - -snps1 -gene1 - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelANOVA, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); -# remove the output file -unlink( filename ); - -# Pull Matrix eQTL results - t-statistic and p-value - -fstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c( Fstat = fstat, pvalue = pvalue) -# And compare to those from ANOVA in R -{ - cat('\n\n Matrix eQTL: \n'); - print(rez); - cat('\n R anova(lm()) output: \n') - lmodel = lm( gene.mat ~ cvrt.mat + factor(snps.mat), weights = 1/noise.std^2 ); - lmout = anova( lmodel )[2, 4:5]; - print( lmout ) -} - -# Results from Matrix eQTL and 'lm' must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)) -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/modelLINEAR.Rd --- a/MatrixEQTL/MatrixEQTL/man/modelLINEAR.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ -\name{modelLINEAR} -\alias{modelLINEAR} -\docType{data} -\title{ - Constant for \code{\link{Matrix_eQTL_engine}}. -} -\description{ - Set parameter \code{useModel = modelLINEAR} in the call of \code{\link{Matrix_eQTL_main}} to indicate that the effect of genotype on expression should be assumed to be additive linear. -} - -\examples{ -library('MatrixEQTL') - -# Number of columns (samples) -n = 100; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with genotype and expression variables -snps.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n); -gene.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n) * noise.std + 0.5 * snps.mat + 1; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# name of temporary output file -filename = tempfile(); - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); -# remove the output file -unlink( filename ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue) -# And compare to those from the linear regression in R -{ - cat('\n\n Matrix eQTL: \n'); - print(rez); - cat('\n R summary(lm()) output: \n'); - lmodel = lm( gene.mat ~ snps.mat + cvrt.mat, weights = 1/noise.std^2 ); - lmout = summary( lmodel )$coefficients[2, c(1,3,4)]; - print( lmout ) -} - -# Results from Matrix eQTL and 'lm' must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)) -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/modelLINEAR_CROSS.Rd --- a/MatrixEQTL/MatrixEQTL/man/modelLINEAR_CROSS.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -\name{modelLINEAR_CROSS} -\alias{modelLINEAR_CROSS} -\docType{data} -\title{ - Constant for \code{\link{Matrix_eQTL_engine}}. -} -\description{ - Set parameter \code{useModel = modelLINEAR_CROSS} in the call of \code{\link{Matrix_eQTL_main}} to indicate that Matrix eQTL should include the interaction of SNP and last covariate in the model and test for its significance. -} -\examples{ -library('MatrixEQTL') - -# Number of columns (samples) -n = 25; - -# Number of covariates -nc = 10; - -# Generate the standard deviation of the noise -noise.std = 0.1 + rnorm(n)^2; - -# Generate the covariates -cvrt.mat = 2 + matrix(rnorm(n*nc), ncol = nc); - -# Generate the vectors with single genotype and expression variables -snps.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n); -gene.mat = cvrt.mat \%*\% rnorm(nc) + rnorm(n) * noise.std + - 1 + 0.5 * snps.mat + snps.mat * cvrt.mat[,nc]; - -# Create 3 SlicedData objects for the analysis -snps1 = SlicedData$new( matrix( snps.mat, nrow = 1 ) ); -gene1 = SlicedData$new( matrix( gene.mat, nrow = 1 ) ); -cvrt1 = SlicedData$new( t(cvrt.mat) ); - -# name of temporary output file -filename = tempfile(); - -# Call the main analysis function -me = Matrix_eQTL_main( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1, - useModel = modelLINEAR_CROSS, - errorCovariance = diag(noise.std^2), - verbose = TRUE, - pvalue.hist = FALSE ); -# remove the output file -unlink( filename ); - -# Pull Matrix eQTL results - t-statistic and p-value -beta = me$all$eqtls$beta; -tstat = me$all$eqtls$statistic; -pvalue = me$all$eqtls$pvalue; -rez = c(beta = beta, tstat = tstat, pvalue = pvalue) -# And compare to those from the linear regression in R -{ - cat('\n\n Matrix eQTL: \n'); - print(rez); - cat('\n R summary(lm()) output: \n') - lmodel = lm( gene.mat ~ snps.mat + cvrt.mat + snps.mat*cvrt.mat[,nc], - weights = 1/noise.std^2 ); - lmout = tail(summary( lmodel )$coefficients,1)[,c(1,3,4)]; - print( tail(lmout) ); -} - -# Results from Matrix eQTL and 'lm' must agree -stopifnot(all.equal(lmout, rez, check.attributes=FALSE)) -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/plot.MatrixEQTL.Rd --- a/MatrixEQTL/MatrixEQTL/man/plot.MatrixEQTL.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ -\name{plot.MatrixEQTL} -\alias{plot.MatrixEQTL} -\title{Plot histogram or QQ-plot of all p-values -} -\description{ -This method plots a histogram or QQ-plot of p-values -for all tests performed by \code{\link{Matrix_eQTL_engine}}. -} -\usage{ -\method{plot}{MatrixEQTL}(x, cex = 0.5, pch = 19, xlim = NULL, ylim = NULL,...) -} -\arguments{ - \item{x}{ - An object returned by \code{\link{Matrix_eQTL_engine}}. - } - \item{cex}{ - A numerical value giving the amount by which plotting text and symbols should be magnified relative to the default. - } - \item{pch}{ - Plotting "character", i.e., symbol to use. See \code{\link[graphics]{points}}. - } - \item{xlim}{ - Set the range of the horisontal axis. - } - \item{ylim}{ - Set the range of the vertical axis. - } - \item{\dots}{ - further graphical parameters passed to \code{\link[graphics]{lines}} and \code{\link[graphics]{points}}. - } -} -\details{ - The plot type (histogram vs. QQ-plot) is determined by the \code{pvalue.hist} parameter in the call of \code{\link{Matrix_eQTL_engine}} function. -} -\value{ - The method does not return any value. -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\note{ - The sample code below produces figures like these: - - Histogram: \figure{histogram.png} - - QQ-plot: \figure{QQplot.png} -} -\author{ - Andrey Shabalin \email{ashabalin@vcu.edu} -} -\examples{ -library(MatrixEQTL) - -# Number of samples -n = 100; - -# Number of variables -ngs = 2000; - -# Common signal in all variables -pop = 0.2*rnorm(n); - -# data matrices -snps.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop; -gene.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop + snps.mat*((1:ngs)/ngs)^9/2; - -# data objects for Matrix eQTL engine -snps1 = SlicedData$new( t( snps.mat ) ); -gene1 = SlicedData$new( t( gene.mat ) ); -cvrt1 = SlicedData$new( ); -rm(snps.mat, gene.mat) - -# Slice data in blocks of 500 variables -snps1$ResliceCombined(500); -gene1$ResliceCombined(500); - -# Produce no output files -filename = NULL; # tempfile() - -# Perform analysis recording information for a histogram -meh = Matrix_eQTL_engine( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1e-100, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = 100); - -plot(meh, col="grey") - -# Perform analysis recording information for a QQ-plot -meq = Matrix_eQTL_engine( - snps = snps1, - gene = gene1, - cvrt = cvrt1, - output_file_name = filename, - pvOutputThreshold = 1e-6, - useModel = modelLINEAR, - errorCovariance = numeric(), - verbose = TRUE, - pvalue.hist = "qqplot"); - -plot(meq) -} -\keyword{ QQ-plot } -\keyword{ histogram } diff -r 2d1118ddc31d -r 5fafba5359eb MatrixEQTL/MatrixEQTL/man/snpsloc.Rd --- a/MatrixEQTL/MatrixEQTL/man/snpsloc.Rd Fri Mar 12 08:14:04 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -\name{snpsloc} -\alias{snpsloc} -\docType{data} -\title{ - Artificial data for Matrix eQTL sample code: SNP location file. -} -\description{ - Artificial SNP location file for 15 markers. - \tabular{llll}{ - snpid \tab chr \tab pos \cr - Snp_01 \tab chr1 \tab 721289 \cr - Snp_02 \tab chr1 \tab 752565 \cr - \ldots \tab \ldots \tab \ldots \cr - } -} -\format{ - A data frame with 3 columns. - \describe{ - \item{\code{snpid}}{A column with SNP names. The order does not have to match the genotype data set.} - \item{\code{chr}}{Chromosome number, i.e. chr1.} - \item{\code{pos}}{Coordinate of the SNP.} - } -} -\references{ - The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} -} -\seealso{ - See \code{\link{Matrix_eQTL_engine}} for reference and sample code. -} -\keyword{datasets}