Mercurial > repos > jasonxu > matrixeqtl
diff MatrixEQTL/man/Matrix_eQTL_main.Rd @ 0:cd4c8e4a4b5b draft
Uploaded
author | jasonxu |
---|---|
date | Fri, 12 Mar 2021 08:12:46 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MatrixEQTL/man/Matrix_eQTL_main.Rd Fri Mar 12 08:12:46 2021 +0000 @@ -0,0 +1,273 @@ +\name{Matrix_eQTL_main} +\alias{Matrix_eQTL_main} +\alias{Matrix_eQTL_engine} +\title{ + Main function for fast eQTL analysis in MatrixEQTL package +} +\description{ + \code{Matrix_eQTL_engine} function tests association of every row of the \code{snps} dataset with every row of the \code{gene} dataset using a linear regression model defined by the \code{useModel} parameter (see below). + + The testing procedure accounts for extra covariates in \code{cvrt} parameter. + + The \code{errorCovariance} parameter can be set to the error variance-covariance matrix to account for heteroskedastic and/or correlated errors. + + Associations significant at \code{pvOutputThreshold} (\code{pvOutputThreshold.cis}) levels are saved to \code{output_file_name} (\code{output_file_name.cis}), with corresponding estimates of effect size (slope coefficient), test statistics, p-values, and q-values (false discovery rate). + + Matrix eQTL can perform separate analysis for local (cis) and distant (trans) eQTLs. + For such analysis one has to set the cis-analysis specific parameters \code{pvOutputThreshold.cis > 0}, \code{cisDist}, \code{snpspos} and {genepos} in the call of \code{Matrix_eQTL_main} function. + A gene-SNP pair is considered local if the distance between them is less or equal to \code{cisDist}. + The genomic location of genes and SNPs is defined by data frames \code{snpspos} and {genepos}. + Depending on p-value thresholds \code{pvOutputThreshold} and \code{pvOutputThreshold.cis} Matrix eQTL runs in one of three different modes: + \itemize{ + \item Set \code{pvOutputThreshold > 0} and \code{pvOutputThreshold.cis = 0} (or use \code{Matrix_eQTL_engine}) to perform eQTL analysis without using gene/SNP locations. Associations significant at the \code{pvOutputThreshold} level are be recorded in \code{output_file_name} and in the returned object. + \item Set \code{pvOutputThreshold = 0} and \code{pvOutputThreshold.cis > 0} to perform eQTL analysis for local gene-SNP pairs only. Local associations significant at \code{pvOutputThreshold.cis} level will be recorded in \code{output_file_name.cis} and in the returned object. + \item Set \code{pvOutputThreshold > 0} and \code{pvOutputThreshold.cis > 0} to perform eQTL analysis with separate p-value thresholds for local and distant eQTLs. Distant and local associations significant at corresponding thresholds are recorded in \code{output_file_name} and \code{output_file_name.cis} respectively and in the returned object. In this case the false discovery rate is calculated separately for these two sets of eQTLs. + } + \code{Matrix_eQTL_engine} is a wrapper for \code{Matrix_eQTL_main} for eQTL analysis without regard to gene/SNP location and provided for compatibility with the previous versions of the package. + + The parameter \code{pvalue.hist} allows to record information sufficient to create a histogram or QQ-plot of all the p-values (see \code{\link[=plot.MatrixEQTL]{plot}}). +} +\usage{ +Matrix_eQTL_main( snps, + gene, + cvrt = SlicedData$new(), + output_file_name = "", + pvOutputThreshold = 1e-5, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + output_file_name.cis = "", + pvOutputThreshold.cis = 0, + snpspos = NULL, + genepos = NULL, + cisDist = 1e6, + pvalue.hist = FALSE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE) + +Matrix_eQTL_engine(snps, + gene, + cvrt = SlicedData$new(), + output_file_name, + pvOutputThreshold = 1e-5, + useModel = modelLINEAR, + errorCovariance = numeric(), + verbose = TRUE, + pvalue.hist = FALSE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE) +} +\arguments{ + \item{snps}{ + \code{\linkS4class{SlicedData}} object with genotype information. + Can be real-valued for linear models and must take at most 3 distinct values for ANOVA unless the number of ANOVA categories is set to a higher number (see \code{useModel} parameter). + } + \item{gene}{ + \code{\linkS4class{SlicedData}} object with gene expression information. + Must have columns matching those of \code{snps}. + } + \item{cvrt}{ + \code{\linkS4class{SlicedData}} object with additional covariates. + Can be an empty \code{SlicedData} object in case of no covariates. + The constant is always included in the model and would cause an error if included in \code{cvrt}. + The order of columns must match those in \code{snps} and \code{gene}. + } + \item{output_file_name}{ + \code{character}, \code{connection}, or \code{NULL}. + If not \code{NULL}, significant associations are saved to this file (all significant associations if \code{pvOutputThreshold=0} or only distant if \code{pvOutputThreshold>0}). + If the file with this name exists, it is overwritten. + } + \item{output_file_name.cis}{ + \code{character}, \code{connection}, or \code{NULL}. + If not \code{NULL}, significant local associations are saved to this file. + If the file with this name exists, it is overwritten. + } + \item{pvOutputThreshold}{ + \code{numeric}. Significance threshold for all/distant tests. + } + \item{pvOutputThreshold.cis}{ + \code{numeric}. Same as \code{pvOutputThreshold}, but for local eQTLs. + } + \item{useModel}{ + \code{integer}. Eigher \code{modelLINEAR}, \code{modelANOVA}, or \code{modelLINEAR_CROSS}. + \enumerate{ + \item Set \code{useModel = \link{modelLINEAR}} to model the effect of the genotype as additive linear and test for its significance using t-statistic. + \item Set \code{useModel = \link{modelANOVA}} to treat genotype as a categorical variables and use ANOVA model and test for its significance using F-test. The default number of ANOVA categories is 3. Set otherwise like this: \code{options(MatrixEQTL.ANOVA.categories=4)}. + \item Set \code{useModel = \link{modelLINEAR_CROSS}} to add a new term to the model + equal to the product of genotype and the last covariate; the significance of this term is then tested using t-statistic. + } + + } + \item{errorCovariance}{ + \code{numeric}. The error covariance matrix. Use \code{numeric()} for homoskedastic independent errors. + } + \item{verbose}{ + \code{logical}. Set to \code{TRUE} to display more detailed report about the progress. + } + \item{snpspos}{ + \code{data.frame} object with information about SNP locations, must have 3 columns - SNP name, chromosome, and position, like this: + \tabular{ccc}{ + snpid \tab chr \tab pos \cr + Snp_01 \tab 1 \tab 721289 \cr + Snp_02 \tab 1 \tab 752565 \cr + \ldots \tab \ldots \tab \ldots \cr + } + } + \item{genepos}{ + \code{data.frame} with information about transcript locations, must have 4 columns - the name, chromosome, and positions of the left and right ends, like this: + \tabular{cccc}{ + geneid \tab chr \tab left \tab right \cr + Gene_01 \tab 1 \tab 721289 \tab 731289 \cr + Gene_02 \tab 1 \tab 752565 \tab 762565 \cr + \ldots \tab \ldots \tab \ldots \tab \ldots \cr + } + } + \item{cisDist}{ + \code{numeric}. SNP-gene pairs within this distance are considered local. The distance is measured from the nearest end of the gene. SNPs within a gene are always considered local. + } + \item{pvalue.hist}{ + \code{logical}, \code{numerical}, or \code{"qqplot"}. + Defines whether and how the distribution of p-values is recorded in the returned object. + If \code{pvalue.hist = FALSE}, the information is not recorded and the analysis is performed faster. + Alternatively, set \code{pvalue.hist = "qqplot"} to record information sufficient to create a QQ-plot of the p-values (use \code{\link[=plot.MatrixEQTL]{plot}} on the returned object to create the plot). + To record information for a histogram set \code{pvalue.hist} to the desired number of bins of equal size. Finally, \code{pvalue.hist} can also be set to a custom set of bin edges. + } + \item{min.pv.by.genesnp}{ + \code{logical}. Set \code{min.pv.by.genesnp = TRUE} to record the minimum p-value for each SNP and each gene in the returned object. The minimum p-values are recorded even if if they are above the corresponding thresholds of \code{pvOutputThreshold} and \code{pvOutputThreshold.cis}. The analysis runs faster when the parameter is set to \code{FALSE}. + } + \item{noFDRsaveMemory}{ + \code{logical}. Set \code{noFDRsaveMemory = TRUE} to save significant gene-SNP pairs directly to the output files, reduce memory footprint and skip FDR calculation. The eQTLs are not recorded in the returned object if \code{noFDRsaveMemory = TRUE}. + } +} +\details{ + Note that the columns of \code{gene}, \code{snps}, and \code{cvrt} must match. + If they do not match in the input files, use \code{ColumnSubsample} method to subset and/or reorder them. +} +\value{ + The detected eQTLs are saved in \code{output_file_name} and/or \code{output_file_name.cis} if they are not \code{NULL}. + The method also returns a list with a summary of the performed analysis. + \item{param}{Keeps all input parameters and also records the number of degrees of freedom for the full model.} + \item{time.in.sec}{Time difference between the start and the end of the analysis (in seconds).} + \item{all}{Information about all detected eQTLs.} + \item{cis}{Information about detected local eQTLs.} + \item{trans}{Information about detected distant eQTLs.} + The elements \code{all}, \code{cis}, and \code{trans} may contain the following components + \describe{ + \item{\code{ntests}}{Total number of tests performed. This is used for FDR calculation.} + \item{\code{eqtls}}{Data frame with recorded significant associations. Not available if \code{noFDRsaveMemory=FALSE}} + \item{\code{neqtls}}{Number of significant associations recorded.} + \item{\code{hist.bins}}{Histogram bins used for recording p-value distribution. See \code{pvalue.hist} parameter.} + \item{\code{hist.counts}}{Number of p-value that fell in each histogram bin. See \code{pvalue.hist} parameter.} + \item{\code{min.pv.snps}}{Vector with the best p-value for each SNP. See \code{min.pv.by.genesnp} parameter.} + \item{\code{min.pv.gene}}{Vector with the best p-value for each gene. See \code{min.pv.by.genesnp} parameter.} + } +} +\references{ + The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} +} +\author{ + Andrey Shabalin \email{ashabalin@vcu.edu} +} +\seealso{ + The code below is the sample code for eQTL analysis NOT using gene/SNP locations. + + See \code{\link{MatrixEQTL_cis_code}} for sample code for eQTL analysis that separates local and distant tests. +} +\examples{ +# Matrix eQTL by Andrey A. Shabalin +# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ +# +# Be sure to use an up to date version of R and Matrix eQTL. + +# source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); +library(MatrixEQTL) + +## Location of the package with the data files. +base.dir = find.package('MatrixEQTL'); + +## Settings + +# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS +useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS + +# Genotype file name +SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); + +# Gene expression file name +expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); + +# Covariates file name +# Set to character() for no covariates +covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); + +# Output file name +output_file_name = tempfile(); + +# Only associations significant at this level will be saved +pvOutputThreshold = 1e-2; + +# Error covariance matrix +# Set to numeric() for identity. +errorCovariance = numeric(); +# errorCovariance = read.table("Sample_Data/errorCovariance.txt"); + + +## Load genotype data + +snps = SlicedData$new(); +snps$fileDelimiter = "\t"; # the TAB character +snps$fileOmitCharacters = "NA"; # denote missing values; +snps$fileSkipRows = 1; # one row of column labels +snps$fileSkipColumns = 1; # one column of row labels +snps$fileSliceSize = 2000; # read file in slices of 2,000 rows +snps$LoadFile(SNP_file_name); + +## Load gene expression data + +gene = SlicedData$new(); +gene$fileDelimiter = "\t"; # the TAB character +gene$fileOmitCharacters = "NA"; # denote missing values; +gene$fileSkipRows = 1; # one row of column labels +gene$fileSkipColumns = 1; # one column of row labels +gene$fileSliceSize = 2000; # read file in slices of 2,000 rows +gene$LoadFile(expression_file_name); + +## Load covariates + +cvrt = SlicedData$new(); +cvrt$fileDelimiter = "\t"; # the TAB character +cvrt$fileOmitCharacters = "NA"; # denote missing values; +cvrt$fileSkipRows = 1; # one row of column labels +cvrt$fileSkipColumns = 1; # one column of row labels +if(length(covariates_file_name)>0) { + cvrt$LoadFile(covariates_file_name); +} + +## Run the analysis + +me = Matrix_eQTL_engine( + snps = snps, + gene = gene, + cvrt = cvrt, + output_file_name = output_file_name, + pvOutputThreshold = pvOutputThreshold, + useModel = useModel, + errorCovariance = errorCovariance, + verbose = TRUE, + pvalue.hist = TRUE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE); + +unlink(output_file_name); + +## Results: + +cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); +cat('Detected eQTLs:', '\n'); +show(me$all$eqtls) + +## Plot the histogram of all p-values + +plot(me) + +}