Mercurial > repos > jasonxu > matrixeqtl
comparison MatrixEQTL/man/Matrix_eQTL_main.Rd @ 3:ae74f8fb3aef draft
Uploaded
| author | jasonxu |
|---|---|
| date | Fri, 12 Mar 2021 08:20:57 +0000 |
| parents | cd4c8e4a4b5b |
| children |
comparison
equal
deleted
inserted
replaced
| 2:5fafba5359eb | 3:ae74f8fb3aef |
|---|---|
| 1 \name{Matrix_eQTL_main} | |
| 2 \alias{Matrix_eQTL_main} | |
| 3 \alias{Matrix_eQTL_engine} | |
| 4 \title{ | |
| 5 Main function for fast eQTL analysis in MatrixEQTL package | |
| 6 } | |
| 7 \description{ | |
| 8 \code{Matrix_eQTL_engine} function tests association of every row of the \code{snps} dataset with every row of the \code{gene} dataset using a linear regression model defined by the \code{useModel} parameter (see below). | |
| 9 | |
| 10 The testing procedure accounts for extra covariates in \code{cvrt} parameter. | |
| 11 | |
| 12 The \code{errorCovariance} parameter can be set to the error variance-covariance matrix to account for heteroskedastic and/or correlated errors. | |
| 13 | |
| 14 Associations significant at \code{pvOutputThreshold} (\code{pvOutputThreshold.cis}) levels are saved to \code{output_file_name} (\code{output_file_name.cis}), with corresponding estimates of effect size (slope coefficient), test statistics, p-values, and q-values (false discovery rate). | |
| 15 | |
| 16 Matrix eQTL can perform separate analysis for local (cis) and distant (trans) eQTLs. | |
| 17 For such analysis one has to set the cis-analysis specific parameters \code{pvOutputThreshold.cis > 0}, \code{cisDist}, \code{snpspos} and {genepos} in the call of \code{Matrix_eQTL_main} function. | |
| 18 A gene-SNP pair is considered local if the distance between them is less or equal to \code{cisDist}. | |
| 19 The genomic location of genes and SNPs is defined by data frames \code{snpspos} and {genepos}. | |
| 20 Depending on p-value thresholds \code{pvOutputThreshold} and \code{pvOutputThreshold.cis} Matrix eQTL runs in one of three different modes: | |
| 21 \itemize{ | |
| 22 \item Set \code{pvOutputThreshold > 0} and \code{pvOutputThreshold.cis = 0} (or use \code{Matrix_eQTL_engine}) to perform eQTL analysis without using gene/SNP locations. Associations significant at the \code{pvOutputThreshold} level are be recorded in \code{output_file_name} and in the returned object. | |
| 23 \item Set \code{pvOutputThreshold = 0} and \code{pvOutputThreshold.cis > 0} to perform eQTL analysis for local gene-SNP pairs only. Local associations significant at \code{pvOutputThreshold.cis} level will be recorded in \code{output_file_name.cis} and in the returned object. | |
| 24 \item Set \code{pvOutputThreshold > 0} and \code{pvOutputThreshold.cis > 0} to perform eQTL analysis with separate p-value thresholds for local and distant eQTLs. Distant and local associations significant at corresponding thresholds are recorded in \code{output_file_name} and \code{output_file_name.cis} respectively and in the returned object. In this case the false discovery rate is calculated separately for these two sets of eQTLs. | |
| 25 } | |
| 26 \code{Matrix_eQTL_engine} is a wrapper for \code{Matrix_eQTL_main} for eQTL analysis without regard to gene/SNP location and provided for compatibility with the previous versions of the package. | |
| 27 | |
| 28 The parameter \code{pvalue.hist} allows to record information sufficient to create a histogram or QQ-plot of all the p-values (see \code{\link[=plot.MatrixEQTL]{plot}}). | |
| 29 } | |
| 30 \usage{ | |
| 31 Matrix_eQTL_main( snps, | |
| 32 gene, | |
| 33 cvrt = SlicedData$new(), | |
| 34 output_file_name = "", | |
| 35 pvOutputThreshold = 1e-5, | |
| 36 useModel = modelLINEAR, | |
| 37 errorCovariance = numeric(), | |
| 38 verbose = TRUE, | |
| 39 output_file_name.cis = "", | |
| 40 pvOutputThreshold.cis = 0, | |
| 41 snpspos = NULL, | |
| 42 genepos = NULL, | |
| 43 cisDist = 1e6, | |
| 44 pvalue.hist = FALSE, | |
| 45 min.pv.by.genesnp = FALSE, | |
| 46 noFDRsaveMemory = FALSE) | |
| 47 | |
| 48 Matrix_eQTL_engine(snps, | |
| 49 gene, | |
| 50 cvrt = SlicedData$new(), | |
| 51 output_file_name, | |
| 52 pvOutputThreshold = 1e-5, | |
| 53 useModel = modelLINEAR, | |
| 54 errorCovariance = numeric(), | |
| 55 verbose = TRUE, | |
| 56 pvalue.hist = FALSE, | |
| 57 min.pv.by.genesnp = FALSE, | |
| 58 noFDRsaveMemory = FALSE) | |
| 59 } | |
| 60 \arguments{ | |
| 61 \item{snps}{ | |
| 62 \code{\linkS4class{SlicedData}} object with genotype information. | |
| 63 Can be real-valued for linear models and must take at most 3 distinct values for ANOVA unless the number of ANOVA categories is set to a higher number (see \code{useModel} parameter). | |
| 64 } | |
| 65 \item{gene}{ | |
| 66 \code{\linkS4class{SlicedData}} object with gene expression information. | |
| 67 Must have columns matching those of \code{snps}. | |
| 68 } | |
| 69 \item{cvrt}{ | |
| 70 \code{\linkS4class{SlicedData}} object with additional covariates. | |
| 71 Can be an empty \code{SlicedData} object in case of no covariates. | |
| 72 The constant is always included in the model and would cause an error if included in \code{cvrt}. | |
| 73 The order of columns must match those in \code{snps} and \code{gene}. | |
| 74 } | |
| 75 \item{output_file_name}{ | |
| 76 \code{character}, \code{connection}, or \code{NULL}. | |
| 77 If not \code{NULL}, significant associations are saved to this file (all significant associations if \code{pvOutputThreshold=0} or only distant if \code{pvOutputThreshold>0}). | |
| 78 If the file with this name exists, it is overwritten. | |
| 79 } | |
| 80 \item{output_file_name.cis}{ | |
| 81 \code{character}, \code{connection}, or \code{NULL}. | |
| 82 If not \code{NULL}, significant local associations are saved to this file. | |
| 83 If the file with this name exists, it is overwritten. | |
| 84 } | |
| 85 \item{pvOutputThreshold}{ | |
| 86 \code{numeric}. Significance threshold for all/distant tests. | |
| 87 } | |
| 88 \item{pvOutputThreshold.cis}{ | |
| 89 \code{numeric}. Same as \code{pvOutputThreshold}, but for local eQTLs. | |
| 90 } | |
| 91 \item{useModel}{ | |
| 92 \code{integer}. Eigher \code{modelLINEAR}, \code{modelANOVA}, or \code{modelLINEAR_CROSS}. | |
| 93 \enumerate{ | |
| 94 \item Set \code{useModel = \link{modelLINEAR}} to model the effect of the genotype as additive linear and test for its significance using t-statistic. | |
| 95 \item Set \code{useModel = \link{modelANOVA}} to treat genotype as a categorical variables and use ANOVA model and test for its significance using F-test. The default number of ANOVA categories is 3. Set otherwise like this: \code{options(MatrixEQTL.ANOVA.categories=4)}. | |
| 96 \item Set \code{useModel = \link{modelLINEAR_CROSS}} to add a new term to the model | |
| 97 equal to the product of genotype and the last covariate; the significance of this term is then tested using t-statistic. | |
| 98 } | |
| 99 | |
| 100 } | |
| 101 \item{errorCovariance}{ | |
| 102 \code{numeric}. The error covariance matrix. Use \code{numeric()} for homoskedastic independent errors. | |
| 103 } | |
| 104 \item{verbose}{ | |
| 105 \code{logical}. Set to \code{TRUE} to display more detailed report about the progress. | |
| 106 } | |
| 107 \item{snpspos}{ | |
| 108 \code{data.frame} object with information about SNP locations, must have 3 columns - SNP name, chromosome, and position, like this: | |
| 109 \tabular{ccc}{ | |
| 110 snpid \tab chr \tab pos \cr | |
| 111 Snp_01 \tab 1 \tab 721289 \cr | |
| 112 Snp_02 \tab 1 \tab 752565 \cr | |
| 113 \ldots \tab \ldots \tab \ldots \cr | |
| 114 } | |
| 115 } | |
| 116 \item{genepos}{ | |
| 117 \code{data.frame} with information about transcript locations, must have 4 columns - the name, chromosome, and positions of the left and right ends, like this: | |
| 118 \tabular{cccc}{ | |
| 119 geneid \tab chr \tab left \tab right \cr | |
| 120 Gene_01 \tab 1 \tab 721289 \tab 731289 \cr | |
| 121 Gene_02 \tab 1 \tab 752565 \tab 762565 \cr | |
| 122 \ldots \tab \ldots \tab \ldots \tab \ldots \cr | |
| 123 } | |
| 124 } | |
| 125 \item{cisDist}{ | |
| 126 \code{numeric}. SNP-gene pairs within this distance are considered local. The distance is measured from the nearest end of the gene. SNPs within a gene are always considered local. | |
| 127 } | |
| 128 \item{pvalue.hist}{ | |
| 129 \code{logical}, \code{numerical}, or \code{"qqplot"}. | |
| 130 Defines whether and how the distribution of p-values is recorded in the returned object. | |
| 131 If \code{pvalue.hist = FALSE}, the information is not recorded and the analysis is performed faster. | |
| 132 Alternatively, set \code{pvalue.hist = "qqplot"} to record information sufficient to create a QQ-plot of the p-values (use \code{\link[=plot.MatrixEQTL]{plot}} on the returned object to create the plot). | |
| 133 To record information for a histogram set \code{pvalue.hist} to the desired number of bins of equal size. Finally, \code{pvalue.hist} can also be set to a custom set of bin edges. | |
| 134 } | |
| 135 \item{min.pv.by.genesnp}{ | |
| 136 \code{logical}. Set \code{min.pv.by.genesnp = TRUE} to record the minimum p-value for each SNP and each gene in the returned object. The minimum p-values are recorded even if if they are above the corresponding thresholds of \code{pvOutputThreshold} and \code{pvOutputThreshold.cis}. The analysis runs faster when the parameter is set to \code{FALSE}. | |
| 137 } | |
| 138 \item{noFDRsaveMemory}{ | |
| 139 \code{logical}. Set \code{noFDRsaveMemory = TRUE} to save significant gene-SNP pairs directly to the output files, reduce memory footprint and skip FDR calculation. The eQTLs are not recorded in the returned object if \code{noFDRsaveMemory = TRUE}. | |
| 140 } | |
| 141 } | |
| 142 \details{ | |
| 143 Note that the columns of \code{gene}, \code{snps}, and \code{cvrt} must match. | |
| 144 If they do not match in the input files, use \code{ColumnSubsample} method to subset and/or reorder them. | |
| 145 } | |
| 146 \value{ | |
| 147 The detected eQTLs are saved in \code{output_file_name} and/or \code{output_file_name.cis} if they are not \code{NULL}. | |
| 148 The method also returns a list with a summary of the performed analysis. | |
| 149 \item{param}{Keeps all input parameters and also records the number of degrees of freedom for the full model.} | |
| 150 \item{time.in.sec}{Time difference between the start and the end of the analysis (in seconds).} | |
| 151 \item{all}{Information about all detected eQTLs.} | |
| 152 \item{cis}{Information about detected local eQTLs.} | |
| 153 \item{trans}{Information about detected distant eQTLs.} | |
| 154 The elements \code{all}, \code{cis}, and \code{trans} may contain the following components | |
| 155 \describe{ | |
| 156 \item{\code{ntests}}{Total number of tests performed. This is used for FDR calculation.} | |
| 157 \item{\code{eqtls}}{Data frame with recorded significant associations. Not available if \code{noFDRsaveMemory=FALSE}} | |
| 158 \item{\code{neqtls}}{Number of significant associations recorded.} | |
| 159 \item{\code{hist.bins}}{Histogram bins used for recording p-value distribution. See \code{pvalue.hist} parameter.} | |
| 160 \item{\code{hist.counts}}{Number of p-value that fell in each histogram bin. See \code{pvalue.hist} parameter.} | |
| 161 \item{\code{min.pv.snps}}{Vector with the best p-value for each SNP. See \code{min.pv.by.genesnp} parameter.} | |
| 162 \item{\code{min.pv.gene}}{Vector with the best p-value for each gene. See \code{min.pv.by.genesnp} parameter.} | |
| 163 } | |
| 164 } | |
| 165 \references{ | |
| 166 The package website: \url{http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/} | |
| 167 } | |
| 168 \author{ | |
| 169 Andrey Shabalin \email{ashabalin@vcu.edu} | |
| 170 } | |
| 171 \seealso{ | |
| 172 The code below is the sample code for eQTL analysis NOT using gene/SNP locations. | |
| 173 | |
| 174 See \code{\link{MatrixEQTL_cis_code}} for sample code for eQTL analysis that separates local and distant tests. | |
| 175 } | |
| 176 \examples{ | |
| 177 # Matrix eQTL by Andrey A. Shabalin | |
| 178 # http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/ | |
| 179 # | |
| 180 # Be sure to use an up to date version of R and Matrix eQTL. | |
| 181 | |
| 182 # source("Matrix_eQTL_R/Matrix_eQTL_engine.r"); | |
| 183 library(MatrixEQTL) | |
| 184 | |
| 185 ## Location of the package with the data files. | |
| 186 base.dir = find.package('MatrixEQTL'); | |
| 187 | |
| 188 ## Settings | |
| 189 | |
| 190 # Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS | |
| 191 useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS | |
| 192 | |
| 193 # Genotype file name | |
| 194 SNP_file_name = paste(base.dir, "/data/SNP.txt", sep=""); | |
| 195 | |
| 196 # Gene expression file name | |
| 197 expression_file_name = paste(base.dir, "/data/GE.txt", sep=""); | |
| 198 | |
| 199 # Covariates file name | |
| 200 # Set to character() for no covariates | |
| 201 covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep=""); | |
| 202 | |
| 203 # Output file name | |
| 204 output_file_name = tempfile(); | |
| 205 | |
| 206 # Only associations significant at this level will be saved | |
| 207 pvOutputThreshold = 1e-2; | |
| 208 | |
| 209 # Error covariance matrix | |
| 210 # Set to numeric() for identity. | |
| 211 errorCovariance = numeric(); | |
| 212 # errorCovariance = read.table("Sample_Data/errorCovariance.txt"); | |
| 213 | |
| 214 | |
| 215 ## Load genotype data | |
| 216 | |
| 217 snps = SlicedData$new(); | |
| 218 snps$fileDelimiter = "\t"; # the TAB character | |
| 219 snps$fileOmitCharacters = "NA"; # denote missing values; | |
| 220 snps$fileSkipRows = 1; # one row of column labels | |
| 221 snps$fileSkipColumns = 1; # one column of row labels | |
| 222 snps$fileSliceSize = 2000; # read file in slices of 2,000 rows | |
| 223 snps$LoadFile(SNP_file_name); | |
| 224 | |
| 225 ## Load gene expression data | |
| 226 | |
| 227 gene = SlicedData$new(); | |
| 228 gene$fileDelimiter = "\t"; # the TAB character | |
| 229 gene$fileOmitCharacters = "NA"; # denote missing values; | |
| 230 gene$fileSkipRows = 1; # one row of column labels | |
| 231 gene$fileSkipColumns = 1; # one column of row labels | |
| 232 gene$fileSliceSize = 2000; # read file in slices of 2,000 rows | |
| 233 gene$LoadFile(expression_file_name); | |
| 234 | |
| 235 ## Load covariates | |
| 236 | |
| 237 cvrt = SlicedData$new(); | |
| 238 cvrt$fileDelimiter = "\t"; # the TAB character | |
| 239 cvrt$fileOmitCharacters = "NA"; # denote missing values; | |
| 240 cvrt$fileSkipRows = 1; # one row of column labels | |
| 241 cvrt$fileSkipColumns = 1; # one column of row labels | |
| 242 if(length(covariates_file_name)>0) { | |
| 243 cvrt$LoadFile(covariates_file_name); | |
| 244 } | |
| 245 | |
| 246 ## Run the analysis | |
| 247 | |
| 248 me = Matrix_eQTL_engine( | |
| 249 snps = snps, | |
| 250 gene = gene, | |
| 251 cvrt = cvrt, | |
| 252 output_file_name = output_file_name, | |
| 253 pvOutputThreshold = pvOutputThreshold, | |
| 254 useModel = useModel, | |
| 255 errorCovariance = errorCovariance, | |
| 256 verbose = TRUE, | |
| 257 pvalue.hist = TRUE, | |
| 258 min.pv.by.genesnp = FALSE, | |
| 259 noFDRsaveMemory = FALSE); | |
| 260 | |
| 261 unlink(output_file_name); | |
| 262 | |
| 263 ## Results: | |
| 264 | |
| 265 cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n'); | |
| 266 cat('Detected eQTLs:', '\n'); | |
| 267 show(me$all$eqtls) | |
| 268 | |
| 269 ## Plot the histogram of all p-values | |
| 270 | |
| 271 plot(me) | |
| 272 | |
| 273 } |
