Mercurial > repos > george-weingart > maaslin
diff maaslin-4450aa4ecc84/src/lib/AnalysisModules.R @ 1:a87d5a5f2776
Uploaded the version running on the prod server
author | george-weingart |
---|---|
date | Sun, 08 Feb 2015 23:08:38 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maaslin-4450aa4ecc84/src/lib/AnalysisModules.R Sun Feb 08 23:08:38 2015 -0500 @@ -0,0 +1,1237 @@ +##################################################################################### +#Copyright (C) <2012> +# +#Permission is hereby granted, free of charge, to any person obtaining a copy of +#this software and associated documentation files (the "Software"), to deal in the +#Software without restriction, including without limitation the rights to use, copy, +#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +#and to permit persons to whom the Software is furnished to do so, subject to +#the following conditions: +# +#The above copyright notice and this permission notice shall be included in all copies +#or substantial portions of the Software. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), +# authored by the Huttenhower lab at the Harvard School of Public Health +# (contact Timothy Tickle, ttickle@hsph.harvard.edu). +##################################################################################### + +inlinedocs <- function( +##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu> +##description<< Allows one to plug in new modules to perform analysis (univariate or multivariate), regularization, and data (response) transformation. +) { return( pArgs ) } + +# Libraries +suppressMessages(library( agricolae, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +# Needed for the pot-hoc Kruskal wallis comparisons +suppressMessages(library( penalized, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +# Needed for stepAIC +suppressMessages(library( MASS, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +# Needed for na action behavior +suppressMessages(library( gam, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +# Needed for boosting +suppressMessages(library( gbm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +# Needed for LASSO +suppressMessages(library( glmnet, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +# Needed for mixed models +#suppressMessages(library( lme4, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +suppressMessages(library( nlme, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) + +# Needed for zero inflated models +#suppressMessages(library( MCMCglmm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +suppressMessages(library( pscl, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +suppressMessages(library( gamlss, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +# Do not use #suppressMessages(library( glmmADMB, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) + +fAddBack = TRUE +dUnevenMax = .9 + + +### Helper functions +# OK +funcMakeContrasts <- function +### Makes univariate contrasts of all predictors in the model formula with the response. +(strFormula, +### lm style string defining reponse and predictors +strRandomFormula, +### mixed model string defining the fixed covariates +frmeTmp, +### The data frame to find predictor data in +iTaxon, +### Taxon +functionContrast, +### functionContrast The univariate test to perform +lsQCCounts +### QC info +){ + #TODO are we updating the QCCounts? + lsSig = list() + ### Holds all the significance results from the tests + adP = c() + ### Holds the p-values + sCurDataName = names(frmeTmp)[iTaxon] + ### The name of the taxon (data row) that is being associated (always assumed to be numeric) + #Get test comparisons (predictor names from formula string) + asComparisons = unique(c(funcFormulaStrToList(strFormula),funcFormulaStrToList(strRandomFormula))) + + #Change metadata in formula to univariate comparisons + for(sComparison in asComparisons) + { + # Metadata values + vxTest = frmeTmp[[sComparison]] + + # Get the levels in the comparison + # Can ignore the first level because it is the reference level + asLevels = sComparison + if(is.factor(vxTest)){asLevels = levels(vxTest)[2:length(vxTest)]} + + lasComparisonResults = functionContrast(x=sComparison, adCur=frmeTmp[[sCurDataName]], dfData=frmeTmp) + for(asComparison in lasComparisonResults) + { + if( is.na( asComparison$p.value ) ) { next } + # Get pvalue + adP = c(adP, asComparison$p.value) + # Get SD, if not available, give SD of covariate + dSTD = asComparison$SD + # TODO Is using sd on factor and binary data correct? + if(is.na(dSTD) || is.null(dSTD)){dSTD = sd(vxTest)} + + lsSig[[length( lsSig ) + 1]] <- list( + #Current metadata name (string column name) ok + name = sComparison, + #Current metadatda name (string, as a factor level if existing as such) ok + orig = asComparison$name, + #Taxon feature name (string) ok + taxon = colnames( frmeTmp )[iTaxon], + #Taxon data / response (double vector) ok + data = frmeTmp[,iTaxon], + #Name of column ok + factors = sComparison, + #Metadata values (metadata as a factor or raw numeric) ok + metadata = vxTest, + #Current coefficient value (named coef value with level name (from coefs) ok + value = asComparison$coef, + #Standard deviation (numeric) ok + std = dSTD, + #Model coefficients (output from coefs with intercept) ok + allCoefs = asComparison$coef) + } + } + return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts)) + ### Returns a list of p-value, standard deviation, and comparison which produced the p-value +} + +#Ok +funcGetStepPredictors <- function +### Retrieve the predictors of the reduced model after stepwise model selection +(lmod, +### Linear model resulting from step-wise model selection +frmeTmp, +strLog +### File to document logging +){ + #Document + funcWrite( "#model", strLog ) + funcWrite( lmod$fit, strLog ) +#TODO funcWrite( lmod$train.error, strLog ) +#TODO funcWrite( lmod$Terms, strLog ) + funcWrite( "#summary-gbm", strLog ) + funcWrite( summary(lmod), strLog ) + + #Get Names from coefficients + asStepCoefsFactors = coefficients(lmod) + astrCoefNames = setdiff(names(asStepCoefsFactors[as.vector(!is.na(asStepCoefsFactors))==TRUE]),"(Intercept)") + asStepCoefsFactors = unique(as.vector(sapply(astrCoefNames,funcCoef2Col, frmeData=frmeTmp))) + + if(length(asStepCoefsFactors)<1){ return(NA) } + return( asStepCoefsFactors ) + ### Vector of string predictor names +} + +funcGetUnivariateResults <- function +### Reduce the list of list of results to the correct format +( llmod, +### The list of univariate models +frmeData, +### Data analysis is performed on +liTaxon, +### The response id +dSig, +### Significance level for q-values +adP, +### List of pvalues from all associations performed +lsSig, +### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients +strLog, +### File to which to document logging +lsQCCounts, +### Records of counts associated with quality control +lastrCols, +### Predictors used in the association +asSuppressCovariates=c() +### Vector of covariates to suppress and not give results for +){ + adP = c() + lsSig = list() + for(lmod in llmod) + { + adP = c(adP,lmod$adP) + lsSig = c(lsSig,lmod$lsSig) + } + return(list(adP=adP, lsSig=lsSig, lsQCCounts=llmod[length(llmod)]$lsQCCounts)) +} + +# OK +funcGetLMResults <- function +### Reduce the lm object return to just the data needed for further analysis +( llmod, +### The result from a linear model +frmeData, +### Data analysis is performed on +liTaxon, +### The response id +dSig, +### Significance level for q-values +adP, +### List of pvalues from all associations performed +lsSig, +### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients +strLog, +### File to which to document logging +lsQCCounts, +### Records of counts associated with quality control +lastrCols, +### Predictors used in the association +asSuppressCovariates=c() +### Vector of covariates to suppress and not give results for +){ + ilmodIndex = 0 + for( lmod in llmod ) + { + ilmodIndex = ilmodIndex + 1 + lmod = llmod[[ ilmodIndex ]] + iTaxon = liTaxon[[ ilmodIndex ]] + astrCols = lastrCols[[ ilmodIndex ]] + + #Exclude none and errors + if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) ) + { + #holds the location of the pvlaues if an lm, if lmm is detected this will be changed + iPValuePosition = 4 + + #Get the column name of the iTaxon index + #frmeTmp needs to be what? + strTaxon = colnames( frmeData )[iTaxon] + #Get summary information from the linear model + lsSum = try( summary( lmod ) ) + #The following can actually happen when the stranger regressors return broken results + if( class( lsSum ) == "try-error" ) + { + next + } + + #Write summary information to log file + funcWrite( "#model", strLog ) + funcWrite( lmod, strLog ) + funcWrite( "#summary", strLog ) + #Unbelievably, some of the more unusual regression methods crash out when _printing_ their results + try( funcWrite( lsSum, strLog ) ) + + #Get the coefficients + #This should work for linear models + frmeCoefs <- try( coefficients( lsSum ) ) + + if( ( class(frmeCoefs ) == "try-error" ) || is.null( frmeCoefs ) ) + { + adCoefs = try( coefficients( lmod )) + if(class( adCoefs ) == "try-error") + { + adCoefs = coef(lmod) + } + frmeCoefs <- NA + } else { + if( class( frmeCoefs ) == "list" ) + { + frmeCoefs <- frmeCoefs$count + } + adCoefs = frmeCoefs[,1] + } + + #Go through each coefficient + astrRows <- names( adCoefs ) + + ##lmm + if( is.null( astrRows ) ) + { + astrRows = rownames( lsSum$tTable ) + frmeCoefs = lsSum$tTable + iPValuePosition = 5 + adCoefs = frmeCoefs[,1] + } + + for( iMetadata in 1:length( astrRows ) ) + { + #Current coef which is being evaluated + strOrig = astrRows[ iMetadata ] + #Skip y interscept + if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next } + #Skip suppressed covariates + if( funcCoef2Col( strOrig, frmeData ) %in% asSuppressCovariates){ next } + + #Extract pvalue and std in standard model + dP = frmeCoefs[ strOrig, iPValuePosition ] + dStd = frmeCoefs[ strOrig, 2 ] + + #Attempt to extract the pvalue and std in mixed effects summary + #Could not get the pvalue so skip result + if( is.nan( dP ) || is.na( dP ) || is.null( dP ) ) { next } + + dCoef = adCoefs[ iMetadata ] + + #Setting adMetadata + #Metadata values + strMetadata = funcCoef2Col( strOrig, frmeData, astrCols ) + if( is.na( strMetadata ) ) + { + if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next } + c_logrMaaslin$error( "Unknown coefficient: %s", strOrig ) + } + if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next } + adMetadata <- frmeData[, strMetadata ] + + # Store (factor level modified) p-value + # Store general results for each coef + adP <- c( adP, dP ) + + # Append to the list of information about associations + lsSig[[ length( lsSig ) + 1 ]] <- list( + # Current metadata name + name = strMetadata, + # Current metadatda name (as a factor level if existing as such) + orig = strOrig, + # Taxon feature name + taxon = strTaxon, + # Taxon data / response + data = frmeData[, iTaxon ], + # All levels + factors = c( strMetadata ), + # Metadata values + metadata = adMetadata, + # Current coeficient value + value = dCoef, + # Standard deviation + std = dStd, + # Model coefficients + allCoefs = adCoefs ) + } + } + } + return( list( adP = adP, lsSig = lsSig, lsQCCounts = lsQCCounts ) ) + ### List containing a list of pvalues, a list of significant data per association, and a list of QC data +} + +funcGetZeroInflatedResults <- function +### Reduce the lm object return to just the data needed for further analysis +( llmod, +### The result from a linear model +frmeData, +### Data analysis is performed on +liTaxon, +### The response id +dSig, +### Significance level for q-values +adP, +### List of pvalues from all associations performed +lsSig, +### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients +strLog, +### File to which to document logging +lsQCCounts, +### Records of counts associated with quality control +lastrCols, +### Predictors used in the association +asSuppressCovariates=c() +### Vector of covariates to suppress and not give results for +){ + ilmodIndex = 0 + for(lmod in llmod) + { + ilmodIndex = ilmodIndex + 1 + lmod = llmod[[ilmodIndex]] + iTaxon = liTaxon[[ilmodIndex]] + astrCols = lastrCols[[ilmodIndex]] + + #Exclude none and errors + if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) ) + { + #holds the location of the pvlaues if an lm, if lmm is detected this will be changed + iPValuePosition = 4 + + #Get the column name of the iTaxon index + #frmeTmp needs to be what? + strTaxon = colnames( frmeData )[iTaxon] + + #Write summary information to log file + funcWrite( "#model", strLog ) + funcWrite( lmod, strLog ) + + #Get the coefficients + #This should work for linear models + frmeCoefs <- summary(lmod) + if(! is.null( frmeCoefs$coefficients$count ) ) # Needed for zeroinfl + { + frmeCoefs = frmeCoefs$coefficients$count + } + adCoefs = frmeCoefs[,1] + names(adCoefs) = row.names(frmeCoefs) + + funcWrite( "#Coefs", strLog ) + funcWrite( frmeCoefs, strLog ) + + #Go through each coefficient + astrRows <- row.names( frmeCoefs ) + + for( iMetadata in 1:length( astrRows ) ) + { + #Current coef which is being evaluated + strOrig = astrRows[iMetadata] + #Skip y interscept + if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next } + #Skip suppressed covariates + if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next} + + #Extract pvalue and std in standard model + dP = frmeCoefs[strOrig, iPValuePosition] + if(is.nan(dP)){next} + dStd = frmeCoefs[strOrig,2] + + dCoef = adCoefs[iMetadata] + + #Setting adMetadata + #Metadata values + strMetadata = funcCoef2Col( strOrig, frmeData, astrCols ) + if( is.na( strMetadata ) ) + { + if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next } + c_logrMaaslin$error( "Unknown coefficient: %s", strOrig ) + } + if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next } + adMetadata <- frmeData[,strMetadata] + + #Store (factor level modified) p-value + #Store general results for each coef + adP <- c(adP, dP) + lsSig[[length( lsSig ) + 1]] <- list( + #Current metadata name + name = strMetadata, + #Current metadatda name (as a factor level if existing as such) + orig = strOrig,# + #Taxon feature name + taxon = strTaxon, + #Taxon data / response + data = frmeData[,iTaxon], + #All levels + factors = c(strMetadata), + #Metadata values + metadata = adMetadata, + #Current coeficient value + value = dCoef, + #Standard deviation + std = dStd, + #Model coefficients + allCoefs = adCoefs) + } + } + } + return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts)) + ### List containing a list of pvalues, a list of significant data per association, and a list of QC data +} + +oldfuncGetZeroInflatedResults <- function +### Reduce the lm object return to just the data needed for further analysis +( llmod, +### The result from a linear model +frmeData, +### Data analysis is performed on +liTaxon, +### The response id +dSig, +### Significance level for q-values +adP, +### List of pvalues from all associations performed +lsSig, +### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients +strLog, +### File to which to document logging +lsQCCounts, +### Records of counts associated with quality control +lastrCols, +### Predictors used in the association +asSuppressCovariates=c() +### Vector of covariates to suppress and not give results for +){ + ilmodIndex = 0 + for(lmod in llmod) + { + ilmodIndex = ilmodIndex + 1 + lmod = llmod[[ilmodIndex]] + iTaxon = liTaxon[[ilmodIndex]] + astrCols = lastrCols[[ilmodIndex]] + + #Exclude none and errors + if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) ) + { + #holds the location of the pvlaues if an lm, if lmm is detected this will be changed + iPValuePosition = 4 + + #Get the column name of the iTaxon index + #frmeTmp needs to be what? + strTaxon = colnames( frmeData )[iTaxon] + + #Write summary information to log file + funcWrite( "#model", strLog ) + funcWrite( lmod, strLog ) + + #Get the coefficients + #This should work for linear models + frmeCoefs <- summary(lmod) + adCoefs = frmeCoefs[,1] + names(adCoefs) = row.names(frmeCoefs) + + #Go through each coefficient + astrRows <- row.names( frmeCoefs ) + + for( iMetadata in 1:length( astrRows ) ) + { + #Current coef which is being evaluated + strOrig = astrRows[iMetadata] + #Skip y interscept + if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next } + #Skip suppressed covariates + if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next} + + #Extract pvalue and std in standard model + dP = frmeCoefs[strOrig, iPValuePosition] + dStd = frmeCoefs[strOrig,2] + + dCoef = adCoefs[iMetadata] + + #Setting adMetadata + #Metadata values + strMetadata = funcCoef2Col( strOrig, frmeData, astrCols ) + if( is.na( strMetadata ) ) + { + if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next } + c_logrMaaslin$error( "Unknown coefficient: %s", strOrig ) + } + if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next } + adMetadata <- frmeData[,strMetadata] + + #Store (factor level modified) p-value + #Store general results for each coef + adP <- c(adP, dP) + lsSig[[length( lsSig ) + 1]] <- list( + #Current metadata name + name = strMetadata, + #Current metadatda name (as a factor level if existing as such) + orig = strOrig,# + #Taxon feature name + taxon = strTaxon, + #Taxon data / response + data = frmeData[,iTaxon], + #All levels + factors = c(strMetadata), + #Metadata values + metadata = adMetadata, + #Current coeficient value + value = dCoef, + #Standard deviation + std = dStd, + #Model coefficients + allCoefs = adCoefs) + } + } + } + return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts)) + ### List containing a list of pvalues, a list of significant data per association, and a list of QC data +} + + +notfuncGetZeroInflatedResults <- function +### Reduce the lm object return to just the data needed for further analysis +( llmod, +### The result from a linear model +frmeData, +### Data analysis is performed on +liTaxon, +### The response id +dSig, +### Significance level for q-values +adP, +### List of pvalues from all associations performed +lsSig, +### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients +strLog, +### File to which to document logging +lsQCCounts, +### Records of counts associated with quality control +lastrCols, +### Predictors used in the association +asSuppressCovariates=c() +### Vector of covariates to suppress and not give results for +){ + ilmodIndex = 0 + for(lmod in llmod) + { + ilmodIndex = ilmodIndex + 1 + lmod = llmod[[ilmodIndex]] + iTaxon = liTaxon[[ilmodIndex]] + astrCols = lastrCols[[ilmodIndex]] + + #Exclude none and errors + if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) ) + { + #holds the location of the pvlaues if an lm, if lmm is detected this will be changed + iPValuePosition = 4 + + #Get the column name of the iTaxon index + #frmeTmp needs to be what? + strTaxon = colnames( frmeData )[iTaxon] + + #Write summary information to log file + funcWrite( "#model", strLog ) + funcWrite( lmod, strLog ) + + #Get the coefficients + #This should work for linear models + frmeCoefs <- summary(lmod) + frmeCoefs = frmeCoefs$coefficients$count + funcWrite( "#Coefs", strLog ) + funcWrite( frmeCoefs, strLog ) + + adCoefs = frmeCoefs[,1] + names(adCoefs) = row.names(frmeCoefs) + + #Go through each coefficient + astrRows <- row.names( frmeCoefs ) + + for( iMetadata in 1:length( astrRows ) ) + { + #Current coef which is being evaluated + strOrig = astrRows[iMetadata] + + #Skip y interscept + if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next } + #Skip suppressed covariates + if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next} + + #Extract pvalue and std in standard model + dP = frmeCoefs[strOrig, iPValuePosition] + if(is.nan(dP)){ next } + dStd = frmeCoefs[strOrig,2] + dCoef = adCoefs[iMetadata] + + #Setting adMetadata + #Metadata values + strMetadata = funcCoef2Col( strOrig, frmeData, astrCols ) + if( is.na( strMetadata ) ) + { + if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next } + c_logrMaaslin$error( "Unknown coefficient: %s", strOrig ) + } + if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next } + adMetadata <- frmeData[,strMetadata] + + #Store (factor level modified) p-value + #Store general results for each coef + adP <- c(adP, dP) + lsSig[[length( lsSig ) + 1]] <- list( + #Current metadata name + name = strMetadata, + #Current metadatda name (as a factor level if existing as such) + orig = strOrig,# + #Taxon feature name + taxon = strTaxon, + #Taxon data / response + data = frmeData[,iTaxon], + #All levels + factors = c(strMetadata), + #Metadata values + metadata = adMetadata, + #Current coeficient value + value = dCoef, + #Standard deviation + std = dStd, + #Model coefficients + allCoefs = adCoefs) + } + } + } + return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts)) + ### List containing a list of pvalues, a list of significant data per association, and a list of QC data +} + + +### Options for variable selection +# OK +funcBoostModel <- function( +### Perform model selection / regularization with boosting +strFormula, +### The formula of the full model before boosting +frmeTmp, +### The data on which to perform analysis +adCur, +### The response data +lsParameters, +### User controlled parameters needed specific to boosting +lsForcedParameters = NULL, +### Force these predictors to be in the model +strLog +### File to which to document logging +){ + funcWrite( c("#Boost formula", strFormula), strLog ) + lmod = try( gbm( as.formula( strFormula ), data=frmeTmp, distribution="laplace", verbose=FALSE, n.minobsinnode=min(10, round(0.1 * nrow( frmeTmp ) ) ), n.trees=1000 ) ) +#TODO# lmod = try( gbm( as.formula( strFormula ), data=frmeTmp, distribution="gaussian", verbose=FALSE, n.minobsinnode=min(10, round(0.1 * nrow( frmeTmp ) ) ), n.trees=1000 ) ) + + astrTerms <- c() + if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) ) + { + #Get boosting summary results + lsSum <- summary( lmod, plotit = FALSE ) + + #Document + funcWrite( "#model-gbm", strLog ) + funcWrite( lmod$fit, strLog ) + funcWrite( lmod$train.error, strLog ) + funcWrite( lmod$Terms, strLog ) + funcWrite( "#summary-gbm", strLog ) + funcWrite( lsSum, strLog ) + + # Uneven metadata + vstrUneven = c() + # Kept metadata + vstrKeepMetadata = c() + + #Select model predictors + #Check the frequency of selection and skip if not selected more than set threshold dFreq + for( strMetadata in lmod$var.names ) + { + #Get the name of the metadata + strTerm <- funcCoef2Col( strMetadata, frmeTmp, c(astrMetadata, astrGenetics) ) + + #Add back in uneven metadata + if(fAddBack) + { + ldMetadata = frmeTmp[[strMetadata]] + if(length(which(table(ldMetadata)/length(ldMetadata)>dUnevenMax))>0) + { + astrTerms <- c(astrTerms, strTerm) + vstrUneven = c(vstrUneven,strMetadata) + next + } + } + + #If the selprob is less than a certain frequency, skip + dSel <- lsSum$rel.inf[which( lsSum$var == strMetadata )] / 100 + if( is.na(dSel) || ( dSel <= lsParameters$dFreq ) ){ next } +#TODO# if( is.na(dSel) || ( dSel < lsParameters$dFreq ) ){ next } + + #If you should ignore the metadata, continue + if( is.null( strTerm ) ) { next } + + #If you cant find the metadata name, write + if( is.na( strTerm ) ) + { + c_logrMaaslin$error( "Unknown coefficient: %s", strMetadata ) + next + } + + #Collect metadata names + astrTerms <- c(astrTerms, strTerm) + vstrKeepMetadata = c(vstrKeepMetadata,strTerm) + } + } else { astrTerms = lsForcedParameters } + +# funcBoostInfluencePlot(vdRelInf=lsSum$rel.inf, sFeature=lsParameters$sBugName, vsPredictorNames=lsSum$var, vstrKeepMetadata=vstrKeepMetadata, vstrUneven=vstrUneven) + + return(unique(c(astrTerms,lsForcedParameters))) + ### Return a vector of predictor names to use in a reduced model +} + +#Glmnet default is to standardize the variables. +#used as an example for implementation +#http://r.789695.n4.nabble.com/estimating-survival-times-with-glmnet-and-coxph-td4614225.html +funcPenalizedModel <- function( +### Perform penalized regularization for variable selection +strFormula, +### The formula of the full model before boosting +frmeTmp, +### The data on which to perform analysis +adCur, +### The response data +lsParameters, +### User controlled parameters needed specific to boosting +lsForcedParameters = NULL, +### Force these predictors to be in the model +strLog +### File to which to document logging +){ + #Convert the data frame to a model matrix + mtrxDesign = model.matrix(as.formula(strFormula), data=frmeTmp) + + #Cross validate the lambda + cvRet = cv.glmnet(x=mtrxDesign,y=adCur,alpha=lsParameters$dPAlpha) + + #Perform lasso + glmnetMod = glmnet(x=mtrxDesign,y=adCur,family=lsParameters$family,alpha=lsParameters$dPAlpha,lambda=cvRet$lambda.min) + + #Get non zero beta and return column names for covariate names. + ldBeta = glmnetMod$beta[,which.max(glmnetMod$dev.ratio)] + ldBeta = names(ldBeta[which(abs(ldBeta)>0)]) + return(sapply(ldBeta,funcCoef2Col,frmeData=frmeTmp)) +} + +# OK +funcForwardModel <- function( +### Perform model selection with forward stepwise selection +strFormula, +### lm style string defining reposne and predictors +frmeTmp, +### Data on which to perform analysis +adCur, +### Response data +lsParameters, +### User controlled parameters needed specific to boosting +lsForcedParameters = NULL, +### Force these predictors to be in the model +strLog +### File to which to document logging +){ + funcWrite( c("#Forward formula", strFormula), strLog ) + + strNullFormula = "adCur ~ 1" + if(!is.null(lsForcedParameters)) + { + strNullFormula = paste( "adCur ~", paste( sprintf( "`%s`", lsForcedParameters ), collapse = " + " )) + } + lmodNull <- try( lm(as.formula( strNullFormula ), data=frmeTmp)) + lmodFull <- try( lm(as.formula( strFormula ), data=frmeTmp )) + if(!("try-error" %in% c(class( lmodNull ),class( lmodFull )))) + { + lmod = stepAIC(lmodNull, scope=list(lower=lmodNull,upper=lmodFull), direction="forward", trace=0) + return(funcGetStepPredictors(lmod, frmeTmp, strLog)) + } + return( lsForcedParameters ) + ### Return a vector of predictor names to use in a reduced model or NA on error +} + +# OK +# Select model with backwards selection +funcBackwardsModel <- function( +### Perform model selection with backwards stepwise selection +strFormula, +### lm style string defining reponse and predictors +frmeTmp, +### Data on which to perform analysis +adCur, +### Response data +lsParameters, +### User controlled parameters needed specific to boosting +lsForcedParameters = NULL, +### Force these predictors to be in the model +strLog +### File to which to document logging +){ + funcWrite( c("#Backwards formula", strFormula), strLog ) + + strNullFormula = "adCur ~ 1" + if(!is.null(lsForcedParameters)) + { + strNullFormula = paste( "adCur ~", paste( sprintf( "`%s`", lsForcedParameters ), collapse = " + " )) + } + + lmodNull <- try( lm(as.formula( strNullFormula ), data=frmeTmp)) + lmodFull <- try( lm(as.formula( strFormula ), data=frmeTmp )) + + if(! class( lmodFull ) == "try-error" ) + { + lmod = stepAIC(lmodFull, scope=list(lower=lmodNull, upper=lmodFull), direction="backward") + return(funcGetStepPredictors(lmod, frmeTmp, strLog)) + } else { + return( lsForcedParameters ) } + ### Return a vector of predictor names to use in a reduced model or NA on error +} + +### Analysis methods +### Univariate options + +# Sparse Dir. Model +#TODO# Implement in sfle + +# Tested +# Correlation +# NOTE: Ignores the idea of random and fixed covariates +funcSpearman <- function( +### Perform multiple univariate comparisons producing spearman correlations for association +strFormula, +### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables +frmeTmp, +### Data on which to perform analysis +iTaxon, +### Index of the response data +lsQCCounts, +### List recording anything important to QC +strRandomFormula = NULL +### Has the formula for random covariates +){ + return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon, + functionContrast=function(x,adCur,dfData) + { + retList = list() + ret = cor.test(as.formula(paste("~",x,"+ adCur")), data=dfData, method="spearman", na.action=c_strNA_Action) + #Returning rho for the coef in a named vector + vdCoef = c() + vdCoef[[x]]=ret$estimate + retList[[1]]=list(p.value=ret$p.value,SD=sd(dfData[[x]]),name=x,coef=vdCoef) + return(retList) + }, lsQCCounts)) + ### List of contrast information, pvalue, contrast and std per univariate test +} + +# Tested +# Wilcoxon (T-Test) +# NOTE: Ignores the idea of random and fixed covariates +funcWilcoxon <- function( +### Perform multiple univariate comparisons performing wilcoxon tests on discontinuous data with 2 levels +strFormula, +### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables +frmeTmp, +### Data on which to perform analysis +iTaxon, +### Index of the response data +lsQCCounts, +### List recording anything important to QC +strRandomFormula = NULL +### Has the formula for random covariates +){ + return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon, + functionContrast=function(x,adCur,dfData) + { + retList = list() + ret = wilcox.test(as.formula(paste("adCur",x,sep=" ~ ")), data=dfData, na.action=c_strNA_Action) + #Returning NA for the coef in a named vector + vdCoef = c() + vdCoef[[x]]=ret$statistic + retList[[1]]=list(p.value=ret$p.value,SD=sd(dfData[[x]]),name=x,coef=vdCoef) + return(retList) + }, lsQCCounts)) + ### List of contrast information, pvalue, contrast and std per univariate test +} + +# Tested +# Kruskal.Wallis (Nonparameteric anova) +# NOTE: Ignores the idea of random and fixed covariates +funcKruskalWallis <- function( +### Perform multiple univariate comparisons performing Kruskal wallis rank sum tests on discontuous data with more than 2 levels +strFormula, +### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables +frmeTmp, +### Data on which to perform analysis +iTaxon, +### Index of the response data +lsQCCounts, +### List recording anything important to QC +strRandomFormula = NULL +### Has the formula for random covariates +){ + return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon, + functionContrast=function(x,adCur,dfData) + { + retList = list() + lmodKW = kruskal(adCur,dfData[[x]],group=FALSE,p.adj="holm") + + asLevels = levels(dfData[[x]]) + # The names of the generated comparisons, sometimes the control is first sometimes it is not so + # We will just check which is in the names and use that + asComparisons = row.names(lmodKW$comparisons) + #Get the comparison with the control + for(sLevel in asLevels[2:length(asLevels)]) + { + sComparison = intersect(c(paste(asLevels[1],sLevel,sep=" - "),paste(sLevel,asLevels[1],sep=" - ")),asComparisons) + #Returning NA for the coef in a named vector + vdCoef = c() + vdCoef[[paste(x,sLevel,sep="")]]=lmodKW$comparisons[sComparison,"Difference"] +# vdCoef[[paste(x,sLevel,sep="")]]=NA + retList[[length(retList)+1]]=list(p.value=lmodKW$comparisons[sComparison,"p.value"],SD=1.0,name=paste(x,sLevel,sep=""),coef=vdCoef) + } + return(retList) + }, lsQCCounts)) + ### List of contrast information, pvalue, contrast and std per univariate test +} + +# Tested +# NOTE: Ignores the idea of random and fixed covariates +funcDoUnivariate <- function( +### Perform multiple univariate comparisons producing spearman correlations for association +strFormula, +### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables +frmeTmp, +### Data on which to perform analysis +iTaxon, +### Index of the response data +lsHistory, +### List recording p-values, association information, and QC counts +strRandomFormula = NULL, +### Has the formula for random covariates +fZeroInflate = FALSE +){ + if(fZeroInflate) + { + throw("There are no zero-inflated univariate models to perform your analysis.") + } + + # Get covariates + astrCovariates = unique(c(funcFormulaStrToList(strFormula),funcFormulaStrToList(strRandomFormula))) + + # For each covariate + for(sCovariate in astrCovariates) + { + ## Check to see if it is discrete + axData = frmeTmp[[sCovariate]] + lsRet = NA + if(is.factor(axData) || is.logical(axData)) + { + ## If discrete check how many levels + lsDataLevels = levels(axData) + ## If 2 levels do wilcoxon test + if(length(lsDataLevels) < 3) + { + lsRet = funcWilcoxon(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts) + } else { + ## If 3 or more levels do kruskal wallis test + lsRet = funcKruskalWallis(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts) + } + } else { + ## If not discrete do spearman test (list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts)) + lsRet = funcSpearman(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts) + } + lsHistory[["adP"]] = c(lsHistory[["adP"]], lsRet[["adP"]]) + lsHistory[["lsSig"]] = c(lsHistory[["lsSig"]], lsRet[["lsSig"]]) + lsHistory[["lsQCCounts"]] = lsRet[["lsQCCounts"]] + } + return(lsHistory) +} + +### Multivariate + +# Tested +funcLM <- function( +### Perform vanilla linear regression +strFormula, +### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables +frmeTmp, +### Data on which to perform analysis +iTaxon, +### Index of the response data +lsHistory, +### List recording p-values, association information, and QC counts +strRandomFormula = NULL, +### Has the formula for random covariates +fZeroInflated = FALSE +### Turns on the zero inflated model +){ + adCur = frmeTmp[,iTaxon] + if(fZeroInflated) + { + return(try(gamlss(formula=as.formula(strFormula), family=BEZI, data=frmeTmp))) # gamlss + } else { + if(!is.null(strRandomFormula)) + { + return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family=gaussian(link="identity"), data=frmeTmp))) + #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled) + #return(try( lmer(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action) )) + } else { + return(try( lm(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action) )) + } + } + ### lmod result object from lm +} + +# Tested +funcBinomialMult <- function( +### Perform linear regression with negative binomial link +strFormula, +### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables +frmeTmp, +### Data on which to perform analysis +iTaxon, +### Index of the response data +lsHistory, +### List recording p-values, association information, and QC counts +strRandomFormula = NULL, +### Has the formula for random covariates +fZeroInflated = FALSE +### Turns on the zero inflated model +){ + adCur = frmeTmp[,iTaxon] + if(fZeroInflated) + { + return(try(zeroinfl(as.formula(strFormula), data=frmeTmp, dist="negbin"))) # pscl +# return(try(gamlss(as.formula(strFormula), family=ZINBI, data=frmeTmp))) # pscl + } else { + if(!is.null(strRandomFormula)) + { + throw("This analysis flow is not completely developed, please choose an option other than negative bionomial with random covariates") + #TODO need to estimate the theta + #return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family=negative.binomial(theta = 2, link=log), data=frmeTmp))) + #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled) + } else { + return(try( glm.nb(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action ))) + } + } + ### lmod result object from lm +} + +# Tested +funcQuasiMult <- function( +### Perform linear regression with quasi-poisson link +strFormula, +### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables +frmeTmp, +### Data on which to perform analysis +iTaxon, +### Index of the response data +lsHistory, +### List recording p-values, association information, and QC counts +strRandomFormula = NULL, +### Has the formula for random covariates +fZeroInflated = FALSE +### Turns on a zero infalted model +){ + adCur = frmeTmp[,iTaxon] + if(fZeroInflated) + { +# return(try(gamlss(formula=as.formula(strFormula), family=ZIP, data=frmeTmp))) # gamlss + return(try(zeroinfl(as.formula(strFormula), data=frmeTmp, dist="poisson"))) # pscl + } else { + #Check to see if | is in the model, if so use a lmm otherwise the standard glm is ok + if(!is.null(strRandomFormula)) + { + return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family= quasipoisson, data=frmeTmp))) + #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled) + #return(try ( glmer(as.formula(strFormula), data=frmeTmp, family=quasipoisson, na.action=c_strNA_Action) )) + } else { + return(try( glm(as.formula(strFormula), family=quasipoisson, data=frmeTmp, na.action=c_strNA_Action) )) + } + } + ### lmod result object from lm +} + +### Transformations +# Tested +funcArcsinSqrt <- function( +# Transform data with arcsin sqrt transformation +aData +### The data on which to perform the transformation +){ + return(asin(sqrt(aData))) + ### Transformed data +} + +funcSquareSin <- function( +# Transform data with square sin transformation +# Opposite of the funcArcsinSqrt +aData +### The data on which to perform the transformation +){ + return(sin(aData)^2) + ### Transformed data +} + +# Tested +funcNoTransform <-function( +### Pass data without transform +aData +### The data on which to perform the transformation +### Only given here to preserve the pattern, not used. +){ + return(aData) + ### Transformed data +} + +funcGetAnalysisMethods <- function( +### Returns the appropriate functions for regularization, analysis, data transformation, and analysis object inspection. +### This allows modular customization per analysis step. +### To add a new method insert an entry in the switch for either the selection, transform, or method +### Insert them by using the pattern optparse_keyword_without_quotes = function_in_AnalysisModules +### Order in the return listy is currently set and expected to be selection, transforms/links, analysis method +### none returns null +sModelSelectionKey, +### Keyword defining the method of model selection +sTransformKey, +### Keyword defining the method of data transformation +sMethodKey, +### Keyword defining the method of analysis +fZeroInflated = FALSE +### Indicates if using zero inflated models +){ + lRetMethods = list() + #Insert selection methods here + lRetMethods[[c_iSelection]] = switch(sModelSelectionKey, + boost = funcBoostModel, + penalized = funcPenalizedModel, + forward = funcForwardModel, + backward = funcBackwardsModel, + none = NA) + + #Insert transforms + lRetMethods[[c_iTransform]] = switch(sTransformKey, + asinsqrt = funcArcsinSqrt, + none = funcNoTransform) + + #Insert untransform + lRetMethods[[c_iUnTransform]] = switch(sTransformKey, + asinsqrt = funcNoTransform, + none = funcNoTransform) + + #Insert analysis + lRetMethods[[c_iAnalysis]] = switch(sMethodKey, + neg_binomial = funcBinomialMult, + quasi = funcQuasiMult, + univariate = funcDoUnivariate, + lm = funcLM, + none = NA) + + # If a univariate method is used it is required to set this to true + # For correct handling. + lRetMethods[[c_iIsUnivariate]]=sMethodKey=="univariate" + + #Insert method to get results + if(fZeroInflated) + { + lRetMethods[[c_iResults]] = switch(sMethodKey, + neg_binomial = funcGetZeroInflatedResults, + quasi = funcGetZeroInflatedResults, + univariate = funcGetUnivariateResults, + lm = funcGetZeroInflatedResults, + none = NA) + } else { + lRetMethods[[c_iResults]] = switch(sMethodKey, + neg_binomial = funcGetLMResults, + quasi = funcGetLMResults, + univariate = funcGetUnivariateResults, + lm = funcGetLMResults, + none = NA) + } + + return(lRetMethods) + ### Returns a list of functions to be passed for regularization, data transformation, analysis, + ### and custom analysis results introspection functions to pull from return objects data of interest +}