Mercurial > repos > george-weingart > maaslin

diff src/lib/AnalysisModules.R @ 0:e0b5980139d9
maaslin
author: george-weingart
date: Tue, 13 May 2014 22:00:40 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/AnalysisModules.R	Tue May 13 22:00:40 2014 -0400
@@ -0,0 +1,1237 @@
+#####################################################################################
+#Copyright (C) <2012>
+#
+#Permission is hereby granted, free of charge, to any person obtaining a copy of
+#this software and associated documentation files (the "Software"), to deal in the
+#Software without restriction, including without limitation the rights to use, copy,
+#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+#and to permit persons to whom the Software is furnished to do so, subject to
+#the following conditions:
+#
+#The above copyright notice and this permission notice shall be included in all copies
+#or substantial portions of the Software.
+#
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), 
+# authored by the Huttenhower lab at the Harvard School of Public Health
+# (contact Timothy Tickle, ttickle@hsph.harvard.edu).
+#####################################################################################
+
+inlinedocs <- function(
+##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu>
+##description<< Allows one to plug in new modules to perform analysis (univariate or multivariate), regularization, and data (response) transformation.
+) { return( pArgs ) }
+
+# Libraries
+suppressMessages(library( agricolae, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+# Needed for the pot-hoc Kruskal wallis comparisons
+suppressMessages(library( penalized, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+# Needed for stepAIC
+suppressMessages(library( MASS, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+# Needed for na action behavior
+suppressMessages(library( gam, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+# Needed for boosting
+suppressMessages(library( gbm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+# Needed for LASSO
+suppressMessages(library( glmnet, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+# Needed for mixed models
+#suppressMessages(library( lme4, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+suppressMessages(library( nlme, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+
+# Needed for zero inflated models
+#suppressMessages(library( MCMCglmm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+suppressMessages(library( pscl, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+suppressMessages(library( gamlss, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+# Do not use #suppressMessages(library( glmmADMB, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+
+fAddBack = TRUE
+dUnevenMax = .9
+
+
+### Helper functions
+# OK
+funcMakeContrasts <- function
+### Makes univariate contrasts of all predictors in the model formula with the response.
+(strFormula, 
+### lm style string defining reponse and predictors
+strRandomFormula,
+### mixed model string defining the fixed covariates
+frmeTmp,
+### The data frame to find predictor data in
+iTaxon,
+### Taxon
+functionContrast,
+### functionContrast The univariate test to perform
+lsQCCounts
+### QC info
+){
+  #TODO are we updating the QCCounts?
+  lsSig = list()
+  ### Holds all the significance results from the tests
+  adP = c()
+  ### Holds the p-values
+  sCurDataName = names(frmeTmp)[iTaxon]
+  ### The name of the taxon (data row) that is being associated (always assumed to be numeric)
+  #Get test comparisons (predictor names from formula string)
+  asComparisons  = unique(c(funcFormulaStrToList(strFormula),funcFormulaStrToList(strRandomFormula)))
+
+  #Change metadata in formula to univariate comparisons
+  for(sComparison in asComparisons)
+  {
+    # Metadata values
+    vxTest = frmeTmp[[sComparison]]
+
+    # Get the levels in the comparison
+    # Can ignore the first level because it is the reference level
+    asLevels = sComparison
+    if(is.factor(vxTest)){asLevels = levels(vxTest)[2:length(vxTest)]}
+
+    lasComparisonResults = functionContrast(x=sComparison, adCur=frmeTmp[[sCurDataName]], dfData=frmeTmp)
+    for(asComparison in lasComparisonResults)
+    {
+      if( is.na( asComparison$p.value ) ) { next }
+      # Get pvalue
+      adP = c(adP, asComparison$p.value)
+      # Get SD, if not available, give SD of covariate
+      dSTD = asComparison$SD
+      # TODO Is using sd on factor and binary data correct?
+      if(is.na(dSTD) || is.null(dSTD)){dSTD = sd(vxTest)}
+
+      lsSig[[length( lsSig ) + 1]] <- list(
+        #Current metadata name (string column name) ok
+        name = sComparison,
+        #Current metadatda name (string, as a factor level if existing as such) ok
+        orig = asComparison$name,
+        #Taxon feature name (string) ok
+        taxon = colnames( frmeTmp )[iTaxon],
+        #Taxon data / response (double vector) ok
+        data = frmeTmp[,iTaxon],
+        #Name of column ok
+        factors = sComparison,
+        #Metadata values (metadata as a factor or raw numeric) ok
+        metadata = vxTest,
+        #Current coefficient value (named coef value with level name (from coefs) ok
+        value = asComparison$coef,
+        #Standard deviation (numeric) ok
+        std = dSTD,
+        #Model coefficients (output from coefs with intercept) ok
+        allCoefs = asComparison$coef)
+    }
+  }
+  return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts))
+  ### Returns a list of p-value, standard deviation, and comparison which produced the p-value
+}
+
+#Ok
+funcGetStepPredictors <- function
+### Retrieve the predictors of the reduced model after stepwise model selection
+(lmod,
+### Linear model resulting from step-wise model selection
+frmeTmp,
+strLog
+### File to document logging
+){
+  #Document
+  funcWrite( "#model", strLog )
+  funcWrite( lmod$fit, strLog )
+#TODO  funcWrite( lmod$train.error, strLog )
+#TODO  funcWrite( lmod$Terms, strLog )
+  funcWrite( "#summary-gbm", strLog )
+  funcWrite( summary(lmod), strLog )
+
+  #Get Names from coefficients
+  asStepCoefsFactors = coefficients(lmod)
+  astrCoefNames = setdiff(names(asStepCoefsFactors[as.vector(!is.na(asStepCoefsFactors))==TRUE]),"(Intercept)")
+  asStepCoefsFactors = unique(as.vector(sapply(astrCoefNames,funcCoef2Col, frmeData=frmeTmp)))
+
+  if(length(asStepCoefsFactors)<1){ return(NA) }
+  return( asStepCoefsFactors )
+  ### Vector of string predictor names
+}
+
+funcGetUnivariateResults <- function
+### Reduce the list of list of results to the correct format
+( llmod,
+### The list of univariate models
+frmeData,
+### Data analysis is performed on
+liTaxon,
+### The response id
+dSig,
+### Significance level for q-values
+adP,
+### List of pvalues from all associations performed
+lsSig,
+### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients
+strLog,
+### File to which to document logging
+lsQCCounts,
+### Records of counts associated with quality control
+lastrCols,
+### Predictors used in the association
+asSuppressCovariates=c()
+### Vector of covariates to suppress and not give results for
+){
+  adP = c()
+  lsSig = list()
+  for(lmod in llmod)
+  {
+    adP = c(adP,lmod$adP)
+    lsSig = c(lsSig,lmod$lsSig)
+  }
+  return(list(adP=adP, lsSig=lsSig, lsQCCounts=llmod[length(llmod)]$lsQCCounts))
+}
+
+# OK
+funcGetLMResults <- function
+### Reduce the lm object return to just the data needed for further analysis
+( llmod,
+### The result from a linear model
+frmeData,
+### Data analysis is performed on
+liTaxon,
+### The response id
+dSig,
+### Significance level for q-values
+adP,
+### List of pvalues from all associations performed
+lsSig,
+### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients
+strLog,
+### File to which to document logging
+lsQCCounts,
+### Records of counts associated with quality control
+lastrCols,
+### Predictors used in the association
+asSuppressCovariates=c()
+### Vector of covariates to suppress and not give results for
+){
+  ilmodIndex = 0
+  for( lmod in llmod )
+  {
+    ilmodIndex = ilmodIndex + 1
+    lmod = llmod[[ ilmodIndex ]]
+    iTaxon = liTaxon[[ ilmodIndex ]]
+    astrCols = lastrCols[[ ilmodIndex ]]
+
+    #Exclude none and errors
+    if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) )
+    {
+      #holds the location of the pvlaues if an lm, if lmm is detected this will be changed
+      iPValuePosition = 4
+
+      #Get the column name of the iTaxon index
+      #frmeTmp needs to be what?
+      strTaxon = colnames( frmeData )[iTaxon]
+      #Get summary information from the linear model
+      lsSum = try( summary( lmod ) )
+      #The following can actually happen when the stranger regressors return broken results
+      if( class( lsSum ) == "try-error" )
+      {
+        next
+      }
+
+      #Write summary information to log file
+      funcWrite( "#model", strLog )
+      funcWrite( lmod, strLog )
+      funcWrite( "#summary", strLog )
+      #Unbelievably, some of the more unusual regression methods crash out when _printing_ their results 
+      try( funcWrite( lsSum, strLog ) )
+
+      #Get the coefficients
+      #This should work for linear models
+      frmeCoefs <- try( coefficients( lsSum ) )
+      
+      if( ( class(frmeCoefs ) == "try-error" ) || is.null( frmeCoefs ) )
+      {
+        adCoefs = try( coefficients( lmod ))
+        if(class( adCoefs ) == "try-error")
+        {
+          adCoefs = coef(lmod)
+        }
+        frmeCoefs <- NA
+      } else {
+        if( class( frmeCoefs ) == "list" )
+        {
+          frmeCoefs <- frmeCoefs$count
+        }
+        adCoefs = frmeCoefs[,1]
+      }
+
+      #Go through each coefficient
+      astrRows <- names( adCoefs )
+
+      ##lmm
+      if( is.null( astrRows ) )
+      {
+        astrRows = rownames( lsSum$tTable )
+        frmeCoefs = lsSum$tTable
+        iPValuePosition = 5
+        adCoefs = frmeCoefs[,1]
+      }
+
+      for( iMetadata in 1:length( astrRows ) )
+      {
+        #Current coef which is being evaluated 
+        strOrig = astrRows[ iMetadata ]
+        #Skip y interscept
+        if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next }
+        #Skip suppressed covariates
+        if( funcCoef2Col( strOrig, frmeData ) %in% asSuppressCovariates){ next }
+
+        #Extract pvalue and std in standard model
+        dP = frmeCoefs[ strOrig, iPValuePosition ]
+        dStd = frmeCoefs[ strOrig, 2 ]
+
+        #Attempt to extract the pvalue and std in mixed effects summary 
+        #Could not get the pvalue so skip result
+        if( is.nan( dP ) || is.na( dP ) || is.null( dP ) ) { next }
+
+        dCoef = adCoefs[ iMetadata ]
+
+        #Setting adMetadata
+        #Metadata values
+        strMetadata = funcCoef2Col( strOrig, frmeData, astrCols )
+        if( is.na( strMetadata ) )
+        {
+          if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next }
+          c_logrMaaslin$error( "Unknown coefficient: %s", strOrig )
+        }
+        if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next }
+        adMetadata <- frmeData[, strMetadata ]
+
+        # Store (factor level modified) p-value
+        # Store general results for each coef
+        adP <- c( adP, dP )
+
+         # Append to the list of information about associations
+        lsSig[[ length( lsSig ) + 1 ]] <- list(
+          # Current metadata name
+          name		= strMetadata,
+          # Current metadatda name (as a factor level if existing as such)
+          orig		= strOrig,
+          # Taxon feature name
+          taxon		= strTaxon,
+          # Taxon data / response
+          data		= frmeData[, iTaxon ],
+          # All levels
+          factors	= c( strMetadata ),
+          # Metadata values
+          metadata	= adMetadata,
+          # Current coeficient value
+          value		= dCoef,
+          # Standard deviation
+          std		= dStd,
+          # Model coefficients
+          allCoefs	= adCoefs )
+      }
+    }
+  }
+  return( list( adP = adP, lsSig = lsSig, lsQCCounts = lsQCCounts ) )
+  ### List containing a list of pvalues, a list of significant data per association, and a list of QC data
+}
+
+funcGetZeroInflatedResults <- function
+### Reduce the lm object return to just the data needed for further analysis
+( llmod,
+### The result from a linear model
+frmeData,
+### Data analysis is performed on
+liTaxon,
+### The response id
+dSig,
+### Significance level for q-values
+adP,
+### List of pvalues from all associations performed
+lsSig,
+### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients
+strLog,
+### File to which to document logging
+lsQCCounts,
+### Records of counts associated with quality control
+lastrCols,
+### Predictors used in the association
+asSuppressCovariates=c()
+### Vector of covariates to suppress and not give results for
+){
+  ilmodIndex = 0
+  for(lmod in llmod)
+  {
+    ilmodIndex = ilmodIndex + 1
+    lmod = llmod[[ilmodIndex]]
+    iTaxon = liTaxon[[ilmodIndex]]
+    astrCols = lastrCols[[ilmodIndex]]
+
+    #Exclude none and errors
+    if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) )
+    {
+      #holds the location of the pvlaues if an lm, if lmm is detected this will be changed
+      iPValuePosition = 4
+
+      #Get the column name of the iTaxon index
+      #frmeTmp needs to be what?
+      strTaxon = colnames( frmeData )[iTaxon]
+
+      #Write summary information to log file
+      funcWrite( "#model", strLog )
+      funcWrite( lmod, strLog )
+
+      #Get the coefficients
+      #This should work for linear models
+      frmeCoefs <- summary(lmod)
+      if(! is.null( frmeCoefs$coefficients$count ) ) # Needed for zeroinfl
+      {
+        frmeCoefs = frmeCoefs$coefficients$count
+      }
+      adCoefs = frmeCoefs[,1]
+      names(adCoefs) = row.names(frmeCoefs)
+
+      funcWrite( "#Coefs", strLog )
+      funcWrite( frmeCoefs, strLog )
+
+      #Go through each coefficient
+      astrRows <- row.names( frmeCoefs )
+
+      for( iMetadata in 1:length( astrRows ) )
+      {
+        #Current coef which is being evaluated 
+        strOrig = astrRows[iMetadata]
+        #Skip y interscept
+        if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next }
+        #Skip suppressed covariates
+        if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next}
+
+        #Extract pvalue and std in standard model
+        dP = frmeCoefs[strOrig, iPValuePosition]
+        if(is.nan(dP)){next}
+        dStd = frmeCoefs[strOrig,2]
+
+        dCoef = adCoefs[iMetadata]
+
+        #Setting adMetadata
+        #Metadata values
+        strMetadata = funcCoef2Col( strOrig, frmeData, astrCols )
+        if( is.na( strMetadata ) )
+        {
+          if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next }
+          c_logrMaaslin$error( "Unknown coefficient: %s", strOrig )
+        }
+        if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next }
+        adMetadata <- frmeData[,strMetadata]
+
+        #Store (factor level modified) p-value
+        #Store general results for each coef
+        adP <- c(adP, dP)
+        lsSig[[length( lsSig ) + 1]] <- list(
+          #Current metadata name
+          name		= strMetadata,
+          #Current metadatda name (as a factor level if existing as such)
+          orig		= strOrig,#
+          #Taxon feature name
+          taxon		= strTaxon,
+          #Taxon data / response
+          data		= frmeData[,iTaxon],
+          #All levels
+          factors	= c(strMetadata),
+          #Metadata values
+          metadata	= adMetadata,
+          #Current coeficient value
+          value		= dCoef,
+          #Standard deviation
+          std		= dStd,
+          #Model coefficients
+          allCoefs	= adCoefs)
+      }
+    }
+  }
+  return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts))
+  ### List containing a list of pvalues, a list of significant data per association, and a list of QC data
+}
+
+oldfuncGetZeroInflatedResults <- function
+### Reduce the lm object return to just the data needed for further analysis
+( llmod,
+### The result from a linear model
+frmeData,
+### Data analysis is performed on
+liTaxon,
+### The response id
+dSig,
+### Significance level for q-values
+adP,
+### List of pvalues from all associations performed
+lsSig,
+### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients
+strLog,
+### File to which to document logging
+lsQCCounts,
+### Records of counts associated with quality control
+lastrCols,
+### Predictors used in the association
+asSuppressCovariates=c()
+### Vector of covariates to suppress and not give results for
+){
+  ilmodIndex = 0
+  for(lmod in llmod)
+  {
+    ilmodIndex = ilmodIndex + 1
+    lmod = llmod[[ilmodIndex]]
+    iTaxon = liTaxon[[ilmodIndex]]
+    astrCols = lastrCols[[ilmodIndex]]
+
+    #Exclude none and errors
+    if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) )
+    {
+      #holds the location of the pvlaues if an lm, if lmm is detected this will be changed
+      iPValuePosition = 4
+
+      #Get the column name of the iTaxon index
+      #frmeTmp needs to be what?
+      strTaxon = colnames( frmeData )[iTaxon]
+
+      #Write summary information to log file
+      funcWrite( "#model", strLog )
+      funcWrite( lmod, strLog )
+
+      #Get the coefficients
+      #This should work for linear models
+      frmeCoefs <- summary(lmod)
+      adCoefs = frmeCoefs[,1]
+      names(adCoefs) = row.names(frmeCoefs)
+
+      #Go through each coefficient
+      astrRows <- row.names( frmeCoefs )
+
+      for( iMetadata in 1:length( astrRows ) )
+      {
+        #Current coef which is being evaluated 
+        strOrig = astrRows[iMetadata]
+        #Skip y interscept
+        if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next }
+        #Skip suppressed covariates
+        if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next}
+
+        #Extract pvalue and std in standard model
+        dP = frmeCoefs[strOrig, iPValuePosition]
+        dStd = frmeCoefs[strOrig,2]
+
+        dCoef = adCoefs[iMetadata]
+
+        #Setting adMetadata
+        #Metadata values
+        strMetadata = funcCoef2Col( strOrig, frmeData, astrCols )
+        if( is.na( strMetadata ) )
+        {
+          if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next }
+          c_logrMaaslin$error( "Unknown coefficient: %s", strOrig )
+        }
+        if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next }
+        adMetadata <- frmeData[,strMetadata]
+
+        #Store (factor level modified) p-value
+        #Store general results for each coef
+        adP <- c(adP, dP)
+        lsSig[[length( lsSig ) + 1]] <- list(
+          #Current metadata name
+          name		= strMetadata,
+          #Current metadatda name (as a factor level if existing as such)
+          orig		= strOrig,#
+          #Taxon feature name
+          taxon		= strTaxon,
+          #Taxon data / response
+          data		= frmeData[,iTaxon],
+          #All levels
+          factors	= c(strMetadata),
+          #Metadata values
+          metadata	= adMetadata,
+          #Current coeficient value
+          value		= dCoef,
+          #Standard deviation
+          std		= dStd,
+          #Model coefficients
+          allCoefs	= adCoefs)
+      }
+    }
+  }
+  return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts))
+  ### List containing a list of pvalues, a list of significant data per association, and a list of QC data
+}
+
+
+notfuncGetZeroInflatedResults <- function
+### Reduce the lm object return to just the data needed for further analysis
+( llmod,
+### The result from a linear model
+frmeData,
+### Data analysis is performed on
+liTaxon,
+### The response id
+dSig,
+### Significance level for q-values
+adP,
+### List of pvalues from all associations performed
+lsSig,
+### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients
+strLog,
+### File to which to document logging
+lsQCCounts,
+### Records of counts associated with quality control
+lastrCols,
+### Predictors used in the association
+asSuppressCovariates=c()
+### Vector of covariates to suppress and not give results for
+){
+  ilmodIndex = 0
+  for(lmod in llmod)
+  {
+    ilmodIndex = ilmodIndex + 1
+    lmod = llmod[[ilmodIndex]]
+    iTaxon = liTaxon[[ilmodIndex]]
+    astrCols = lastrCols[[ilmodIndex]]
+
+    #Exclude none and errors
+    if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) )
+    {
+      #holds the location of the pvlaues if an lm, if lmm is detected this will be changed
+      iPValuePosition = 4
+
+      #Get the column name of the iTaxon index
+      #frmeTmp needs to be what?
+      strTaxon = colnames( frmeData )[iTaxon]
+
+      #Write summary information to log file
+      funcWrite( "#model", strLog )
+      funcWrite( lmod, strLog )
+
+      #Get the coefficients
+      #This should work for linear models
+      frmeCoefs <- summary(lmod)
+      frmeCoefs = frmeCoefs$coefficients$count
+      funcWrite( "#Coefs", strLog )
+      funcWrite( frmeCoefs, strLog )
+
+      adCoefs = frmeCoefs[,1]
+      names(adCoefs) = row.names(frmeCoefs)
+
+      #Go through each coefficient
+      astrRows <- row.names( frmeCoefs )
+
+      for( iMetadata in 1:length( astrRows ) )
+      {
+        #Current coef which is being evaluated 
+        strOrig = astrRows[iMetadata]
+
+        #Skip y interscept
+        if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next }
+        #Skip suppressed covariates
+        if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next}
+
+        #Extract pvalue and std in standard model
+        dP = frmeCoefs[strOrig, iPValuePosition]
+        if(is.nan(dP)){ next }
+        dStd = frmeCoefs[strOrig,2]
+        dCoef = adCoefs[iMetadata]
+
+        #Setting adMetadata
+        #Metadata values
+        strMetadata = funcCoef2Col( strOrig, frmeData, astrCols )
+        if( is.na( strMetadata ) )
+        {
+          if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next }
+          c_logrMaaslin$error( "Unknown coefficient: %s", strOrig )
+        }
+        if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next }
+        adMetadata <- frmeData[,strMetadata]
+
+        #Store (factor level modified) p-value
+        #Store general results for each coef
+        adP <- c(adP, dP)
+        lsSig[[length( lsSig ) + 1]] <- list(
+          #Current metadata name
+          name		= strMetadata,
+          #Current metadatda name (as a factor level if existing as such)
+          orig		= strOrig,#
+          #Taxon feature name
+          taxon		= strTaxon,
+          #Taxon data / response
+          data		= frmeData[,iTaxon],
+          #All levels
+          factors	= c(strMetadata),
+          #Metadata values
+          metadata	= adMetadata,
+          #Current coeficient value
+          value		= dCoef,
+          #Standard deviation
+          std		= dStd,
+          #Model coefficients
+          allCoefs	= adCoefs)
+      }
+    }
+  }
+  return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts))
+  ### List containing a list of pvalues, a list of significant data per association, and a list of QC data
+}
+
+
+### Options for variable selection 
+# OK
+funcBoostModel <- function(
+### Perform model selection / regularization with boosting
+strFormula,
+### The formula of the full model before boosting
+frmeTmp,
+### The data on which to perform analysis
+adCur,
+### The response data
+lsParameters,
+### User controlled parameters needed specific to boosting
+lsForcedParameters = NULL,
+### Force these predictors to be in the model
+strLog
+### File to which to document logging
+){
+  funcWrite( c("#Boost formula", strFormula), strLog )
+  lmod = try( gbm( as.formula( strFormula ), data=frmeTmp, distribution="laplace", verbose=FALSE, n.minobsinnode=min(10, round(0.1 * nrow( frmeTmp ) ) ), n.trees=1000 ) )
+#TODO#  lmod = try( gbm( as.formula( strFormula ), data=frmeTmp, distribution="gaussian", verbose=FALSE, n.minobsinnode=min(10, round(0.1 * nrow( frmeTmp ) ) ), n.trees=1000 ) )
+
+  astrTerms <- c()
+  if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) )
+  {
+    #Get boosting summary results
+    lsSum <- summary( lmod, plotit = FALSE )
+
+    #Document
+    funcWrite( "#model-gbm", strLog )
+    funcWrite( lmod$fit, strLog )
+    funcWrite( lmod$train.error, strLog )
+    funcWrite( lmod$Terms, strLog )
+    funcWrite( "#summary-gbm", strLog )
+    funcWrite( lsSum, strLog )
+
+    # Uneven metadata
+    vstrUneven = c()
+    # Kept metadata
+    vstrKeepMetadata = c()
+
+    #Select model predictors
+    #Check the frequency of selection and skip if not selected more than set threshold dFreq
+    for( strMetadata in lmod$var.names )
+    {
+      #Get the name of the metadata
+      strTerm <- funcCoef2Col( strMetadata, frmeTmp, c(astrMetadata, astrGenetics) )
+
+      #Add back in uneven metadata
+      if(fAddBack)
+      {
+        ldMetadata = frmeTmp[[strMetadata]]
+        if(length(which(table(ldMetadata)/length(ldMetadata)>dUnevenMax))>0)
+        {
+          astrTerms <- c(astrTerms, strTerm)
+          vstrUneven = c(vstrUneven,strMetadata)
+          next
+        }
+      }
+
+      #If the selprob is less than a certain frequency, skip
+      dSel <- lsSum$rel.inf[which( lsSum$var == strMetadata )] / 100
+      if( is.na(dSel) || ( dSel <= lsParameters$dFreq ) ){ next }
+#TODO#      if( is.na(dSel) || ( dSel < lsParameters$dFreq ) ){ next }
+
+      #If you should ignore the metadata, continue
+      if( is.null( strTerm ) ) { next }
+
+      #If you cant find the metadata name, write
+      if( is.na( strTerm ) )
+      {
+        c_logrMaaslin$error( "Unknown coefficient: %s", strMetadata )
+        next
+      }
+
+      #Collect metadata names
+      astrTerms <- c(astrTerms, strTerm)
+      vstrKeepMetadata = c(vstrKeepMetadata,strTerm)
+    }
+  } else { astrTerms = lsForcedParameters }
+
+#  funcBoostInfluencePlot(vdRelInf=lsSum$rel.inf, sFeature=lsParameters$sBugName, vsPredictorNames=lsSum$var, vstrKeepMetadata=vstrKeepMetadata, vstrUneven=vstrUneven)
+
+  return(unique(c(astrTerms,lsForcedParameters)))
+  ### Return a vector of predictor names to use in a reduced model
+}
+
+#Glmnet default is to standardize the variables.
+#used as an example for implementation
+#http://r.789695.n4.nabble.com/estimating-survival-times-with-glmnet-and-coxph-td4614225.html
+funcPenalizedModel <- function(
+### Perform penalized regularization for variable selection
+strFormula,
+### The formula of the full model before boosting
+frmeTmp,
+### The data on which to perform analysis
+adCur,
+### The response data
+lsParameters,
+### User controlled parameters needed specific to boosting
+lsForcedParameters = NULL,
+### Force these predictors to be in the model
+strLog
+### File to which to document logging
+){
+  #Convert the data frame to a model matrix
+  mtrxDesign = model.matrix(as.formula(strFormula), data=frmeTmp)
+
+  #Cross validate the lambda
+  cvRet = cv.glmnet(x=mtrxDesign,y=adCur,alpha=lsParameters$dPAlpha)
+
+  #Perform lasso
+  glmnetMod = glmnet(x=mtrxDesign,y=adCur,family=lsParameters$family,alpha=lsParameters$dPAlpha,lambda=cvRet$lambda.min)
+
+  #Get non zero beta and return column names for covariate names.
+  ldBeta = glmnetMod$beta[,which.max(glmnetMod$dev.ratio)]
+  ldBeta = names(ldBeta[which(abs(ldBeta)>0)])
+  return(sapply(ldBeta,funcCoef2Col,frmeData=frmeTmp))
+}
+
+# OK
+funcForwardModel <- function(
+### Perform model selection with forward stepwise selection
+strFormula,
+### lm style string defining reposne and predictors 
+frmeTmp,
+### Data on which to perform analysis
+adCur,
+### Response data
+lsParameters,
+### User controlled parameters needed specific to boosting
+lsForcedParameters = NULL,
+### Force these predictors to be in the model
+strLog
+### File to which to document logging
+){
+  funcWrite( c("#Forward formula", strFormula), strLog )
+
+  strNullFormula = "adCur ~ 1"
+  if(!is.null(lsForcedParameters))
+  {
+    strNullFormula = paste( "adCur ~", paste( sprintf( "`%s`", lsForcedParameters ), collapse = " + " ))
+  }
+  lmodNull <- try( lm(as.formula( strNullFormula ), data=frmeTmp))
+  lmodFull <- try( lm(as.formula( strFormula ), data=frmeTmp ))
+  if(!("try-error" %in% c(class( lmodNull ),class( lmodFull ))))
+  {
+    lmod = stepAIC(lmodNull, scope=list(lower=lmodNull,upper=lmodFull), direction="forward", trace=0)
+    return(funcGetStepPredictors(lmod, frmeTmp, strLog))
+  }
+  return( lsForcedParameters )
+  ### Return a vector of predictor names to use in a reduced model or NA on error
+}
+
+# OK
+# Select model with backwards selection
+funcBackwardsModel <- function(
+### Perform model selection with backwards stepwise selection
+strFormula,
+### lm style string defining reponse and predictors 
+frmeTmp,
+### Data on which to perform analysis
+adCur,
+### Response data
+lsParameters,
+### User controlled parameters needed specific to boosting
+lsForcedParameters = NULL,
+### Force these predictors to be in the model
+strLog
+### File to which to document logging
+){
+  funcWrite( c("#Backwards formula", strFormula), strLog )
+
+  strNullFormula = "adCur ~ 1"
+  if(!is.null(lsForcedParameters))
+  {
+    strNullFormula = paste( "adCur ~", paste( sprintf( "`%s`", lsForcedParameters ), collapse = " + " ))
+  }
+
+  lmodNull <- try( lm(as.formula( strNullFormula ), data=frmeTmp))
+  lmodFull <- try( lm(as.formula( strFormula ), data=frmeTmp ))
+
+  if(! class( lmodFull ) == "try-error" )
+  {
+    lmod = stepAIC(lmodFull, scope=list(lower=lmodNull, upper=lmodFull), direction="backward")
+    return(funcGetStepPredictors(lmod, frmeTmp, strLog))
+  } else { 
+    return( lsForcedParameters ) }
+  ### Return a vector of predictor names to use in a reduced model or NA on error
+}
+
+### Analysis methods
+### Univariate options
+
+# Sparse Dir. Model
+#TODO# Implement in sfle
+
+# Tested
+# Correlation
+# NOTE: Ignores the idea of random and fixed covariates
+funcSpearman <- function(
+### Perform multiple univariate comparisons producing spearman correlations for association
+strFormula,
+### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
+frmeTmp,
+### Data on which to perform analysis
+iTaxon,
+### Index of the response data
+lsQCCounts,
+### List recording anything important to QC
+strRandomFormula = NULL
+### Has the formula for random covariates
+){
+  return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon,
+    functionContrast=function(x,adCur,dfData)
+    {
+      retList = list()
+      ret = cor.test(as.formula(paste("~",x,"+ adCur")), data=dfData, method="spearman", na.action=c_strNA_Action)
+      #Returning rho for the coef in a named vector
+      vdCoef = c()
+      vdCoef[[x]]=ret$estimate
+      retList[[1]]=list(p.value=ret$p.value,SD=sd(dfData[[x]]),name=x,coef=vdCoef)
+      return(retList)
+    }, lsQCCounts))
+  ### List of contrast information, pvalue, contrast and std per univariate test
+}
+
+# Tested
+# Wilcoxon (T-Test)
+# NOTE: Ignores the idea of random and fixed covariates
+funcWilcoxon <- function(
+### Perform multiple univariate comparisons performing wilcoxon tests on discontinuous data with 2 levels
+strFormula,
+### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
+frmeTmp,
+### Data on which to perform analysis
+iTaxon,
+### Index of the response data
+lsQCCounts,
+### List recording anything important to QC
+strRandomFormula = NULL
+### Has the formula for random covariates
+){
+  return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon,
+    functionContrast=function(x,adCur,dfData)
+    {
+      retList = list()
+      ret = wilcox.test(as.formula(paste("adCur",x,sep=" ~ ")), data=dfData, na.action=c_strNA_Action)
+      #Returning NA for the coef in a named vector
+      vdCoef = c()
+      vdCoef[[x]]=ret$statistic
+      retList[[1]]=list(p.value=ret$p.value,SD=sd(dfData[[x]]),name=x,coef=vdCoef)
+      return(retList)
+    }, lsQCCounts))
+  ### List of contrast information, pvalue, contrast and std per univariate test
+}
+
+# Tested
+# Kruskal.Wallis (Nonparameteric anova)
+# NOTE: Ignores the idea of random and fixed covariates
+funcKruskalWallis <- function(
+### Perform multiple univariate comparisons performing Kruskal wallis rank sum tests on discontuous data with more than 2 levels
+strFormula,
+### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
+frmeTmp,
+### Data on which to perform analysis
+iTaxon,
+### Index of the response data
+lsQCCounts,
+### List recording anything important to QC
+strRandomFormula = NULL
+### Has the formula for random covariates
+){
+  return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon,
+    functionContrast=function(x,adCur,dfData)
+    {
+      retList = list()
+      lmodKW = kruskal(adCur,dfData[[x]],group=FALSE,p.adj="holm")
+
+      asLevels = levels(dfData[[x]])
+      # The names of the generated comparisons, sometimes the control is first sometimes it is not so
+      # We will just check which is in the names and use that
+      asComparisons = row.names(lmodKW$comparisons)
+      #Get the comparison with the control
+      for(sLevel in asLevels[2:length(asLevels)])
+      {
+        sComparison = intersect(c(paste(asLevels[1],sLevel,sep=" - "),paste(sLevel,asLevels[1],sep=" - ")),asComparisons)
+        #Returning NA for the coef in a named vector
+        vdCoef = c()
+        vdCoef[[paste(x,sLevel,sep="")]]=lmodKW$comparisons[sComparison,"Difference"]
+#        vdCoef[[paste(x,sLevel,sep="")]]=NA
+        retList[[length(retList)+1]]=list(p.value=lmodKW$comparisons[sComparison,"p.value"],SD=1.0,name=paste(x,sLevel,sep=""),coef=vdCoef)
+      }
+      return(retList)
+    }, lsQCCounts))
+  ### List of contrast information, pvalue, contrast and std per univariate test
+}
+
+# Tested
+# NOTE: Ignores the idea of random and fixed covariates
+funcDoUnivariate <- function(
+### Perform multiple univariate comparisons producing spearman correlations for association
+strFormula,
+### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
+frmeTmp,
+### Data on which to perform analysis
+iTaxon,
+### Index of the response data
+lsHistory,
+### List recording p-values, association information, and QC counts
+strRandomFormula = NULL,
+### Has the formula for random covariates
+fZeroInflate = FALSE
+){
+  if(fZeroInflate)
+  {
+    throw("There are no zero-inflated univariate models to perform your analysis.")
+  }
+
+  # Get covariates
+  astrCovariates = unique(c(funcFormulaStrToList(strFormula),funcFormulaStrToList(strRandomFormula)))
+
+  # For each covariate
+  for(sCovariate in astrCovariates)
+  {
+    ## Check to see if it is discrete
+    axData = frmeTmp[[sCovariate]]
+    lsRet = NA
+    if(is.factor(axData) || is.logical(axData))
+    {
+      ## If discrete check how many levels
+      lsDataLevels = levels(axData)
+      ## If 2 levels do wilcoxon test
+      if(length(lsDataLevels) < 3)
+      {
+        lsRet = funcWilcoxon(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts)
+      } else {
+      ## If 3 or more levels do kruskal wallis test
+        lsRet = funcKruskalWallis(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts)
+      }
+    } else {
+      ## If not discrete do spearman test (list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts))
+      lsRet = funcSpearman(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts)
+    }
+    lsHistory[["adP"]] = c(lsHistory[["adP"]], lsRet[["adP"]])
+    lsHistory[["lsSig"]] = c(lsHistory[["lsSig"]], lsRet[["lsSig"]])
+    lsHistory[["lsQCCounts"]] = lsRet[["lsQCCounts"]]
+  }
+  return(lsHistory)
+}
+
+### Multivariate
+
+# Tested
+funcLM <- function(
+### Perform vanilla linear regression
+strFormula,
+### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
+frmeTmp,
+### Data on which to perform analysis
+iTaxon,
+### Index of the response data
+lsHistory,
+### List recording p-values, association information, and QC counts
+strRandomFormula = NULL,
+### Has the formula for random covariates
+fZeroInflated = FALSE
+### Turns on the zero inflated model
+){
+  adCur = frmeTmp[,iTaxon]
+  if(fZeroInflated)
+  {
+    return(try(gamlss(formula=as.formula(strFormula), family=BEZI, data=frmeTmp))) # gamlss
+  } else {
+    if(!is.null(strRandomFormula))
+    {
+      return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family=gaussian(link="identity"), data=frmeTmp)))
+      #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled)
+      #return(try( lmer(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action) ))
+    } else {
+      return(try( lm(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action) ))
+    }
+  }
+  ### lmod result object from lm
+}
+
+# Tested
+funcBinomialMult <- function(
+### Perform linear regression with negative binomial link
+strFormula,
+### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
+frmeTmp,
+### Data on which to perform analysis
+iTaxon,
+### Index of the response data
+lsHistory,
+### List recording p-values, association information, and QC counts
+strRandomFormula = NULL,
+### Has the formula for random covariates
+fZeroInflated = FALSE
+### Turns on the zero inflated model
+){
+  adCur = frmeTmp[,iTaxon]
+  if(fZeroInflated)
+  {
+    return(try(zeroinfl(as.formula(strFormula), data=frmeTmp, dist="negbin"))) # pscl
+#    return(try(gamlss(as.formula(strFormula), family=ZINBI, data=frmeTmp))) # pscl
+  } else {
+    if(!is.null(strRandomFormula))
+    {
+      throw("This analysis flow is not completely developed, please choose an option other than negative bionomial with random covariates")
+      #TODO need to estimate the theta
+      #return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family=negative.binomial(theta = 2, link=log), data=frmeTmp)))
+      #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled)
+    } else {
+      return(try( glm.nb(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action )))
+    }
+  }
+  ### lmod result object from lm
+}
+
+# Tested
+funcQuasiMult <- function(
+### Perform linear regression with quasi-poisson link
+strFormula,
+### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables
+frmeTmp,
+### Data on which to perform analysis
+iTaxon,
+### Index of the response data
+lsHistory,
+### List recording p-values, association information, and QC counts
+strRandomFormula = NULL,
+### Has the formula for random covariates
+fZeroInflated = FALSE
+### Turns on a zero infalted model
+){
+  adCur = frmeTmp[,iTaxon]
+  if(fZeroInflated)
+  {
+#    return(try(gamlss(formula=as.formula(strFormula), family=ZIP, data=frmeTmp))) # gamlss
+    return(try(zeroinfl(as.formula(strFormula), data=frmeTmp, dist="poisson"))) # pscl
+  } else {
+    #Check to see if | is in the model, if so use a lmm otherwise the standard glm is ok
+    if(!is.null(strRandomFormula))
+    {
+      return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family= quasipoisson, data=frmeTmp)))
+      #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled)
+      #return(try ( glmer(as.formula(strFormula), data=frmeTmp, family=quasipoisson, na.action=c_strNA_Action) ))
+    } else {
+      return(try( glm(as.formula(strFormula), family=quasipoisson, data=frmeTmp, na.action=c_strNA_Action) ))
+    }
+  }
+  ### lmod result object from lm
+}
+
+### Transformations
+# Tested
+funcArcsinSqrt <- function(
+# Transform data with arcsin sqrt transformation
+aData
+### The data on which to perform the transformation
+){
+  return(asin(sqrt(aData)))
+  ### Transformed data
+}
+
+funcSquareSin <- function(
+# Transform data with square sin transformation
+# Opposite of the funcArcsinSqrt
+aData
+### The data on which to perform the transformation
+){
+  return(sin(aData)^2)
+  ### Transformed data
+}
+
+# Tested
+funcNoTransform <-function(
+### Pass data without transform
+aData
+### The data on which to perform the transformation
+### Only given here to preserve the pattern, not used.
+){
+  return(aData)
+  ### Transformed data
+}
+
+funcGetAnalysisMethods <- function(
+### Returns the appropriate functions for regularization, analysis, data transformation, and analysis object inspection.
+### This allows modular customization per analysis step.
+### To add a new method insert an entry in the switch for either the selection, transform, or method
+### Insert them by using the pattern optparse_keyword_without_quotes = function_in_AnalysisModules
+### Order in the return listy is currently set and expected to be selection, transforms/links, analysis method
+### none returns null
+sModelSelectionKey,
+### Keyword defining the method of model selection
+sTransformKey,
+### Keyword defining the method of data transformation
+sMethodKey,
+### Keyword defining the method of analysis
+fZeroInflated = FALSE
+### Indicates if using zero inflated models
+){
+  lRetMethods = list()
+  #Insert selection methods here
+  lRetMethods[[c_iSelection]] = switch(sModelSelectionKey,
+    boost = funcBoostModel,
+    penalized = funcPenalizedModel,
+    forward = funcForwardModel,
+    backward = funcBackwardsModel,
+    none = NA)
+
+  #Insert transforms
+  lRetMethods[[c_iTransform]] = switch(sTransformKey,
+    asinsqrt = funcArcsinSqrt,
+    none = funcNoTransform)
+
+  #Insert untransform
+  lRetMethods[[c_iUnTransform]] = switch(sTransformKey,
+    asinsqrt = funcNoTransform,
+    none = funcNoTransform)
+
+  #Insert analysis
+  lRetMethods[[c_iAnalysis]] = switch(sMethodKey,
+    neg_binomial = funcBinomialMult,
+    quasi = funcQuasiMult,
+    univariate = funcDoUnivariate,
+    lm = funcLM,
+    none = NA)
+
+  # If a univariate method is used it is required to set this to true
+  # For correct handling.
+  lRetMethods[[c_iIsUnivariate]]=sMethodKey=="univariate"
+
+  #Insert method to get results
+  if(fZeroInflated)
+  {
+    lRetMethods[[c_iResults]] = switch(sMethodKey,
+      neg_binomial = funcGetZeroInflatedResults,
+      quasi = funcGetZeroInflatedResults,
+      univariate = funcGetUnivariateResults,
+      lm = funcGetZeroInflatedResults,
+      none = NA)
+  } else {
+    lRetMethods[[c_iResults]] = switch(sMethodKey,
+      neg_binomial = funcGetLMResults,
+      quasi = funcGetLMResults,
+      univariate = funcGetUnivariateResults,
+      lm = funcGetLMResults,
+      none = NA)
+  }
+
+  return(lRetMethods)
+  ### Returns a list of functions to be passed for regularization, data transformation, analysis,
+  ### and custom analysis results introspection functions to pull from return objects data of interest
+}
author	george-weingart
date	Tue, 13 May 2014 22:00:40 -0400
parents
children