Mercurial > repos > george-weingart > maaslin

diff src/lib/BoostGLM.R @ 0:e0b5980139d9
maaslin
author: george-weingart
date: Tue, 13 May 2014 22:00:40 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/BoostGLM.R	Tue May 13 22:00:40 2014 -0400
@@ -0,0 +1,887 @@
+#####################################################################################
+#Copyright (C) <2012>
+#
+#Permission is hereby granted, free of charge, to any person obtaining a copy of
+#this software and associated documentation files (the "Software"), to deal in the
+#Software without restriction, including without limitation the rights to use, copy,
+#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+#and to permit persons to whom the Software is furnished to do so, subject to
+#the following conditions:
+#
+#The above copyright notice and this permission notice shall be included in all copies
+#or substantial portions of the Software.
+#
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), 
+# authored by the Huttenhower lab at the Harvard School of Public Health
+# (contact Timothy Tickle, ttickle@hsph.harvard.edu).
+#####################################################################################
+
+inlinedocs <- function(
+##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu>
+##description<< Manages the quality control of data and the performance of analysis (univariate or multivariate), regularization, and data (response) transformation.
+) { return( pArgs ) }
+
+### Load libraries quietly
+suppressMessages(library( gam, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+suppressMessages(library( gbm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+suppressMessages(library( logging, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+suppressMessages(library( outliers, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+suppressMessages(library( robustbase, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+suppressMessages(library( pscl, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
+
+### Get constants
+#source(file.path("input","maaslin","src","Constants.R"))
+#source("Constants.R")
+
+## Get logger
+c_logrMaaslin <- getLogger( "maaslin" )
+
+funcDoGrubbs <- function(
+### Use the Grubbs Test to identify outliers
+iData,
+### Column index in the data frame to test
+frmeData,
+### The data frame holding the data
+dPOutlier,
+### P-value threshold to indicate an outlier is significant
+lsQC
+### List holding the QC info of the cleaning step. Which indices are outliers is added.
+){
+  adData <- frmeData[,iData]
+
+  # Original number of NA
+  viNAOrig = which(is.na(adData))
+
+  while( TRUE )
+  {
+    lsTest <- try( grubbs.test( adData ), silent = TRUE )
+    if( ( class( lsTest ) == "try-error" ) || is.na( lsTest$p.value ) || ( lsTest$p.value > dPOutlier ) )
+    {break}
+    viOutliers = outlier( adData, logical = TRUE )
+    adData[viOutliers] <- NA
+  }
+
+  # Record removed data
+  viNAAfter = which(is.na(adData))
+
+  # If all were set to NA then ignore the filtering
+  if(length(adData)==length(viNAAfter))
+  {
+    viNAAfter = viNAOrig
+    adData = frmeData[,iData]
+    c_logrMaaslin$info( paste("Grubbs Test:: Identifed all data as outliers so was inactived for index=",iData," data=",paste(as.vector(frmeData[,iData]),collapse=","), "number zeros=", length(which(frmeData[,iData]==0)), sep = " " ))
+  } else if(mean(adData, na.rm=TRUE) == 0) {
+    viNAAfter = viNAOrig
+    adData = frmeData[,iData]
+    c_logrMaaslin$info( paste("Grubbs Test::Removed all values but 0, ignored. Index=",iData,".",sep=" " ) )
+  } else {
+    # Document removal
+    if( sum( is.na( adData ) ) )
+    {
+      c_logrMaaslin$info( "Grubbs Test::Removing %d outliers from %s", sum( is.na( adData ) ), colnames(frmeData)[iData] )
+			  c_logrMaaslin$info( format( rownames( frmeData )[is.na( adData )] ))
+    }
+  }
+
+  return(list(data=adData,outliers=length(viNAAfter)-length(viNAOrig),indices=setdiff(viNAAfter,viNAOrig)))
+}
+
+funcDoFenceTest <- function(
+### Use a threshold based on the quartiles of the data to identify outliers
+iData,
+### Column index in the data frame to test
+frmeData,
+### The data frame holding the data
+dFence
+### The fence outside the first and third quartiles to use as a threshold for cutt off.
+### This many times the interquartile range +/- to the 3rd/1st quartiles
+){
+  # Establish fence
+  adData <- frmeData[,iData]
+  adQ <- quantile( adData, c(0.25, 0.5, 0.75), na.rm = TRUE )
+
+  dIQR <- adQ[3] - adQ[1]
+  if(!dIQR)
+  {
+    dIQR = sd(adData,na.rm = TRUE)
+  }
+  dUF <- adQ[3] + ( dFence * dIQR )
+  dLF <- adQ[1] - ( dFence * dIQR )
+
+  # Record indices of values outside of fence to remove and remove.
+  aiRemove <- c()
+  for( j in 1:length( adData ) )
+  {
+    d <- adData[j]
+    if( !is.na( d ) && ( ( d < dLF ) || ( d > dUF ) ) )
+    {
+      aiRemove <- c(aiRemove, j)
+    }
+  }
+
+  if(length(aiRemove)==length(adData))
+  {
+    aiRemove = c()
+    c_logrMaaslin$info( "OutliersByFence:: Identified all data as outlier so was inactivated for index=", iData,"data=", paste(as.vector(frmeData[,iData]),collapse=","), "number zeros=", length(which(frmeData[,iData]==0)), sep=" " )
+  } else {
+    adData[aiRemove] <- NA
+
+    # Document to screen
+    if( length( aiRemove ) )
+    {
+      c_logrMaaslin$info( "OutliersByFence::Removing %d outliers from %s", length( aiRemove ), colnames(frmeData)[iData] )
+      c_logrMaaslin$info( format( rownames( frmeData )[aiRemove] ))
+    }
+  }
+
+  return(list(data=adData,outliers=length(aiRemove),indices=aiRemove))
+}
+
+funcZerosAreUneven = function(
+### 
+vdRawData,
+### Raw data to be checked during transformation
+funcTransform,
+### Data transform to perform
+vsStratificationFeatures,
+### Groupings to check for unevenness
+dfData
+### Data frame holding the features
+){
+  # Return indicator of unevenness
+  fUneven = FALSE
+
+  # Transform the data to compare
+  vdTransformed = funcTransform( vdRawData )
+
+  # Go through each stratification of data
+  for( sStratification in vsStratificationFeatures )
+  {
+    # Current stratification
+    vFactorStrats = dfData[[ sStratification ]]
+
+    # If the metadata is not a factor then skip
+    # Only binned data can be evaluated this way.
+    if( !is.factor( vFactorStrats )){ next }
+    
+    viZerosCountsRaw = c()
+    for( sLevel in levels( vFactorStrats ) )
+    {
+      vdTest = vdRawData[ which( vFactorStrats == sLevel ) ]
+      viZerosCountsRaw = c( viZerosCountsRaw, length(which(vdTest == 0)))
+      vdTest = vdTransformed[ which( vFactorStrats == sLevel ) ]
+    }
+    dExpectation = 1 / length( viZerosCountsRaw )
+    dMin = dExpectation / 2
+    dMax = dExpectation + dMin
+    viZerosCountsRaw = viZerosCountsRaw / sum( viZerosCountsRaw )
+    if( ( length( which( viZerosCountsRaw <= dMin ) ) > 0 ) || ( length( which( viZerosCountsRaw >= dMax ) ) > 0 ) )
+    {
+      return( TRUE )
+    }
+  }
+  return( fUneven )
+}
+
+funcTransformIncreasesOutliers = function(
+### Checks if a data transform increases outliers in a distribution
+vdRawData,
+### Raw data to check for outlier zeros
+funcTransform
+){
+  iUnOutliers = length( boxplot( vdRawData, plot = FALSE )$out )
+  iTransformedOutliers = length( boxplot( funcTransform( vdRawData ), plot = FALSE )$out )
+
+  return( iUnOutliers <= iTransformedOutliers ) 
+}
+
+funcClean <- function(
+### Properly clean / get data ready for analysis
+### Includes custom analysis from the custom R script if it exists
+frmeData,
+### Data frame, input data to be acted on
+funcDataProcess,
+### Custom script that can be given to perform specialized processing before MaAsLin does.
+aiMetadata,
+### Indices of columns in frmeData which are metadata for analysis.
+aiData,
+### Indices of column in frmeData which are (abundance) data for analysis.
+lsQCCounts,
+### List that will hold the quality control information which is written in the output directory.
+astrNoImpute = c(),
+### An array of column names of frmeData not to impute.
+dMinSamp,
+### Minimum number of samples
+dMinAbd,
+# Minimum sample abundance
+dFence,
+### How many quartile ranges defines the fence to define outliers.
+funcTransform,
+### The data transformation function or a dummy function that does not affect the data
+dPOutlier = 0.05
+### The significance threshold for the grubbs test to identify an outlier.
+){
+  # Call the custom script and set current data and indicies to the processes data and indicies.
+  c_logrMaaslin$debug( "Start Clean")
+  if( !is.null( funcDataProcess ) )
+  {
+    c_logrMaaslin$debug("Additional preprocess function attempted.")
+
+    pTmp <- funcDataProcess( frmeData=frmeData, aiMetadata=aiMetadata, aiData=aiData)
+    frmeData = pTmp$frmeData
+    aiMetadata = pTmp$aiMetadata
+    aiData = pTmp$aiData
+    lsQCCounts$lsQCCustom = pTmp$lsQCCounts
+  }
+  # Set data indicies after custom QC process.
+  lsQCCounts$aiAfterPreprocess = aiData
+
+  # Remove missing data, remove any sample that has less than dMinSamp * the number of data or low abundance
+  aiRemove = c()
+  aiRemoveLowAbundance = c()
+  for( iCol in aiData )
+  {
+    adCol = frmeData[,iCol]
+    adCol[!is.finite( adCol )] <- NA
+    if( ( sum( !is.na( adCol ) ) < ( dMinSamp * length( adCol ) ) ) ||
+      ( length( unique( na.omit( adCol ) ) ) < 2 ) )
+    {
+        aiRemove = c(aiRemove, iCol)
+    }
+    if( sum(adCol > dMinAbd, na.rm=TRUE ) < (dMinSamp * length( adCol)))
+    {
+      aiRemoveLowAbundance = c(aiRemoveLowAbundance, iCol)
+    }
+  }
+  # Remove and document
+  aiData = setdiff( aiData, aiRemove )
+  aiData = setdiff( aiData, aiRemoveLowAbundance )
+  lsQCCounts$iMissingData = aiRemove
+  lsQCCounts$iLowAbundanceData = aiRemoveLowAbundance
+  if(length(aiRemove))
+  {
+    c_logrMaaslin$info( "Removing the following for data lower bound.")
+    c_logrMaaslin$info( format( colnames( frmeData )[aiRemove] ))
+  }
+  if(length(aiRemoveLowAbundance))
+  {
+    c_logrMaaslin$info( "Removing the following for too many low abundance bugs.")
+    c_logrMaaslin$info( format( colnames( frmeData )[aiRemoveLowAbundance] ))
+  }
+
+  #Transform data
+  iTransformed = 0
+  viNotTransformedData = c()
+  for(aiDatum in aiData)
+  {
+    adValues = frmeData[,aiDatum]
+#    if( ! funcTransformIncreasesOutliers( adValues, funcTransform ) )
+#    {
+      frmeData[,aiDatum] = funcTransform( adValues )
+#      iTransformed = iTransformed + 1
+#    } else {
+#      viNotTransformedData = c( viNotTransformedData, aiDatum )
+#    }
+  }
+  c_logrMaaslin$info(paste("Number of features transformed = ",iTransformed))
+
+  # Metadata: Properly factorize all logical data and integer and number data with less than iNonFactorLevelThreshold
+  # Also record which are numeric metadata
+  aiNumericMetadata = c()
+  for( i in aiMetadata )
+  {
+    if( ( class( frmeData[,i] ) %in% c("integer", "numeric", "logical") ) &&
+      ( length( unique( frmeData[,i] ) ) < c_iNonFactorLevelThreshold ) ) {
+      c_logrMaaslin$debug(paste("Changing metadatum from numeric/integer/logical to factor",colnames(frmeData)[i],sep="="))
+      frmeData[,i] = factor( frmeData[,i] )
+    } 
+    if( class( frmeData[,i] ) %in% c("integer","numeric") )
+    {
+      aiNumericMetadata = c(aiNumericMetadata,i)
+    }
+  }
+
+  # Remove outliers
+  # If the dFence Value is set use the method of defining the outllier as
+  # dFence * the interquartile range + or - the 3rd and first quartile respectively.
+  # If not the gibbs test is used.
+  lsQCCounts$aiDataSumOutlierPerDatum = c()
+  lsQCCounts$aiMetadataSumOutlierPerDatum = c()
+  lsQCCounts$liOutliers = list()
+
+  if( dFence > 0.0 )
+  {
+    # For data
+    for( iData in aiData )
+    {
+      lOutlierInfo <- funcDoFenceTest(iData=iData,frmeData=frmeData,dFence=dFence)
+      frmeData[,iData] <- lOutlierInfo[["data"]]
+      lsQCCounts$aiDataSumOutlierPerDatum <- c(lsQCCounts$aiDataSumOutlierPerDatum,lOutlierInfo[["outliers"]])
+      if(lOutlierInfo[["outliers"]]>0)
+      {
+        lsQCCounts$liOutliers[[paste(iData,sep="")]] <- lOutlierInfo[["indices"]]
+      }
+    }
+
+    # Remove outlier non-factor metadata
+    for( iMetadata in aiNumericMetadata )
+    {
+      lOutlierInfo <- funcDoFenceTest(iData=iMetadata,frmeData=frmeData,dFence=dFence)
+      frmeData[,iMetadata] <- lOutlierInfo[["data"]]
+      lsQCCounts$aiMetadataSumOutlierPerDatum <- c(lsQCCounts$aiMetadataSumOutlierPerDatum,lOutlierInfo[["outliers"]])
+      if(lOutlierInfo[["outliers"]]>0)
+      {
+        lsQCCounts$liOutliers[[paste(iMetadata,sep="")]] <- lOutlierInfo[["indices"]]
+      }
+    }
+  #Do not use the fence, use the Grubbs test
+  } else if(dPOutlier!=0.0){
+    # For data
+    for( iData in aiData )
+    {
+      lOutlierInfo <- funcDoGrubbs(iData=iData,frmeData=frmeData,dPOutlier=dPOutlier)
+      frmeData[,iData] <- lOutlierInfo[["data"]]
+      lsQCCounts$aiDataSumOutlierPerDatum <- c(lsQCCounts$aiDataSumOutlierPerDatum,lOutlierInfo[["outliers"]])
+      if(lOutlierInfo[["outliers"]]>0)
+      {
+        lsQCCounts$liOutliers[[paste(iData,sep="")]] <- lOutlierInfo[["indices"]]
+      }
+    }
+    for( iMetadata in aiNumericMetadata )
+    {
+      lOutlierInfo <- funcDoGrubbs(iData=iMetadata,frmeData=frmeData,dPOutlier=dPOutlier)
+      frmeData[,iMetadata] <- lOutlierInfo[["data"]]
+      lsQCCounts$aiMetadataSumOutlierPerDatum <- c(lsQCCounts$aiMetadataSumOutlierPerDatum,lOutlierInfo[["outliers"]])
+      if(lOutlierInfo[["outliers"]]>0)
+      {
+        lsQCCounts$liOutliers[[paste(iMetadata,sep="")]] <- lOutlierInfo[["indices"]]
+      }
+    }
+  }
+
+  # Metadata: Remove missing data
+  # This is defined as if there is only one non-NA value or
+  # if the number of NOT NA data is less than a percentage of the data defined by dMinSamp
+  aiRemove = c()
+  for( iCol in c(aiMetadata) )
+  {
+    adCol = frmeData[,iCol]
+    if( ( sum( !is.na( adCol ) ) < ( dMinSamp * length( adCol ) ) ) ||
+      ( length( unique( na.omit( adCol ) ) ) < 2 ) )
+    {
+      aiRemove = c(aiRemove, iCol)
+    }
+  }
+
+  # Remove metadata
+  aiMetadata = setdiff( aiMetadata, aiRemove )
+
+  # Update the data which was removed.
+  lsQCCounts$iMissingMetadata = aiRemove
+  if(length(aiRemove))
+  {
+    c_logrMaaslin$info("Removing the following metadata for too much missing data or only one data value outside of NA.")
+    c_logrMaaslin$info(format(colnames( frmeData )[aiRemove]))
+  }
+
+  # Keep track of factor levels in a list for later use
+  lslsFactors <- list()
+  for( iCol in c(aiMetadata) )
+  {
+    aCol <- frmeData[,iCol]
+    if( class( aCol ) == "factor" )
+    {
+      lslsFactors[[length( lslsFactors ) + 1]] <- list(iCol, levels( aCol ))
+    }
+  }
+
+  # Replace missing data values by the mean of the data column.
+  # Remove samples that were all NA from the cleaning and so could not be imputed.
+  aiRemoveData = c()
+  for( iCol in aiData )
+  {
+    adCol <- frmeData[,iCol]
+    adCol[is.infinite( adCol )] <- NA
+    adCol[is.na( adCol )] <- mean( adCol[which(adCol>0)], na.rm = TRUE )
+    frmeData[,iCol] <- adCol
+
+    if(length(which(is.na(frmeData[,iCol]))) == length(frmeData[,iCol]))
+    {
+      c_logrMaaslin$info( paste("Removing data", iCol, "for being all NA after QC"))
+      aiRemoveData = c(aiRemoveData,iCol)
+    }
+  }
+
+  # Remove and document
+  aiData = setdiff( aiData, aiRemoveData )
+  lsQCCounts$iMissingData = c(lsQCCounts$iMissingData,aiRemoveData)
+  if(length(aiRemoveData))
+  {
+    c_logrMaaslin$info( "Removing the following for having only NAs after cleaning (maybe due to only having NA after outlier testing).")
+    c_logrMaaslin$info( format( colnames( frmeData )[aiRemoveData] ))
+  }
+
+  #Use na.gam.replace to manage NA metadata
+  aiTmp <- setdiff( aiMetadata, which( colnames( frmeData ) %in% astrNoImpute ) )
+  # Keep tack of NAs so the may not be plotted later.
+  liNaIndices = list()
+  lsNames = names(frmeData)
+  for( i in aiTmp)
+  {
+    liNaIndices[[lsNames[i]]] = which(is.na(frmeData[,i]))
+  }
+  frmeData[,aiTmp] <- na.gam.replace( frmeData[,aiTmp] )
+
+  #If NA is a value in factor data, set the NA as a level.
+  for( lsFactor in lslsFactors )
+  {
+    iCol <- lsFactor[[1]]
+    aCol <- frmeData[,iCol]
+    if( "NA" %in% levels( aCol ) )
+    {
+      if(! lsNames[iCol] %in% astrNoImpute)
+      {
+        liNaIndices[[lsNames[iCol]]] = union(which(is.na(frmeData[,iCol])),which(frmeData[,iCol]=="NA"))
+      }
+      frmeData[,iCol] <- factor( aCol, levels = c(lsFactor[[2]], "NA") )
+    }
+  }
+
+  # Make sure there is a minimum number of non-0 measurements
+  aiRemove = c()
+  for( iCol in aiData )
+  {
+    adCol = frmeData[,iCol]
+    if(length( which(adCol!=0)) < ( dMinSamp * length( adCol ) ) )
+    {
+      aiRemove = c(aiRemove, iCol)
+    }
+  }
+
+  # Remove and document
+  aiData = setdiff( aiData, aiRemove)
+  lsQCCounts$iZeroDominantData = aiRemove
+  if(length(aiRemove))
+  {
+    c_logrMaaslin$info( "Removing the following for having not enough non-zero measurments for analysis.")
+    c_logrMaaslin$info( format( colnames( frmeData )[aiRemove] ))
+  }
+
+  c_logrMaaslin$debug("End FuncClean")
+  return( list(frmeData = frmeData, aiMetadata = aiMetadata, aiData = aiData, lsQCCounts = lsQCCounts, liNaIndices=liNaIndices, viNotTransformedData = viNotTransformedData) )
+  ### Return list of
+  ### frmeData: The Data after cleaning
+  ### aiMetadata: The indices of the metadata still being used after filtering
+  ### aiData: The indices of the data still being used after filtering
+  ### lsQCCOunts: QC info
+}
+
+funcBugs <- function(
+### Run analysis of all data features against all metadata
+frmeData,
+### Cleaned data including metadata, and data
+lsData,
+### This list is a general container for data as the analysis occurs, think about it as a cache for the analysis
+aiMetadata,
+### Indices of metadata used in analysis
+aiData,
+### Indices of response data
+aiNotTransformedData,
+### Indicies of the data not transformed
+strData,
+### Log file name
+dSig,
+### Significance threshold for the qvalue cut off
+fInvert=FALSE,
+### Invert images to have a black background
+strDirOut = NA,
+### Output project directory
+funcReg=NULL,
+### Function for regularization
+funcTransform=NULL,
+### Function used to transform the data
+funcUnTransform=NULL,
+### If a transform is used the opposite of that transfor must be used on the residuals in the partial residual plots
+lsNonPenalizedPredictors=NULL,
+### These predictors will not be penalized in the feature (model) selection step
+funcAnalysis=NULL,
+### Function to perform association analysis
+lsRandomCovariates=NULL,
+### List of string names of metadata which will be treated as random covariates
+funcGetResults=NULL,
+### Function to unpack results from analysis
+fDoRPlot=TRUE,
+### Plot residuals
+fOmitLogFile = FALSE,
+### Stops the creation of the log file
+fAllvAll=FALSE,
+### Flag to turn on all against all comparisons
+liNaIndices = list(),
+### Indicies of imputed NA data
+lxParameters=list(),
+### List holds parameters for different variable selection techniques
+strTestingCorrection = "BH",
+### Correction for multiple testing
+fIsUnivariate = FALSE,
+### Indicates if the function is univariate
+fZeroInflated = FALSE
+### Indicates to use a zero infalted model
+){
+  c_logrMaaslin$debug("Start funcBugs")
+  # If no output directory is indicated
+  # Then make it the current directory
+  if( is.na( strDirOut ) || is.null( strDirOut ) )
+  {
+    if( !is.na( strData ) )
+    {
+      strDirOut <- paste( dirname( strData ), "/", sep = "" )
+    } else { strDirOut = "" }
+  }
+
+  # Make th log file and output file names based on the log file name
+  strLog = NA
+  strBase = ""
+  if(!is.na(strData))
+  {
+    strBaseOut <- paste( strDirOut, sub( "\\.([^.]+)$", "", basename(strData) ), sep = "/" )
+    strLog <- paste( strBaseOut,c_sLogFileSuffix, ".txt", sep = "" )
+  }
+
+  # If indicated, stop the creation of the log file
+  # Otherwise delete the log file if it exists and log
+  if(fOmitLogFile){ strLog = NA }
+  if(!is.na(strLog))
+  {
+    c_logrMaaslin$info( "Outputting to: %s", strLog )
+    unlink( strLog )
+  }
+ 
+  # Will contain pvalues
+  adP = c()
+  adPAdj = c()
+
+  # List of lists with association information
+  lsSig <- list()
+  # Go through each data that was not previously removed and perform inference
+  for( iTaxon in aiData )
+  {
+    # Log to screen progress per 10 associations.
+    # Can be thown off if iTaxon is missing a mod 10 value
+    # So the taxons may not be logged every 10 but not a big deal
+    if( !( iTaxon %% 10 ) )
+    {
+      c_logrMaaslin$info( "Taxon %d/%d", iTaxon, max( aiData ) )
+    }
+
+    # Call analysis method
+    lsOne <- funcBugHybrid( iTaxon=iTaxon, frmeData=frmeData, lsData=lsData, aiMetadata=aiMetadata, dSig=dSig, adP=adP, lsSig=lsSig, funcTransform=funcTransform, funcUnTransform=funcUnTransform, strLog=strLog, funcReg=funcReg, lsNonPenalizedPredictors=lsNonPenalizedPredictors, funcAnalysis=funcAnalysis, lsRandomCovariates=lsRandomCovariates, funcGetResult=funcGetResults, fAllvAll=fAllvAll, fIsUnivariate=fIsUnivariate, lxParameters=lxParameters, fZeroInflated=fZeroInflated, fIsTransformed= ! iTaxon %in% aiNotTransformedData )
+
+    # If you get a NA (happens when the lmm gets all random covariates) move on
+    if( is.na( lsOne ) ){ next }
+
+    # The updating of the following happens in the inference method call in the funcBugHybrid call
+    # New pvalue array
+    adP <- lsOne$adP
+    # New lsSig contains data about significant feature v metadata comparisons
+    lsSig <- lsOne$lsSig
+    # New qc data
+    lsData$lsQCCounts = lsOne$lsQCCounts
+  }
+
+  # Log the QC info
+  c_logrMaaslin$debug("lsData$lsQCCounts")
+  c_logrMaaslin$debug(format(lsData$lsQCCounts))
+
+  if( is.null( adP ) ) { return( NULL ) }
+
+  # Perform bonferonni corrections on factor data (for levels), calculate the number of tests performed, and FDR adjust for multiple hypotheses
+  # Perform Bonferonni adjustment on factor data
+  for( iADIndex in 1:length( adP ) )
+  {
+    # Only perform on factor data
+    if( is.factor( lsSig[[ iADIndex ]]$metadata ) )
+    {
+      adPAdj = c( adPAdj, funcBonferonniCorrectFactorData( dPvalue = adP[ iADIndex ], vsFactors = lsSig[[ iADIndex ]]$metadata, fIgnoreNAs = length(liNaIndices)>0) )
+    } else {
+      adPAdj = c( adPAdj, adP[ iADIndex ] )
+    }
+  }
+
+  iTests = funcCalculateTestCounts(iDataCount = length(aiData), asMetadata = intersect( lsData$astrMetadata, colnames( frmeData )[aiMetadata] ), asForced = lsNonPenalizedPredictors, asRandom = lsRandomCovariates, fAllvAll = fAllvAll)
+
+  #Get indices of sorted data after the factor correction but before the multiple hypothesis corrections.
+  aiSig <- sort.list( adPAdj )
+
+  # Perform FDR BH
+  adQ = p.adjust(adPAdj, method=strTestingCorrection, n=max(length(adPAdj), iTests))
+
+  # Find all covariates that had significant associations
+  astrNames <- c()
+  for( i in 1:length( lsSig ) )
+  {
+    astrNames <- c(astrNames, lsSig[[i]]$name)
+  }
+  astrNames <- unique( astrNames )
+
+  # Sets up named label return for global plotting
+  lsReturnTaxa <- list()
+  for( j in aiSig )
+  {
+    if( adQ[j] > dSig ) { next }
+    strTaxon <- lsSig[[j]]$taxon
+    if(strTaxon %in% names(lsReturnTaxa))
+    {
+      lsReturnTaxa[[strTaxon]] = min(lsReturnTaxa[[strTaxon]],adQ[j])
+    } else { lsReturnTaxa[[strTaxon]] = adQ[j]}
+  }
+
+  # For each covariate with significant associations
+  # Write out a file with association information
+  for( strName in astrNames )
+  {
+    strFileTXT <- NA
+    strFilePDF <- NA
+    for( j in aiSig )
+    {
+      lsCur		<- lsSig[[j]]
+      strCur		<- lsCur$name
+
+      if( strCur != strName ) { next }
+
+      strTaxon		<- lsCur$taxon
+      adData		<- lsCur$data
+      astrFactors	<- lsCur$factors
+      adCur		<- lsCur$metadata
+      adY <- adData
+
+      if( is.na( strData ) ) { next }
+
+      ## If the text file output is not written to yet
+      ## make the file names, and delete any previous file output 
+      if( is.na( strFileTXT ) )
+      {
+        strFileTXT <- sprintf( "%s-%s.txt", strBaseOut, strName )
+        unlink(strFileTXT)
+        funcWrite( c("Variable", "Feature", "Value", "Coefficient", "N", "N not 0", "P-value", "Q-value"), strFileTXT )
+      }
+
+      ## Write text output
+      funcWrite( c(strName, strTaxon, lsCur$orig, lsCur$value, length( adData ), sum( adData > 0 ), adP[j], adQ[j]), strFileTXT )
+
+      ## If the significance meets the threshold
+      ## Write PDF file output
+      if( adQ[j] > dSig ) { next }
+
+      # Do not make residuals plots if univariate is selected
+      strFilePDF = funcPDF( frmeTmp=frmeData, lsCur=lsCur, curPValue=adP[j], curQValue=adQ[j], strFilePDF=strFilePDF, strBaseOut=strBaseOut, strName=strName, funcUnTransform= funcUnTransform, fDoResidualPlot=fDoRPlot, fInvert=fInvert, liNaIndices=liNaIndices )
+   }
+    if( dev.cur( ) != 1 ) { dev.off( ) }
+  }
+  aiTmp <- aiData
+
+  logdebug("End funcBugs", c_logMaaslin)
+  return(list(lsReturnBugs=lsReturnTaxa, lsQCCounts=lsData$lsQCCounts))
+  ### List of data features successfully associated without error and quality control data
+}
+
+#Lightly Tested
+### Performs analysis for 1 feature
+### iTaxon: integer Taxon index to be associated with data
+### frmeData: Data frame The full data
+### lsData: List of all associated data
+### aiMetadata: Numeric vector of indices
+### dSig: Numeric significance threshold for q-value cut off
+### adP: List of pvalues from associations
+### lsSig: List which serves as a cache of data about significant associations
+### strLog: String file to log to
+funcBugHybrid <- function(
+### Performs analysis for 1 feature
+iTaxon,
+### integer Taxon index to be associated with data
+frmeData,
+### Data frame, the full data
+lsData,
+### List of all associated data
+aiMetadata,
+### Numeric vector of indices
+dSig,
+### Numeric significance threshold for q-value cut off
+adP,
+### List of pvalues from associations
+lsSig,
+### List which serves as a cache of data about significant associations
+funcTransform,
+### The tranform used on the data
+funcUnTransform,
+### The reverse transform on the data
+strLog = NA,
+### String, file to which to log
+funcReg=NULL,
+### Function to perform regularization
+lsNonPenalizedPredictors=NULL,
+### These predictors will not be penalized in the feature (model) selection step
+funcAnalysis=NULL,
+### Function to perform association analysis
+lsRandomCovariates=NULL,
+### List of string names of metadata which will be treated as random covariates
+funcGetResult=NULL,
+### Function to unpack results from analysis
+fAllvAll=FALSE,
+### Flag to turn on all against all comparisons
+fIsUnivariate = FALSE,
+### Indicates the analysis function is univariate
+lxParameters=list(),
+### List holds parameters for different variable selection techniques
+fZeroInflated = FALSE,
+### Indicates if to use a zero infalted model
+fIsTransformed = TRUE
+### Indicates that the bug is transformed
+){
+#dTime00 <- proc.time()[3]
+  #Get metadata column names
+  astrMetadata = intersect( lsData$astrMetadata, colnames( frmeData )[aiMetadata] )
+
+  #Get data measurements that are not NA
+  aiRows <- which( !is.na( frmeData[,iTaxon] ) )
+
+  #Get the dataframe of non-na data measurements
+  frmeTmp <- frmeData[aiRows,]
+
+  #Set the min boosting selection frequency to a default if not given
+  if( is.na( lxParameters$dFreq ) )
+  {
+    lxParameters$dFreq <- 0.5 / length( c(astrMetadata) )
+  }
+
+  # Get the full data for the bug feature
+  adCur = frmeTmp[,iTaxon]
+  lxParameters$sBugName = names(frmeTmp[iTaxon])
+
+  # This can run multiple models so some of the results are held in lists and some are not
+  llmod = list()
+  liTaxon = list()
+  lastrTerms = list()
+
+  # Build formula for simple mixed effects models
+  # Removes random covariates from variable selection
+  astrMetadata  = setdiff(astrMetadata, lsRandomCovariates)
+  strFormula <- paste( "adCur ~", paste( sprintf( "`%s`", astrMetadata ), collapse = " + " ), sep = " " )
+
+  # Document the model
+  funcWrite( c("#taxon", colnames( frmeTmp )[iTaxon]), strLog )
+  funcWrite( c("#metadata", astrMetadata), strLog )
+  funcWrite( c("#samples", rownames( frmeTmp )), strLog )
+
+  #Model terms
+  astrTerms <- c()
+
+  # Attempt feature (model) selection
+  if(!is.na(funcReg))
+  {
+    #Count model selection method attempts
+    lsData$lsQCCounts$iBoosts = lsData$lsQCCounts$iBoosts + 1
+    #Perform model selection
+    astrTerms <- funcReg(strFormula=strFormula, frmeTmp=frmeTmp, adCur=adCur, lsParameters=lxParameters, lsForcedParameters=lsNonPenalizedPredictors, strLog=strLog)
+    #If the feature selection function is set to None, set all terms of the model to all the metadata
+  } else { astrTerms = astrMetadata }
+
+  # Get look through the boosting results to get a model
+  # Holds the predictors in the predictors in the model that were selected by the boosting
+  if(is.null( astrTerms )){lsData$lsQCCounts$iBoostErrors = lsData$lsQCCounts$iBoostErrors + 1}
+
+  # Get the indices that are transformed
+  # Of those indices check for uneven metadata
+  # Untransform any of the metadata that failed
+  # Failed means true for uneven occurences of zeros
+#  if( fIsTransformed )
+#  {
+#    vdUnevenZeroCheck = funcUnTransform( frmeData[[ iTaxon ]] )
+#    if( funcZerosAreUneven( vdRawData=vdUnevenZeroCheck, funcTransform=funcTransform, vsStratificationFeatures=astrTerms, dfData=frmeData ) )
+#    {
+#      frmeData[[ iTaxon ]] = vdUnevenZeroCheck
+#      c_logrMaaslin$debug( paste( "Taxon transformation reversed due to unevenness of zero distribution.", iTaxon ) )
+#    }
+#  }
+
+  # Run association analysis if predictors exist and an analysis function is specified
+  # Run analysis
+  if(!is.na(funcAnalysis) )
+  {
+    #If there are selected and forced fixed covariates
+    if( length( astrTerms ) )
+    {
+      #Count the association attempt
+      lsData$lsQCCounts$iLms = lsData$lsQCCounts$iLms + 1
+
+      #Make the lm formula
+      #Build formula for simple mixed effects models using random covariates
+      strRandomCovariatesFormula = NULL
+      #Random covariates are forced
+      if(length(lsRandomCovariates)>0)
+      {
+        #Format for lme
+        #Needed for changes to not allowing random covariates through the boosting process
+        strRandomCovariatesFormula <- paste( "adCur ~ ", paste( sprintf( "1|`%s`", lsRandomCovariates), collapse = " + " ))
+      }
+
+      #Set up a list of formula containing selected fixed variables changing and the forced fixed covariates constant
+      vstrFormula = c()
+      #Set up suppressing forced covariates in a all v all scenario only
+      asSuppress = c()
+      #Enable all against all comparisons
+      if(fAllvAll && !fIsUnivariate)
+      {
+        lsVaryingCovariates = setdiff(astrTerms,lsNonPenalizedPredictors)
+        lsConstantCovariates = setdiff(lsNonPenalizedPredictors,lsRandomCovariates)
+        strConstantFormula = paste( sprintf( "`%s`", lsConstantCovariates ), collapse = " + " )
+        asSuppress = lsConstantCovariates
+
+        if(length(lsVaryingCovariates)==0L)
+        {
+          vstrFormula <- c( paste( "adCur ~ ", paste( sprintf( "`%s`", lsConstantCovariates ), collapse = " + " )) )
+        } else {
+          for( sVarCov in lsVaryingCovariates )
+          {
+            strTempFormula = paste( "adCur ~ `", sVarCov,"`",sep="")
+            if(length(lsConstantCovariates)>0){ strTempFormula = paste(strTempFormula,strConstantFormula,sep=" + ") }
+            vstrFormula <- c( vstrFormula, strTempFormula )
+          }
+        }
+      } else {
+        #This is either the multivariate case formula for all covariates in an lm or fixed covariates in the lmm
+        vstrFormula <- c( paste( "adCur ~ ", paste( sprintf( "`%s`", astrTerms ), collapse = " + " )) )
+      }
+
+      #Run the association
+      for( strAnalysisFormula in vstrFormula )
+      {
+        i = length(llmod)+1
+        llmod[[i]] = funcAnalysis(strFormula=strAnalysisFormula, frmeTmp=frmeTmp, iTaxon=iTaxon, lsHistory=list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts), strRandomFormula=strRandomCovariatesFormula, fZeroInflated=fZeroInflated)
+
+        liTaxon[[i]] = iTaxon
+        lastrTerms[[i]] = funcFormulaStrToList(strAnalysisFormula)
+      }
+    } else {
+      #If there are no selected or forced fixed covariates
+      lsData$lsQCCounts$iNoTerms = lsData$lsQCCounts$iNoTerms + 1
+      return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts))
+    }
+  }
+
+  #Call funcBugResults and return it's return
+  if(!is.na(funcGetResult))
+  {
+    #Format the results to a consistent expected result.
+    return( funcGetResult( llmod=llmod, frmeData=frmeData, liTaxon=liTaxon, dSig=dSig, adP=adP, lsSig=lsSig, strLog=strLog, lsQCCounts=lsData$lsQCCounts, lastrCols=lastrTerms, asSuppressCovariates=asSuppress ) )
+  } else {
+    return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts))
+  }
+  ### List containing a list of pvalues, a list of significant data per association, and a list of QC data
+}
author	george-weingart
date	Tue, 13 May 2014 22:00:40 -0400
parents
children