Mercurial > repos > george-weingart > maaslin

#####################################################################################
#Copyright (C) <2012>
#
#Permission is hereby granted, free of charge, to any person obtaining a copy of
#this software and associated documentation files (the "Software"), to deal in the
#Software without restriction, including without limitation the rights to use, copy,
#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
#and to permit persons to whom the Software is furnished to do so, subject to
#the following conditions:
#
#The above copyright notice and this permission notice shall be included in all copies
#or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# This file is a component of the MaAsLin (Multivariate Associations Using Linear Models),
# authored by the Huttenhower lab at the Harvard School of Public Health
# (contact Timothy Tickle, ttickle@hsph.harvard.edu).
#####################################################################################

inlinedocs <- function(
##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu>
##description<< Manages the quality control of data and the performance of analysis (univariate or multivariate), regularization, and data (response) transformation.
) { return( pArgs ) }

### Load libraries quietly
suppressMessages(library( gam, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
suppressMessages(library( gbm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
suppressMessages(library( logging, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
suppressMessages(library( outliers, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
suppressMessages(library( robustbase, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))
suppressMessages(library( pscl, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE))

### Get constants
#source(file.path("input","maaslin","src","Constants.R"))
#source("Constants.R")

## Get logger
c_logrMaaslin <- getLogger( "maaslin" )

funcDoGrubbs <- function(
### Use the Grubbs Test to identify outliers
iData,
### Column index in the data frame to test
frmeData,
### The data frame holding the data
dPOutlier,
### P-value threshold to indicate an outlier is significant
lsQC
### List holding the QC info of the cleaning step. Which indices are outliers is added.
){
  adData <- frmeData[,iData]

  # Original number of NA
  viNAOrig = which(is.na(adData))

  while( TRUE )
  {
    lsTest <- try( grubbs.test( adData ), silent = TRUE )
    if( ( class( lsTest ) == "try-error" ) || is.na( lsTest$p.value ) || ( lsTest$p.value > dPOutlier ) )
    {break}
    viOutliers = outlier( adData, logical = TRUE )
    adData[viOutliers] <- NA
  }

  # Record removed data
  viNAAfter = which(is.na(adData))

  # If all were set to NA then ignore the filtering
  if(length(adData)==length(viNAAfter))
  {
    viNAAfter = viNAOrig
    adData = frmeData[,iData]
    c_logrMaaslin$info( paste("Grubbs Test:: Identifed all data as outliers so was inactived for index=",iData," data=",paste(as.vector(frmeData[,iData]),collapse=","), "number zeros=", length(which(frmeData[,iData]==0)), sep = " " ))
  } else if(mean(adData, na.rm=TRUE) == 0) {
    viNAAfter = viNAOrig
    adData = frmeData[,iData]
    c_logrMaaslin$info( paste("Grubbs Test::Removed all values but 0, ignored. Index=",iData,".",sep=" " ) )
  } else {
    # Document removal
    if( sum( is.na( adData ) ) )
    {
      c_logrMaaslin$info( "Grubbs Test::Removing %d outliers from %s", sum( is.na( adData ) ), colnames(frmeData)[iData] )
			  c_logrMaaslin$info( format( rownames( frmeData )[is.na( adData )] ))
    }
  }

  return(list(data=adData,outliers=length(viNAAfter)-length(viNAOrig),indices=setdiff(viNAAfter,viNAOrig)))
}

funcDoFenceTest <- function(
### Use a threshold based on the quartiles of the data to identify outliers
iData,
### Column index in the data frame to test
frmeData,
### The data frame holding the data
dFence
### The fence outside the first and third quartiles to use as a threshold for cutt off.
### This many times the interquartile range +/- to the 3rd/1st quartiles
){
  # Establish fence
  adData <- frmeData[,iData]
  adQ <- quantile( adData, c(0.25, 0.5, 0.75), na.rm = TRUE )

  dIQR <- adQ[3] - adQ[1]
  if(!dIQR)
  {
    dIQR = sd(adData,na.rm = TRUE)
  }
  dUF <- adQ[3] + ( dFence * dIQR )
  dLF <- adQ[1] - ( dFence * dIQR )

  # Record indices of values outside of fence to remove and remove.
  aiRemove <- c()
  for( j in 1:length( adData ) )
  {
    d <- adData[j]
    if( !is.na( d ) && ( ( d < dLF ) || ( d > dUF ) ) )
    {
      aiRemove <- c(aiRemove, j)
    }
  }

  if(length(aiRemove)==length(adData))
  {
    aiRemove = c()
    c_logrMaaslin$info( "OutliersByFence:: Identified all data as outlier so was inactivated for index=", iData,"data=", paste(as.vector(frmeData[,iData]),collapse=","), "number zeros=", length(which(frmeData[,iData]==0)), sep=" " )
  } else {
    adData[aiRemove] <- NA

    # Document to screen
    if( length( aiRemove ) )
    {
      c_logrMaaslin$info( "OutliersByFence::Removing %d outliers from %s", length( aiRemove ), colnames(frmeData)[iData] )
      c_logrMaaslin$info( format( rownames( frmeData )[aiRemove] ))
    }
  }

  return(list(data=adData,outliers=length(aiRemove),indices=aiRemove))
}

funcZerosAreUneven = function(
###
vdRawData,
### Raw data to be checked during transformation
funcTransform,
### Data transform to perform
vsStratificationFeatures,
### Groupings to check for unevenness
dfData
### Data frame holding the features
){
  # Return indicator of unevenness
  fUneven = FALSE

  # Transform the data to compare
  vdTransformed = funcTransform( vdRawData )

  # Go through each stratification of data
  for( sStratification in vsStratificationFeatures )
  {
    # Current stratification
    vFactorStrats = dfData[[ sStratification ]]

    # If the metadata is not a factor then skip
    # Only binned data can be evaluated this way.
    if( !is.factor( vFactorStrats )){ next }

    viZerosCountsRaw = c()
    for( sLevel in levels( vFactorStrats ) )
    {
      vdTest = vdRawData[ which( vFactorStrats == sLevel ) ]
      viZerosCountsRaw = c( viZerosCountsRaw, length(which(vdTest == 0)))
      vdTest = vdTransformed[ which( vFactorStrats == sLevel ) ]
    }
    dExpectation = 1 / length( viZerosCountsRaw )
    dMin = dExpectation / 2
    dMax = dExpectation + dMin
    viZerosCountsRaw = viZerosCountsRaw / sum( viZerosCountsRaw )
    if( ( length( which( viZerosCountsRaw <= dMin ) ) > 0 ) || ( length( which( viZerosCountsRaw >= dMax ) ) > 0 ) )
    {
      return( TRUE )
    }
  }
  return( fUneven )
}

funcTransformIncreasesOutliers = function(
### Checks if a data transform increases outliers in a distribution
vdRawData,
### Raw data to check for outlier zeros
funcTransform
){
  iUnOutliers = length( boxplot( vdRawData, plot = FALSE )$out )
  iTransformedOutliers = length( boxplot( funcTransform( vdRawData ), plot = FALSE )$out )

  return( iUnOutliers <= iTransformedOutliers )
}

funcClean <- function(
### Properly clean / get data ready for analysis
### Includes custom analysis from the custom R script if it exists
frmeData,
### Data frame, input data to be acted on
funcDataProcess,
### Custom script that can be given to perform specialized processing before MaAsLin does.
aiMetadata,
### Indices of columns in frmeData which are metadata for analysis.
aiData,
### Indices of column in frmeData which are (abundance) data for analysis.
lsQCCounts,
### List that will hold the quality control information which is written in the output directory.
astrNoImpute = c(),
### An array of column names of frmeData not to impute.
dMinSamp,
### Minimum number of samples
dMinAbd,
# Minimum sample abundance
dFence,
### How many quartile ranges defines the fence to define outliers.
funcTransform,
### The data transformation function or a dummy function that does not affect the data
dPOutlier = 0.05
### The significance threshold for the grubbs test to identify an outlier.
){
  # Call the custom script and set current data and indicies to the processes data and indicies.
  c_logrMaaslin$debug( "Start Clean")
  if( !is.null( funcDataProcess ) )
  {
    c_logrMaaslin$debug("Additional preprocess function attempted.")

    pTmp <- funcDataProcess( frmeData=frmeData, aiMetadata=aiMetadata, aiData=aiData)
    frmeData = pTmp$frmeData
    aiMetadata = pTmp$aiMetadata
    aiData = pTmp$aiData
    lsQCCounts$lsQCCustom = pTmp$lsQCCounts
  }
  # Set data indicies after custom QC process.
  lsQCCounts$aiAfterPreprocess = aiData

  # Remove missing data, remove any sample that has less than dMinSamp * the number of data or low abundance
  aiRemove = c()
  aiRemoveLowAbundance = c()
  for( iCol in aiData )
  {
    adCol = frmeData[,iCol]
    adCol[!is.finite( adCol )] <- NA
    if( ( sum( !is.na( adCol ) ) < ( dMinSamp * length( adCol ) ) ) ||
      ( length( unique( na.omit( adCol ) ) ) < 2 ) )
    {
        aiRemove = c(aiRemove, iCol)
    }
    if( sum(adCol > dMinAbd, na.rm=TRUE ) < (dMinSamp * length( adCol)))
    {
      aiRemoveLowAbundance = c(aiRemoveLowAbundance, iCol)
    }
  }
  # Remove and document
  aiData = setdiff( aiData, aiRemove )
  aiData = setdiff( aiData, aiRemoveLowAbundance )
  lsQCCounts$iMissingData = aiRemove
  lsQCCounts$iLowAbundanceData = aiRemoveLowAbundance
  if(length(aiRemove))
  {
    c_logrMaaslin$info( "Removing the following for data lower bound.")
    c_logrMaaslin$info( format( colnames( frmeData )[aiRemove] ))
  }
  if(length(aiRemoveLowAbundance))
  {
    c_logrMaaslin$info( "Removing the following for too many low abundance bugs.")
    c_logrMaaslin$info( format( colnames( frmeData )[aiRemoveLowAbundance] ))
  }

  #Transform data
  iTransformed = 0
  viNotTransformedData = c()
  for(aiDatum in aiData)
  {
    adValues = frmeData[,aiDatum]
#    if( ! funcTransformIncreasesOutliers( adValues, funcTransform ) )
#    {
      frmeData[,aiDatum] = funcTransform( adValues )
#      iTransformed = iTransformed + 1
#    } else {
#      viNotTransformedData = c( viNotTransformedData, aiDatum )
#    }
  }
  c_logrMaaslin$info(paste("Number of features transformed = ",iTransformed))

  # Metadata: Properly factorize all logical data and integer and number data with less than iNonFactorLevelThreshold
  # Also record which are numeric metadata
  aiNumericMetadata = c()
  for( i in aiMetadata )
  {
    if( ( class( frmeData[,i] ) %in% c("integer", "numeric", "logical") ) &&
      ( length( unique( frmeData[,i] ) ) < c_iNonFactorLevelThreshold ) ) {
      c_logrMaaslin$debug(paste("Changing metadatum from numeric/integer/logical to factor",colnames(frmeData)[i],sep="="))
      frmeData[,i] = factor( frmeData[,i] )
    }
    if( class( frmeData[,i] ) %in% c("integer","numeric") )
    {
      aiNumericMetadata = c(aiNumericMetadata,i)
    }
  }

  # Remove outliers
  # If the dFence Value is set use the method of defining the outllier as
  # dFence * the interquartile range + or - the 3rd and first quartile respectively.
  # If not the gibbs test is used.
  lsQCCounts$aiDataSumOutlierPerDatum = c()
  lsQCCounts$aiMetadataSumOutlierPerDatum = c()
  lsQCCounts$liOutliers = list()

  if( dFence > 0.0 )
  {
    # For data
    for( iData in aiData )
    {
      lOutlierInfo <- funcDoFenceTest(iData=iData,frmeData=frmeData,dFence=dFence)
      frmeData[,iData] <- lOutlierInfo[["data"]]
      lsQCCounts$aiDataSumOutlierPerDatum <- c(lsQCCounts$aiDataSumOutlierPerDatum,lOutlierInfo[["outliers"]])
      if(lOutlierInfo[["outliers"]]>0)
      {
        lsQCCounts$liOutliers[[paste(iData,sep="")]] <- lOutlierInfo[["indices"]]
      }
    }

    # Remove outlier non-factor metadata
    for( iMetadata in aiNumericMetadata )
    {
      lOutlierInfo <- funcDoFenceTest(iData=iMetadata,frmeData=frmeData,dFence=dFence)
      frmeData[,iMetadata] <- lOutlierInfo[["data"]]
      lsQCCounts$aiMetadataSumOutlierPerDatum <- c(lsQCCounts$aiMetadataSumOutlierPerDatum,lOutlierInfo[["outliers"]])
      if(lOutlierInfo[["outliers"]]>0)
      {
        lsQCCounts$liOutliers[[paste(iMetadata,sep="")]] <- lOutlierInfo[["indices"]]
      }
    }
  #Do not use the fence, use the Grubbs test
  } else if(dPOutlier!=0.0){
    # For data
    for( iData in aiData )
    {
      lOutlierInfo <- funcDoGrubbs(iData=iData,frmeData=frmeData,dPOutlier=dPOutlier)
      frmeData[,iData] <- lOutlierInfo[["data"]]
      lsQCCounts$aiDataSumOutlierPerDatum <- c(lsQCCounts$aiDataSumOutlierPerDatum,lOutlierInfo[["outliers"]])
      if(lOutlierInfo[["outliers"]]>0)
      {
        lsQCCounts$liOutliers[[paste(iData,sep="")]] <- lOutlierInfo[["indices"]]
      }
    }
    for( iMetadata in aiNumericMetadata )
    {
      lOutlierInfo <- funcDoGrubbs(iData=iMetadata,frmeData=frmeData,dPOutlier=dPOutlier)
      frmeData[,iMetadata] <- lOutlierInfo[["data"]]
      lsQCCounts$aiMetadataSumOutlierPerDatum <- c(lsQCCounts$aiMetadataSumOutlierPerDatum,lOutlierInfo[["outliers"]])
      if(lOutlierInfo[["outliers"]]>0)
      {
        lsQCCounts$liOutliers[[paste(iMetadata,sep="")]] <- lOutlierInfo[["indices"]]
      }
    }
  }

  # Metadata: Remove missing data
  # This is defined as if there is only one non-NA value or
  # if the number of NOT NA data is less than a percentage of the data defined by dMinSamp
  aiRemove = c()
  for( iCol in c(aiMetadata) )
  {
    adCol = frmeData[,iCol]
    if( ( sum( !is.na( adCol ) ) < ( dMinSamp * length( adCol ) ) ) ||
      ( length( unique( na.omit( adCol ) ) ) < 2 ) )
    {
      aiRemove = c(aiRemove, iCol)
    }
  }

  # Remove metadata
  aiMetadata = setdiff( aiMetadata, aiRemove )

  # Update the data which was removed.
  lsQCCounts$iMissingMetadata = aiRemove
  if(length(aiRemove))
  {
    c_logrMaaslin$info("Removing the following metadata for too much missing data or only one data value outside of NA.")
    c_logrMaaslin$info(format(colnames( frmeData )[aiRemove]))
  }

  # Keep track of factor levels in a list for later use
  lslsFactors <- list()
  for( iCol in c(aiMetadata) )
  {
    aCol <- frmeData[,iCol]
    if( class( aCol ) == "factor" )
    {
      lslsFactors[[length( lslsFactors ) + 1]] <- list(iCol, levels( aCol ))
    }
  }

  # Replace missing data values by the mean of the data column.
  # Remove samples that were all NA from the cleaning and so could not be imputed.
  aiRemoveData = c()
  for( iCol in aiData )
  {
    adCol <- frmeData[,iCol]
    adCol[is.infinite( adCol )] <- NA
    adCol[is.na( adCol )] <- mean( adCol[which(adCol>0)], na.rm = TRUE )
    frmeData[,iCol] <- adCol

    if(length(which(is.na(frmeData[,iCol]))) == length(frmeData[,iCol]))
    {
      c_logrMaaslin$info( paste("Removing data", iCol, "for being all NA after QC"))
      aiRemoveData = c(aiRemoveData,iCol)
    }
  }

  # Remove and document
  aiData = setdiff( aiData, aiRemoveData )
  lsQCCounts$iMissingData = c(lsQCCounts$iMissingData,aiRemoveData)
  if(length(aiRemoveData))
  {
    c_logrMaaslin$info( "Removing the following for having only NAs after cleaning (maybe due to only having NA after outlier testing).")
    c_logrMaaslin$info( format( colnames( frmeData )[aiRemoveData] ))
  }

  #Use na.gam.replace to manage NA metadata
  aiTmp <- setdiff( aiMetadata, which( colnames( frmeData ) %in% astrNoImpute ) )
  # Keep tack of NAs so the may not be plotted later.
  liNaIndices = list()
  lsNames = names(frmeData)
  for( i in aiTmp)
  {
    liNaIndices[[lsNames[i]]] = which(is.na(frmeData[,i]))
  }
  frmeData[,aiTmp] <- na.gam.replace( frmeData[,aiTmp] )

  #If NA is a value in factor data, set the NA as a level.
  for( lsFactor in lslsFactors )
  {
    iCol <- lsFactor[[1]]
    aCol <- frmeData[,iCol]
    if( "NA" %in% levels( aCol ) )
    {
      if(! lsNames[iCol] %in% astrNoImpute)
      {
        liNaIndices[[lsNames[iCol]]] = union(which(is.na(frmeData[,iCol])),which(frmeData[,iCol]=="NA"))
      }
      frmeData[,iCol] <- factor( aCol, levels = c(lsFactor[[2]], "NA") )
    }
  }

  # Make sure there is a minimum number of non-0 measurements
  aiRemove = c()
  for( iCol in aiData )
  {
    adCol = frmeData[,iCol]
    if(length( which(adCol!=0)) < ( dMinSamp * length( adCol ) ) )
    {
      aiRemove = c(aiRemove, iCol)
    }
  }

  # Remove and document
  aiData = setdiff( aiData, aiRemove)
  lsQCCounts$iZeroDominantData = aiRemove
  if(length(aiRemove))
  {
    c_logrMaaslin$info( "Removing the following for having not enough non-zero measurments for analysis.")
    c_logrMaaslin$info( format( colnames( frmeData )[aiRemove] ))
  }

  c_logrMaaslin$debug("End FuncClean")
  return( list(frmeData = frmeData, aiMetadata = aiMetadata, aiData = aiData, lsQCCounts = lsQCCounts, liNaIndices=liNaIndices, viNotTransformedData = viNotTransformedData) )
  ### Return list of
  ### frmeData: The Data after cleaning
  ### aiMetadata: The indices of the metadata still being used after filtering
  ### aiData: The indices of the data still being used after filtering
  ### lsQCCOunts: QC info
}

funcBugs <- function(
### Run analysis of all data features against all metadata
frmeData,
### Cleaned data including metadata, and data
lsData,
### This list is a general container for data as the analysis occurs, think about it as a cache for the analysis
aiMetadata,
### Indices of metadata used in analysis
aiData,
### Indices of response data
aiNotTransformedData,
### Indicies of the data not transformed
strData,
### Log file name
dSig,
### Significance threshold for the qvalue cut off
fInvert=FALSE,
### Invert images to have a black background
strDirOut = NA,
### Output project directory
funcReg=NULL,
### Function for regularization
funcTransform=NULL,
### Function used to transform the data
funcUnTransform=NULL,
### If a transform is used the opposite of that transfor must be used on the residuals in the partial residual plots
lsNonPenalizedPredictors=NULL,
### These predictors will not be penalized in the feature (model) selection step
funcAnalysis=NULL,
### Function to perform association analysis
lsRandomCovariates=NULL,
### List of string names of metadata which will be treated as random covariates
funcGetResults=NULL,
### Function to unpack results from analysis
fDoRPlot=TRUE,
### Plot residuals
fOmitLogFile = FALSE,
### Stops the creation of the log file
fAllvAll=FALSE,
### Flag to turn on all against all comparisons
liNaIndices = list(),
### Indicies of imputed NA data
lxParameters=list(),
### List holds parameters for different variable selection techniques
strTestingCorrection = "BH",
### Correction for multiple testing
fIsUnivariate = FALSE,
### Indicates if the function is univariate
fZeroInflated = FALSE
### Indicates to use a zero infalted model
){
  c_logrMaaslin$debug("Start funcBugs")
  # If no output directory is indicated
  # Then make it the current directory
  if( is.na( strDirOut ) || is.null( strDirOut ) )
  {
    if( !is.na( strData ) )
    {
      strDirOut <- paste( dirname( strData ), "/", sep = "" )
    } else { strDirOut = "" }
  }

  # Make th log file and output file names based on the log file name
  strLog = NA
  strBase = ""
  if(!is.na(strData))
  {
    strBaseOut <- paste( strDirOut, sub( "\\.([^.]+)$", "", basename(strData) ), sep = "/" )
    strLog <- paste( strBaseOut,c_sLogFileSuffix, ".txt", sep = "" )
  }

  # If indicated, stop the creation of the log file
  # Otherwise delete the log file if it exists and log
  if(fOmitLogFile){ strLog = NA }
  if(!is.na(strLog))
  {
    c_logrMaaslin$info( "Outputting to: %s", strLog )
    unlink( strLog )
  }

  # Will contain pvalues
  adP = c()
  adPAdj = c()

  # List of lists with association information
  lsSig <- list()
  # Go through each data that was not previously removed and perform inference
  for( iTaxon in aiData )
  {
    # Log to screen progress per 10 associations.
    # Can be thown off if iTaxon is missing a mod 10 value
    # So the taxons may not be logged every 10 but not a big deal
    if( !( iTaxon %% 10 ) )
    {
      c_logrMaaslin$info( "Taxon %d/%d", iTaxon, max( aiData ) )
    }

    # Call analysis method
    lsOne <- funcBugHybrid( iTaxon=iTaxon, frmeData=frmeData, lsData=lsData, aiMetadata=aiMetadata, dSig=dSig, adP=adP, lsSig=lsSig, funcTransform=funcTransform, funcUnTransform=funcUnTransform, strLog=strLog, funcReg=funcReg, lsNonPenalizedPredictors=lsNonPenalizedPredictors, funcAnalysis=funcAnalysis, lsRandomCovariates=lsRandomCovariates, funcGetResult=funcGetResults, fAllvAll=fAllvAll, fIsUnivariate=fIsUnivariate, lxParameters=lxParameters, fZeroInflated=fZeroInflated, fIsTransformed= ! iTaxon %in% aiNotTransformedData )

    # If you get a NA (happens when the lmm gets all random covariates) move on
    if( is.na( lsOne ) ){ next }

    # The updating of the following happens in the inference method call in the funcBugHybrid call
    # New pvalue array
    adP <- lsOne$adP
    # New lsSig contains data about significant feature v metadata comparisons
    lsSig <- lsOne$lsSig
    # New qc data
    lsData$lsQCCounts = lsOne$lsQCCounts
  }

  # Log the QC info
  c_logrMaaslin$debug("lsData$lsQCCounts")
  c_logrMaaslin$debug(format(lsData$lsQCCounts))

  if( is.null( adP ) ) { return( NULL ) }

  # Perform bonferonni corrections on factor data (for levels), calculate the number of tests performed, and FDR adjust for multiple hypotheses
  # Perform Bonferonni adjustment on factor data
  for( iADIndex in 1:length( adP ) )
  {
    # Only perform on factor data
    if( is.factor( lsSig[[ iADIndex ]]$metadata ) )
    {
      adPAdj = c( adPAdj, funcBonferonniCorrectFactorData( dPvalue = adP[ iADIndex ], vsFactors = lsSig[[ iADIndex ]]$metadata, fIgnoreNAs = length(liNaIndices)>0) )
    } else {
      adPAdj = c( adPAdj, adP[ iADIndex ] )
    }
  }

  iTests = funcCalculateTestCounts(iDataCount = length(aiData), asMetadata = intersect( lsData$astrMetadata, colnames( frmeData )[aiMetadata] ), asForced = lsNonPenalizedPredictors, asRandom = lsRandomCovariates, fAllvAll = fAllvAll)

  #Get indices of sorted data after the factor correction but before the multiple hypothesis corrections.
  aiSig <- sort.list( adPAdj )

  # Perform FDR BH
  adQ = p.adjust(adPAdj, method=strTestingCorrection, n=max(length(adPAdj), iTests))

  # Find all covariates that had significant associations
  astrNames <- c()
  for( i in 1:length( lsSig ) )
  {
    astrNames <- c(astrNames, lsSig[[i]]$name)
  }
  astrNames <- unique( astrNames )

  # Sets up named label return for global plotting
  lsReturnTaxa <- list()
  for( j in aiSig )
  {
    if( adQ[j] > dSig ) { next }
    strTaxon <- lsSig[[j]]$taxon
    if(strTaxon %in% names(lsReturnTaxa))
    {
      lsReturnTaxa[[strTaxon]] = min(lsReturnTaxa[[strTaxon]],adQ[j])
    } else { lsReturnTaxa[[strTaxon]] = adQ[j]}
  }

  # For each covariate with significant associations
  # Write out a file with association information
  for( strName in astrNames )
  {
    strFileTXT <- NA
    strFilePDF <- NA
    for( j in aiSig )
    {
      lsCur		<- lsSig[[j]]
      strCur		<- lsCur$name

      if( strCur != strName ) { next }

      strTaxon		<- lsCur$taxon
      adData		<- lsCur$data
      astrFactors	<- lsCur$factors
      adCur		<- lsCur$metadata
      adY <- adData

      if( is.na( strData ) ) { next }

      ## If the text file output is not written to yet
      ## make the file names, and delete any previous file output
      if( is.na( strFileTXT ) )
      {
        strFileTXT <- sprintf( "%s-%s.txt", strBaseOut, strName )
        unlink(strFileTXT)
        funcWrite( c("Variable", "Feature", "Value", "Coefficient", "N", "N not 0", "P-value", "Q-value"), strFileTXT )
      }

      ## Write text output
      funcWrite( c(strName, strTaxon, lsCur$orig, lsCur$value, length( adData ), sum( adData > 0 ), adP[j], adQ[j]), strFileTXT )

      ## If the significance meets the threshold
      ## Write PDF file output
      if( adQ[j] > dSig ) { next }

      # Do not make residuals plots if univariate is selected
      strFilePDF = funcPDF( frmeTmp=frmeData, lsCur=lsCur, curPValue=adP[j], curQValue=adQ[j], strFilePDF=strFilePDF, strBaseOut=strBaseOut, strName=strName, funcUnTransform= funcUnTransform, fDoResidualPlot=fDoRPlot, fInvert=fInvert, liNaIndices=liNaIndices )
   }
    if( dev.cur( ) != 1 ) { dev.off( ) }
  }
  aiTmp <- aiData

  logdebug("End funcBugs", c_logMaaslin)
  return(list(lsReturnBugs=lsReturnTaxa, lsQCCounts=lsData$lsQCCounts))
  ### List of data features successfully associated without error and quality control data
}

#Lightly Tested
### Performs analysis for 1 feature
### iTaxon: integer Taxon index to be associated with data
### frmeData: Data frame The full data
### lsData: List of all associated data
### aiMetadata: Numeric vector of indices
### dSig: Numeric significance threshold for q-value cut off
### adP: List of pvalues from associations
### lsSig: List which serves as a cache of data about significant associations
### strLog: String file to log to
funcBugHybrid <- function(
### Performs analysis for 1 feature
iTaxon,
### integer Taxon index to be associated with data
frmeData,
### Data frame, the full data
lsData,
### List of all associated data
aiMetadata,
### Numeric vector of indices
dSig,
### Numeric significance threshold for q-value cut off
adP,
### List of pvalues from associations
lsSig,
### List which serves as a cache of data about significant associations
funcTransform,
### The tranform used on the data
funcUnTransform,
### The reverse transform on the data
strLog = NA,
### String, file to which to log
funcReg=NULL,
### Function to perform regularization
lsNonPenalizedPredictors=NULL,
### These predictors will not be penalized in the feature (model) selection step
funcAnalysis=NULL,
### Function to perform association analysis
lsRandomCovariates=NULL,
### List of string names of metadata which will be treated as random covariates
funcGetResult=NULL,
### Function to unpack results from analysis
fAllvAll=FALSE,
### Flag to turn on all against all comparisons
fIsUnivariate = FALSE,
### Indicates the analysis function is univariate
lxParameters=list(),
### List holds parameters for different variable selection techniques
fZeroInflated = FALSE,
### Indicates if to use a zero infalted model
fIsTransformed = TRUE
### Indicates that the bug is transformed
){
#dTime00 <- proc.time()[3]
  #Get metadata column names
  astrMetadata = intersect( lsData$astrMetadata, colnames( frmeData )[aiMetadata] )

  #Get data measurements that are not NA
  aiRows <- which( !is.na( frmeData[,iTaxon] ) )

  #Get the dataframe of non-na data measurements
  frmeTmp <- frmeData[aiRows,]

  #Set the min boosting selection frequency to a default if not given
  if( is.na( lxParameters$dFreq ) )
  {
    lxParameters$dFreq <- 0.5 / length( c(astrMetadata) )
  }

  # Get the full data for the bug feature
  adCur = frmeTmp[,iTaxon]
  lxParameters$sBugName = names(frmeTmp[iTaxon])

  # This can run multiple models so some of the results are held in lists and some are not
  llmod = list()
  liTaxon = list()
  lastrTerms = list()

  # Build formula for simple mixed effects models
  # Removes random covariates from variable selection
  astrMetadata  = setdiff(astrMetadata, lsRandomCovariates)
  strFormula <- paste( "adCur ~", paste( sprintf( "`%s`", astrMetadata ), collapse = " + " ), sep = " " )

  # Document the model
  funcWrite( c("#taxon", colnames( frmeTmp )[iTaxon]), strLog )
  funcWrite( c("#metadata", astrMetadata), strLog )
  funcWrite( c("#samples", rownames( frmeTmp )), strLog )

  #Model terms
  astrTerms <- c()

  # Attempt feature (model) selection
  if(!is.na(funcReg))
  {
    #Count model selection method attempts
    lsData$lsQCCounts$iBoosts = lsData$lsQCCounts$iBoosts + 1
    #Perform model selection
    astrTerms <- funcReg(strFormula=strFormula, frmeTmp=frmeTmp, adCur=adCur, lsParameters=lxParameters, lsForcedParameters=lsNonPenalizedPredictors, strLog=strLog)
    #If the feature selection function is set to None, set all terms of the model to all the metadata
  } else { astrTerms = astrMetadata }

  # Get look through the boosting results to get a model
  # Holds the predictors in the predictors in the model that were selected by the boosting
  if(is.null( astrTerms )){lsData$lsQCCounts$iBoostErrors = lsData$lsQCCounts$iBoostErrors + 1}

  # Get the indices that are transformed
  # Of those indices check for uneven metadata
  # Untransform any of the metadata that failed
  # Failed means true for uneven occurences of zeros
#  if( fIsTransformed )
#  {
#    vdUnevenZeroCheck = funcUnTransform( frmeData[[ iTaxon ]] )
#    if( funcZerosAreUneven( vdRawData=vdUnevenZeroCheck, funcTransform=funcTransform, vsStratificationFeatures=astrTerms, dfData=frmeData ) )
#    {
#      frmeData[[ iTaxon ]] = vdUnevenZeroCheck
#      c_logrMaaslin$debug( paste( "Taxon transformation reversed due to unevenness of zero distribution.", iTaxon ) )
#    }
#  }

  # Run association analysis if predictors exist and an analysis function is specified
  # Run analysis
  if(!is.na(funcAnalysis) )
  {
    #If there are selected and forced fixed covariates
    if( length( astrTerms ) )
    {
      #Count the association attempt
      lsData$lsQCCounts$iLms = lsData$lsQCCounts$iLms + 1

      #Make the lm formula
      #Build formula for simple mixed effects models using random covariates
      strRandomCovariatesFormula = NULL
      #Random covariates are forced
      if(length(lsRandomCovariates)>0)
      {
        #Format for lme
        #Needed for changes to not allowing random covariates through the boosting process
        strRandomCovariatesFormula <- paste( "adCur ~ ", paste( sprintf( "1|`%s`", lsRandomCovariates), collapse = " + " ))
      }

      #Set up a list of formula containing selected fixed variables changing and the forced fixed covariates constant
      vstrFormula = c()
      #Set up suppressing forced covariates in a all v all scenario only
      asSuppress = c()
      #Enable all against all comparisons
      if(fAllvAll && !fIsUnivariate)
      {
        lsVaryingCovariates = setdiff(astrTerms,lsNonPenalizedPredictors)
        lsConstantCovariates = setdiff(lsNonPenalizedPredictors,lsRandomCovariates)
        strConstantFormula = paste( sprintf( "`%s`", lsConstantCovariates ), collapse = " + " )
        asSuppress = lsConstantCovariates

        if(length(lsVaryingCovariates)==0L)
        {
          vstrFormula <- c( paste( "adCur ~ ", paste( sprintf( "`%s`", lsConstantCovariates ), collapse = " + " )) )
        } else {
          for( sVarCov in lsVaryingCovariates )
          {
            strTempFormula = paste( "adCur ~ `", sVarCov,"`",sep="")
            if(length(lsConstantCovariates)>0){ strTempFormula = paste(strTempFormula,strConstantFormula,sep=" + ") }
            vstrFormula <- c( vstrFormula, strTempFormula )
          }
        }
      } else {
        #This is either the multivariate case formula for all covariates in an lm or fixed covariates in the lmm
        vstrFormula <- c( paste( "adCur ~ ", paste( sprintf( "`%s`", astrTerms ), collapse = " + " )) )
      }

      #Run the association
      for( strAnalysisFormula in vstrFormula )
      {
        i = length(llmod)+1
        llmod[[i]] = funcAnalysis(strFormula=strAnalysisFormula, frmeTmp=frmeTmp, iTaxon=iTaxon, lsHistory=list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts), strRandomFormula=strRandomCovariatesFormula, fZeroInflated=fZeroInflated)

        liTaxon[[i]] = iTaxon
        lastrTerms[[i]] = funcFormulaStrToList(strAnalysisFormula)
      }
    } else {
      #If there are no selected or forced fixed covariates
      lsData$lsQCCounts$iNoTerms = lsData$lsQCCounts$iNoTerms + 1
      return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts))
    }
  }

  #Call funcBugResults and return it's return
  if(!is.na(funcGetResult))
  {
    #Format the results to a consistent expected result.
    return( funcGetResult( llmod=llmod, frmeData=frmeData, liTaxon=liTaxon, dSig=dSig, adP=adP, lsSig=lsSig, strLog=strLog, lsQCCounts=lsData$lsQCCounts, lastrCols=lastrTerms, asSuppressCovariates=asSuppress ) )
  } else {
    return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts))
  }
  ### List containing a list of pvalues, a list of significant data per association, and a list of QC data
}
author	george.weingart@gmail.com
date	Mon, 09 Feb 2015 12:17:40 -0500
parents	e0b5980139d9
children