Mercurial > repos > george-weingart > maaslin
diff src/lib/BoostGLM.R @ 0:e0b5980139d9
maaslin
author | george-weingart |
---|---|
date | Tue, 13 May 2014 22:00:40 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/BoostGLM.R Tue May 13 22:00:40 2014 -0400 @@ -0,0 +1,887 @@ +##################################################################################### +#Copyright (C) <2012> +# +#Permission is hereby granted, free of charge, to any person obtaining a copy of +#this software and associated documentation files (the "Software"), to deal in the +#Software without restriction, including without limitation the rights to use, copy, +#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +#and to permit persons to whom the Software is furnished to do so, subject to +#the following conditions: +# +#The above copyright notice and this permission notice shall be included in all copies +#or substantial portions of the Software. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), +# authored by the Huttenhower lab at the Harvard School of Public Health +# (contact Timothy Tickle, ttickle@hsph.harvard.edu). +##################################################################################### + +inlinedocs <- function( +##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu> +##description<< Manages the quality control of data and the performance of analysis (univariate or multivariate), regularization, and data (response) transformation. +) { return( pArgs ) } + +### Load libraries quietly +suppressMessages(library( gam, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +suppressMessages(library( gbm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +suppressMessages(library( logging, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +suppressMessages(library( outliers, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +suppressMessages(library( robustbase, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) +suppressMessages(library( pscl, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) + +### Get constants +#source(file.path("input","maaslin","src","Constants.R")) +#source("Constants.R") + +## Get logger +c_logrMaaslin <- getLogger( "maaslin" ) + +funcDoGrubbs <- function( +### Use the Grubbs Test to identify outliers +iData, +### Column index in the data frame to test +frmeData, +### The data frame holding the data +dPOutlier, +### P-value threshold to indicate an outlier is significant +lsQC +### List holding the QC info of the cleaning step. Which indices are outliers is added. +){ + adData <- frmeData[,iData] + + # Original number of NA + viNAOrig = which(is.na(adData)) + + while( TRUE ) + { + lsTest <- try( grubbs.test( adData ), silent = TRUE ) + if( ( class( lsTest ) == "try-error" ) || is.na( lsTest$p.value ) || ( lsTest$p.value > dPOutlier ) ) + {break} + viOutliers = outlier( adData, logical = TRUE ) + adData[viOutliers] <- NA + } + + # Record removed data + viNAAfter = which(is.na(adData)) + + # If all were set to NA then ignore the filtering + if(length(adData)==length(viNAAfter)) + { + viNAAfter = viNAOrig + adData = frmeData[,iData] + c_logrMaaslin$info( paste("Grubbs Test:: Identifed all data as outliers so was inactived for index=",iData," data=",paste(as.vector(frmeData[,iData]),collapse=","), "number zeros=", length(which(frmeData[,iData]==0)), sep = " " )) + } else if(mean(adData, na.rm=TRUE) == 0) { + viNAAfter = viNAOrig + adData = frmeData[,iData] + c_logrMaaslin$info( paste("Grubbs Test::Removed all values but 0, ignored. Index=",iData,".",sep=" " ) ) + } else { + # Document removal + if( sum( is.na( adData ) ) ) + { + c_logrMaaslin$info( "Grubbs Test::Removing %d outliers from %s", sum( is.na( adData ) ), colnames(frmeData)[iData] ) + c_logrMaaslin$info( format( rownames( frmeData )[is.na( adData )] )) + } + } + + return(list(data=adData,outliers=length(viNAAfter)-length(viNAOrig),indices=setdiff(viNAAfter,viNAOrig))) +} + +funcDoFenceTest <- function( +### Use a threshold based on the quartiles of the data to identify outliers +iData, +### Column index in the data frame to test +frmeData, +### The data frame holding the data +dFence +### The fence outside the first and third quartiles to use as a threshold for cutt off. +### This many times the interquartile range +/- to the 3rd/1st quartiles +){ + # Establish fence + adData <- frmeData[,iData] + adQ <- quantile( adData, c(0.25, 0.5, 0.75), na.rm = TRUE ) + + dIQR <- adQ[3] - adQ[1] + if(!dIQR) + { + dIQR = sd(adData,na.rm = TRUE) + } + dUF <- adQ[3] + ( dFence * dIQR ) + dLF <- adQ[1] - ( dFence * dIQR ) + + # Record indices of values outside of fence to remove and remove. + aiRemove <- c() + for( j in 1:length( adData ) ) + { + d <- adData[j] + if( !is.na( d ) && ( ( d < dLF ) || ( d > dUF ) ) ) + { + aiRemove <- c(aiRemove, j) + } + } + + if(length(aiRemove)==length(adData)) + { + aiRemove = c() + c_logrMaaslin$info( "OutliersByFence:: Identified all data as outlier so was inactivated for index=", iData,"data=", paste(as.vector(frmeData[,iData]),collapse=","), "number zeros=", length(which(frmeData[,iData]==0)), sep=" " ) + } else { + adData[aiRemove] <- NA + + # Document to screen + if( length( aiRemove ) ) + { + c_logrMaaslin$info( "OutliersByFence::Removing %d outliers from %s", length( aiRemove ), colnames(frmeData)[iData] ) + c_logrMaaslin$info( format( rownames( frmeData )[aiRemove] )) + } + } + + return(list(data=adData,outliers=length(aiRemove),indices=aiRemove)) +} + +funcZerosAreUneven = function( +### +vdRawData, +### Raw data to be checked during transformation +funcTransform, +### Data transform to perform +vsStratificationFeatures, +### Groupings to check for unevenness +dfData +### Data frame holding the features +){ + # Return indicator of unevenness + fUneven = FALSE + + # Transform the data to compare + vdTransformed = funcTransform( vdRawData ) + + # Go through each stratification of data + for( sStratification in vsStratificationFeatures ) + { + # Current stratification + vFactorStrats = dfData[[ sStratification ]] + + # If the metadata is not a factor then skip + # Only binned data can be evaluated this way. + if( !is.factor( vFactorStrats )){ next } + + viZerosCountsRaw = c() + for( sLevel in levels( vFactorStrats ) ) + { + vdTest = vdRawData[ which( vFactorStrats == sLevel ) ] + viZerosCountsRaw = c( viZerosCountsRaw, length(which(vdTest == 0))) + vdTest = vdTransformed[ which( vFactorStrats == sLevel ) ] + } + dExpectation = 1 / length( viZerosCountsRaw ) + dMin = dExpectation / 2 + dMax = dExpectation + dMin + viZerosCountsRaw = viZerosCountsRaw / sum( viZerosCountsRaw ) + if( ( length( which( viZerosCountsRaw <= dMin ) ) > 0 ) || ( length( which( viZerosCountsRaw >= dMax ) ) > 0 ) ) + { + return( TRUE ) + } + } + return( fUneven ) +} + +funcTransformIncreasesOutliers = function( +### Checks if a data transform increases outliers in a distribution +vdRawData, +### Raw data to check for outlier zeros +funcTransform +){ + iUnOutliers = length( boxplot( vdRawData, plot = FALSE )$out ) + iTransformedOutliers = length( boxplot( funcTransform( vdRawData ), plot = FALSE )$out ) + + return( iUnOutliers <= iTransformedOutliers ) +} + +funcClean <- function( +### Properly clean / get data ready for analysis +### Includes custom analysis from the custom R script if it exists +frmeData, +### Data frame, input data to be acted on +funcDataProcess, +### Custom script that can be given to perform specialized processing before MaAsLin does. +aiMetadata, +### Indices of columns in frmeData which are metadata for analysis. +aiData, +### Indices of column in frmeData which are (abundance) data for analysis. +lsQCCounts, +### List that will hold the quality control information which is written in the output directory. +astrNoImpute = c(), +### An array of column names of frmeData not to impute. +dMinSamp, +### Minimum number of samples +dMinAbd, +# Minimum sample abundance +dFence, +### How many quartile ranges defines the fence to define outliers. +funcTransform, +### The data transformation function or a dummy function that does not affect the data +dPOutlier = 0.05 +### The significance threshold for the grubbs test to identify an outlier. +){ + # Call the custom script and set current data and indicies to the processes data and indicies. + c_logrMaaslin$debug( "Start Clean") + if( !is.null( funcDataProcess ) ) + { + c_logrMaaslin$debug("Additional preprocess function attempted.") + + pTmp <- funcDataProcess( frmeData=frmeData, aiMetadata=aiMetadata, aiData=aiData) + frmeData = pTmp$frmeData + aiMetadata = pTmp$aiMetadata + aiData = pTmp$aiData + lsQCCounts$lsQCCustom = pTmp$lsQCCounts + } + # Set data indicies after custom QC process. + lsQCCounts$aiAfterPreprocess = aiData + + # Remove missing data, remove any sample that has less than dMinSamp * the number of data or low abundance + aiRemove = c() + aiRemoveLowAbundance = c() + for( iCol in aiData ) + { + adCol = frmeData[,iCol] + adCol[!is.finite( adCol )] <- NA + if( ( sum( !is.na( adCol ) ) < ( dMinSamp * length( adCol ) ) ) || + ( length( unique( na.omit( adCol ) ) ) < 2 ) ) + { + aiRemove = c(aiRemove, iCol) + } + if( sum(adCol > dMinAbd, na.rm=TRUE ) < (dMinSamp * length( adCol))) + { + aiRemoveLowAbundance = c(aiRemoveLowAbundance, iCol) + } + } + # Remove and document + aiData = setdiff( aiData, aiRemove ) + aiData = setdiff( aiData, aiRemoveLowAbundance ) + lsQCCounts$iMissingData = aiRemove + lsQCCounts$iLowAbundanceData = aiRemoveLowAbundance + if(length(aiRemove)) + { + c_logrMaaslin$info( "Removing the following for data lower bound.") + c_logrMaaslin$info( format( colnames( frmeData )[aiRemove] )) + } + if(length(aiRemoveLowAbundance)) + { + c_logrMaaslin$info( "Removing the following for too many low abundance bugs.") + c_logrMaaslin$info( format( colnames( frmeData )[aiRemoveLowAbundance] )) + } + + #Transform data + iTransformed = 0 + viNotTransformedData = c() + for(aiDatum in aiData) + { + adValues = frmeData[,aiDatum] +# if( ! funcTransformIncreasesOutliers( adValues, funcTransform ) ) +# { + frmeData[,aiDatum] = funcTransform( adValues ) +# iTransformed = iTransformed + 1 +# } else { +# viNotTransformedData = c( viNotTransformedData, aiDatum ) +# } + } + c_logrMaaslin$info(paste("Number of features transformed = ",iTransformed)) + + # Metadata: Properly factorize all logical data and integer and number data with less than iNonFactorLevelThreshold + # Also record which are numeric metadata + aiNumericMetadata = c() + for( i in aiMetadata ) + { + if( ( class( frmeData[,i] ) %in% c("integer", "numeric", "logical") ) && + ( length( unique( frmeData[,i] ) ) < c_iNonFactorLevelThreshold ) ) { + c_logrMaaslin$debug(paste("Changing metadatum from numeric/integer/logical to factor",colnames(frmeData)[i],sep="=")) + frmeData[,i] = factor( frmeData[,i] ) + } + if( class( frmeData[,i] ) %in% c("integer","numeric") ) + { + aiNumericMetadata = c(aiNumericMetadata,i) + } + } + + # Remove outliers + # If the dFence Value is set use the method of defining the outllier as + # dFence * the interquartile range + or - the 3rd and first quartile respectively. + # If not the gibbs test is used. + lsQCCounts$aiDataSumOutlierPerDatum = c() + lsQCCounts$aiMetadataSumOutlierPerDatum = c() + lsQCCounts$liOutliers = list() + + if( dFence > 0.0 ) + { + # For data + for( iData in aiData ) + { + lOutlierInfo <- funcDoFenceTest(iData=iData,frmeData=frmeData,dFence=dFence) + frmeData[,iData] <- lOutlierInfo[["data"]] + lsQCCounts$aiDataSumOutlierPerDatum <- c(lsQCCounts$aiDataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) + if(lOutlierInfo[["outliers"]]>0) + { + lsQCCounts$liOutliers[[paste(iData,sep="")]] <- lOutlierInfo[["indices"]] + } + } + + # Remove outlier non-factor metadata + for( iMetadata in aiNumericMetadata ) + { + lOutlierInfo <- funcDoFenceTest(iData=iMetadata,frmeData=frmeData,dFence=dFence) + frmeData[,iMetadata] <- lOutlierInfo[["data"]] + lsQCCounts$aiMetadataSumOutlierPerDatum <- c(lsQCCounts$aiMetadataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) + if(lOutlierInfo[["outliers"]]>0) + { + lsQCCounts$liOutliers[[paste(iMetadata,sep="")]] <- lOutlierInfo[["indices"]] + } + } + #Do not use the fence, use the Grubbs test + } else if(dPOutlier!=0.0){ + # For data + for( iData in aiData ) + { + lOutlierInfo <- funcDoGrubbs(iData=iData,frmeData=frmeData,dPOutlier=dPOutlier) + frmeData[,iData] <- lOutlierInfo[["data"]] + lsQCCounts$aiDataSumOutlierPerDatum <- c(lsQCCounts$aiDataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) + if(lOutlierInfo[["outliers"]]>0) + { + lsQCCounts$liOutliers[[paste(iData,sep="")]] <- lOutlierInfo[["indices"]] + } + } + for( iMetadata in aiNumericMetadata ) + { + lOutlierInfo <- funcDoGrubbs(iData=iMetadata,frmeData=frmeData,dPOutlier=dPOutlier) + frmeData[,iMetadata] <- lOutlierInfo[["data"]] + lsQCCounts$aiMetadataSumOutlierPerDatum <- c(lsQCCounts$aiMetadataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) + if(lOutlierInfo[["outliers"]]>0) + { + lsQCCounts$liOutliers[[paste(iMetadata,sep="")]] <- lOutlierInfo[["indices"]] + } + } + } + + # Metadata: Remove missing data + # This is defined as if there is only one non-NA value or + # if the number of NOT NA data is less than a percentage of the data defined by dMinSamp + aiRemove = c() + for( iCol in c(aiMetadata) ) + { + adCol = frmeData[,iCol] + if( ( sum( !is.na( adCol ) ) < ( dMinSamp * length( adCol ) ) ) || + ( length( unique( na.omit( adCol ) ) ) < 2 ) ) + { + aiRemove = c(aiRemove, iCol) + } + } + + # Remove metadata + aiMetadata = setdiff( aiMetadata, aiRemove ) + + # Update the data which was removed. + lsQCCounts$iMissingMetadata = aiRemove + if(length(aiRemove)) + { + c_logrMaaslin$info("Removing the following metadata for too much missing data or only one data value outside of NA.") + c_logrMaaslin$info(format(colnames( frmeData )[aiRemove])) + } + + # Keep track of factor levels in a list for later use + lslsFactors <- list() + for( iCol in c(aiMetadata) ) + { + aCol <- frmeData[,iCol] + if( class( aCol ) == "factor" ) + { + lslsFactors[[length( lslsFactors ) + 1]] <- list(iCol, levels( aCol )) + } + } + + # Replace missing data values by the mean of the data column. + # Remove samples that were all NA from the cleaning and so could not be imputed. + aiRemoveData = c() + for( iCol in aiData ) + { + adCol <- frmeData[,iCol] + adCol[is.infinite( adCol )] <- NA + adCol[is.na( adCol )] <- mean( adCol[which(adCol>0)], na.rm = TRUE ) + frmeData[,iCol] <- adCol + + if(length(which(is.na(frmeData[,iCol]))) == length(frmeData[,iCol])) + { + c_logrMaaslin$info( paste("Removing data", iCol, "for being all NA after QC")) + aiRemoveData = c(aiRemoveData,iCol) + } + } + + # Remove and document + aiData = setdiff( aiData, aiRemoveData ) + lsQCCounts$iMissingData = c(lsQCCounts$iMissingData,aiRemoveData) + if(length(aiRemoveData)) + { + c_logrMaaslin$info( "Removing the following for having only NAs after cleaning (maybe due to only having NA after outlier testing).") + c_logrMaaslin$info( format( colnames( frmeData )[aiRemoveData] )) + } + + #Use na.gam.replace to manage NA metadata + aiTmp <- setdiff( aiMetadata, which( colnames( frmeData ) %in% astrNoImpute ) ) + # Keep tack of NAs so the may not be plotted later. + liNaIndices = list() + lsNames = names(frmeData) + for( i in aiTmp) + { + liNaIndices[[lsNames[i]]] = which(is.na(frmeData[,i])) + } + frmeData[,aiTmp] <- na.gam.replace( frmeData[,aiTmp] ) + + #If NA is a value in factor data, set the NA as a level. + for( lsFactor in lslsFactors ) + { + iCol <- lsFactor[[1]] + aCol <- frmeData[,iCol] + if( "NA" %in% levels( aCol ) ) + { + if(! lsNames[iCol] %in% astrNoImpute) + { + liNaIndices[[lsNames[iCol]]] = union(which(is.na(frmeData[,iCol])),which(frmeData[,iCol]=="NA")) + } + frmeData[,iCol] <- factor( aCol, levels = c(lsFactor[[2]], "NA") ) + } + } + + # Make sure there is a minimum number of non-0 measurements + aiRemove = c() + for( iCol in aiData ) + { + adCol = frmeData[,iCol] + if(length( which(adCol!=0)) < ( dMinSamp * length( adCol ) ) ) + { + aiRemove = c(aiRemove, iCol) + } + } + + # Remove and document + aiData = setdiff( aiData, aiRemove) + lsQCCounts$iZeroDominantData = aiRemove + if(length(aiRemove)) + { + c_logrMaaslin$info( "Removing the following for having not enough non-zero measurments for analysis.") + c_logrMaaslin$info( format( colnames( frmeData )[aiRemove] )) + } + + c_logrMaaslin$debug("End FuncClean") + return( list(frmeData = frmeData, aiMetadata = aiMetadata, aiData = aiData, lsQCCounts = lsQCCounts, liNaIndices=liNaIndices, viNotTransformedData = viNotTransformedData) ) + ### Return list of + ### frmeData: The Data after cleaning + ### aiMetadata: The indices of the metadata still being used after filtering + ### aiData: The indices of the data still being used after filtering + ### lsQCCOunts: QC info +} + +funcBugs <- function( +### Run analysis of all data features against all metadata +frmeData, +### Cleaned data including metadata, and data +lsData, +### This list is a general container for data as the analysis occurs, think about it as a cache for the analysis +aiMetadata, +### Indices of metadata used in analysis +aiData, +### Indices of response data +aiNotTransformedData, +### Indicies of the data not transformed +strData, +### Log file name +dSig, +### Significance threshold for the qvalue cut off +fInvert=FALSE, +### Invert images to have a black background +strDirOut = NA, +### Output project directory +funcReg=NULL, +### Function for regularization +funcTransform=NULL, +### Function used to transform the data +funcUnTransform=NULL, +### If a transform is used the opposite of that transfor must be used on the residuals in the partial residual plots +lsNonPenalizedPredictors=NULL, +### These predictors will not be penalized in the feature (model) selection step +funcAnalysis=NULL, +### Function to perform association analysis +lsRandomCovariates=NULL, +### List of string names of metadata which will be treated as random covariates +funcGetResults=NULL, +### Function to unpack results from analysis +fDoRPlot=TRUE, +### Plot residuals +fOmitLogFile = FALSE, +### Stops the creation of the log file +fAllvAll=FALSE, +### Flag to turn on all against all comparisons +liNaIndices = list(), +### Indicies of imputed NA data +lxParameters=list(), +### List holds parameters for different variable selection techniques +strTestingCorrection = "BH", +### Correction for multiple testing +fIsUnivariate = FALSE, +### Indicates if the function is univariate +fZeroInflated = FALSE +### Indicates to use a zero infalted model +){ + c_logrMaaslin$debug("Start funcBugs") + # If no output directory is indicated + # Then make it the current directory + if( is.na( strDirOut ) || is.null( strDirOut ) ) + { + if( !is.na( strData ) ) + { + strDirOut <- paste( dirname( strData ), "/", sep = "" ) + } else { strDirOut = "" } + } + + # Make th log file and output file names based on the log file name + strLog = NA + strBase = "" + if(!is.na(strData)) + { + strBaseOut <- paste( strDirOut, sub( "\\.([^.]+)$", "", basename(strData) ), sep = "/" ) + strLog <- paste( strBaseOut,c_sLogFileSuffix, ".txt", sep = "" ) + } + + # If indicated, stop the creation of the log file + # Otherwise delete the log file if it exists and log + if(fOmitLogFile){ strLog = NA } + if(!is.na(strLog)) + { + c_logrMaaslin$info( "Outputting to: %s", strLog ) + unlink( strLog ) + } + + # Will contain pvalues + adP = c() + adPAdj = c() + + # List of lists with association information + lsSig <- list() + # Go through each data that was not previously removed and perform inference + for( iTaxon in aiData ) + { + # Log to screen progress per 10 associations. + # Can be thown off if iTaxon is missing a mod 10 value + # So the taxons may not be logged every 10 but not a big deal + if( !( iTaxon %% 10 ) ) + { + c_logrMaaslin$info( "Taxon %d/%d", iTaxon, max( aiData ) ) + } + + # Call analysis method + lsOne <- funcBugHybrid( iTaxon=iTaxon, frmeData=frmeData, lsData=lsData, aiMetadata=aiMetadata, dSig=dSig, adP=adP, lsSig=lsSig, funcTransform=funcTransform, funcUnTransform=funcUnTransform, strLog=strLog, funcReg=funcReg, lsNonPenalizedPredictors=lsNonPenalizedPredictors, funcAnalysis=funcAnalysis, lsRandomCovariates=lsRandomCovariates, funcGetResult=funcGetResults, fAllvAll=fAllvAll, fIsUnivariate=fIsUnivariate, lxParameters=lxParameters, fZeroInflated=fZeroInflated, fIsTransformed= ! iTaxon %in% aiNotTransformedData ) + + # If you get a NA (happens when the lmm gets all random covariates) move on + if( is.na( lsOne ) ){ next } + + # The updating of the following happens in the inference method call in the funcBugHybrid call + # New pvalue array + adP <- lsOne$adP + # New lsSig contains data about significant feature v metadata comparisons + lsSig <- lsOne$lsSig + # New qc data + lsData$lsQCCounts = lsOne$lsQCCounts + } + + # Log the QC info + c_logrMaaslin$debug("lsData$lsQCCounts") + c_logrMaaslin$debug(format(lsData$lsQCCounts)) + + if( is.null( adP ) ) { return( NULL ) } + + # Perform bonferonni corrections on factor data (for levels), calculate the number of tests performed, and FDR adjust for multiple hypotheses + # Perform Bonferonni adjustment on factor data + for( iADIndex in 1:length( adP ) ) + { + # Only perform on factor data + if( is.factor( lsSig[[ iADIndex ]]$metadata ) ) + { + adPAdj = c( adPAdj, funcBonferonniCorrectFactorData( dPvalue = adP[ iADIndex ], vsFactors = lsSig[[ iADIndex ]]$metadata, fIgnoreNAs = length(liNaIndices)>0) ) + } else { + adPAdj = c( adPAdj, adP[ iADIndex ] ) + } + } + + iTests = funcCalculateTestCounts(iDataCount = length(aiData), asMetadata = intersect( lsData$astrMetadata, colnames( frmeData )[aiMetadata] ), asForced = lsNonPenalizedPredictors, asRandom = lsRandomCovariates, fAllvAll = fAllvAll) + + #Get indices of sorted data after the factor correction but before the multiple hypothesis corrections. + aiSig <- sort.list( adPAdj ) + + # Perform FDR BH + adQ = p.adjust(adPAdj, method=strTestingCorrection, n=max(length(adPAdj), iTests)) + + # Find all covariates that had significant associations + astrNames <- c() + for( i in 1:length( lsSig ) ) + { + astrNames <- c(astrNames, lsSig[[i]]$name) + } + astrNames <- unique( astrNames ) + + # Sets up named label return for global plotting + lsReturnTaxa <- list() + for( j in aiSig ) + { + if( adQ[j] > dSig ) { next } + strTaxon <- lsSig[[j]]$taxon + if(strTaxon %in% names(lsReturnTaxa)) + { + lsReturnTaxa[[strTaxon]] = min(lsReturnTaxa[[strTaxon]],adQ[j]) + } else { lsReturnTaxa[[strTaxon]] = adQ[j]} + } + + # For each covariate with significant associations + # Write out a file with association information + for( strName in astrNames ) + { + strFileTXT <- NA + strFilePDF <- NA + for( j in aiSig ) + { + lsCur <- lsSig[[j]] + strCur <- lsCur$name + + if( strCur != strName ) { next } + + strTaxon <- lsCur$taxon + adData <- lsCur$data + astrFactors <- lsCur$factors + adCur <- lsCur$metadata + adY <- adData + + if( is.na( strData ) ) { next } + + ## If the text file output is not written to yet + ## make the file names, and delete any previous file output + if( is.na( strFileTXT ) ) + { + strFileTXT <- sprintf( "%s-%s.txt", strBaseOut, strName ) + unlink(strFileTXT) + funcWrite( c("Variable", "Feature", "Value", "Coefficient", "N", "N not 0", "P-value", "Q-value"), strFileTXT ) + } + + ## Write text output + funcWrite( c(strName, strTaxon, lsCur$orig, lsCur$value, length( adData ), sum( adData > 0 ), adP[j], adQ[j]), strFileTXT ) + + ## If the significance meets the threshold + ## Write PDF file output + if( adQ[j] > dSig ) { next } + + # Do not make residuals plots if univariate is selected + strFilePDF = funcPDF( frmeTmp=frmeData, lsCur=lsCur, curPValue=adP[j], curQValue=adQ[j], strFilePDF=strFilePDF, strBaseOut=strBaseOut, strName=strName, funcUnTransform= funcUnTransform, fDoResidualPlot=fDoRPlot, fInvert=fInvert, liNaIndices=liNaIndices ) + } + if( dev.cur( ) != 1 ) { dev.off( ) } + } + aiTmp <- aiData + + logdebug("End funcBugs", c_logMaaslin) + return(list(lsReturnBugs=lsReturnTaxa, lsQCCounts=lsData$lsQCCounts)) + ### List of data features successfully associated without error and quality control data +} + +#Lightly Tested +### Performs analysis for 1 feature +### iTaxon: integer Taxon index to be associated with data +### frmeData: Data frame The full data +### lsData: List of all associated data +### aiMetadata: Numeric vector of indices +### dSig: Numeric significance threshold for q-value cut off +### adP: List of pvalues from associations +### lsSig: List which serves as a cache of data about significant associations +### strLog: String file to log to +funcBugHybrid <- function( +### Performs analysis for 1 feature +iTaxon, +### integer Taxon index to be associated with data +frmeData, +### Data frame, the full data +lsData, +### List of all associated data +aiMetadata, +### Numeric vector of indices +dSig, +### Numeric significance threshold for q-value cut off +adP, +### List of pvalues from associations +lsSig, +### List which serves as a cache of data about significant associations +funcTransform, +### The tranform used on the data +funcUnTransform, +### The reverse transform on the data +strLog = NA, +### String, file to which to log +funcReg=NULL, +### Function to perform regularization +lsNonPenalizedPredictors=NULL, +### These predictors will not be penalized in the feature (model) selection step +funcAnalysis=NULL, +### Function to perform association analysis +lsRandomCovariates=NULL, +### List of string names of metadata which will be treated as random covariates +funcGetResult=NULL, +### Function to unpack results from analysis +fAllvAll=FALSE, +### Flag to turn on all against all comparisons +fIsUnivariate = FALSE, +### Indicates the analysis function is univariate +lxParameters=list(), +### List holds parameters for different variable selection techniques +fZeroInflated = FALSE, +### Indicates if to use a zero infalted model +fIsTransformed = TRUE +### Indicates that the bug is transformed +){ +#dTime00 <- proc.time()[3] + #Get metadata column names + astrMetadata = intersect( lsData$astrMetadata, colnames( frmeData )[aiMetadata] ) + + #Get data measurements that are not NA + aiRows <- which( !is.na( frmeData[,iTaxon] ) ) + + #Get the dataframe of non-na data measurements + frmeTmp <- frmeData[aiRows,] + + #Set the min boosting selection frequency to a default if not given + if( is.na( lxParameters$dFreq ) ) + { + lxParameters$dFreq <- 0.5 / length( c(astrMetadata) ) + } + + # Get the full data for the bug feature + adCur = frmeTmp[,iTaxon] + lxParameters$sBugName = names(frmeTmp[iTaxon]) + + # This can run multiple models so some of the results are held in lists and some are not + llmod = list() + liTaxon = list() + lastrTerms = list() + + # Build formula for simple mixed effects models + # Removes random covariates from variable selection + astrMetadata = setdiff(astrMetadata, lsRandomCovariates) + strFormula <- paste( "adCur ~", paste( sprintf( "`%s`", astrMetadata ), collapse = " + " ), sep = " " ) + + # Document the model + funcWrite( c("#taxon", colnames( frmeTmp )[iTaxon]), strLog ) + funcWrite( c("#metadata", astrMetadata), strLog ) + funcWrite( c("#samples", rownames( frmeTmp )), strLog ) + + #Model terms + astrTerms <- c() + + # Attempt feature (model) selection + if(!is.na(funcReg)) + { + #Count model selection method attempts + lsData$lsQCCounts$iBoosts = lsData$lsQCCounts$iBoosts + 1 + #Perform model selection + astrTerms <- funcReg(strFormula=strFormula, frmeTmp=frmeTmp, adCur=adCur, lsParameters=lxParameters, lsForcedParameters=lsNonPenalizedPredictors, strLog=strLog) + #If the feature selection function is set to None, set all terms of the model to all the metadata + } else { astrTerms = astrMetadata } + + # Get look through the boosting results to get a model + # Holds the predictors in the predictors in the model that were selected by the boosting + if(is.null( astrTerms )){lsData$lsQCCounts$iBoostErrors = lsData$lsQCCounts$iBoostErrors + 1} + + # Get the indices that are transformed + # Of those indices check for uneven metadata + # Untransform any of the metadata that failed + # Failed means true for uneven occurences of zeros +# if( fIsTransformed ) +# { +# vdUnevenZeroCheck = funcUnTransform( frmeData[[ iTaxon ]] ) +# if( funcZerosAreUneven( vdRawData=vdUnevenZeroCheck, funcTransform=funcTransform, vsStratificationFeatures=astrTerms, dfData=frmeData ) ) +# { +# frmeData[[ iTaxon ]] = vdUnevenZeroCheck +# c_logrMaaslin$debug( paste( "Taxon transformation reversed due to unevenness of zero distribution.", iTaxon ) ) +# } +# } + + # Run association analysis if predictors exist and an analysis function is specified + # Run analysis + if(!is.na(funcAnalysis) ) + { + #If there are selected and forced fixed covariates + if( length( astrTerms ) ) + { + #Count the association attempt + lsData$lsQCCounts$iLms = lsData$lsQCCounts$iLms + 1 + + #Make the lm formula + #Build formula for simple mixed effects models using random covariates + strRandomCovariatesFormula = NULL + #Random covariates are forced + if(length(lsRandomCovariates)>0) + { + #Format for lme + #Needed for changes to not allowing random covariates through the boosting process + strRandomCovariatesFormula <- paste( "adCur ~ ", paste( sprintf( "1|`%s`", lsRandomCovariates), collapse = " + " )) + } + + #Set up a list of formula containing selected fixed variables changing and the forced fixed covariates constant + vstrFormula = c() + #Set up suppressing forced covariates in a all v all scenario only + asSuppress = c() + #Enable all against all comparisons + if(fAllvAll && !fIsUnivariate) + { + lsVaryingCovariates = setdiff(astrTerms,lsNonPenalizedPredictors) + lsConstantCovariates = setdiff(lsNonPenalizedPredictors,lsRandomCovariates) + strConstantFormula = paste( sprintf( "`%s`", lsConstantCovariates ), collapse = " + " ) + asSuppress = lsConstantCovariates + + if(length(lsVaryingCovariates)==0L) + { + vstrFormula <- c( paste( "adCur ~ ", paste( sprintf( "`%s`", lsConstantCovariates ), collapse = " + " )) ) + } else { + for( sVarCov in lsVaryingCovariates ) + { + strTempFormula = paste( "adCur ~ `", sVarCov,"`",sep="") + if(length(lsConstantCovariates)>0){ strTempFormula = paste(strTempFormula,strConstantFormula,sep=" + ") } + vstrFormula <- c( vstrFormula, strTempFormula ) + } + } + } else { + #This is either the multivariate case formula for all covariates in an lm or fixed covariates in the lmm + vstrFormula <- c( paste( "adCur ~ ", paste( sprintf( "`%s`", astrTerms ), collapse = " + " )) ) + } + + #Run the association + for( strAnalysisFormula in vstrFormula ) + { + i = length(llmod)+1 + llmod[[i]] = funcAnalysis(strFormula=strAnalysisFormula, frmeTmp=frmeTmp, iTaxon=iTaxon, lsHistory=list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts), strRandomFormula=strRandomCovariatesFormula, fZeroInflated=fZeroInflated) + + liTaxon[[i]] = iTaxon + lastrTerms[[i]] = funcFormulaStrToList(strAnalysisFormula) + } + } else { + #If there are no selected or forced fixed covariates + lsData$lsQCCounts$iNoTerms = lsData$lsQCCounts$iNoTerms + 1 + return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts)) + } + } + + #Call funcBugResults and return it's return + if(!is.na(funcGetResult)) + { + #Format the results to a consistent expected result. + return( funcGetResult( llmod=llmod, frmeData=frmeData, liTaxon=liTaxon, dSig=dSig, adP=adP, lsSig=lsSig, strLog=strLog, lsQCCounts=lsData$lsQCCounts, lastrCols=lastrTerms, asSuppressCovariates=asSuppress ) ) + } else { + return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts)) + } + ### List containing a list of pvalues, a list of significant data per association, and a list of QC data +}