Mercurial > repos > george-weingart > maaslin
comparison src/lib/BoostGLM.R @ 0:e0b5980139d9
maaslin
author | george-weingart |
---|---|
date | Tue, 13 May 2014 22:00:40 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e0b5980139d9 |
---|---|
1 ##################################################################################### | |
2 #Copyright (C) <2012> | |
3 # | |
4 #Permission is hereby granted, free of charge, to any person obtaining a copy of | |
5 #this software and associated documentation files (the "Software"), to deal in the | |
6 #Software without restriction, including without limitation the rights to use, copy, | |
7 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
8 #and to permit persons to whom the Software is furnished to do so, subject to | |
9 #the following conditions: | |
10 # | |
11 #The above copyright notice and this permission notice shall be included in all copies | |
12 #or substantial portions of the Software. | |
13 # | |
14 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
15 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
16 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
17 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
18 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
19 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
20 # | |
21 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), | |
22 # authored by the Huttenhower lab at the Harvard School of Public Health | |
23 # (contact Timothy Tickle, ttickle@hsph.harvard.edu). | |
24 ##################################################################################### | |
25 | |
26 inlinedocs <- function( | |
27 ##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu> | |
28 ##description<< Manages the quality control of data and the performance of analysis (univariate or multivariate), regularization, and data (response) transformation. | |
29 ) { return( pArgs ) } | |
30 | |
31 ### Load libraries quietly | |
32 suppressMessages(library( gam, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
33 suppressMessages(library( gbm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
34 suppressMessages(library( logging, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
35 suppressMessages(library( outliers, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
36 suppressMessages(library( robustbase, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
37 suppressMessages(library( pscl, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
38 | |
39 ### Get constants | |
40 #source(file.path("input","maaslin","src","Constants.R")) | |
41 #source("Constants.R") | |
42 | |
43 ## Get logger | |
44 c_logrMaaslin <- getLogger( "maaslin" ) | |
45 | |
46 funcDoGrubbs <- function( | |
47 ### Use the Grubbs Test to identify outliers | |
48 iData, | |
49 ### Column index in the data frame to test | |
50 frmeData, | |
51 ### The data frame holding the data | |
52 dPOutlier, | |
53 ### P-value threshold to indicate an outlier is significant | |
54 lsQC | |
55 ### List holding the QC info of the cleaning step. Which indices are outliers is added. | |
56 ){ | |
57 adData <- frmeData[,iData] | |
58 | |
59 # Original number of NA | |
60 viNAOrig = which(is.na(adData)) | |
61 | |
62 while( TRUE ) | |
63 { | |
64 lsTest <- try( grubbs.test( adData ), silent = TRUE ) | |
65 if( ( class( lsTest ) == "try-error" ) || is.na( lsTest$p.value ) || ( lsTest$p.value > dPOutlier ) ) | |
66 {break} | |
67 viOutliers = outlier( adData, logical = TRUE ) | |
68 adData[viOutliers] <- NA | |
69 } | |
70 | |
71 # Record removed data | |
72 viNAAfter = which(is.na(adData)) | |
73 | |
74 # If all were set to NA then ignore the filtering | |
75 if(length(adData)==length(viNAAfter)) | |
76 { | |
77 viNAAfter = viNAOrig | |
78 adData = frmeData[,iData] | |
79 c_logrMaaslin$info( paste("Grubbs Test:: Identifed all data as outliers so was inactived for index=",iData," data=",paste(as.vector(frmeData[,iData]),collapse=","), "number zeros=", length(which(frmeData[,iData]==0)), sep = " " )) | |
80 } else if(mean(adData, na.rm=TRUE) == 0) { | |
81 viNAAfter = viNAOrig | |
82 adData = frmeData[,iData] | |
83 c_logrMaaslin$info( paste("Grubbs Test::Removed all values but 0, ignored. Index=",iData,".",sep=" " ) ) | |
84 } else { | |
85 # Document removal | |
86 if( sum( is.na( adData ) ) ) | |
87 { | |
88 c_logrMaaslin$info( "Grubbs Test::Removing %d outliers from %s", sum( is.na( adData ) ), colnames(frmeData)[iData] ) | |
89 c_logrMaaslin$info( format( rownames( frmeData )[is.na( adData )] )) | |
90 } | |
91 } | |
92 | |
93 return(list(data=adData,outliers=length(viNAAfter)-length(viNAOrig),indices=setdiff(viNAAfter,viNAOrig))) | |
94 } | |
95 | |
96 funcDoFenceTest <- function( | |
97 ### Use a threshold based on the quartiles of the data to identify outliers | |
98 iData, | |
99 ### Column index in the data frame to test | |
100 frmeData, | |
101 ### The data frame holding the data | |
102 dFence | |
103 ### The fence outside the first and third quartiles to use as a threshold for cutt off. | |
104 ### This many times the interquartile range +/- to the 3rd/1st quartiles | |
105 ){ | |
106 # Establish fence | |
107 adData <- frmeData[,iData] | |
108 adQ <- quantile( adData, c(0.25, 0.5, 0.75), na.rm = TRUE ) | |
109 | |
110 dIQR <- adQ[3] - adQ[1] | |
111 if(!dIQR) | |
112 { | |
113 dIQR = sd(adData,na.rm = TRUE) | |
114 } | |
115 dUF <- adQ[3] + ( dFence * dIQR ) | |
116 dLF <- adQ[1] - ( dFence * dIQR ) | |
117 | |
118 # Record indices of values outside of fence to remove and remove. | |
119 aiRemove <- c() | |
120 for( j in 1:length( adData ) ) | |
121 { | |
122 d <- adData[j] | |
123 if( !is.na( d ) && ( ( d < dLF ) || ( d > dUF ) ) ) | |
124 { | |
125 aiRemove <- c(aiRemove, j) | |
126 } | |
127 } | |
128 | |
129 if(length(aiRemove)==length(adData)) | |
130 { | |
131 aiRemove = c() | |
132 c_logrMaaslin$info( "OutliersByFence:: Identified all data as outlier so was inactivated for index=", iData,"data=", paste(as.vector(frmeData[,iData]),collapse=","), "number zeros=", length(which(frmeData[,iData]==0)), sep=" " ) | |
133 } else { | |
134 adData[aiRemove] <- NA | |
135 | |
136 # Document to screen | |
137 if( length( aiRemove ) ) | |
138 { | |
139 c_logrMaaslin$info( "OutliersByFence::Removing %d outliers from %s", length( aiRemove ), colnames(frmeData)[iData] ) | |
140 c_logrMaaslin$info( format( rownames( frmeData )[aiRemove] )) | |
141 } | |
142 } | |
143 | |
144 return(list(data=adData,outliers=length(aiRemove),indices=aiRemove)) | |
145 } | |
146 | |
147 funcZerosAreUneven = function( | |
148 ### | |
149 vdRawData, | |
150 ### Raw data to be checked during transformation | |
151 funcTransform, | |
152 ### Data transform to perform | |
153 vsStratificationFeatures, | |
154 ### Groupings to check for unevenness | |
155 dfData | |
156 ### Data frame holding the features | |
157 ){ | |
158 # Return indicator of unevenness | |
159 fUneven = FALSE | |
160 | |
161 # Transform the data to compare | |
162 vdTransformed = funcTransform( vdRawData ) | |
163 | |
164 # Go through each stratification of data | |
165 for( sStratification in vsStratificationFeatures ) | |
166 { | |
167 # Current stratification | |
168 vFactorStrats = dfData[[ sStratification ]] | |
169 | |
170 # If the metadata is not a factor then skip | |
171 # Only binned data can be evaluated this way. | |
172 if( !is.factor( vFactorStrats )){ next } | |
173 | |
174 viZerosCountsRaw = c() | |
175 for( sLevel in levels( vFactorStrats ) ) | |
176 { | |
177 vdTest = vdRawData[ which( vFactorStrats == sLevel ) ] | |
178 viZerosCountsRaw = c( viZerosCountsRaw, length(which(vdTest == 0))) | |
179 vdTest = vdTransformed[ which( vFactorStrats == sLevel ) ] | |
180 } | |
181 dExpectation = 1 / length( viZerosCountsRaw ) | |
182 dMin = dExpectation / 2 | |
183 dMax = dExpectation + dMin | |
184 viZerosCountsRaw = viZerosCountsRaw / sum( viZerosCountsRaw ) | |
185 if( ( length( which( viZerosCountsRaw <= dMin ) ) > 0 ) || ( length( which( viZerosCountsRaw >= dMax ) ) > 0 ) ) | |
186 { | |
187 return( TRUE ) | |
188 } | |
189 } | |
190 return( fUneven ) | |
191 } | |
192 | |
193 funcTransformIncreasesOutliers = function( | |
194 ### Checks if a data transform increases outliers in a distribution | |
195 vdRawData, | |
196 ### Raw data to check for outlier zeros | |
197 funcTransform | |
198 ){ | |
199 iUnOutliers = length( boxplot( vdRawData, plot = FALSE )$out ) | |
200 iTransformedOutliers = length( boxplot( funcTransform( vdRawData ), plot = FALSE )$out ) | |
201 | |
202 return( iUnOutliers <= iTransformedOutliers ) | |
203 } | |
204 | |
205 funcClean <- function( | |
206 ### Properly clean / get data ready for analysis | |
207 ### Includes custom analysis from the custom R script if it exists | |
208 frmeData, | |
209 ### Data frame, input data to be acted on | |
210 funcDataProcess, | |
211 ### Custom script that can be given to perform specialized processing before MaAsLin does. | |
212 aiMetadata, | |
213 ### Indices of columns in frmeData which are metadata for analysis. | |
214 aiData, | |
215 ### Indices of column in frmeData which are (abundance) data for analysis. | |
216 lsQCCounts, | |
217 ### List that will hold the quality control information which is written in the output directory. | |
218 astrNoImpute = c(), | |
219 ### An array of column names of frmeData not to impute. | |
220 dMinSamp, | |
221 ### Minimum number of samples | |
222 dMinAbd, | |
223 # Minimum sample abundance | |
224 dFence, | |
225 ### How many quartile ranges defines the fence to define outliers. | |
226 funcTransform, | |
227 ### The data transformation function or a dummy function that does not affect the data | |
228 dPOutlier = 0.05 | |
229 ### The significance threshold for the grubbs test to identify an outlier. | |
230 ){ | |
231 # Call the custom script and set current data and indicies to the processes data and indicies. | |
232 c_logrMaaslin$debug( "Start Clean") | |
233 if( !is.null( funcDataProcess ) ) | |
234 { | |
235 c_logrMaaslin$debug("Additional preprocess function attempted.") | |
236 | |
237 pTmp <- funcDataProcess( frmeData=frmeData, aiMetadata=aiMetadata, aiData=aiData) | |
238 frmeData = pTmp$frmeData | |
239 aiMetadata = pTmp$aiMetadata | |
240 aiData = pTmp$aiData | |
241 lsQCCounts$lsQCCustom = pTmp$lsQCCounts | |
242 } | |
243 # Set data indicies after custom QC process. | |
244 lsQCCounts$aiAfterPreprocess = aiData | |
245 | |
246 # Remove missing data, remove any sample that has less than dMinSamp * the number of data or low abundance | |
247 aiRemove = c() | |
248 aiRemoveLowAbundance = c() | |
249 for( iCol in aiData ) | |
250 { | |
251 adCol = frmeData[,iCol] | |
252 adCol[!is.finite( adCol )] <- NA | |
253 if( ( sum( !is.na( adCol ) ) < ( dMinSamp * length( adCol ) ) ) || | |
254 ( length( unique( na.omit( adCol ) ) ) < 2 ) ) | |
255 { | |
256 aiRemove = c(aiRemove, iCol) | |
257 } | |
258 if( sum(adCol > dMinAbd, na.rm=TRUE ) < (dMinSamp * length( adCol))) | |
259 { | |
260 aiRemoveLowAbundance = c(aiRemoveLowAbundance, iCol) | |
261 } | |
262 } | |
263 # Remove and document | |
264 aiData = setdiff( aiData, aiRemove ) | |
265 aiData = setdiff( aiData, aiRemoveLowAbundance ) | |
266 lsQCCounts$iMissingData = aiRemove | |
267 lsQCCounts$iLowAbundanceData = aiRemoveLowAbundance | |
268 if(length(aiRemove)) | |
269 { | |
270 c_logrMaaslin$info( "Removing the following for data lower bound.") | |
271 c_logrMaaslin$info( format( colnames( frmeData )[aiRemove] )) | |
272 } | |
273 if(length(aiRemoveLowAbundance)) | |
274 { | |
275 c_logrMaaslin$info( "Removing the following for too many low abundance bugs.") | |
276 c_logrMaaslin$info( format( colnames( frmeData )[aiRemoveLowAbundance] )) | |
277 } | |
278 | |
279 #Transform data | |
280 iTransformed = 0 | |
281 viNotTransformedData = c() | |
282 for(aiDatum in aiData) | |
283 { | |
284 adValues = frmeData[,aiDatum] | |
285 # if( ! funcTransformIncreasesOutliers( adValues, funcTransform ) ) | |
286 # { | |
287 frmeData[,aiDatum] = funcTransform( adValues ) | |
288 # iTransformed = iTransformed + 1 | |
289 # } else { | |
290 # viNotTransformedData = c( viNotTransformedData, aiDatum ) | |
291 # } | |
292 } | |
293 c_logrMaaslin$info(paste("Number of features transformed = ",iTransformed)) | |
294 | |
295 # Metadata: Properly factorize all logical data and integer and number data with less than iNonFactorLevelThreshold | |
296 # Also record which are numeric metadata | |
297 aiNumericMetadata = c() | |
298 for( i in aiMetadata ) | |
299 { | |
300 if( ( class( frmeData[,i] ) %in% c("integer", "numeric", "logical") ) && | |
301 ( length( unique( frmeData[,i] ) ) < c_iNonFactorLevelThreshold ) ) { | |
302 c_logrMaaslin$debug(paste("Changing metadatum from numeric/integer/logical to factor",colnames(frmeData)[i],sep="=")) | |
303 frmeData[,i] = factor( frmeData[,i] ) | |
304 } | |
305 if( class( frmeData[,i] ) %in% c("integer","numeric") ) | |
306 { | |
307 aiNumericMetadata = c(aiNumericMetadata,i) | |
308 } | |
309 } | |
310 | |
311 # Remove outliers | |
312 # If the dFence Value is set use the method of defining the outllier as | |
313 # dFence * the interquartile range + or - the 3rd and first quartile respectively. | |
314 # If not the gibbs test is used. | |
315 lsQCCounts$aiDataSumOutlierPerDatum = c() | |
316 lsQCCounts$aiMetadataSumOutlierPerDatum = c() | |
317 lsQCCounts$liOutliers = list() | |
318 | |
319 if( dFence > 0.0 ) | |
320 { | |
321 # For data | |
322 for( iData in aiData ) | |
323 { | |
324 lOutlierInfo <- funcDoFenceTest(iData=iData,frmeData=frmeData,dFence=dFence) | |
325 frmeData[,iData] <- lOutlierInfo[["data"]] | |
326 lsQCCounts$aiDataSumOutlierPerDatum <- c(lsQCCounts$aiDataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) | |
327 if(lOutlierInfo[["outliers"]]>0) | |
328 { | |
329 lsQCCounts$liOutliers[[paste(iData,sep="")]] <- lOutlierInfo[["indices"]] | |
330 } | |
331 } | |
332 | |
333 # Remove outlier non-factor metadata | |
334 for( iMetadata in aiNumericMetadata ) | |
335 { | |
336 lOutlierInfo <- funcDoFenceTest(iData=iMetadata,frmeData=frmeData,dFence=dFence) | |
337 frmeData[,iMetadata] <- lOutlierInfo[["data"]] | |
338 lsQCCounts$aiMetadataSumOutlierPerDatum <- c(lsQCCounts$aiMetadataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) | |
339 if(lOutlierInfo[["outliers"]]>0) | |
340 { | |
341 lsQCCounts$liOutliers[[paste(iMetadata,sep="")]] <- lOutlierInfo[["indices"]] | |
342 } | |
343 } | |
344 #Do not use the fence, use the Grubbs test | |
345 } else if(dPOutlier!=0.0){ | |
346 # For data | |
347 for( iData in aiData ) | |
348 { | |
349 lOutlierInfo <- funcDoGrubbs(iData=iData,frmeData=frmeData,dPOutlier=dPOutlier) | |
350 frmeData[,iData] <- lOutlierInfo[["data"]] | |
351 lsQCCounts$aiDataSumOutlierPerDatum <- c(lsQCCounts$aiDataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) | |
352 if(lOutlierInfo[["outliers"]]>0) | |
353 { | |
354 lsQCCounts$liOutliers[[paste(iData,sep="")]] <- lOutlierInfo[["indices"]] | |
355 } | |
356 } | |
357 for( iMetadata in aiNumericMetadata ) | |
358 { | |
359 lOutlierInfo <- funcDoGrubbs(iData=iMetadata,frmeData=frmeData,dPOutlier=dPOutlier) | |
360 frmeData[,iMetadata] <- lOutlierInfo[["data"]] | |
361 lsQCCounts$aiMetadataSumOutlierPerDatum <- c(lsQCCounts$aiMetadataSumOutlierPerDatum,lOutlierInfo[["outliers"]]) | |
362 if(lOutlierInfo[["outliers"]]>0) | |
363 { | |
364 lsQCCounts$liOutliers[[paste(iMetadata,sep="")]] <- lOutlierInfo[["indices"]] | |
365 } | |
366 } | |
367 } | |
368 | |
369 # Metadata: Remove missing data | |
370 # This is defined as if there is only one non-NA value or | |
371 # if the number of NOT NA data is less than a percentage of the data defined by dMinSamp | |
372 aiRemove = c() | |
373 for( iCol in c(aiMetadata) ) | |
374 { | |
375 adCol = frmeData[,iCol] | |
376 if( ( sum( !is.na( adCol ) ) < ( dMinSamp * length( adCol ) ) ) || | |
377 ( length( unique( na.omit( adCol ) ) ) < 2 ) ) | |
378 { | |
379 aiRemove = c(aiRemove, iCol) | |
380 } | |
381 } | |
382 | |
383 # Remove metadata | |
384 aiMetadata = setdiff( aiMetadata, aiRemove ) | |
385 | |
386 # Update the data which was removed. | |
387 lsQCCounts$iMissingMetadata = aiRemove | |
388 if(length(aiRemove)) | |
389 { | |
390 c_logrMaaslin$info("Removing the following metadata for too much missing data or only one data value outside of NA.") | |
391 c_logrMaaslin$info(format(colnames( frmeData )[aiRemove])) | |
392 } | |
393 | |
394 # Keep track of factor levels in a list for later use | |
395 lslsFactors <- list() | |
396 for( iCol in c(aiMetadata) ) | |
397 { | |
398 aCol <- frmeData[,iCol] | |
399 if( class( aCol ) == "factor" ) | |
400 { | |
401 lslsFactors[[length( lslsFactors ) + 1]] <- list(iCol, levels( aCol )) | |
402 } | |
403 } | |
404 | |
405 # Replace missing data values by the mean of the data column. | |
406 # Remove samples that were all NA from the cleaning and so could not be imputed. | |
407 aiRemoveData = c() | |
408 for( iCol in aiData ) | |
409 { | |
410 adCol <- frmeData[,iCol] | |
411 adCol[is.infinite( adCol )] <- NA | |
412 adCol[is.na( adCol )] <- mean( adCol[which(adCol>0)], na.rm = TRUE ) | |
413 frmeData[,iCol] <- adCol | |
414 | |
415 if(length(which(is.na(frmeData[,iCol]))) == length(frmeData[,iCol])) | |
416 { | |
417 c_logrMaaslin$info( paste("Removing data", iCol, "for being all NA after QC")) | |
418 aiRemoveData = c(aiRemoveData,iCol) | |
419 } | |
420 } | |
421 | |
422 # Remove and document | |
423 aiData = setdiff( aiData, aiRemoveData ) | |
424 lsQCCounts$iMissingData = c(lsQCCounts$iMissingData,aiRemoveData) | |
425 if(length(aiRemoveData)) | |
426 { | |
427 c_logrMaaslin$info( "Removing the following for having only NAs after cleaning (maybe due to only having NA after outlier testing).") | |
428 c_logrMaaslin$info( format( colnames( frmeData )[aiRemoveData] )) | |
429 } | |
430 | |
431 #Use na.gam.replace to manage NA metadata | |
432 aiTmp <- setdiff( aiMetadata, which( colnames( frmeData ) %in% astrNoImpute ) ) | |
433 # Keep tack of NAs so the may not be plotted later. | |
434 liNaIndices = list() | |
435 lsNames = names(frmeData) | |
436 for( i in aiTmp) | |
437 { | |
438 liNaIndices[[lsNames[i]]] = which(is.na(frmeData[,i])) | |
439 } | |
440 frmeData[,aiTmp] <- na.gam.replace( frmeData[,aiTmp] ) | |
441 | |
442 #If NA is a value in factor data, set the NA as a level. | |
443 for( lsFactor in lslsFactors ) | |
444 { | |
445 iCol <- lsFactor[[1]] | |
446 aCol <- frmeData[,iCol] | |
447 if( "NA" %in% levels( aCol ) ) | |
448 { | |
449 if(! lsNames[iCol] %in% astrNoImpute) | |
450 { | |
451 liNaIndices[[lsNames[iCol]]] = union(which(is.na(frmeData[,iCol])),which(frmeData[,iCol]=="NA")) | |
452 } | |
453 frmeData[,iCol] <- factor( aCol, levels = c(lsFactor[[2]], "NA") ) | |
454 } | |
455 } | |
456 | |
457 # Make sure there is a minimum number of non-0 measurements | |
458 aiRemove = c() | |
459 for( iCol in aiData ) | |
460 { | |
461 adCol = frmeData[,iCol] | |
462 if(length( which(adCol!=0)) < ( dMinSamp * length( adCol ) ) ) | |
463 { | |
464 aiRemove = c(aiRemove, iCol) | |
465 } | |
466 } | |
467 | |
468 # Remove and document | |
469 aiData = setdiff( aiData, aiRemove) | |
470 lsQCCounts$iZeroDominantData = aiRemove | |
471 if(length(aiRemove)) | |
472 { | |
473 c_logrMaaslin$info( "Removing the following for having not enough non-zero measurments for analysis.") | |
474 c_logrMaaslin$info( format( colnames( frmeData )[aiRemove] )) | |
475 } | |
476 | |
477 c_logrMaaslin$debug("End FuncClean") | |
478 return( list(frmeData = frmeData, aiMetadata = aiMetadata, aiData = aiData, lsQCCounts = lsQCCounts, liNaIndices=liNaIndices, viNotTransformedData = viNotTransformedData) ) | |
479 ### Return list of | |
480 ### frmeData: The Data after cleaning | |
481 ### aiMetadata: The indices of the metadata still being used after filtering | |
482 ### aiData: The indices of the data still being used after filtering | |
483 ### lsQCCOunts: QC info | |
484 } | |
485 | |
486 funcBugs <- function( | |
487 ### Run analysis of all data features against all metadata | |
488 frmeData, | |
489 ### Cleaned data including metadata, and data | |
490 lsData, | |
491 ### This list is a general container for data as the analysis occurs, think about it as a cache for the analysis | |
492 aiMetadata, | |
493 ### Indices of metadata used in analysis | |
494 aiData, | |
495 ### Indices of response data | |
496 aiNotTransformedData, | |
497 ### Indicies of the data not transformed | |
498 strData, | |
499 ### Log file name | |
500 dSig, | |
501 ### Significance threshold for the qvalue cut off | |
502 fInvert=FALSE, | |
503 ### Invert images to have a black background | |
504 strDirOut = NA, | |
505 ### Output project directory | |
506 funcReg=NULL, | |
507 ### Function for regularization | |
508 funcTransform=NULL, | |
509 ### Function used to transform the data | |
510 funcUnTransform=NULL, | |
511 ### If a transform is used the opposite of that transfor must be used on the residuals in the partial residual plots | |
512 lsNonPenalizedPredictors=NULL, | |
513 ### These predictors will not be penalized in the feature (model) selection step | |
514 funcAnalysis=NULL, | |
515 ### Function to perform association analysis | |
516 lsRandomCovariates=NULL, | |
517 ### List of string names of metadata which will be treated as random covariates | |
518 funcGetResults=NULL, | |
519 ### Function to unpack results from analysis | |
520 fDoRPlot=TRUE, | |
521 ### Plot residuals | |
522 fOmitLogFile = FALSE, | |
523 ### Stops the creation of the log file | |
524 fAllvAll=FALSE, | |
525 ### Flag to turn on all against all comparisons | |
526 liNaIndices = list(), | |
527 ### Indicies of imputed NA data | |
528 lxParameters=list(), | |
529 ### List holds parameters for different variable selection techniques | |
530 strTestingCorrection = "BH", | |
531 ### Correction for multiple testing | |
532 fIsUnivariate = FALSE, | |
533 ### Indicates if the function is univariate | |
534 fZeroInflated = FALSE | |
535 ### Indicates to use a zero infalted model | |
536 ){ | |
537 c_logrMaaslin$debug("Start funcBugs") | |
538 # If no output directory is indicated | |
539 # Then make it the current directory | |
540 if( is.na( strDirOut ) || is.null( strDirOut ) ) | |
541 { | |
542 if( !is.na( strData ) ) | |
543 { | |
544 strDirOut <- paste( dirname( strData ), "/", sep = "" ) | |
545 } else { strDirOut = "" } | |
546 } | |
547 | |
548 # Make th log file and output file names based on the log file name | |
549 strLog = NA | |
550 strBase = "" | |
551 if(!is.na(strData)) | |
552 { | |
553 strBaseOut <- paste( strDirOut, sub( "\\.([^.]+)$", "", basename(strData) ), sep = "/" ) | |
554 strLog <- paste( strBaseOut,c_sLogFileSuffix, ".txt", sep = "" ) | |
555 } | |
556 | |
557 # If indicated, stop the creation of the log file | |
558 # Otherwise delete the log file if it exists and log | |
559 if(fOmitLogFile){ strLog = NA } | |
560 if(!is.na(strLog)) | |
561 { | |
562 c_logrMaaslin$info( "Outputting to: %s", strLog ) | |
563 unlink( strLog ) | |
564 } | |
565 | |
566 # Will contain pvalues | |
567 adP = c() | |
568 adPAdj = c() | |
569 | |
570 # List of lists with association information | |
571 lsSig <- list() | |
572 # Go through each data that was not previously removed and perform inference | |
573 for( iTaxon in aiData ) | |
574 { | |
575 # Log to screen progress per 10 associations. | |
576 # Can be thown off if iTaxon is missing a mod 10 value | |
577 # So the taxons may not be logged every 10 but not a big deal | |
578 if( !( iTaxon %% 10 ) ) | |
579 { | |
580 c_logrMaaslin$info( "Taxon %d/%d", iTaxon, max( aiData ) ) | |
581 } | |
582 | |
583 # Call analysis method | |
584 lsOne <- funcBugHybrid( iTaxon=iTaxon, frmeData=frmeData, lsData=lsData, aiMetadata=aiMetadata, dSig=dSig, adP=adP, lsSig=lsSig, funcTransform=funcTransform, funcUnTransform=funcUnTransform, strLog=strLog, funcReg=funcReg, lsNonPenalizedPredictors=lsNonPenalizedPredictors, funcAnalysis=funcAnalysis, lsRandomCovariates=lsRandomCovariates, funcGetResult=funcGetResults, fAllvAll=fAllvAll, fIsUnivariate=fIsUnivariate, lxParameters=lxParameters, fZeroInflated=fZeroInflated, fIsTransformed= ! iTaxon %in% aiNotTransformedData ) | |
585 | |
586 # If you get a NA (happens when the lmm gets all random covariates) move on | |
587 if( is.na( lsOne ) ){ next } | |
588 | |
589 # The updating of the following happens in the inference method call in the funcBugHybrid call | |
590 # New pvalue array | |
591 adP <- lsOne$adP | |
592 # New lsSig contains data about significant feature v metadata comparisons | |
593 lsSig <- lsOne$lsSig | |
594 # New qc data | |
595 lsData$lsQCCounts = lsOne$lsQCCounts | |
596 } | |
597 | |
598 # Log the QC info | |
599 c_logrMaaslin$debug("lsData$lsQCCounts") | |
600 c_logrMaaslin$debug(format(lsData$lsQCCounts)) | |
601 | |
602 if( is.null( adP ) ) { return( NULL ) } | |
603 | |
604 # Perform bonferonni corrections on factor data (for levels), calculate the number of tests performed, and FDR adjust for multiple hypotheses | |
605 # Perform Bonferonni adjustment on factor data | |
606 for( iADIndex in 1:length( adP ) ) | |
607 { | |
608 # Only perform on factor data | |
609 if( is.factor( lsSig[[ iADIndex ]]$metadata ) ) | |
610 { | |
611 adPAdj = c( adPAdj, funcBonferonniCorrectFactorData( dPvalue = adP[ iADIndex ], vsFactors = lsSig[[ iADIndex ]]$metadata, fIgnoreNAs = length(liNaIndices)>0) ) | |
612 } else { | |
613 adPAdj = c( adPAdj, adP[ iADIndex ] ) | |
614 } | |
615 } | |
616 | |
617 iTests = funcCalculateTestCounts(iDataCount = length(aiData), asMetadata = intersect( lsData$astrMetadata, colnames( frmeData )[aiMetadata] ), asForced = lsNonPenalizedPredictors, asRandom = lsRandomCovariates, fAllvAll = fAllvAll) | |
618 | |
619 #Get indices of sorted data after the factor correction but before the multiple hypothesis corrections. | |
620 aiSig <- sort.list( adPAdj ) | |
621 | |
622 # Perform FDR BH | |
623 adQ = p.adjust(adPAdj, method=strTestingCorrection, n=max(length(adPAdj), iTests)) | |
624 | |
625 # Find all covariates that had significant associations | |
626 astrNames <- c() | |
627 for( i in 1:length( lsSig ) ) | |
628 { | |
629 astrNames <- c(astrNames, lsSig[[i]]$name) | |
630 } | |
631 astrNames <- unique( astrNames ) | |
632 | |
633 # Sets up named label return for global plotting | |
634 lsReturnTaxa <- list() | |
635 for( j in aiSig ) | |
636 { | |
637 if( adQ[j] > dSig ) { next } | |
638 strTaxon <- lsSig[[j]]$taxon | |
639 if(strTaxon %in% names(lsReturnTaxa)) | |
640 { | |
641 lsReturnTaxa[[strTaxon]] = min(lsReturnTaxa[[strTaxon]],adQ[j]) | |
642 } else { lsReturnTaxa[[strTaxon]] = adQ[j]} | |
643 } | |
644 | |
645 # For each covariate with significant associations | |
646 # Write out a file with association information | |
647 for( strName in astrNames ) | |
648 { | |
649 strFileTXT <- NA | |
650 strFilePDF <- NA | |
651 for( j in aiSig ) | |
652 { | |
653 lsCur <- lsSig[[j]] | |
654 strCur <- lsCur$name | |
655 | |
656 if( strCur != strName ) { next } | |
657 | |
658 strTaxon <- lsCur$taxon | |
659 adData <- lsCur$data | |
660 astrFactors <- lsCur$factors | |
661 adCur <- lsCur$metadata | |
662 adY <- adData | |
663 | |
664 if( is.na( strData ) ) { next } | |
665 | |
666 ## If the text file output is not written to yet | |
667 ## make the file names, and delete any previous file output | |
668 if( is.na( strFileTXT ) ) | |
669 { | |
670 strFileTXT <- sprintf( "%s-%s.txt", strBaseOut, strName ) | |
671 unlink(strFileTXT) | |
672 funcWrite( c("Variable", "Feature", "Value", "Coefficient", "N", "N not 0", "P-value", "Q-value"), strFileTXT ) | |
673 } | |
674 | |
675 ## Write text output | |
676 funcWrite( c(strName, strTaxon, lsCur$orig, lsCur$value, length( adData ), sum( adData > 0 ), adP[j], adQ[j]), strFileTXT ) | |
677 | |
678 ## If the significance meets the threshold | |
679 ## Write PDF file output | |
680 if( adQ[j] > dSig ) { next } | |
681 | |
682 # Do not make residuals plots if univariate is selected | |
683 strFilePDF = funcPDF( frmeTmp=frmeData, lsCur=lsCur, curPValue=adP[j], curQValue=adQ[j], strFilePDF=strFilePDF, strBaseOut=strBaseOut, strName=strName, funcUnTransform= funcUnTransform, fDoResidualPlot=fDoRPlot, fInvert=fInvert, liNaIndices=liNaIndices ) | |
684 } | |
685 if( dev.cur( ) != 1 ) { dev.off( ) } | |
686 } | |
687 aiTmp <- aiData | |
688 | |
689 logdebug("End funcBugs", c_logMaaslin) | |
690 return(list(lsReturnBugs=lsReturnTaxa, lsQCCounts=lsData$lsQCCounts)) | |
691 ### List of data features successfully associated without error and quality control data | |
692 } | |
693 | |
694 #Lightly Tested | |
695 ### Performs analysis for 1 feature | |
696 ### iTaxon: integer Taxon index to be associated with data | |
697 ### frmeData: Data frame The full data | |
698 ### lsData: List of all associated data | |
699 ### aiMetadata: Numeric vector of indices | |
700 ### dSig: Numeric significance threshold for q-value cut off | |
701 ### adP: List of pvalues from associations | |
702 ### lsSig: List which serves as a cache of data about significant associations | |
703 ### strLog: String file to log to | |
704 funcBugHybrid <- function( | |
705 ### Performs analysis for 1 feature | |
706 iTaxon, | |
707 ### integer Taxon index to be associated with data | |
708 frmeData, | |
709 ### Data frame, the full data | |
710 lsData, | |
711 ### List of all associated data | |
712 aiMetadata, | |
713 ### Numeric vector of indices | |
714 dSig, | |
715 ### Numeric significance threshold for q-value cut off | |
716 adP, | |
717 ### List of pvalues from associations | |
718 lsSig, | |
719 ### List which serves as a cache of data about significant associations | |
720 funcTransform, | |
721 ### The tranform used on the data | |
722 funcUnTransform, | |
723 ### The reverse transform on the data | |
724 strLog = NA, | |
725 ### String, file to which to log | |
726 funcReg=NULL, | |
727 ### Function to perform regularization | |
728 lsNonPenalizedPredictors=NULL, | |
729 ### These predictors will not be penalized in the feature (model) selection step | |
730 funcAnalysis=NULL, | |
731 ### Function to perform association analysis | |
732 lsRandomCovariates=NULL, | |
733 ### List of string names of metadata which will be treated as random covariates | |
734 funcGetResult=NULL, | |
735 ### Function to unpack results from analysis | |
736 fAllvAll=FALSE, | |
737 ### Flag to turn on all against all comparisons | |
738 fIsUnivariate = FALSE, | |
739 ### Indicates the analysis function is univariate | |
740 lxParameters=list(), | |
741 ### List holds parameters for different variable selection techniques | |
742 fZeroInflated = FALSE, | |
743 ### Indicates if to use a zero infalted model | |
744 fIsTransformed = TRUE | |
745 ### Indicates that the bug is transformed | |
746 ){ | |
747 #dTime00 <- proc.time()[3] | |
748 #Get metadata column names | |
749 astrMetadata = intersect( lsData$astrMetadata, colnames( frmeData )[aiMetadata] ) | |
750 | |
751 #Get data measurements that are not NA | |
752 aiRows <- which( !is.na( frmeData[,iTaxon] ) ) | |
753 | |
754 #Get the dataframe of non-na data measurements | |
755 frmeTmp <- frmeData[aiRows,] | |
756 | |
757 #Set the min boosting selection frequency to a default if not given | |
758 if( is.na( lxParameters$dFreq ) ) | |
759 { | |
760 lxParameters$dFreq <- 0.5 / length( c(astrMetadata) ) | |
761 } | |
762 | |
763 # Get the full data for the bug feature | |
764 adCur = frmeTmp[,iTaxon] | |
765 lxParameters$sBugName = names(frmeTmp[iTaxon]) | |
766 | |
767 # This can run multiple models so some of the results are held in lists and some are not | |
768 llmod = list() | |
769 liTaxon = list() | |
770 lastrTerms = list() | |
771 | |
772 # Build formula for simple mixed effects models | |
773 # Removes random covariates from variable selection | |
774 astrMetadata = setdiff(astrMetadata, lsRandomCovariates) | |
775 strFormula <- paste( "adCur ~", paste( sprintf( "`%s`", astrMetadata ), collapse = " + " ), sep = " " ) | |
776 | |
777 # Document the model | |
778 funcWrite( c("#taxon", colnames( frmeTmp )[iTaxon]), strLog ) | |
779 funcWrite( c("#metadata", astrMetadata), strLog ) | |
780 funcWrite( c("#samples", rownames( frmeTmp )), strLog ) | |
781 | |
782 #Model terms | |
783 astrTerms <- c() | |
784 | |
785 # Attempt feature (model) selection | |
786 if(!is.na(funcReg)) | |
787 { | |
788 #Count model selection method attempts | |
789 lsData$lsQCCounts$iBoosts = lsData$lsQCCounts$iBoosts + 1 | |
790 #Perform model selection | |
791 astrTerms <- funcReg(strFormula=strFormula, frmeTmp=frmeTmp, adCur=adCur, lsParameters=lxParameters, lsForcedParameters=lsNonPenalizedPredictors, strLog=strLog) | |
792 #If the feature selection function is set to None, set all terms of the model to all the metadata | |
793 } else { astrTerms = astrMetadata } | |
794 | |
795 # Get look through the boosting results to get a model | |
796 # Holds the predictors in the predictors in the model that were selected by the boosting | |
797 if(is.null( astrTerms )){lsData$lsQCCounts$iBoostErrors = lsData$lsQCCounts$iBoostErrors + 1} | |
798 | |
799 # Get the indices that are transformed | |
800 # Of those indices check for uneven metadata | |
801 # Untransform any of the metadata that failed | |
802 # Failed means true for uneven occurences of zeros | |
803 # if( fIsTransformed ) | |
804 # { | |
805 # vdUnevenZeroCheck = funcUnTransform( frmeData[[ iTaxon ]] ) | |
806 # if( funcZerosAreUneven( vdRawData=vdUnevenZeroCheck, funcTransform=funcTransform, vsStratificationFeatures=astrTerms, dfData=frmeData ) ) | |
807 # { | |
808 # frmeData[[ iTaxon ]] = vdUnevenZeroCheck | |
809 # c_logrMaaslin$debug( paste( "Taxon transformation reversed due to unevenness of zero distribution.", iTaxon ) ) | |
810 # } | |
811 # } | |
812 | |
813 # Run association analysis if predictors exist and an analysis function is specified | |
814 # Run analysis | |
815 if(!is.na(funcAnalysis) ) | |
816 { | |
817 #If there are selected and forced fixed covariates | |
818 if( length( astrTerms ) ) | |
819 { | |
820 #Count the association attempt | |
821 lsData$lsQCCounts$iLms = lsData$lsQCCounts$iLms + 1 | |
822 | |
823 #Make the lm formula | |
824 #Build formula for simple mixed effects models using random covariates | |
825 strRandomCovariatesFormula = NULL | |
826 #Random covariates are forced | |
827 if(length(lsRandomCovariates)>0) | |
828 { | |
829 #Format for lme | |
830 #Needed for changes to not allowing random covariates through the boosting process | |
831 strRandomCovariatesFormula <- paste( "adCur ~ ", paste( sprintf( "1|`%s`", lsRandomCovariates), collapse = " + " )) | |
832 } | |
833 | |
834 #Set up a list of formula containing selected fixed variables changing and the forced fixed covariates constant | |
835 vstrFormula = c() | |
836 #Set up suppressing forced covariates in a all v all scenario only | |
837 asSuppress = c() | |
838 #Enable all against all comparisons | |
839 if(fAllvAll && !fIsUnivariate) | |
840 { | |
841 lsVaryingCovariates = setdiff(astrTerms,lsNonPenalizedPredictors) | |
842 lsConstantCovariates = setdiff(lsNonPenalizedPredictors,lsRandomCovariates) | |
843 strConstantFormula = paste( sprintf( "`%s`", lsConstantCovariates ), collapse = " + " ) | |
844 asSuppress = lsConstantCovariates | |
845 | |
846 if(length(lsVaryingCovariates)==0L) | |
847 { | |
848 vstrFormula <- c( paste( "adCur ~ ", paste( sprintf( "`%s`", lsConstantCovariates ), collapse = " + " )) ) | |
849 } else { | |
850 for( sVarCov in lsVaryingCovariates ) | |
851 { | |
852 strTempFormula = paste( "adCur ~ `", sVarCov,"`",sep="") | |
853 if(length(lsConstantCovariates)>0){ strTempFormula = paste(strTempFormula,strConstantFormula,sep=" + ") } | |
854 vstrFormula <- c( vstrFormula, strTempFormula ) | |
855 } | |
856 } | |
857 } else { | |
858 #This is either the multivariate case formula for all covariates in an lm or fixed covariates in the lmm | |
859 vstrFormula <- c( paste( "adCur ~ ", paste( sprintf( "`%s`", astrTerms ), collapse = " + " )) ) | |
860 } | |
861 | |
862 #Run the association | |
863 for( strAnalysisFormula in vstrFormula ) | |
864 { | |
865 i = length(llmod)+1 | |
866 llmod[[i]] = funcAnalysis(strFormula=strAnalysisFormula, frmeTmp=frmeTmp, iTaxon=iTaxon, lsHistory=list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts), strRandomFormula=strRandomCovariatesFormula, fZeroInflated=fZeroInflated) | |
867 | |
868 liTaxon[[i]] = iTaxon | |
869 lastrTerms[[i]] = funcFormulaStrToList(strAnalysisFormula) | |
870 } | |
871 } else { | |
872 #If there are no selected or forced fixed covariates | |
873 lsData$lsQCCounts$iNoTerms = lsData$lsQCCounts$iNoTerms + 1 | |
874 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts)) | |
875 } | |
876 } | |
877 | |
878 #Call funcBugResults and return it's return | |
879 if(!is.na(funcGetResult)) | |
880 { | |
881 #Format the results to a consistent expected result. | |
882 return( funcGetResult( llmod=llmod, frmeData=frmeData, liTaxon=liTaxon, dSig=dSig, adP=adP, lsSig=lsSig, strLog=strLog, lsQCCounts=lsData$lsQCCounts, lastrCols=lastrTerms, asSuppressCovariates=asSuppress ) ) | |
883 } else { | |
884 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsData$lsQCCounts)) | |
885 } | |
886 ### List containing a list of pvalues, a list of significant data per association, and a list of QC data | |
887 } |