Mercurial > repos > george-weingart > maaslin
comparison maaslin-4450aa4ecc84/src/lib/AnalysisModules.R @ 1:a87d5a5f2776
Uploaded the version running on the prod server
author | george-weingart |
---|---|
date | Sun, 08 Feb 2015 23:08:38 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:e0b5980139d9 | 1:a87d5a5f2776 |
---|---|
1 ##################################################################################### | |
2 #Copyright (C) <2012> | |
3 # | |
4 #Permission is hereby granted, free of charge, to any person obtaining a copy of | |
5 #this software and associated documentation files (the "Software"), to deal in the | |
6 #Software without restriction, including without limitation the rights to use, copy, | |
7 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
8 #and to permit persons to whom the Software is furnished to do so, subject to | |
9 #the following conditions: | |
10 # | |
11 #The above copyright notice and this permission notice shall be included in all copies | |
12 #or substantial portions of the Software. | |
13 # | |
14 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
15 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
16 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
17 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
18 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
19 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
20 # | |
21 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), | |
22 # authored by the Huttenhower lab at the Harvard School of Public Health | |
23 # (contact Timothy Tickle, ttickle@hsph.harvard.edu). | |
24 ##################################################################################### | |
25 | |
26 inlinedocs <- function( | |
27 ##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu> | |
28 ##description<< Allows one to plug in new modules to perform analysis (univariate or multivariate), regularization, and data (response) transformation. | |
29 ) { return( pArgs ) } | |
30 | |
31 # Libraries | |
32 suppressMessages(library( agricolae, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
33 # Needed for the pot-hoc Kruskal wallis comparisons | |
34 suppressMessages(library( penalized, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
35 # Needed for stepAIC | |
36 suppressMessages(library( MASS, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
37 # Needed for na action behavior | |
38 suppressMessages(library( gam, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
39 # Needed for boosting | |
40 suppressMessages(library( gbm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
41 # Needed for LASSO | |
42 suppressMessages(library( glmnet, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
43 # Needed for mixed models | |
44 #suppressMessages(library( lme4, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
45 suppressMessages(library( nlme, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
46 | |
47 # Needed for zero inflated models | |
48 #suppressMessages(library( MCMCglmm, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
49 suppressMessages(library( pscl, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
50 suppressMessages(library( gamlss, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
51 # Do not use #suppressMessages(library( glmmADMB, warn.conflicts=FALSE, quietly=TRUE, verbose=FALSE)) | |
52 | |
53 fAddBack = TRUE | |
54 dUnevenMax = .9 | |
55 | |
56 | |
57 ### Helper functions | |
58 # OK | |
59 funcMakeContrasts <- function | |
60 ### Makes univariate contrasts of all predictors in the model formula with the response. | |
61 (strFormula, | |
62 ### lm style string defining reponse and predictors | |
63 strRandomFormula, | |
64 ### mixed model string defining the fixed covariates | |
65 frmeTmp, | |
66 ### The data frame to find predictor data in | |
67 iTaxon, | |
68 ### Taxon | |
69 functionContrast, | |
70 ### functionContrast The univariate test to perform | |
71 lsQCCounts | |
72 ### QC info | |
73 ){ | |
74 #TODO are we updating the QCCounts? | |
75 lsSig = list() | |
76 ### Holds all the significance results from the tests | |
77 adP = c() | |
78 ### Holds the p-values | |
79 sCurDataName = names(frmeTmp)[iTaxon] | |
80 ### The name of the taxon (data row) that is being associated (always assumed to be numeric) | |
81 #Get test comparisons (predictor names from formula string) | |
82 asComparisons = unique(c(funcFormulaStrToList(strFormula),funcFormulaStrToList(strRandomFormula))) | |
83 | |
84 #Change metadata in formula to univariate comparisons | |
85 for(sComparison in asComparisons) | |
86 { | |
87 # Metadata values | |
88 vxTest = frmeTmp[[sComparison]] | |
89 | |
90 # Get the levels in the comparison | |
91 # Can ignore the first level because it is the reference level | |
92 asLevels = sComparison | |
93 if(is.factor(vxTest)){asLevels = levels(vxTest)[2:length(vxTest)]} | |
94 | |
95 lasComparisonResults = functionContrast(x=sComparison, adCur=frmeTmp[[sCurDataName]], dfData=frmeTmp) | |
96 for(asComparison in lasComparisonResults) | |
97 { | |
98 if( is.na( asComparison$p.value ) ) { next } | |
99 # Get pvalue | |
100 adP = c(adP, asComparison$p.value) | |
101 # Get SD, if not available, give SD of covariate | |
102 dSTD = asComparison$SD | |
103 # TODO Is using sd on factor and binary data correct? | |
104 if(is.na(dSTD) || is.null(dSTD)){dSTD = sd(vxTest)} | |
105 | |
106 lsSig[[length( lsSig ) + 1]] <- list( | |
107 #Current metadata name (string column name) ok | |
108 name = sComparison, | |
109 #Current metadatda name (string, as a factor level if existing as such) ok | |
110 orig = asComparison$name, | |
111 #Taxon feature name (string) ok | |
112 taxon = colnames( frmeTmp )[iTaxon], | |
113 #Taxon data / response (double vector) ok | |
114 data = frmeTmp[,iTaxon], | |
115 #Name of column ok | |
116 factors = sComparison, | |
117 #Metadata values (metadata as a factor or raw numeric) ok | |
118 metadata = vxTest, | |
119 #Current coefficient value (named coef value with level name (from coefs) ok | |
120 value = asComparison$coef, | |
121 #Standard deviation (numeric) ok | |
122 std = dSTD, | |
123 #Model coefficients (output from coefs with intercept) ok | |
124 allCoefs = asComparison$coef) | |
125 } | |
126 } | |
127 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts)) | |
128 ### Returns a list of p-value, standard deviation, and comparison which produced the p-value | |
129 } | |
130 | |
131 #Ok | |
132 funcGetStepPredictors <- function | |
133 ### Retrieve the predictors of the reduced model after stepwise model selection | |
134 (lmod, | |
135 ### Linear model resulting from step-wise model selection | |
136 frmeTmp, | |
137 strLog | |
138 ### File to document logging | |
139 ){ | |
140 #Document | |
141 funcWrite( "#model", strLog ) | |
142 funcWrite( lmod$fit, strLog ) | |
143 #TODO funcWrite( lmod$train.error, strLog ) | |
144 #TODO funcWrite( lmod$Terms, strLog ) | |
145 funcWrite( "#summary-gbm", strLog ) | |
146 funcWrite( summary(lmod), strLog ) | |
147 | |
148 #Get Names from coefficients | |
149 asStepCoefsFactors = coefficients(lmod) | |
150 astrCoefNames = setdiff(names(asStepCoefsFactors[as.vector(!is.na(asStepCoefsFactors))==TRUE]),"(Intercept)") | |
151 asStepCoefsFactors = unique(as.vector(sapply(astrCoefNames,funcCoef2Col, frmeData=frmeTmp))) | |
152 | |
153 if(length(asStepCoefsFactors)<1){ return(NA) } | |
154 return( asStepCoefsFactors ) | |
155 ### Vector of string predictor names | |
156 } | |
157 | |
158 funcGetUnivariateResults <- function | |
159 ### Reduce the list of list of results to the correct format | |
160 ( llmod, | |
161 ### The list of univariate models | |
162 frmeData, | |
163 ### Data analysis is performed on | |
164 liTaxon, | |
165 ### The response id | |
166 dSig, | |
167 ### Significance level for q-values | |
168 adP, | |
169 ### List of pvalues from all associations performed | |
170 lsSig, | |
171 ### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients | |
172 strLog, | |
173 ### File to which to document logging | |
174 lsQCCounts, | |
175 ### Records of counts associated with quality control | |
176 lastrCols, | |
177 ### Predictors used in the association | |
178 asSuppressCovariates=c() | |
179 ### Vector of covariates to suppress and not give results for | |
180 ){ | |
181 adP = c() | |
182 lsSig = list() | |
183 for(lmod in llmod) | |
184 { | |
185 adP = c(adP,lmod$adP) | |
186 lsSig = c(lsSig,lmod$lsSig) | |
187 } | |
188 return(list(adP=adP, lsSig=lsSig, lsQCCounts=llmod[length(llmod)]$lsQCCounts)) | |
189 } | |
190 | |
191 # OK | |
192 funcGetLMResults <- function | |
193 ### Reduce the lm object return to just the data needed for further analysis | |
194 ( llmod, | |
195 ### The result from a linear model | |
196 frmeData, | |
197 ### Data analysis is performed on | |
198 liTaxon, | |
199 ### The response id | |
200 dSig, | |
201 ### Significance level for q-values | |
202 adP, | |
203 ### List of pvalues from all associations performed | |
204 lsSig, | |
205 ### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients | |
206 strLog, | |
207 ### File to which to document logging | |
208 lsQCCounts, | |
209 ### Records of counts associated with quality control | |
210 lastrCols, | |
211 ### Predictors used in the association | |
212 asSuppressCovariates=c() | |
213 ### Vector of covariates to suppress and not give results for | |
214 ){ | |
215 ilmodIndex = 0 | |
216 for( lmod in llmod ) | |
217 { | |
218 ilmodIndex = ilmodIndex + 1 | |
219 lmod = llmod[[ ilmodIndex ]] | |
220 iTaxon = liTaxon[[ ilmodIndex ]] | |
221 astrCols = lastrCols[[ ilmodIndex ]] | |
222 | |
223 #Exclude none and errors | |
224 if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) ) | |
225 { | |
226 #holds the location of the pvlaues if an lm, if lmm is detected this will be changed | |
227 iPValuePosition = 4 | |
228 | |
229 #Get the column name of the iTaxon index | |
230 #frmeTmp needs to be what? | |
231 strTaxon = colnames( frmeData )[iTaxon] | |
232 #Get summary information from the linear model | |
233 lsSum = try( summary( lmod ) ) | |
234 #The following can actually happen when the stranger regressors return broken results | |
235 if( class( lsSum ) == "try-error" ) | |
236 { | |
237 next | |
238 } | |
239 | |
240 #Write summary information to log file | |
241 funcWrite( "#model", strLog ) | |
242 funcWrite( lmod, strLog ) | |
243 funcWrite( "#summary", strLog ) | |
244 #Unbelievably, some of the more unusual regression methods crash out when _printing_ their results | |
245 try( funcWrite( lsSum, strLog ) ) | |
246 | |
247 #Get the coefficients | |
248 #This should work for linear models | |
249 frmeCoefs <- try( coefficients( lsSum ) ) | |
250 | |
251 if( ( class(frmeCoefs ) == "try-error" ) || is.null( frmeCoefs ) ) | |
252 { | |
253 adCoefs = try( coefficients( lmod )) | |
254 if(class( adCoefs ) == "try-error") | |
255 { | |
256 adCoefs = coef(lmod) | |
257 } | |
258 frmeCoefs <- NA | |
259 } else { | |
260 if( class( frmeCoefs ) == "list" ) | |
261 { | |
262 frmeCoefs <- frmeCoefs$count | |
263 } | |
264 adCoefs = frmeCoefs[,1] | |
265 } | |
266 | |
267 #Go through each coefficient | |
268 astrRows <- names( adCoefs ) | |
269 | |
270 ##lmm | |
271 if( is.null( astrRows ) ) | |
272 { | |
273 astrRows = rownames( lsSum$tTable ) | |
274 frmeCoefs = lsSum$tTable | |
275 iPValuePosition = 5 | |
276 adCoefs = frmeCoefs[,1] | |
277 } | |
278 | |
279 for( iMetadata in 1:length( astrRows ) ) | |
280 { | |
281 #Current coef which is being evaluated | |
282 strOrig = astrRows[ iMetadata ] | |
283 #Skip y interscept | |
284 if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next } | |
285 #Skip suppressed covariates | |
286 if( funcCoef2Col( strOrig, frmeData ) %in% asSuppressCovariates){ next } | |
287 | |
288 #Extract pvalue and std in standard model | |
289 dP = frmeCoefs[ strOrig, iPValuePosition ] | |
290 dStd = frmeCoefs[ strOrig, 2 ] | |
291 | |
292 #Attempt to extract the pvalue and std in mixed effects summary | |
293 #Could not get the pvalue so skip result | |
294 if( is.nan( dP ) || is.na( dP ) || is.null( dP ) ) { next } | |
295 | |
296 dCoef = adCoefs[ iMetadata ] | |
297 | |
298 #Setting adMetadata | |
299 #Metadata values | |
300 strMetadata = funcCoef2Col( strOrig, frmeData, astrCols ) | |
301 if( is.na( strMetadata ) ) | |
302 { | |
303 if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next } | |
304 c_logrMaaslin$error( "Unknown coefficient: %s", strOrig ) | |
305 } | |
306 if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next } | |
307 adMetadata <- frmeData[, strMetadata ] | |
308 | |
309 # Store (factor level modified) p-value | |
310 # Store general results for each coef | |
311 adP <- c( adP, dP ) | |
312 | |
313 # Append to the list of information about associations | |
314 lsSig[[ length( lsSig ) + 1 ]] <- list( | |
315 # Current metadata name | |
316 name = strMetadata, | |
317 # Current metadatda name (as a factor level if existing as such) | |
318 orig = strOrig, | |
319 # Taxon feature name | |
320 taxon = strTaxon, | |
321 # Taxon data / response | |
322 data = frmeData[, iTaxon ], | |
323 # All levels | |
324 factors = c( strMetadata ), | |
325 # Metadata values | |
326 metadata = adMetadata, | |
327 # Current coeficient value | |
328 value = dCoef, | |
329 # Standard deviation | |
330 std = dStd, | |
331 # Model coefficients | |
332 allCoefs = adCoefs ) | |
333 } | |
334 } | |
335 } | |
336 return( list( adP = adP, lsSig = lsSig, lsQCCounts = lsQCCounts ) ) | |
337 ### List containing a list of pvalues, a list of significant data per association, and a list of QC data | |
338 } | |
339 | |
340 funcGetZeroInflatedResults <- function | |
341 ### Reduce the lm object return to just the data needed for further analysis | |
342 ( llmod, | |
343 ### The result from a linear model | |
344 frmeData, | |
345 ### Data analysis is performed on | |
346 liTaxon, | |
347 ### The response id | |
348 dSig, | |
349 ### Significance level for q-values | |
350 adP, | |
351 ### List of pvalues from all associations performed | |
352 lsSig, | |
353 ### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients | |
354 strLog, | |
355 ### File to which to document logging | |
356 lsQCCounts, | |
357 ### Records of counts associated with quality control | |
358 lastrCols, | |
359 ### Predictors used in the association | |
360 asSuppressCovariates=c() | |
361 ### Vector of covariates to suppress and not give results for | |
362 ){ | |
363 ilmodIndex = 0 | |
364 for(lmod in llmod) | |
365 { | |
366 ilmodIndex = ilmodIndex + 1 | |
367 lmod = llmod[[ilmodIndex]] | |
368 iTaxon = liTaxon[[ilmodIndex]] | |
369 astrCols = lastrCols[[ilmodIndex]] | |
370 | |
371 #Exclude none and errors | |
372 if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) ) | |
373 { | |
374 #holds the location of the pvlaues if an lm, if lmm is detected this will be changed | |
375 iPValuePosition = 4 | |
376 | |
377 #Get the column name of the iTaxon index | |
378 #frmeTmp needs to be what? | |
379 strTaxon = colnames( frmeData )[iTaxon] | |
380 | |
381 #Write summary information to log file | |
382 funcWrite( "#model", strLog ) | |
383 funcWrite( lmod, strLog ) | |
384 | |
385 #Get the coefficients | |
386 #This should work for linear models | |
387 frmeCoefs <- summary(lmod) | |
388 if(! is.null( frmeCoefs$coefficients$count ) ) # Needed for zeroinfl | |
389 { | |
390 frmeCoefs = frmeCoefs$coefficients$count | |
391 } | |
392 adCoefs = frmeCoefs[,1] | |
393 names(adCoefs) = row.names(frmeCoefs) | |
394 | |
395 funcWrite( "#Coefs", strLog ) | |
396 funcWrite( frmeCoefs, strLog ) | |
397 | |
398 #Go through each coefficient | |
399 astrRows <- row.names( frmeCoefs ) | |
400 | |
401 for( iMetadata in 1:length( astrRows ) ) | |
402 { | |
403 #Current coef which is being evaluated | |
404 strOrig = astrRows[iMetadata] | |
405 #Skip y interscept | |
406 if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next } | |
407 #Skip suppressed covariates | |
408 if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next} | |
409 | |
410 #Extract pvalue and std in standard model | |
411 dP = frmeCoefs[strOrig, iPValuePosition] | |
412 if(is.nan(dP)){next} | |
413 dStd = frmeCoefs[strOrig,2] | |
414 | |
415 dCoef = adCoefs[iMetadata] | |
416 | |
417 #Setting adMetadata | |
418 #Metadata values | |
419 strMetadata = funcCoef2Col( strOrig, frmeData, astrCols ) | |
420 if( is.na( strMetadata ) ) | |
421 { | |
422 if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next } | |
423 c_logrMaaslin$error( "Unknown coefficient: %s", strOrig ) | |
424 } | |
425 if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next } | |
426 adMetadata <- frmeData[,strMetadata] | |
427 | |
428 #Store (factor level modified) p-value | |
429 #Store general results for each coef | |
430 adP <- c(adP, dP) | |
431 lsSig[[length( lsSig ) + 1]] <- list( | |
432 #Current metadata name | |
433 name = strMetadata, | |
434 #Current metadatda name (as a factor level if existing as such) | |
435 orig = strOrig,# | |
436 #Taxon feature name | |
437 taxon = strTaxon, | |
438 #Taxon data / response | |
439 data = frmeData[,iTaxon], | |
440 #All levels | |
441 factors = c(strMetadata), | |
442 #Metadata values | |
443 metadata = adMetadata, | |
444 #Current coeficient value | |
445 value = dCoef, | |
446 #Standard deviation | |
447 std = dStd, | |
448 #Model coefficients | |
449 allCoefs = adCoefs) | |
450 } | |
451 } | |
452 } | |
453 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts)) | |
454 ### List containing a list of pvalues, a list of significant data per association, and a list of QC data | |
455 } | |
456 | |
457 oldfuncGetZeroInflatedResults <- function | |
458 ### Reduce the lm object return to just the data needed for further analysis | |
459 ( llmod, | |
460 ### The result from a linear model | |
461 frmeData, | |
462 ### Data analysis is performed on | |
463 liTaxon, | |
464 ### The response id | |
465 dSig, | |
466 ### Significance level for q-values | |
467 adP, | |
468 ### List of pvalues from all associations performed | |
469 lsSig, | |
470 ### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients | |
471 strLog, | |
472 ### File to which to document logging | |
473 lsQCCounts, | |
474 ### Records of counts associated with quality control | |
475 lastrCols, | |
476 ### Predictors used in the association | |
477 asSuppressCovariates=c() | |
478 ### Vector of covariates to suppress and not give results for | |
479 ){ | |
480 ilmodIndex = 0 | |
481 for(lmod in llmod) | |
482 { | |
483 ilmodIndex = ilmodIndex + 1 | |
484 lmod = llmod[[ilmodIndex]] | |
485 iTaxon = liTaxon[[ilmodIndex]] | |
486 astrCols = lastrCols[[ilmodIndex]] | |
487 | |
488 #Exclude none and errors | |
489 if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) ) | |
490 { | |
491 #holds the location of the pvlaues if an lm, if lmm is detected this will be changed | |
492 iPValuePosition = 4 | |
493 | |
494 #Get the column name of the iTaxon index | |
495 #frmeTmp needs to be what? | |
496 strTaxon = colnames( frmeData )[iTaxon] | |
497 | |
498 #Write summary information to log file | |
499 funcWrite( "#model", strLog ) | |
500 funcWrite( lmod, strLog ) | |
501 | |
502 #Get the coefficients | |
503 #This should work for linear models | |
504 frmeCoefs <- summary(lmod) | |
505 adCoefs = frmeCoefs[,1] | |
506 names(adCoefs) = row.names(frmeCoefs) | |
507 | |
508 #Go through each coefficient | |
509 astrRows <- row.names( frmeCoefs ) | |
510 | |
511 for( iMetadata in 1:length( astrRows ) ) | |
512 { | |
513 #Current coef which is being evaluated | |
514 strOrig = astrRows[iMetadata] | |
515 #Skip y interscept | |
516 if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next } | |
517 #Skip suppressed covariates | |
518 if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next} | |
519 | |
520 #Extract pvalue and std in standard model | |
521 dP = frmeCoefs[strOrig, iPValuePosition] | |
522 dStd = frmeCoefs[strOrig,2] | |
523 | |
524 dCoef = adCoefs[iMetadata] | |
525 | |
526 #Setting adMetadata | |
527 #Metadata values | |
528 strMetadata = funcCoef2Col( strOrig, frmeData, astrCols ) | |
529 if( is.na( strMetadata ) ) | |
530 { | |
531 if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next } | |
532 c_logrMaaslin$error( "Unknown coefficient: %s", strOrig ) | |
533 } | |
534 if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next } | |
535 adMetadata <- frmeData[,strMetadata] | |
536 | |
537 #Store (factor level modified) p-value | |
538 #Store general results for each coef | |
539 adP <- c(adP, dP) | |
540 lsSig[[length( lsSig ) + 1]] <- list( | |
541 #Current metadata name | |
542 name = strMetadata, | |
543 #Current metadatda name (as a factor level if existing as such) | |
544 orig = strOrig,# | |
545 #Taxon feature name | |
546 taxon = strTaxon, | |
547 #Taxon data / response | |
548 data = frmeData[,iTaxon], | |
549 #All levels | |
550 factors = c(strMetadata), | |
551 #Metadata values | |
552 metadata = adMetadata, | |
553 #Current coeficient value | |
554 value = dCoef, | |
555 #Standard deviation | |
556 std = dStd, | |
557 #Model coefficients | |
558 allCoefs = adCoefs) | |
559 } | |
560 } | |
561 } | |
562 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts)) | |
563 ### List containing a list of pvalues, a list of significant data per association, and a list of QC data | |
564 } | |
565 | |
566 | |
567 notfuncGetZeroInflatedResults <- function | |
568 ### Reduce the lm object return to just the data needed for further analysis | |
569 ( llmod, | |
570 ### The result from a linear model | |
571 frmeData, | |
572 ### Data analysis is performed on | |
573 liTaxon, | |
574 ### The response id | |
575 dSig, | |
576 ### Significance level for q-values | |
577 adP, | |
578 ### List of pvalues from all associations performed | |
579 lsSig, | |
580 ### List of information from the lm containing, metadata name, metadatda name (as a factor level if existing as such), Taxon feature name, Taxon data / response, All levels, Metadata values, Current coeficient value, Standard deviation, Model coefficients | |
581 strLog, | |
582 ### File to which to document logging | |
583 lsQCCounts, | |
584 ### Records of counts associated with quality control | |
585 lastrCols, | |
586 ### Predictors used in the association | |
587 asSuppressCovariates=c() | |
588 ### Vector of covariates to suppress and not give results for | |
589 ){ | |
590 ilmodIndex = 0 | |
591 for(lmod in llmod) | |
592 { | |
593 ilmodIndex = ilmodIndex + 1 | |
594 lmod = llmod[[ilmodIndex]] | |
595 iTaxon = liTaxon[[ilmodIndex]] | |
596 astrCols = lastrCols[[ilmodIndex]] | |
597 | |
598 #Exclude none and errors | |
599 if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) ) | |
600 { | |
601 #holds the location of the pvlaues if an lm, if lmm is detected this will be changed | |
602 iPValuePosition = 4 | |
603 | |
604 #Get the column name of the iTaxon index | |
605 #frmeTmp needs to be what? | |
606 strTaxon = colnames( frmeData )[iTaxon] | |
607 | |
608 #Write summary information to log file | |
609 funcWrite( "#model", strLog ) | |
610 funcWrite( lmod, strLog ) | |
611 | |
612 #Get the coefficients | |
613 #This should work for linear models | |
614 frmeCoefs <- summary(lmod) | |
615 frmeCoefs = frmeCoefs$coefficients$count | |
616 funcWrite( "#Coefs", strLog ) | |
617 funcWrite( frmeCoefs, strLog ) | |
618 | |
619 adCoefs = frmeCoefs[,1] | |
620 names(adCoefs) = row.names(frmeCoefs) | |
621 | |
622 #Go through each coefficient | |
623 astrRows <- row.names( frmeCoefs ) | |
624 | |
625 for( iMetadata in 1:length( astrRows ) ) | |
626 { | |
627 #Current coef which is being evaluated | |
628 strOrig = astrRows[iMetadata] | |
629 | |
630 #Skip y interscept | |
631 if( strOrig %in% c("(Intercept)", "Intercept", "Log(theta)") ) { next } | |
632 #Skip suppressed covariates | |
633 if( funcCoef2Col(strOrig,frmeData) %in% asSuppressCovariates){next} | |
634 | |
635 #Extract pvalue and std in standard model | |
636 dP = frmeCoefs[strOrig, iPValuePosition] | |
637 if(is.nan(dP)){ next } | |
638 dStd = frmeCoefs[strOrig,2] | |
639 dCoef = adCoefs[iMetadata] | |
640 | |
641 #Setting adMetadata | |
642 #Metadata values | |
643 strMetadata = funcCoef2Col( strOrig, frmeData, astrCols ) | |
644 if( is.na( strMetadata ) ) | |
645 { | |
646 if( substring( strOrig, nchar( strOrig ) - 1 ) == "NA" ) { next } | |
647 c_logrMaaslin$error( "Unknown coefficient: %s", strOrig ) | |
648 } | |
649 if( substring( strOrig, nchar( strMetadata ) + 1 ) == "NA" ) { next } | |
650 adMetadata <- frmeData[,strMetadata] | |
651 | |
652 #Store (factor level modified) p-value | |
653 #Store general results for each coef | |
654 adP <- c(adP, dP) | |
655 lsSig[[length( lsSig ) + 1]] <- list( | |
656 #Current metadata name | |
657 name = strMetadata, | |
658 #Current metadatda name (as a factor level if existing as such) | |
659 orig = strOrig,# | |
660 #Taxon feature name | |
661 taxon = strTaxon, | |
662 #Taxon data / response | |
663 data = frmeData[,iTaxon], | |
664 #All levels | |
665 factors = c(strMetadata), | |
666 #Metadata values | |
667 metadata = adMetadata, | |
668 #Current coeficient value | |
669 value = dCoef, | |
670 #Standard deviation | |
671 std = dStd, | |
672 #Model coefficients | |
673 allCoefs = adCoefs) | |
674 } | |
675 } | |
676 } | |
677 return(list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts)) | |
678 ### List containing a list of pvalues, a list of significant data per association, and a list of QC data | |
679 } | |
680 | |
681 | |
682 ### Options for variable selection | |
683 # OK | |
684 funcBoostModel <- function( | |
685 ### Perform model selection / regularization with boosting | |
686 strFormula, | |
687 ### The formula of the full model before boosting | |
688 frmeTmp, | |
689 ### The data on which to perform analysis | |
690 adCur, | |
691 ### The response data | |
692 lsParameters, | |
693 ### User controlled parameters needed specific to boosting | |
694 lsForcedParameters = NULL, | |
695 ### Force these predictors to be in the model | |
696 strLog | |
697 ### File to which to document logging | |
698 ){ | |
699 funcWrite( c("#Boost formula", strFormula), strLog ) | |
700 lmod = try( gbm( as.formula( strFormula ), data=frmeTmp, distribution="laplace", verbose=FALSE, n.minobsinnode=min(10, round(0.1 * nrow( frmeTmp ) ) ), n.trees=1000 ) ) | |
701 #TODO# lmod = try( gbm( as.formula( strFormula ), data=frmeTmp, distribution="gaussian", verbose=FALSE, n.minobsinnode=min(10, round(0.1 * nrow( frmeTmp ) ) ), n.trees=1000 ) ) | |
702 | |
703 astrTerms <- c() | |
704 if( !is.na( lmod ) && ( class( lmod ) != "try-error" ) ) | |
705 { | |
706 #Get boosting summary results | |
707 lsSum <- summary( lmod, plotit = FALSE ) | |
708 | |
709 #Document | |
710 funcWrite( "#model-gbm", strLog ) | |
711 funcWrite( lmod$fit, strLog ) | |
712 funcWrite( lmod$train.error, strLog ) | |
713 funcWrite( lmod$Terms, strLog ) | |
714 funcWrite( "#summary-gbm", strLog ) | |
715 funcWrite( lsSum, strLog ) | |
716 | |
717 # Uneven metadata | |
718 vstrUneven = c() | |
719 # Kept metadata | |
720 vstrKeepMetadata = c() | |
721 | |
722 #Select model predictors | |
723 #Check the frequency of selection and skip if not selected more than set threshold dFreq | |
724 for( strMetadata in lmod$var.names ) | |
725 { | |
726 #Get the name of the metadata | |
727 strTerm <- funcCoef2Col( strMetadata, frmeTmp, c(astrMetadata, astrGenetics) ) | |
728 | |
729 #Add back in uneven metadata | |
730 if(fAddBack) | |
731 { | |
732 ldMetadata = frmeTmp[[strMetadata]] | |
733 if(length(which(table(ldMetadata)/length(ldMetadata)>dUnevenMax))>0) | |
734 { | |
735 astrTerms <- c(astrTerms, strTerm) | |
736 vstrUneven = c(vstrUneven,strMetadata) | |
737 next | |
738 } | |
739 } | |
740 | |
741 #If the selprob is less than a certain frequency, skip | |
742 dSel <- lsSum$rel.inf[which( lsSum$var == strMetadata )] / 100 | |
743 if( is.na(dSel) || ( dSel <= lsParameters$dFreq ) ){ next } | |
744 #TODO# if( is.na(dSel) || ( dSel < lsParameters$dFreq ) ){ next } | |
745 | |
746 #If you should ignore the metadata, continue | |
747 if( is.null( strTerm ) ) { next } | |
748 | |
749 #If you cant find the metadata name, write | |
750 if( is.na( strTerm ) ) | |
751 { | |
752 c_logrMaaslin$error( "Unknown coefficient: %s", strMetadata ) | |
753 next | |
754 } | |
755 | |
756 #Collect metadata names | |
757 astrTerms <- c(astrTerms, strTerm) | |
758 vstrKeepMetadata = c(vstrKeepMetadata,strTerm) | |
759 } | |
760 } else { astrTerms = lsForcedParameters } | |
761 | |
762 # funcBoostInfluencePlot(vdRelInf=lsSum$rel.inf, sFeature=lsParameters$sBugName, vsPredictorNames=lsSum$var, vstrKeepMetadata=vstrKeepMetadata, vstrUneven=vstrUneven) | |
763 | |
764 return(unique(c(astrTerms,lsForcedParameters))) | |
765 ### Return a vector of predictor names to use in a reduced model | |
766 } | |
767 | |
768 #Glmnet default is to standardize the variables. | |
769 #used as an example for implementation | |
770 #http://r.789695.n4.nabble.com/estimating-survival-times-with-glmnet-and-coxph-td4614225.html | |
771 funcPenalizedModel <- function( | |
772 ### Perform penalized regularization for variable selection | |
773 strFormula, | |
774 ### The formula of the full model before boosting | |
775 frmeTmp, | |
776 ### The data on which to perform analysis | |
777 adCur, | |
778 ### The response data | |
779 lsParameters, | |
780 ### User controlled parameters needed specific to boosting | |
781 lsForcedParameters = NULL, | |
782 ### Force these predictors to be in the model | |
783 strLog | |
784 ### File to which to document logging | |
785 ){ | |
786 #Convert the data frame to a model matrix | |
787 mtrxDesign = model.matrix(as.formula(strFormula), data=frmeTmp) | |
788 | |
789 #Cross validate the lambda | |
790 cvRet = cv.glmnet(x=mtrxDesign,y=adCur,alpha=lsParameters$dPAlpha) | |
791 | |
792 #Perform lasso | |
793 glmnetMod = glmnet(x=mtrxDesign,y=adCur,family=lsParameters$family,alpha=lsParameters$dPAlpha,lambda=cvRet$lambda.min) | |
794 | |
795 #Get non zero beta and return column names for covariate names. | |
796 ldBeta = glmnetMod$beta[,which.max(glmnetMod$dev.ratio)] | |
797 ldBeta = names(ldBeta[which(abs(ldBeta)>0)]) | |
798 return(sapply(ldBeta,funcCoef2Col,frmeData=frmeTmp)) | |
799 } | |
800 | |
801 # OK | |
802 funcForwardModel <- function( | |
803 ### Perform model selection with forward stepwise selection | |
804 strFormula, | |
805 ### lm style string defining reposne and predictors | |
806 frmeTmp, | |
807 ### Data on which to perform analysis | |
808 adCur, | |
809 ### Response data | |
810 lsParameters, | |
811 ### User controlled parameters needed specific to boosting | |
812 lsForcedParameters = NULL, | |
813 ### Force these predictors to be in the model | |
814 strLog | |
815 ### File to which to document logging | |
816 ){ | |
817 funcWrite( c("#Forward formula", strFormula), strLog ) | |
818 | |
819 strNullFormula = "adCur ~ 1" | |
820 if(!is.null(lsForcedParameters)) | |
821 { | |
822 strNullFormula = paste( "adCur ~", paste( sprintf( "`%s`", lsForcedParameters ), collapse = " + " )) | |
823 } | |
824 lmodNull <- try( lm(as.formula( strNullFormula ), data=frmeTmp)) | |
825 lmodFull <- try( lm(as.formula( strFormula ), data=frmeTmp )) | |
826 if(!("try-error" %in% c(class( lmodNull ),class( lmodFull )))) | |
827 { | |
828 lmod = stepAIC(lmodNull, scope=list(lower=lmodNull,upper=lmodFull), direction="forward", trace=0) | |
829 return(funcGetStepPredictors(lmod, frmeTmp, strLog)) | |
830 } | |
831 return( lsForcedParameters ) | |
832 ### Return a vector of predictor names to use in a reduced model or NA on error | |
833 } | |
834 | |
835 # OK | |
836 # Select model with backwards selection | |
837 funcBackwardsModel <- function( | |
838 ### Perform model selection with backwards stepwise selection | |
839 strFormula, | |
840 ### lm style string defining reponse and predictors | |
841 frmeTmp, | |
842 ### Data on which to perform analysis | |
843 adCur, | |
844 ### Response data | |
845 lsParameters, | |
846 ### User controlled parameters needed specific to boosting | |
847 lsForcedParameters = NULL, | |
848 ### Force these predictors to be in the model | |
849 strLog | |
850 ### File to which to document logging | |
851 ){ | |
852 funcWrite( c("#Backwards formula", strFormula), strLog ) | |
853 | |
854 strNullFormula = "adCur ~ 1" | |
855 if(!is.null(lsForcedParameters)) | |
856 { | |
857 strNullFormula = paste( "adCur ~", paste( sprintf( "`%s`", lsForcedParameters ), collapse = " + " )) | |
858 } | |
859 | |
860 lmodNull <- try( lm(as.formula( strNullFormula ), data=frmeTmp)) | |
861 lmodFull <- try( lm(as.formula( strFormula ), data=frmeTmp )) | |
862 | |
863 if(! class( lmodFull ) == "try-error" ) | |
864 { | |
865 lmod = stepAIC(lmodFull, scope=list(lower=lmodNull, upper=lmodFull), direction="backward") | |
866 return(funcGetStepPredictors(lmod, frmeTmp, strLog)) | |
867 } else { | |
868 return( lsForcedParameters ) } | |
869 ### Return a vector of predictor names to use in a reduced model or NA on error | |
870 } | |
871 | |
872 ### Analysis methods | |
873 ### Univariate options | |
874 | |
875 # Sparse Dir. Model | |
876 #TODO# Implement in sfle | |
877 | |
878 # Tested | |
879 # Correlation | |
880 # NOTE: Ignores the idea of random and fixed covariates | |
881 funcSpearman <- function( | |
882 ### Perform multiple univariate comparisons producing spearman correlations for association | |
883 strFormula, | |
884 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables | |
885 frmeTmp, | |
886 ### Data on which to perform analysis | |
887 iTaxon, | |
888 ### Index of the response data | |
889 lsQCCounts, | |
890 ### List recording anything important to QC | |
891 strRandomFormula = NULL | |
892 ### Has the formula for random covariates | |
893 ){ | |
894 return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon, | |
895 functionContrast=function(x,adCur,dfData) | |
896 { | |
897 retList = list() | |
898 ret = cor.test(as.formula(paste("~",x,"+ adCur")), data=dfData, method="spearman", na.action=c_strNA_Action) | |
899 #Returning rho for the coef in a named vector | |
900 vdCoef = c() | |
901 vdCoef[[x]]=ret$estimate | |
902 retList[[1]]=list(p.value=ret$p.value,SD=sd(dfData[[x]]),name=x,coef=vdCoef) | |
903 return(retList) | |
904 }, lsQCCounts)) | |
905 ### List of contrast information, pvalue, contrast and std per univariate test | |
906 } | |
907 | |
908 # Tested | |
909 # Wilcoxon (T-Test) | |
910 # NOTE: Ignores the idea of random and fixed covariates | |
911 funcWilcoxon <- function( | |
912 ### Perform multiple univariate comparisons performing wilcoxon tests on discontinuous data with 2 levels | |
913 strFormula, | |
914 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables | |
915 frmeTmp, | |
916 ### Data on which to perform analysis | |
917 iTaxon, | |
918 ### Index of the response data | |
919 lsQCCounts, | |
920 ### List recording anything important to QC | |
921 strRandomFormula = NULL | |
922 ### Has the formula for random covariates | |
923 ){ | |
924 return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon, | |
925 functionContrast=function(x,adCur,dfData) | |
926 { | |
927 retList = list() | |
928 ret = wilcox.test(as.formula(paste("adCur",x,sep=" ~ ")), data=dfData, na.action=c_strNA_Action) | |
929 #Returning NA for the coef in a named vector | |
930 vdCoef = c() | |
931 vdCoef[[x]]=ret$statistic | |
932 retList[[1]]=list(p.value=ret$p.value,SD=sd(dfData[[x]]),name=x,coef=vdCoef) | |
933 return(retList) | |
934 }, lsQCCounts)) | |
935 ### List of contrast information, pvalue, contrast and std per univariate test | |
936 } | |
937 | |
938 # Tested | |
939 # Kruskal.Wallis (Nonparameteric anova) | |
940 # NOTE: Ignores the idea of random and fixed covariates | |
941 funcKruskalWallis <- function( | |
942 ### Perform multiple univariate comparisons performing Kruskal wallis rank sum tests on discontuous data with more than 2 levels | |
943 strFormula, | |
944 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables | |
945 frmeTmp, | |
946 ### Data on which to perform analysis | |
947 iTaxon, | |
948 ### Index of the response data | |
949 lsQCCounts, | |
950 ### List recording anything important to QC | |
951 strRandomFormula = NULL | |
952 ### Has the formula for random covariates | |
953 ){ | |
954 return(funcMakeContrasts(strFormula=strFormula, strRandomFormula=strRandomFormula, frmeTmp=frmeTmp, iTaxon=iTaxon, | |
955 functionContrast=function(x,adCur,dfData) | |
956 { | |
957 retList = list() | |
958 lmodKW = kruskal(adCur,dfData[[x]],group=FALSE,p.adj="holm") | |
959 | |
960 asLevels = levels(dfData[[x]]) | |
961 # The names of the generated comparisons, sometimes the control is first sometimes it is not so | |
962 # We will just check which is in the names and use that | |
963 asComparisons = row.names(lmodKW$comparisons) | |
964 #Get the comparison with the control | |
965 for(sLevel in asLevels[2:length(asLevels)]) | |
966 { | |
967 sComparison = intersect(c(paste(asLevels[1],sLevel,sep=" - "),paste(sLevel,asLevels[1],sep=" - ")),asComparisons) | |
968 #Returning NA for the coef in a named vector | |
969 vdCoef = c() | |
970 vdCoef[[paste(x,sLevel,sep="")]]=lmodKW$comparisons[sComparison,"Difference"] | |
971 # vdCoef[[paste(x,sLevel,sep="")]]=NA | |
972 retList[[length(retList)+1]]=list(p.value=lmodKW$comparisons[sComparison,"p.value"],SD=1.0,name=paste(x,sLevel,sep=""),coef=vdCoef) | |
973 } | |
974 return(retList) | |
975 }, lsQCCounts)) | |
976 ### List of contrast information, pvalue, contrast and std per univariate test | |
977 } | |
978 | |
979 # Tested | |
980 # NOTE: Ignores the idea of random and fixed covariates | |
981 funcDoUnivariate <- function( | |
982 ### Perform multiple univariate comparisons producing spearman correlations for association | |
983 strFormula, | |
984 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables | |
985 frmeTmp, | |
986 ### Data on which to perform analysis | |
987 iTaxon, | |
988 ### Index of the response data | |
989 lsHistory, | |
990 ### List recording p-values, association information, and QC counts | |
991 strRandomFormula = NULL, | |
992 ### Has the formula for random covariates | |
993 fZeroInflate = FALSE | |
994 ){ | |
995 if(fZeroInflate) | |
996 { | |
997 throw("There are no zero-inflated univariate models to perform your analysis.") | |
998 } | |
999 | |
1000 # Get covariates | |
1001 astrCovariates = unique(c(funcFormulaStrToList(strFormula),funcFormulaStrToList(strRandomFormula))) | |
1002 | |
1003 # For each covariate | |
1004 for(sCovariate in astrCovariates) | |
1005 { | |
1006 ## Check to see if it is discrete | |
1007 axData = frmeTmp[[sCovariate]] | |
1008 lsRet = NA | |
1009 if(is.factor(axData) || is.logical(axData)) | |
1010 { | |
1011 ## If discrete check how many levels | |
1012 lsDataLevels = levels(axData) | |
1013 ## If 2 levels do wilcoxon test | |
1014 if(length(lsDataLevels) < 3) | |
1015 { | |
1016 lsRet = funcWilcoxon(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts) | |
1017 } else { | |
1018 ## If 3 or more levels do kruskal wallis test | |
1019 lsRet = funcKruskalWallis(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts) | |
1020 } | |
1021 } else { | |
1022 ## If not discrete do spearman test (list(adP=adP, lsSig=lsSig, lsQCCounts=lsQCCounts)) | |
1023 lsRet = funcSpearman(strFormula=paste("adCur",sCovariate,sep=" ~ "), frmeTmp=frmeTmp, iTaxon=iTaxon, lsQCCounts=lsHistory$lsQCCounts) | |
1024 } | |
1025 lsHistory[["adP"]] = c(lsHistory[["adP"]], lsRet[["adP"]]) | |
1026 lsHistory[["lsSig"]] = c(lsHistory[["lsSig"]], lsRet[["lsSig"]]) | |
1027 lsHistory[["lsQCCounts"]] = lsRet[["lsQCCounts"]] | |
1028 } | |
1029 return(lsHistory) | |
1030 } | |
1031 | |
1032 ### Multivariate | |
1033 | |
1034 # Tested | |
1035 funcLM <- function( | |
1036 ### Perform vanilla linear regression | |
1037 strFormula, | |
1038 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables | |
1039 frmeTmp, | |
1040 ### Data on which to perform analysis | |
1041 iTaxon, | |
1042 ### Index of the response data | |
1043 lsHistory, | |
1044 ### List recording p-values, association information, and QC counts | |
1045 strRandomFormula = NULL, | |
1046 ### Has the formula for random covariates | |
1047 fZeroInflated = FALSE | |
1048 ### Turns on the zero inflated model | |
1049 ){ | |
1050 adCur = frmeTmp[,iTaxon] | |
1051 if(fZeroInflated) | |
1052 { | |
1053 return(try(gamlss(formula=as.formula(strFormula), family=BEZI, data=frmeTmp))) # gamlss | |
1054 } else { | |
1055 if(!is.null(strRandomFormula)) | |
1056 { | |
1057 return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family=gaussian(link="identity"), data=frmeTmp))) | |
1058 #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled) | |
1059 #return(try( lmer(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action) )) | |
1060 } else { | |
1061 return(try( lm(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action) )) | |
1062 } | |
1063 } | |
1064 ### lmod result object from lm | |
1065 } | |
1066 | |
1067 # Tested | |
1068 funcBinomialMult <- function( | |
1069 ### Perform linear regression with negative binomial link | |
1070 strFormula, | |
1071 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables | |
1072 frmeTmp, | |
1073 ### Data on which to perform analysis | |
1074 iTaxon, | |
1075 ### Index of the response data | |
1076 lsHistory, | |
1077 ### List recording p-values, association information, and QC counts | |
1078 strRandomFormula = NULL, | |
1079 ### Has the formula for random covariates | |
1080 fZeroInflated = FALSE | |
1081 ### Turns on the zero inflated model | |
1082 ){ | |
1083 adCur = frmeTmp[,iTaxon] | |
1084 if(fZeroInflated) | |
1085 { | |
1086 return(try(zeroinfl(as.formula(strFormula), data=frmeTmp, dist="negbin"))) # pscl | |
1087 # return(try(gamlss(as.formula(strFormula), family=ZINBI, data=frmeTmp))) # pscl | |
1088 } else { | |
1089 if(!is.null(strRandomFormula)) | |
1090 { | |
1091 throw("This analysis flow is not completely developed, please choose an option other than negative bionomial with random covariates") | |
1092 #TODO need to estimate the theta | |
1093 #return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family=negative.binomial(theta = 2, link=log), data=frmeTmp))) | |
1094 #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled) | |
1095 } else { | |
1096 return(try( glm.nb(as.formula(strFormula), data=frmeTmp, na.action=c_strNA_Action ))) | |
1097 } | |
1098 } | |
1099 ### lmod result object from lm | |
1100 } | |
1101 | |
1102 # Tested | |
1103 funcQuasiMult <- function( | |
1104 ### Perform linear regression with quasi-poisson link | |
1105 strFormula, | |
1106 ### lm style string defining reponse and predictors, for mixed effects models this holds the fixed variables | |
1107 frmeTmp, | |
1108 ### Data on which to perform analysis | |
1109 iTaxon, | |
1110 ### Index of the response data | |
1111 lsHistory, | |
1112 ### List recording p-values, association information, and QC counts | |
1113 strRandomFormula = NULL, | |
1114 ### Has the formula for random covariates | |
1115 fZeroInflated = FALSE | |
1116 ### Turns on a zero infalted model | |
1117 ){ | |
1118 adCur = frmeTmp[,iTaxon] | |
1119 if(fZeroInflated) | |
1120 { | |
1121 # return(try(gamlss(formula=as.formula(strFormula), family=ZIP, data=frmeTmp))) # gamlss | |
1122 return(try(zeroinfl(as.formula(strFormula), data=frmeTmp, dist="poisson"))) # pscl | |
1123 } else { | |
1124 #Check to see if | is in the model, if so use a lmm otherwise the standard glm is ok | |
1125 if(!is.null(strRandomFormula)) | |
1126 { | |
1127 return(try(glmmPQL(fixed=as.formula(strFormula), random=as.formula(strRandomFormula), family= quasipoisson, data=frmeTmp))) | |
1128 #lme4 package but does not have pvalues for the fixed variables (have to use a mcmcsamp/pvals.fnc function which are currently disabled) | |
1129 #return(try ( glmer(as.formula(strFormula), data=frmeTmp, family=quasipoisson, na.action=c_strNA_Action) )) | |
1130 } else { | |
1131 return(try( glm(as.formula(strFormula), family=quasipoisson, data=frmeTmp, na.action=c_strNA_Action) )) | |
1132 } | |
1133 } | |
1134 ### lmod result object from lm | |
1135 } | |
1136 | |
1137 ### Transformations | |
1138 # Tested | |
1139 funcArcsinSqrt <- function( | |
1140 # Transform data with arcsin sqrt transformation | |
1141 aData | |
1142 ### The data on which to perform the transformation | |
1143 ){ | |
1144 return(asin(sqrt(aData))) | |
1145 ### Transformed data | |
1146 } | |
1147 | |
1148 funcSquareSin <- function( | |
1149 # Transform data with square sin transformation | |
1150 # Opposite of the funcArcsinSqrt | |
1151 aData | |
1152 ### The data on which to perform the transformation | |
1153 ){ | |
1154 return(sin(aData)^2) | |
1155 ### Transformed data | |
1156 } | |
1157 | |
1158 # Tested | |
1159 funcNoTransform <-function( | |
1160 ### Pass data without transform | |
1161 aData | |
1162 ### The data on which to perform the transformation | |
1163 ### Only given here to preserve the pattern, not used. | |
1164 ){ | |
1165 return(aData) | |
1166 ### Transformed data | |
1167 } | |
1168 | |
1169 funcGetAnalysisMethods <- function( | |
1170 ### Returns the appropriate functions for regularization, analysis, data transformation, and analysis object inspection. | |
1171 ### This allows modular customization per analysis step. | |
1172 ### To add a new method insert an entry in the switch for either the selection, transform, or method | |
1173 ### Insert them by using the pattern optparse_keyword_without_quotes = function_in_AnalysisModules | |
1174 ### Order in the return listy is currently set and expected to be selection, transforms/links, analysis method | |
1175 ### none returns null | |
1176 sModelSelectionKey, | |
1177 ### Keyword defining the method of model selection | |
1178 sTransformKey, | |
1179 ### Keyword defining the method of data transformation | |
1180 sMethodKey, | |
1181 ### Keyword defining the method of analysis | |
1182 fZeroInflated = FALSE | |
1183 ### Indicates if using zero inflated models | |
1184 ){ | |
1185 lRetMethods = list() | |
1186 #Insert selection methods here | |
1187 lRetMethods[[c_iSelection]] = switch(sModelSelectionKey, | |
1188 boost = funcBoostModel, | |
1189 penalized = funcPenalizedModel, | |
1190 forward = funcForwardModel, | |
1191 backward = funcBackwardsModel, | |
1192 none = NA) | |
1193 | |
1194 #Insert transforms | |
1195 lRetMethods[[c_iTransform]] = switch(sTransformKey, | |
1196 asinsqrt = funcArcsinSqrt, | |
1197 none = funcNoTransform) | |
1198 | |
1199 #Insert untransform | |
1200 lRetMethods[[c_iUnTransform]] = switch(sTransformKey, | |
1201 asinsqrt = funcNoTransform, | |
1202 none = funcNoTransform) | |
1203 | |
1204 #Insert analysis | |
1205 lRetMethods[[c_iAnalysis]] = switch(sMethodKey, | |
1206 neg_binomial = funcBinomialMult, | |
1207 quasi = funcQuasiMult, | |
1208 univariate = funcDoUnivariate, | |
1209 lm = funcLM, | |
1210 none = NA) | |
1211 | |
1212 # If a univariate method is used it is required to set this to true | |
1213 # For correct handling. | |
1214 lRetMethods[[c_iIsUnivariate]]=sMethodKey=="univariate" | |
1215 | |
1216 #Insert method to get results | |
1217 if(fZeroInflated) | |
1218 { | |
1219 lRetMethods[[c_iResults]] = switch(sMethodKey, | |
1220 neg_binomial = funcGetZeroInflatedResults, | |
1221 quasi = funcGetZeroInflatedResults, | |
1222 univariate = funcGetUnivariateResults, | |
1223 lm = funcGetZeroInflatedResults, | |
1224 none = NA) | |
1225 } else { | |
1226 lRetMethods[[c_iResults]] = switch(sMethodKey, | |
1227 neg_binomial = funcGetLMResults, | |
1228 quasi = funcGetLMResults, | |
1229 univariate = funcGetUnivariateResults, | |
1230 lm = funcGetLMResults, | |
1231 none = NA) | |
1232 } | |
1233 | |
1234 return(lRetMethods) | |
1235 ### Returns a list of functions to be passed for regularization, data transformation, analysis, | |
1236 ### and custom analysis results introspection functions to pull from return objects data of interest | |
1237 } |