comparison mqppep_anova_script.Rmd @ 1:b76c75521d91 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
author galaxyp
date Fri, 28 Oct 2022 18:26:42 +0000
parents 8dfd5d2b5903
children a5e7469dfdfa
comparison
equal deleted inserted replaced
0:8dfd5d2b5903 1:b76c75521d91
5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" 5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]"
6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" 6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]"
7 date: 7 date:
8 - "May 28, 2018" 8 - "May 28, 2018"
9 - "; revised June 23, 2022" 9 - "; revised June 23, 2022"
10 lot: true
10 output: 11 output:
11 pdf_document: 12 pdf_document:
12 toc: true 13 toc: true
13 toc_depth: 3 14 toc_depth: 2
14 keep_tex: true 15 keep_tex: true
15 header-includes: 16 dev: pdf
16 - \usepackage{longtable} 17 includes:
17 - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut 18 in_header: mqppep_anova_preamble.tex
18 - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut 19 latex_macros: false
20 raw_tex: true
21 urlcolor: blue
19 params: 22 params:
20 alphaFile: "test-data/alpha_levels.tabular" 23 alphaFile: "test-data/alpha_levels.tabular"
21 inputFile: "test-data/test_input_for_anova.tabular" 24 inputFile: "test-data/test_input_for_anova.tabular"
22 preprocDb: "test-data/test_input_for_anova.sqlite" 25 preprocDb: "test-data/test_input_for_anova.sqlite"
23 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] 26 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
24 show_toc: true
25 firstDataColumn: "^Intensity[^_]"
26 imputationMethod: !r c("group-median", "median", "mean", "random")[1]
27 meanPercentile: 1
28 sdPercentile: 1.0
29 regexSampleNames: "\\.\\d+[A-Z]$" 27 regexSampleNames: "\\.\\d+[A-Z]$"
30 regexSampleGrouping: "\\d+" 28 regexSampleGrouping: "\\d+"
29 groupFilterPatterns: ".+"
30 groupFilter: !r c("none", "exclude", "include")[1]
31 imputationMethod: !r c("group-median", "median", "mean", "random")[4]
32 kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]
33 #imputationMethod: !r c("group-median", "median", "mean", "random")[1]
34
35 # how should sample groups be interpreted?
36 # - "f": fixed patterns (like `grep -F`)
37 # - "p": PERL-compatible (like `grep -P`)
38 # - "r": extended grep patterns (like `grep -E`)
39 # use what case sensitivity?
40 # - "i": case insensitive matching (like `grep -i`)
41 groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1]
42 # what pattern should be used for the first column
43 # (extended grep pattern, case sensitive)
44 firstDataColumn: "^Intensity[^_]"
45 # for small random value imputation, what percentile should be center?
46 meanPercentile: 50
47 #meanPercentile: 1
48 # for small random value imputation, what should `s / mean(x)` ratio be?
49 sdPercentile: 1.0
50 # output path for imputed data file
31 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" 51 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt"
52 # output path for imputed/quantile-normalized/log-transformed data file
32 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" 53 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt"
54 # output path for contents of `stats_metadata_v` table
33 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" 55 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt"
56 # how to test one variable with > 2 categories (e.g., aov or kruskal.test)
34 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] 57 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]
58 # how to test one variable with 2 categories (e.g., oneway.test)
35 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] 59 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3]
36 kseaCutoffStatistic: !r c("p.value", "FDR")[2] 60 # what should be the minimum quality for consideration in both
37 kseaCutoffThreshold: !r c( 0.1, 0.05)[2] 61 minQuality: 0
38 kseaMinKinaseCount: 1 62 # correct KSEA with FDR (recommended) or raw p-value
39 intensityHeatmapRows: 75 63 kseaCutoffStatistic: !r c("FDR", "p.value")[1]
64 # correct KSEA threshold 0.05 (conventional) or higher (perhaps better)
65 # "perhaps better" meaning that KSEA is an hypothesis-generator, not -test
66 #kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5)[1]
67 # minimum number of substrates required for a kinase to be considered in KSEA
68 kseaMinSubstrateCount: 1
69 # Should KSEA be performed aggregating signed log2FC or absolute?
70 # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores
71 # TRUE use abs(log2FC) for KSEA as Justin Drake requested; this is a
72 # justifiable deviation from the KSEAapp::KSEA.Scores algorithm.
73 kseaUseAbsoluteLog2FC: TRUE
74 #kseaUseAbsoluteLog2FC: FALSE
75 # minimum number of observed values per sample-group
76 intensityMinValuesPerGroup: 1
77 # maximum number of heatmap rows (result are poor when > 50)
78 intensityHeatmapRows: 50
79 # what should be the primary criterion to eliminate excessive heatmap rows
80 intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1]
81 # should correlation among substrates be used (rather than covariance)
82 correlateSubstrates: TRUE
83 # only show covariance among variables having variance > 1
84 filterCovVarGT1: TRUE
85 # maximum number of residues to display for ppeps in rownames or columnames
86 ppepTruncN: 10
87 # maximum number of characters of subgenes to display in rownames or columnames
88 subgeneTruncN: 10
89 # maximum number of characters for paste(subgene, ppep) for enrichment plots
90 substTruncN: 20
91 # should boxplots use variable-width boxes to reflect # of samples
92 boxPlotVarWidth: TRUE
93 # should boxplots use notched boxes to reflect difference between samples
94 boxPlotNotch: TRUE
95 # look-up tables for kinase descriptions
96 kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2"
97 kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2"
98 # should debugging trace messages be printed?
99 showEnrichedSubstrates: FALSE
100
101 # should debugging nb/nbe messages be printed?
102 printNBMsgs: FALSE
103 # should debugging trace messages be printed?
104 printTraceMsgs: FALSE
105 # when debugging files are needed, set debugFileBasePath to the path
106 # to the directory where they should be writtn
107 debugFileBasePath: !r if (TRUE) NULL else "test-data"
40 --- 108 ---
41 <!-- 109
42 kseaCutoffStatistic: !r c("p.value", "FDR")[2] 110 ```{r setup, include = FALSE, results = 'asis'}
43 kseaCutoffThreshold: !r c(0.05, 0.1)[1] 111
44 112 # simple debug messaging
45 alphaFile: "test-data/alpha_levels.tabular" 113 print_nb_messages <- params$printNBMsgs
46 inputFile: "test-data/test_input_for_anova.tabular" 114
47 preprocDb: "test-data/test_input_for_anova.sqlite" 115 nb <- if (!print_nb_messages) {
48 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] 116 function(...) invisible()
49 117 } else {
50 alphaFile: "test-data/alpha_levels.tabular" 118 function(..., f = cat) f("\n$\\exists{}\\supset\\forall{}$", ...)
51 inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" 119 }
52 preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" 120
53 kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] 121 nbe <- if (!print_nb_messages) {
54 122 function(...) invisible()
55 alphaFile: "test-data/alpha_levels.tabular" 123 } else {
56 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" 124 function(..., f = cat, file = stderr()) {
57 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" 125 cat(
58 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] 126 stringi::stri_unescape_unicode("\nNBE \\u2203\\u2283\\u2200"),
59 127 ...,
60 alphaFile: "test-data/alpha_levels.tabular" 128 file = file
61 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" 129 )
62 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" 130 }
63 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] 131 }
64 132
65 inputFile: "test-data/density_failure.preproc_tab.tabular"
66 kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2]
67 latex_document: default
68 -->
69 ```{r setup, include = FALSE}
70 #ref for debugging: https://yihui.org/tinytex/r/#debugging 133 #ref for debugging: https://yihui.org/tinytex/r/#debugging
71 options(tinytex.verbose = TRUE) 134 options(tinytex.verbose = TRUE)
72 135
73 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 136 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
74 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 137 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
75 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) 138 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10), dpi = 300)
76 139
77 # freeze the random number generator so the same results will be produced 140 # freeze the random number generator so the same results will be produced
78 # from run to run 141 # from run to run
79 set.seed(28571) 142 set.seed(28571)
80 143
81 ### LIBRARIES 144 ### LIBRARIES
145
146 if (print_nb_messages) nbe("library(gplots)")
82 library(gplots) 147 library(gplots)
148 if (print_nb_messages) nbe("library(caret)")
149 # load caret for nearZeroVar
150 if (print_nb_messages) nbe("Please ignore the messages about systemd, if any.\n")
151 library(caret)
152 if (print_nb_messages) nbe("library(DBI)")
83 library(DBI) 153 library(DBI)
154 if (print_nb_messages) nbe("library(RSQLite)")
84 library(RSQLite) 155 library(RSQLite)
156 if (print_nb_messages) nbe("library(sqldf)\n")
85 # Suppress "Warning: no DISPLAY variable so Tk is not available" 157 # Suppress "Warning: no DISPLAY variable so Tk is not available"
86 suppressWarnings(suppressMessages(library(sqldf))) 158 suppressWarnings(suppressMessages(library(sqldf)))
87 159
88 # required but not added to search list: 160 # required but not added to search list:
89 # - DBI 161 # - DBI
95 # - reshape2 167 # - reshape2
96 # - vioplot 168 # - vioplot
97 169
98 ### CONSTANTS 170 ### CONSTANTS
99 171
100 const_parfin <- par("fin") 172 const_boxplot_fill <- "grey94"
101 const_boxplot_fill <- "grey94"
102 const_stripchart_cex <- 0.5
103 const_stripsmall_cex <-
104 sqrt(const_stripchart_cex * const_stripchart_cex / 2)
105 const_stripchart_jitter <- 0.3
106 const_write_debug_files <- FALSE
107 const_table_anchor_bp <- "bp"
108 const_table_anchor_ht <- "ht"
109 const_table_anchor_p <- "p"
110 const_table_anchor_tbp <- "tbp"
111
112
113 const_ksea_astrsk_kinases <- 1 173 const_ksea_astrsk_kinases <- 1
114 const_ksea_nonastrsk_kinases <- 2 174 const_ksea_nonastrsk_kinases <- 2
115 const_ksea_all_kinases <- 3 175 const_ksea_all_kinases <- 3
116 176 const_log10_e <- log10(exp(1))
117 const_log10_e <- log10(exp(1)) 177 const_stripchart_cex <- 0.5
118 178 const_stripchart_jitter <- 0.3
119 ### FUNCTIONS 179 const_table_anchor_bp <- "bp"
120 180 const_table_anchor_ht <- "ht"
121 # from `demo(error.catching)` 181 const_table_anchor_p <- "p"
182 const_table_anchor_t <- "t"
183 const_table_anchor_tbp <- "tbp"
184
185
186 ### GLOBAL VARIABLES (params)
187
188 ## functions to process params
189
190 is_string_null_or_empty <- function(x) {
191 # N. B. non-strings are intentionally treated as NULL
192 if (is.null(x))
193 TRUE
194 else if (!is.character(x))
195 TRUE
196 else x == ""
197 }
198
122 ##' Catch *and* save both errors and warnings, and in the case of 199 ##' Catch *and* save both errors and warnings, and in the case of
123 ##' a warning, also keep the computed result. 200 ##' a warning, also keep the computed result.
201 ##' return result as list(value = ..., warning = ...)
202 ##' - value will be:
203 ##' - the result if no exception is thrown
204 ##' - the exception if an exception is caught
205 ##' - warning will be a string except perhaps when warning argument is not NULL
206 ##'
207 ##' adapted from `demo(error.catching)`
124 ##' 208 ##'
125 ##' @title tryCatch both warnings (with value) and errors 209 ##' @title tryCatch both warnings (with value) and errors
126 ##' @param expr an \R expression to evaluate 210 ##' @param expr an \R expression to evaluate
127 ##' @return a list with 'value' and 'warning', where 211 ##' @return a list with 'value' and 'warning', where
128 ##' 'value' may be an error caught. 212 ##' 'value' may be an error caught.
129 ##' @author Martin Maechler; 213 ##' @author Martin Maechler;
130 ##' Copyright (C) 2010-2012 The R Core Team 214 ##' Copyright (C) 2010-2012 The R Core Team
131 try_catch_w_e <- function(expr) { 215 try_catch_w_e <-
132 wrn <- NULL 216 function(expr, error = function(e) e, warning = NULL) {
133 # warning handler 217 wrn <- NULL
134 w_handler <- function(w) { 218 # warning handler
135 wrn <<- w 219 w_handler <-
136 invokeRestart("muffleWarning") 220 if (is.function(warning))
137 } 221 warning
138 list( 222 else
139 value = withCallingHandlers( 223 function(w) {
140 tryCatch( 224 wrn <<- w
141 expr, 225 invokeRestart("muffleWarning")
142 error = function(e) e 226 }
227 e_handler <-
228 if (is.function(error))
229 error
230 else
231 function(e) e
232 # return result as list(value = ..., warning = ...)
233 # - value will be:
234 # - the result if no exception is thrown
235 # - the exception if an exception is caught
236 list(
237 value = withCallingHandlers(
238 tryCatch(
239 expr,
240 error = e_handler
241 ),
242 warning = w_handler
143 ), 243 ),
144 warning = w_handler 244 warning = wrn
145 ), 245 )
146 warning = wrn 246 }
147 ) 247
148 } 248 see_kvp <-
149 249 function(format, key, value, suffix = "") {
150 250 if (
151 write_debug_file <- function(s) { 251 !all(
152 if (const_write_debug_files) { 252 is.character(format),
153 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) 253 is.character(key),
154 print(sprintf("DEBUG writing file %s", spath)) 254 is.character(value),
255 is.character(suffix)
256 )
257 ) {
258 cat("all arguments to see_kvp should be character")
259 knitr::knit_exit()
260 }
261 result <- sprintf(format, value)
262 if (length(result) > 1) {
263 sprintf(
264 "%s = c(%s)%s",
265 whack_underscores(key),
266 paste(result, collapse = ", "),
267 suffix
268 )
269 } else {
270 sprintf(
271 "%s = %s%s",
272 key,
273 result,
274 suffix
275 )
276 }
277 }
278
279 see_logical <-
280 function(x, suffix = "", xprssn = deparse1(substitute(x))) {
281 result <- as.character(as.logical(x))
282 # handle NAs and NaNs
283 result[is.na(result)] <- "NA"
284 see_kvp(
285 format = "%s",
286 key = xprssn,
287 value = result,
288 suffix = suffix
289 )
290 }
291
292 see_numeric <-
293 function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) {
294 if (is.numeric(digits) && is.numeric(x)) {
295 digits <- as.integer(digits)
296 digits <- min(16, max(0, digits))
297 format <- paste0("%0.", as.character(digits), "g")
298 result <- sprintf(format, x)
299 see_kvp(
300 format = "%s",
301 key = xprssn,
302 value = result,
303 suffix = suffix
304 )
305 }
306 }
307
308 see_character <-
309 function(x, suffix = "", xprssn = deparse1(substitute(x))) {
310 if (is.character(x)) {
311 see_kvp(
312 format = "%s",
313 key = xprssn,
314 value = sprintf("\"%s\"", x),
315 suffix = suffix
316 )
317 }
318 }
319
320 see_variable <-
321 function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) {
322 if (is.character(x)) {
323 see_character(x, suffix, xprssn)
324 } else if (is.numeric(x)) {
325 see_numeric(x, suffix, digits, xprssn)
326 } else if (is.logical(x)) {
327 see_logical(x, suffix, xprssn)
328 } else {
329 f <- file("")
330 sink(f)
331 str(x)
332 msg <- paste(readLines(f), collapse = "\n")
333 sink()
334 close(f)
335 paste0(
336 "see_variable - str(",
337 xprssn,
338 "):\n",
339 msg, "\n"
340 )
341 }
342 }
343
344 # ref: https://tug.org/texinfohtml/latex2e.html
345 # LaTeX sets aside the following characters for special purposes.
346 # For example, the percent sign % is for comments.
347 # They are called reserved characters or special characters.
348 # They are all discussed elsewhere in this manual.
349 #
350 # $ % & { } _ ~ ^ \ #
351 #
352 # If you want a reserved character to be printed as itself, in the text body
353 # font, for all but the final three characters in that list simply put
354 # a backslash \ in front of the character.
355 # Thus, typing \$1.23 will produce $1.23 in your output.
356 #
357 # As to the last three characters, to get a tilde in the text body font,
358 # use \~{} (omitting the curly braces would result in the next character
359 # receiving a tilde accent).
360 # Similarly, to get a text body font circumflex use \^{}.
361 # To get a backslash in the font of the text body enter \textbackslash{}.
362 whack_math <-
363 function(v) {
364 v <- as.character(v)
365 w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE)
366 w <- Reduce(
367 f = function(l, r) {
368 gsub(r, paste0("\\", r), l, fixed = TRUE)
369 },
370 x = c("#", "$", "%", "&", "{", "}", "_"),
371 init = w
372 )
373 w <- gsub("^", "\\^{}", w, fixed = TRUE)
374 return(w)
375 if (all(v == w))
376 v
377 else
378 paste0("\\texttt{", w, "}")
379 }
380 whack_underscores <- whack_math
381
382 ## dump params to stderr (remove this eventually)
383
384 if (FALSE) nbe(see_variable(params))
385
386 ## unlist params for eventual output
387
388 param_unlist <- unlist(as.list(params))
389
390 # no need to whack underscores and dollars because this is verbatim
391 param_df <- data.frame(
392 parameter = paste0("\\verb@", names(param_unlist), "@"),
393 value = paste0(
394 "\n\\begin{tiny}\n\\verb@",
395 param_unlist,
396 "@\n\\end{tiny}"
397 )
398 )
399 param_df <- data.frame(
400 parameter = names(param_unlist),
401 value = param_unlist
402 )
403 param_df <- param_df[order(param_df$parameter), ]
404
405 ## general output control
406
407 debug_file_base_path <- params$debugFileBasePath
408 print_trace_messages <- params$printTraceMsgs
409 show_enriched_substrates <- params$showEnrichedSubstrates
410 boxplot_varwidth <- params$boxPlotVarWidth
411 boxplot_notch <- params$boxPlotNotch
412
413 ## parameters for static data
414
415 kinase_name_uprt_lut_bz2 <- params$kinaseNameUprtLutBz2
416 kinase_uprt_desc_lut_bz2 <- params$kinaseUprtDescLutBz2
417
418 ## parameters for input file
419
420 preproc_db <- params$preprocDb
421 alpha_file <- params$alphaFile
422 input_file <- params$inputFile
423
424 # First data column - ideally, this could be detected via
425 # regexSampleNames, but for now leave it as is.
426 first_data_column <- params$firstDataColumn
427 fdc_is_integer <- is.integer(first_data_column)
428 if (fdc_is_integer) {
429 first_data_column <- as.integer(params$firstDataColumn)
430 }
431
432 ## parameters for output files
433
434 ksea_app_prep_db <- params$kseaAppPrepDb
435 imputed_data_filename <- params$imputedDataFilename
436 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
437 anova_ksea_mtdt_file <- params$anovaKseaMetadata
438
439 ## parameters for imputation
440
441 # Imputation method, should be one of
442 # "random", "group-median", "median", or "mean"
443 imputation_method <- params$imputationMethod
444
445 # Selection of percentile of logvalue data to set the mean for random number
446 # generation when using random imputation
447 mean_percentile <- params$meanPercentile / 100.0
448
449 # deviation adjustment-factor for random values; real number.
450 sd_percentile <- params$sdPercentile
451
452 ## parameters for group parsing and filtering
453
454 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
455 regex_sample_names <- params$regexSampleNames
456 # Regular expression to extract Sample Grouping from Sample Name;
457 # if error occurs, compare smpl_trt vs. sample_name_matches
458 # to see if groupings/pairs line up
459 # e.g., "(\\d+)"
460
461 regex_sample_grouping <- params$regexSampleGrouping
462 # What are the patterns for filtering sample groups?
463 # How should sample groups be filtered?
464 # - none: do not filter
465 # - include: include sample groups matching filter
466 # - exclude: include sample groups not matching filter
467
468 sample_group_filter <- params$groupFilter
469 if (grepl("f", params$groupFilterMode, fixed = TRUE)) {
470 sample_group_filter_perl <- FALSE
471 sample_group_filter_fixed <- TRUE
472 } else if (grepl("p", params$groupFilterMode, fixed = TRUE)) {
473 sample_group_filter_perl <- TRUE
474 sample_group_filter_fixed <- FALSE
475 } else { # normal regex
476 sample_group_filter_perl <- FALSE
477 sample_group_filter_fixed <- FALSE
478 }
479
480 sample_group_filter_nocase <-
481 grepl("i", params$groupFilterMode, fixed = TRUE)
482
483 # What PCRE patterns should be included or excluded
484 group_filter_patterns_csv <- params$groupFilterPatterns
485 sample_group_filter_patterns <- strsplit(
486 x = group_filter_patterns_csv,
487 split = ",",
488 fixed = TRUE
489 )[[1]]
490
491 ## parameters for hypothesis testing
492
493 one_way_all_categories_fname <- params$oneWayManyCategories
494
495 one_way_all_categories <- try_catch_w_e(
496 match.fun(one_way_all_categories_fname))
497
498 if (!is.function(one_way_all_categories$value)) {
499 write("fatal error for parameter oneWayManyCategories:", stderr())
500 write(one_way_all_categories$value$message, stderr())
501 if (sys.nframe() > 0) {
502 cat("Cannot continue and quit() failed. Goodbye.")
503 knitr::knit_exit()
504 quit(save = "no", status = 1)
505 }
506 }
507
508 one_way_all_categories <- one_way_all_categories$value
509
510 one_way_two_categories_fname <- params$oneWayManyCategories
511 one_way_two_categories <- try_catch_w_e(
512 match.fun(one_way_two_categories_fname))
513 if (!is.function(one_way_two_categories$value)) {
514 cat("fatal error for parameter oneWayTwoCategories: \n")
515 cat(one_way_two_categories$value$message, fill = TRUE)
516 if (sys.nframe() > 0) {
517 cat("Cannot continue and quit() failed. Goodbye.")
518 knitr::knit_exit()
519 quit(save = "no", status = 1)
520 }
521 }
522 one_way_two_categories <- one_way_two_categories$value
523
524 ## parameters for KSEA
525
526 ksea_cutoff_statistic <- params$kseaCutoffStatistic
527 ksea_cutoff_threshold <- params$kseaCutoffThreshold
528 ksea_min_substrate_count <- params$kseaMinSubstrateCount
529
530 ## parameters for global variables consumed by functions
531
532 # intensityHeatmapCriteria: !r c("na_count", "p_value")[2] # TODO switch to 1
533 # TODO Validate within list
534 g_intensity_hm_criteria <- params$intensityHeatmapCriteria
535 if (is_string_null_or_empty(g_intensity_hm_criteria)) {
536 cat("invalid intensityHeatmapCriteria parameter (must be string)")
537 knitr::knit_exit()
538 }
539 switch(
540 g_intensity_hm_criteria,
541 "quality" = NULL,
542 "na_count" = NULL,
543 "p_value" = NULL,
544 {
545 with(
546 params,
547 cat(
548 sprintf(
549 "invalid %s (must be %s)",
550 see_variable(intensityHeatmapCriteria),
551 "one of quality or na_count or p_value"
552 )
553 )
554 )
555 knitr::knit_exit()
556 }
557 )
558
559 # intensityHeatmapRows: 50
560 # TODO Validate >> 0 < 75
561 g_intensity_hm_rows <- params$intensityHeatmapRows
562 if (!is.integer(g_intensity_hm_rows) || g_intensity_hm_rows < 1) {
563 cat("invalid intensityHeatmapRows (must be integer > 0)")
564 knitr::knit_exit()
565 }
566
567 g_intensity_min_per_class <- params$intensityMinValuesPerGroup
568 if (!is.integer(g_intensity_min_per_class) || g_intensity_min_per_class < 0) {
569 cat("invalid intensityMinValuesPerGroup (must be integer > -1")
570 knitr::knit_exit()
571 }
572
573 if (is.na(as.logical(g_correlate_substrates <- params$correlateSubstrates))) {
574 cat("invalid correlateSubstrates (must be TRUE or FALSE)")
575 knitr::knit_exit()
576 }
577
578 if (is.na(as.logical(g_filter_cov_var_gt_1 <- params$filterCovVarGT1))) {
579 cat("invalid filterCovVarGT1 parameter (must be TRUE or FALSE)")
580 knitr::knit_exit()
581 }
582
583 # TODO Validate >> 0 < 30
584 g_ppep_trunc_n <- params$ppepTruncN
585
586 # TODO Validate >> 0 < 30
587 g_subgene_trunc_n <- params$subgeneTruncN
588
589 # TODO Validate >> 0 < 30
590 g_sbstr_trunc_n <- params$substTruncN
591
592
593 ### OPERATORS
594
595 # Test for exclusion
596 # ref: https://www.reneshbedre.com/blog/in-operator-r.html
597 `%notin%` <- Negate(`%in%`)
598
599 # Augmented assignment
600 # ref: https://www2.cs.arizona.edu/icon/refernce/infix2.htm#aug_assign
601 `%||:=%` <- function(lvalue, ...) {
602 pf <- parent.frame()
603 rvalue <- Reduce(paste0, x = ..., init = lvalue)
604 assign(
605 x = as.character(substitute(lvalue)),
606 value = rvalue,
607 pos = pf
608 )
609 invisible(rvalue)
610 }
611
612 ### FUNCTIONS
613
614 no_op <-
615 function() {
616 }
617 # this function is not used in this file and should be removed while
618 # factoring out reusable code
619 all_apply <- function(f, v, na_rm = TRUE, ...) {
620 Reduce(
621 f = function(l, r) if (na_rm && is.na(r)) TRUE else l && r,
622 x = sapply(X = v, FUN = f, ...),
623 init = TRUE
624 )
625 }
626
627 write_debug_file <- function(data_frame) {
628 if (!is.null(debug_file_base_path)) {
629 s_path <-
630 sprintf(
631 "%s/%s.txt",
632 debug_file_base_path,
633 deparse(substitute(data_frame))
634 )
155 write.table( 635 write.table(
156 s, 636 data_frame,
157 file = s_path, 637 file = s_path,
158 sep = "\t", 638 sep = "\t",
159 col.names = TRUE, 639 col.names = TRUE,
160 row.names = TRUE, 640 row.names = TRUE,
161 quote = FALSE 641 quote = FALSE
172 # Hence, `x <- 1; get("x", new_env())` fails by design. 652 # Hence, `x <- 1; get("x", new_env())` fails by design.
173 new_env <- function() { 653 new_env <- function() {
174 new.env(parent = emptyenv()) 654 new.env(parent = emptyenv())
175 } 655 }
176 656
657 # make apply readable for rows
658 row_apply <- function(x, fun, ..., simplify = TRUE) {
659 apply(x, MARGIN = 1, fun, ..., simplify = TRUE)
660 }
661
662 # make apply readable for columns
663 column_apply <- function(x, fun, ..., simplify = TRUE) {
664 apply(x, MARGIN = 2, fun, ..., simplify = TRUE)
665 }
666
667 ##' Produce a vector of boolean values whose i-th value is TRUE when any
668 ##' member of v matches the i-th membr of s, where i in 1:seq_len(length(s))
669 ##'
670 ##' @title Search multiple strings for matches of multiple substrings
671 ##' @param v a vector of substrings to match
672 ##' @param s a vector of strings to search for matches
673 ##' @param ... additional arguments to grepl
674 ##' @return a list with keys in s and valuse that are vectors of elements of v
675 ##' @author Art Eschenlauer
676 ##' Copyright (C) 2022 Art Eschenlauer;
677 ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms
678 mgrepl <- function(v, s, ...) {
679 grpl_rslt <- rep_len(0, length(s))
680 for (vi in v) {
681 grpl_rslt_v <- sapply(
682 X = s,
683 FUN = function(t) {
684 Reduce(
685 f = function(l, r) if (is.null(l)) r else c(l, r),
686 x = sapply(
687 X = vi,
688 FUN = function(f) grepl(f, t, ...)
689 ),
690 init = c()
691 )
692 },
693 simplify = "array"
694 )
695 grpl_rslt <- grpl_rslt + grpl_rslt_v
696 }
697 rslt <- unname(grpl_rslt > 0)
698 }
699
700 ##' Produce positions in a vector where succeeding value != current valus
701 ##'
702 ##' @title Search vector for neighboring positions having different values
703 ##' @param v a vector of comparable numeric values (e.g. integers)
704 ##' @return a vector of positions i where v[i] != v[i + 1]
705 ##' @author Art Eschenlauer
706 ##' Copyright (C) 2022 Art Eschenlauer;
707 ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms
708 transition_positions <- function(v) {
709 Reduce(
710 f = function(l, i) if ((i != 1) && (v[i - 1] != v[i])) c(l, i - 1) else l,
711 x = seq_along(v)[-1:0],
712 init = c()
713 )
714 }
715
716 ### figure debug functions
717
718 cat_par_vector <- function(par_name, lbl = "", newlines = TRUE) {
719 cat(
720 sprintf(
721 "%spar(%s) = c(%s)%s",
722 lbl,
723 par_name,
724 paste(par(par_name), collapse = ", "),
725 if (newlines) "\n\n" else ""
726 )
727 )
728 }
729
730 cat_margins <- function(lbl = NULL) {
731 for (p in c("fig", "fin", "mar", "mai", "omd", "omi", "oma"))
732 cat_par_vector(p, if (!is.null(lbl)) paste0(lbl, " ") else NULL)
733 }
734
735 cat_variable <-
736 function(x, suffix = "", digits = 3, force_str = FALSE) {
737 xprssn <- deparse1(substitute(x))
738 if (force_str || is.matrix(x) || is.list(x) || is.data.frame(x)) {
739 cat(
740 paste0(
741 "\n\\texttt{\\textbf{",
742 whack_underscores(xprssn),
743 "}} [",
744 typeof(x),
745 ",",
746 mode(x),
747 "] =\n"
748 )
749 )
750 cat("\n\\begin{verbatim}\n")
751 str(x)
752 cat("\n\\end{verbatim}\n")
753 } else {
754 cat("\n", see_variable(x, suffix, digits, xprssn))
755 }
756 }
757
758 ### structure helper functions
759
760 # ref: staque.R - Icon-oriented stack and queue operations
761 # - https://gist.github.com/eschen42/917690355e53918b9e7ba7138a02d1f8
762 #
763 # sq_get(v):x produces the leftmost element of v and removes it from v,
764 # but produces NA if v is empty
765 sq_get <- function(v) {
766 if (length(v) == 0) return(NA)
767 assign(as.character(substitute(v)), v[-1], parent.frame())
768 return(v[1])
769 }
770 #
771 # sq_put(v,x1,...,xn):v puts x1, x2, ..., xn onto the right end of v,
772 # producing v.
773 # Values are pushed in order from left to right,
774 # so xn becomes the last (rightmost) value on v.
775 # sq_put(v) with no second argument does nothing.
776 sq_put <- function(v, x = NA, ...) {
777 pf <- parent.frame()
778 if (is.null(x)) return(pf$v)
779 if (
780 !(length(x) > 1) &&
781 !rlang::is_closure(x) &&
782 is.na(x)
783 ) return(pf$v)
784 assign(as.character(substitute(v)), c(v, x, ...), pf)
785 pf[[as.character(substitute(v))]]
786 }
787
177 ### numerical/statistical helper functions 788 ### numerical/statistical helper functions
178 789
179 any_nan <- function(x) { 790 any_nan <- function(x) {
180 !any(x == "NaN") 791 !any(x == "NaN")
181 } 792 }
184 sd_finite <- function(x) { 795 sd_finite <- function(x) {
185 ok <- is.finite(x) 796 ok <- is.finite(x)
186 sd(x[ok]) 797 sd(x[ok])
187 } 798 }
188 799
800 # compute anova raw p-value
189 anova_func <- function(x, grouping_factor, one_way_f) { 801 anova_func <- function(x, grouping_factor, one_way_f) {
190 subject <- data.frame( 802 subject <- data.frame(
191 intensity = x 803 intensity = x
192 ) 804 )
193 x_aov <- 805 x_aov <-
201 else 813 else
202 pvalue <- x_aov$p.value 814 pvalue <- x_aov$p.value
203 pvalue 815 pvalue
204 } 816 }
205 817
818 # This code adapted from matrixcalc::is.positive.definite
819 # Notably, this simply tests without calling stop()
820 is_positive_definite <- function(x, tol = 1e-08) {
821 if (!is.matrix(x))
822 return(FALSE)
823 if (!is.numeric(x))
824 return(FALSE)
825 if (nrow(x) < 1)
826 return(FALSE)
827 if (ncol(x) < 1)
828 return(FALSE)
829 if (nrow(x) != ncol(x))
830 return(FALSE)
831 sum_symm <- sum(x == t(x), na.rm = TRUE)
832 value_count <- Reduce("*", dim(x))
833 if (sum_symm != value_count)
834 return(FALSE)
835 eigenvalues <- eigen(x, only.values = TRUE)$values
836 n <- nrow(x)
837 for (i in 1:n) {
838 if (abs(eigenvalues[i]) < tol) {
839 eigenvalues[i] <- 0
840 }
841 }
842 if (any(eigenvalues <= 0)) {
843 return(FALSE)
844 }
845 return(TRUE)
846 }
206 847
207 ### LaTeX functions 848 ### LaTeX functions
208 849
209 latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) {
210 v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v
211 cat(
212 paste0(
213 v_sub,
214 collapse = collapse_string
215 )
216 )
217 }
218
219 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
220 cat("\\begin{itemize}\n\\item ")
221 latex_collapsed_vector(collapse_string, v, underscore_whack)
222 cat("\n\\end{itemize}\n")
223 }
224
225 latex_itemized_list <- function(v, underscore_whack = TRUE) {
226 latex_itemized_collapsed("\n\\item ", v, underscore_whack)
227 }
228
229 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
230 cat("\\begin{enumerate}\n\\item ")
231 latex_collapsed_vector(collapse_string, v, underscore_whack)
232 cat("\n\\end{enumerate}\n")
233 }
234
235 latex_enumerated_list <- function(v) {
236 latex_enumerated_collapsed("\n\\item ", v)
237 }
238
239 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
240 latex_collapsed_vector(" & ", v, underscore_whack)
241 cat(extra)
242 cat(" \\\\\n")
243 }
244
245 # Use this like print.data.frame, from which it is adapted: 850 # Use this like print.data.frame, from which it is adapted:
246 data_frame_latex <- 851 data_frame_table_latex <-
247 function( 852 function(
248 x, 853 x,
249 ...,
250 # digits to pass to format.data.frame 854 # digits to pass to format.data.frame
251 digits = NULL, 855 digits = NULL,
252 # TRUE -> right-justify columns; FALSE -> left-justify 856 # TRUE -> right-justify columns; FALSE -> left-justify
253 right = TRUE, 857 right = TRUE,
254 # maximumn number of rows to print 858 # maximumn number of rows to print
260 # optional caption 864 # optional caption
261 caption = NULL, 865 caption = NULL,
262 # h(inline); b(bottom); t (top) or p (separate page) 866 # h(inline); b(bottom); t (top) or p (separate page)
263 anchor = "h", 867 anchor = "h",
264 # set underscore_whack to TRUE to escape underscores 868 # set underscore_whack to TRUE to escape underscores
265 underscore_whack = TRUE 869 underscore_whack = TRUE,
870 # how to emit results
871 emit = cat
266 ) { 872 ) {
267 if (is.null(justification)) 873 if (is.null(justification))
268 justification <- 874 justification <-
269 Reduce( 875 Reduce(
270 f = paste, 876 f = paste,
271 x = rep_len(if (right) "r" else "l", length(colnames(x))) 877 x = rep_len(if (right) "r" else "l", length(colnames(x)))
272 ) 878 )
879 n <- length(rownames(x))
880 if (length(x) == 0L) {
881 emit(
882 sprintf(
883 # if n is one, use singular 'row', else use plural 'rows'
884 ngettext(
885 n,
886 "data frame with 0 columns and %d row",
887 "data frame with 0 columns and %d rows"
888 ),
889 n
890 ),
891 "\n",
892 sep = ""
893 )
894 } else if (n == 0L) {
895 emit("0 rows for:\n")
896 latex_itemized_list(
897 v = names(x),
898 underscore_whack = underscore_whack
899 )
900 } else {
901 if (is.null(max))
902 max <- getOption("max.print", 99999L)
903 if (!is.finite(max)) {
904 cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max)
905 knitr::knit_exit()
906 }
907 omit <- (n0 <- max %/% length(x)) < n
908 m <- as.matrix(
909 format.data.frame(
910 if (omit) x[seq_len(n0), , drop = FALSE] else x,
911 digits = digits,
912 na.encode = FALSE
913 )
914 )
915 emit(
916 # h(inline); b(bottom); t (top) or p (separate page)
917 paste0("\\begin{table}[", anchor, "]"),
918 "\\leavevmode",
919 sep = "\n"
920 )
921 if (!is.null(caption))
922 emit(paste0(" \\caption{", caption, "}"))
923 if (centered) emit("\\centering\n")
924 emit(
925 paste(
926 " \\begin{tabular}{",
927 justification,
928 "}\n",
929 sep = ""
930 )
931 )
932
933 # ref for top and bottom struts (\T and \B):
934 # https://tex.stackexchange.com/a/50355
935 if (!is.null(caption))
936 emit("\\B \\\\\n")
937 latex_table_row(
938 v = colnames(m),
939 extra = " \\T \\B",
940 underscore_whack = underscore_whack
941 )
942 emit("\\hline \\\\\n")
943 for (i in seq_len(length(m[, 1]))) {
944 latex_table_row(
945 v = m[i, ],
946 underscore_whack = underscore_whack
947 )
948 }
949 emit(
950 paste(
951 " \\end{tabular}",
952 "\\end{table}",
953 sep = "\n"
954 )
955 )
956 if (omit)
957 emit(" [ reached 'max' / getOption(\"max.print\") -- omitted",
958 n - n0, "rows ]\n")
959 }
960 invisible(x)
961 }
962
963 # Use this like print.data.frame, from which it is adapted:
964 data_frame_tabbing_latex <-
965 function(
966 x,
967 # vector of tab stops, in inches
968 tabstops,
969 # vector of headings, registered with tab-stops
970 headings = colnames(x),
971 # digits to pass to format.data.frame
972 digits = NULL,
973 # maximumn number of rows to print
974 max = NULL,
975 # optional caption
976 caption = NULL,
977 # set underscore_whack to TRUE to escape underscores
978 underscore_whack = TRUE,
979 # flag for landscape mode
980 landscape = FALSE,
981 # flag indicating that subsubsection should be used for caption
982 # rather than subsection
983 use_subsubsection_header = TRUE,
984 # character-size indicator; for possible values, see:
985 # https://tug.org/texinfohtml/latex2e.html#Font-sizes
986 charactersize = "small",
987 # set verbatim to TRUE to debug output
988 verbatim = FALSE
989 ) {
990
991 hlinport <- if (landscape) {
992 function() cat("\\hlinlscp \\\\\n")
993 } else {
994 function() cat("\\hlinport \\\\\n")
995 }
996
997 tabstops_tex <-
998 Reduce(
999 f = function(l, r) paste0(l, r),
1000 x = sprintf("\\hspace{%0.2fin}\\=", tabstops),
1001 init = ""
1002 )
1003
273 n <- length(rownames(x)) 1004 n <- length(rownames(x))
274 if (length(x) == 0L) { 1005 if (length(x) == 0L) {
275 cat( 1006 cat(
276 sprintf( 1007 sprintf(
277 # if n is one, use singular 'row', else use plural 'rows' 1008 # if n is one, use singular 'row', else use plural 'rows'
292 underscore_whack = underscore_whack 1023 underscore_whack = underscore_whack
293 ) 1024 )
294 } else { 1025 } else {
295 if (is.null(max)) 1026 if (is.null(max))
296 max <- getOption("max.print", 99999L) 1027 max <- getOption("max.print", 99999L)
297 if (!is.finite(max)) 1028 if (!is.finite(max)) {
298 stop("invalid 'max' / getOption(\"max.print\"): ", 1029 cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max)
299 max) 1030 knitr::knit_exit()
1031 }
300 omit <- (n0 <- max %/% length(x)) < n 1032 omit <- (n0 <- max %/% length(x)) < n
301 m <- as.matrix( 1033 m <- as.matrix(
302 format.data.frame( 1034 format.data.frame(
303 if (omit) x[seq_len(n0), , drop = FALSE] else x, 1035 if (omit) x[seq_len(n0), , drop = FALSE] else x,
304 digits = digits, 1036 digits = digits,
305 na.encode = FALSE 1037 na.encode = FALSE
306 ) 1038 )
307 ) 1039 )
308 cat( 1040 if (landscape)
309 # h(inline); b(bottom); t (top) or p (separate page) 1041 cat("\n\\begin{landscape}")
310 paste0("\\begin{table}[", anchor, "]\n") 1042 tex_caption <-
311 ) 1043 if (!is.null(caption)) sprintf("\\captionof{table}{%s}\n", caption)
312 if (!is.null(caption)) 1044 else "\n"
313 cat(paste0(" \\caption{", caption, "}")) 1045 # build the column names, which have multiple lines when
314 if (centered) cat("\\centering\n") 1046 # length(headings) is a multiple of the number of columns
315 cat( 1047 column_names <- ""
316 paste( 1048 while (length(headings) > 0) {
317 " \\begin{tabular}{", 1049 my_row <- c()
318 justification, 1050 for (i in 1:(1 + length(tabstops))) {
319 "}\n", 1051 my_field <- sq_get(headings)
320 sep = "" 1052 sq_put(my_row, if (is.na(my_field)) "" else my_field)
321 ) 1053 }
322 ) 1054 column_names %||:=% latex_tabbing_row(
323 # ref: https://tex.stackexchange.com/a/50353 1055 v = my_row,
324 # Describes use of \rule{0pt}{3ex} 1056 underscore_whack = underscore_whack,
325 if (!is.null(caption)) 1057 action = paste0
326 cat("\\B \\\\ \\hline\\hline\n")
327 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
328 latex_table_row(
329 v = colnames(m),
330 extra = "\\T\\B",
331 underscore_whack = underscore_whack
332 )
333 cat("\\hline\n")
334 for (i in seq_len(length(m[, 1]))) {
335 latex_table_row(
336 v = m[i, ],
337 underscore_whack = underscore_whack
338 ) 1058 )
339 } 1059 }
1060
1061 # Begin tabbing environment after beginning charactersize environment
1062 if (verbatim) cat("\n\\begin{verbatim}")
340 cat( 1063 cat(
341 paste( 1064 paste0(
342 " \\end{tabular}", 1065 "\n\\begin{", charactersize, "}", tex_caption,
343 "\\end{table}", 1066 "\\begin{tabwrap}{", tabstops_tex, "}\n"
344 sep = "\n"
345 )
346 ) 1067 )
1068 )
1069 # emit column names
1070 cat(column_names)
1071 # emit hline
1072 hlinport()
1073 for (i in seq_len(length(m[, 1]))) {
1074 my_row <- latex_tabbing_row(
1075 v = m[i, ],
1076 underscore_whack = underscore_whack,
1077 action = paste0
1078 )
1079 if (FALSE)
1080 cat(my_row)
1081 else
1082 cat(my_row)
1083 }
1084 hlinport()
347 if (omit) 1085 if (omit)
348 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", 1086 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
349 n - n0, "rows ]\n") 1087 n - n0, "rows ]\n")
1088 # End charactersize environment after ending tabbing environment
1089 cat(paste0("\\end{tabwrap}\n\\end{", charactersize, "}\n"))
1090 if (verbatim) cat("\\end{verbatim}\n")
1091 if (landscape)
1092 cat("\\end{landscape}\n")
350 } 1093 }
351 invisible(x) 1094 invisible(x)
1095 }
1096
1097 param_df_noexit <-
1098 function(e = NULL) {
1099 data_frame_tabbing_latex(
1100 x = param_df,
1101 tabstops = c(1.75),
1102 underscore_whack = TRUE,
1103 caption = "Input parameters",
1104 verbatim = FALSE
1105 )
1106 if (!is.null(e)) {
1107 sink(stderr())
1108 cat("Caught fatal error:\n\n")
1109 str(e)
1110 sink()
1111 }
1112 }
1113
1114 param_df_exit <-
1115 function(e = NULL) {
1116 param_df_noexit(e)
1117 knitr::knit_exit()
1118 exit(-1)
1119 }
1120
1121 # exit with exit code (default 0) and optional msg
1122 exit <-
1123 function(code = 0, msg = NULL, use_stderr = FALSE) {
1124 if (!is.null(msg)) {
1125 if (use_stderr) sink(stderr())
1126 cat("\n\n", msg, "\n\n")
1127 if (use_stderr) sink()
1128 }
1129 q(save = "no", status = code)
1130 }
1131
1132 # make control sequences into printable latex sequences
1133 latex_printable_control_seqs <-
1134 function(s) {
1135 s <- gsub("[\\]", "xyzzy_plugh", s)
1136 s <- gsub("[$]", "\\\\$", s)
1137 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
1138 return(s)
1139 }
1140 nolatex_verbatim <-
1141 function(expr) eval(expr)
1142
1143 latex_verbatim <-
1144 function(expr) {
1145 arg_string <- deparse1(substitute(expr))
1146 cat("\n\\begin{verbatim}\n___\n")
1147 tryCatch(
1148 expr = expr,
1149 error = param_df_exit,
1150 #ACE error =
1151 #ACE function(e) {
1152 #ACE cat("Caught error:\n\n")
1153 #ACE str(e)
1154 #ACE knitr::knit_exit()
1155 #ACE stop(e)
1156 #ACE },
1157 finally = cat("...\n\\end{verbatim}\n")
1158 )
1159 }
1160
1161 latex_samepage <-
1162 function(expr) {
1163 arg_string <- deparse1(substitute(expr))
1164 cat("\n\\begin{samepage}\n")
1165 tryCatch(
1166 expr = expr,
1167 error = param_df_exit,
1168 #ACE error =
1169 #ACE function(e) {
1170 #ACE cat("Caught error:\n\n")
1171 #ACE str(e)
1172 #ACE knitr::knit_exit()
1173 #ACE stop(e)
1174 #ACE },
1175 finally = cat("\n\\end{samepage}\n")
1176 )
1177 }
1178
1179 # return the result of invocation after showing parameters
1180 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
1181 latex_show_invocation <-
1182 function(f, f_name = deparse1(substitute(f)), head_patch = FALSE) {
1183 function(...) {
1184 my_env <- (as.list(environment()))
1185 va <- list(...)
1186 my_rslt <- new_env()
1187 my_rslt$rslt <- NULL
1188 latex_verbatim(
1189 expr = {
1190 cat(sprintf("\n .. Local variables for '%s':\n\n", f_name))
1191 str(va)
1192 if (!head_patch) {
1193 # return this result
1194 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
1195 cat(sprintf("\n .. Invoking '%s'\n", f_name))
1196 tryCatch(
1197 {
1198 cat("\n\\end{verbatim}\n")
1199 rslt <- do.call(f, va)
1200 },
1201 error = param_df_exit,
1202 #ACE error = function(e) {
1203 #ACE cat("\n\\begin{verbatim}\n")
1204 #ACE str(e)
1205 #ACE cat("\n\\end{verbatim}\n")
1206 #ACE knitr::knit_exit()
1207 #ACE stop(e)
1208 #ACE },
1209 finally = cat("\n\\begin{verbatim}\n")
1210 )
1211 cat(sprintf("\n .. '%s' returned:\n", f_name))
1212 str(rslt)
1213 my_rslt$rslt <- rslt
1214 }
1215 }
1216 )
1217 # return the result of invocation with the shown parameters
1218 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
1219 if (head_patch) my_rslt$rslt <- do.call(f, va)
1220 (my_rslt$rslt)
1221 }
1222 }
1223
1224 latex_collapsed_vector <- function(
1225 collapse_string,
1226 v,
1227 underscore_whack = TRUE,
1228 action = cat0
1229 ) {
1230 v_sub <-
1231 if (underscore_whack) whack_underscores(v) else v
1232 action(
1233 paste0(
1234 v_sub,
1235 collapse = collapse_string
1236 )
1237 )
1238 }
1239
1240 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
1241 cat("\\begin{itemize}\n\\item ")
1242 latex_collapsed_vector(collapse_string, v, underscore_whack)
1243 cat("\n\\end{itemize}\n")
1244 }
1245
1246 latex_itemized_list <- function(v, underscore_whack = TRUE) {
1247 latex_itemized_collapsed("\n\\item ", v, underscore_whack)
1248 }
1249
1250 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
1251 cat("\\begin{enumerate}\n\\item ")
1252 latex_collapsed_vector(collapse_string, v, underscore_whack)
1253 cat("\n\\end{enumerate}\n")
1254 }
1255
1256 latex_enumerated_list <- function(v) {
1257 latex_enumerated_collapsed("\n\\item ", v)
1258 }
1259
1260 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
1261 latex_collapsed_vector(" & ", v, underscore_whack)
1262 cat(extra)
1263 cat(" \\\\\n")
1264 }
1265
1266 latex_tabbing_row <- function(
1267 v,
1268 extra = "",
1269 underscore_whack = TRUE,
1270 action = cat0
1271 ) {
1272 # latex_collapsed_vector applies action to result of paste0;
1273 # by default, action = cat;
1274 # hence, a scalar string is assigned to v_collapsed
1275 v_collapsed <-
1276 latex_collapsed_vector(
1277 "} \\> \\tabfill{",
1278 v,
1279 underscore_whack,
1280 action = paste0
1281 )
1282 action(
1283 "\\tabfill{",
1284 v_collapsed,
1285 "}",
1286 extra,
1287 " \\\\\n"
1288 )
1289 }
1290
1291 # N.B. use con = "" to emulate regular cat
1292 fcat0 <-
1293 function(..., sprtr = " ", cnnctn = file()) {
1294 cat0(..., sep = sprtr, file = cnnctn)
1295 invisible(cnnctn)
352 } 1296 }
353 1297
354 hypersub <- 1298 hypersub <-
355 function(s) { 1299 function(s) {
356 hyper <- tolower(s) 1300 hyper <- tolower(s)
357 hyper <- gsub("[^a-z0-9]+", "-", hyper) 1301 hyper <- gsub("[^a-z0-9]+", "-", hyper)
358 hyper <- gsub("[-]+", "-", hyper) 1302 hyper <- gsub("[-]+", "-", hyper)
1303 hyper <- gsub("[_]+", "-", hyper)
359 hyper <- sub("^[-]", "", hyper) 1304 hyper <- sub("^[-]", "", hyper)
360 hyper <- sub("[-]$", "", hyper) 1305 hyper <- sub("[-]$", "", hyper)
361 return(hyper) 1306 return(hyper)
362 } 1307 }
363 1308
364 subsection_header <- 1309 table_href <- function(s = "offset", caption = "") {
365 function(s) { 1310 paste0("\\hyperlink{table.\\arabic{", s, "}}{Table \\arabic{", s, "}}")
1311 }
1312
1313 table_offset <- function(i = 0, s = "offset", new = FALSE) {
1314 paste0(
1315 if (new) paste0("\\newcounter{", s, "}\n") else "",
1316 "\\setcounter{", s, "}{\\value{table}}\n",
1317 paste0(if (i > 0) rep(paste0("\\stepcounter{", s, "}"), i), "\n")
1318 )
1319 }
1320
1321 a_section_header <-
1322 function(s, prefix = "") {
366 hyper <- hypersub(s) 1323 hyper <- hypersub(s)
367 cat( 1324 my_subsection_header <- sprintf(
368 sprintf( 1325 "\\hypertarget{%s}{\\%ssection{%s}\\label{%s}}\n",
369 "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", 1326 hyper,
370 hyper, s, hyper 1327 prefix,
371 ) 1328 gsub("_", "\\_", s, fixed = TRUE),
372 ) 1329 hyper
373 } 1330 )
374 1331 my_subsection_header
375 subsubsection_header <- 1332 }
376 function(s) { 1333 section_header <- function(s) a_section_header(s, "")
377 hyper <- hypersub(s) 1334 subsection_header <- function(s) a_section_header(s, "sub")
378 cat( 1335 subsubsection_header <- function(s) a_section_header(s, "subsub")
379 sprintf(
380 "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n",
381 hyper, s, hyper
382 )
383 )
384 }
385 1336
386 ### SQLite functions 1337 ### SQLite functions
387 1338
388 ddl_exec <- function(db, sql) { 1339 ddl_exec <- function(db, sql) {
389 discard <- DBI::dbExecute(conn = db, statement = sql) 1340 discard <- DBI::dbExecute(conn = db, statement = sql)
417 } 1368 }
418 } 1369 }
419 1370
420 ### KSEA functions and helpers 1371 ### KSEA functions and helpers
421 1372
422 # Adapted from KSEAapp::KSEA.Scores to allow retrieval of: 1373 #' The KSEA App Analysis (KSEA Kinase Scores Only)
423 # - maximum log2(FC) 1374 #'
1375 #' Compute KSEA kinase scores and statistics from phoshoproteomics data input
1376 #' Adapted from KSEAapp::KSEA.Scores to allow retrieval of maximum log2(FC)
1377 #'
1378 #' Result is an R data.frame with column names
1379 #' "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
1380 #' "Please refer to the original Casado et al. publication for detailed
1381 #' description of these columns and what they represent:
1382 #'
1383 #' - Kinase.Gene indicates the gene name for each kinase.
1384 #' - mS represents the mean log2(fold change) of all the
1385 #' kinase's substrates.
1386 #' - Enrichment is the background-adjusted value of the kinase's mS.
1387 #' - m is the total number of detected substrates
1388 #' from the experimental dataset for each kinase.
1389 #' - z.score is the normalized score for each kinase, weighted by
1390 #' the number of identified substrates.
1391 #' - p.value represents the statistical assessment for the z.score.
1392 #' - FDR is the p-value adjusted for multiple hypothesis testing
1393 #' using the Benjamini & Hochberg method."
1394 #'
1395 #' @param ksdata the Kinase-Substrate dataset uploaded from the file
1396 #' prefaced with "PSP&NetworKIN_"
1397 #' available from github.com/casecpb/KSEA/
1398 #' @param px the experimental data file formatted as described in
1399 #' the KSEA.Complete() documentation
1400 #' @param networkin a binary input of TRUE or FALSE, indicating whether
1401 #' or not to include NetworKIN predictions, where
1402 #' \code{NetworKIN = TRUE}
1403 #' means include NetworKIN predictions
1404 #' @param networkin_cutoff a numeric value between 1 and infinity setting
1405 #' the minimum NetworKIN score
1406 #' (this can be omitted if NetworKIN = FALSE)
1407 #'
1408 #' @return creates a new R data.frame with all the KSEA kinase
1409 #' scores, along with each one's statistical
1410 #' assessment, as described herein.
1411 #'
1412 #' @references
1413 #'
1414 #' Casado et al. (2013) Sci Signal. 6(268):rs6
1415 #'
1416 #' Hornbeck et al. (2015) Nucleic Acids Res. 43:D512-20
1417 #'
1418 #' Horn et al. (2014) Nature Methods 11(6):603-4
1419 #'
424 ksea_scores <- function( 1420 ksea_scores <- function(
425
426 # For human data, typically, ksdata = KSEAapp::ksdata 1421 # For human data, typically, ksdata = KSEAapp::ksdata
427 ksdata, 1422 ksdata,
428 1423
429 # Input data file having columns: 1424 # Input data file having columns:
430 # - Protein : abbreviated protein name 1425 # - Protein : abbreviated protein name
442 # NetworKIN predictions 1437 # NetworKIN predictions
443 networkin, 1438 networkin,
444 1439
445 # A numeric value between 1 and infinity setting the minimum NetworKIN 1440 # A numeric value between 1 and infinity setting the minimum NetworKIN
446 # score (can be left out if networkin = FALSE) 1441 # score (can be left out if networkin = FALSE)
447 networkin_cutoff 1442 networkin_cutoff,
1443
1444 # Minimum substrate count, necessary to adjust the p-value appropriately.
1445 minimum_substrate_count
448 1446
449 ) { 1447 ) {
1448 # no px$FC should be <= 0, but abs(px$FC) is used below as a precaution.
450 if (length(grep(";", px$Residue.Both)) == 0) { 1449 if (length(grep(";", px$Residue.Both)) == 0) {
451 # There are no Residue.Both entries having semicolons, so new is 1450 # There are no Residue.Both entries having semicolons, so new is
452 # simply px except two columns are renamed and a column is added 1451 # simply px except two columns are renamed and a column is added
453 # for log2(abs(fold-change)) 1452 # for log2(abs(fold-change))
454 new <- px 1453 new <- px
490 # Convert any illegal values from NaN to NA 1489 # Convert any illegal values from NaN to NA
491 new[is.nan(new$log2_fc), "log2_fc"] <- NA 1490 new[is.nan(new$log2_fc), "log2_fc"] <- NA
492 # Eliminate rows having missing values (e.g., non-imputed data) 1491 # Eliminate rows having missing values (e.g., non-imputed data)
493 new <- new[complete.cases(new$log2_fc), ] 1492 new <- new[complete.cases(new$log2_fc), ]
494 } 1493 }
495 if (networkin == TRUE) { 1494 # At this point, new$log2_fc is signed according to which contrast has
496 # When NetworKIN is true, filter on NetworKIN.cutoff which includes 1495 # the greater intensity
497 # PhosphoSitePlus data *because its networkin_score is set to Inf* 1496 # To take the magnitude into account without taking the direction into
498 ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] 1497 # account, set params$kseaUseAbsoluteLog2FC to TRUE
499 ksdata_filtered <- ksdata_filtered[ 1498 #
500 (ksdata_filtered$networkin_score >= networkin_cutoff), ] 1499 # Should KSEA be performed aggregating signed log2FC or absolute?
501 } else { 1500 # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores
502 # Otherwise, simply use PhosphSitePlus rows 1501 if (params$kseaUseAbsoluteLog2FC) {
503 ksdata_filtered <- ksdata[ 1502 # TRUE use abs(log2FC) for KSEA as Justin requested; this is a
504 grep("PhosphoSitePlus", ksdata$Source), ] 1503 # justifiable deviation from the KSEAapp::KSEA.Scores algorithm.
505 } 1504 new$log2_fc <- abs(new$log2_fc)
506 # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD 1505 }
1506
1507 monitor_filtration_on_stderr <- TRUE
1508 if (monitor_filtration_on_stderr) {
1509 # set to TRUE to monitor filtration on stderr
1510 sink(stderr())
1511 cat(see_variable(networkin, "\n"))
1512 }
1513 ksdata_filtered <-
1514 sqldf(
1515 sprintf("%s %s",
1516 "select * from ksdata where not Source = 'NetworKIN'",
1517 if (networkin)
1518 sprintf("or networkin_score >= %d", networkin_cutoff)
1519 else
1520 ""
1521 )
1522 )
1523 if (monitor_filtration_on_stderr) {
1524 cat(see_variable(sqldf(
1525 "select count(*), Source from ksdata group by Source"), "\n"))
1526 cat(see_variable(sqldf(
1527 "select count(*), Source from ksdata_filtered group by Source"), "\n"))
1528 sink()
1529 }
1530
1531 ############################################################################
1532 # Line numbers below refer to lines of:
1533 # https://github.com/casecpb/KSEAapp/blob/master/R/KSEA.Scores.R
1534 # I would put the original line in a comment but then lint would complain...
1535 # - Indeed, I had to rename all the variables because lint didn't like names
1536 # containing periods or capital letters.
1537 # ACE
1538 ############################################################################
1539 #
1540 # (1) Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
507 # colnames of ksdata_filtered: 1541 # colnames of ksdata_filtered:
508 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" 1542 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID"
509 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" 1543 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID"
510 # "SITE_...7_AA" "networkin_score" "Source" 1544 # "SITE_...7_AA" "networkin_score" "Source"
511 # colnames of new: 1545 # colnames of new:
514 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc 1548 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc
515 # FROM ksdata_filtered a 1549 # FROM ksdata_filtered a
516 # INNER JOIN new b 1550 # INNER JOIN new b
517 # ON a.SUB_GENE = b.SUB_GENE 1551 # ON a.SUB_GENE = b.SUB_GENE
518 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD 1552 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD
1553 # (KSEA.Scores.R line # 105)
1554 # "Extract KSData.filtered annotations that are only found in new"
519 ksdata_dataset <- base::merge(ksdata_filtered, new) 1555 ksdata_dataset <- base::merge(ksdata_filtered, new)
520 # colnames of ksdata_dataset: 1556 # colnames of ksdata_dataset:
521 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" 1557 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE"
522 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" 1558 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD"
523 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" 1559 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein"
524 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) 1560 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform)
525 # Re-order dataset; prior to accounting for isoforms 1561 # Re-order dataset; prior to accounting for isoforms
1562 # (KSEA.Scores.R line # 106)
526 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] 1563 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
527 # Extract non-isoform accession in UniProtKB 1564 # Extract non-isoform accession in UniProtKB
1565 # (KSEA.Scores.R line # 107)
528 ksdata_dataset$uniprot_no_isoform <- sapply( 1566 ksdata_dataset$uniprot_no_isoform <- sapply(
529 ksdata_dataset$KIN_ACC_ID, 1567 ksdata_dataset$KIN_ACC_ID,
530 function(x) unlist(strsplit(as.character(x), split = "-"))[1] 1568 function(x) unlist(strsplit(as.character(x), split = "-"))[1]
531 ) 1569 )
1570 # "last expression collapses isoforms ... for easy processing"
532 # Discard previous results while selecting interesting columns ... 1571 # Discard previous results while selecting interesting columns ...
1572 # (KSEA.Scores.R line # 110)
533 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] 1573 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
534 # Column names are now: 1574 # Column names are now:
535 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" 1575 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p"
536 # "FC" "log2_fc" "Source" 1576 # "FC" "log2_fc" "Source"
537 # Make column names human-readable 1577 # Make column names human-readable
1578 # (KSEA.Scores.R line # 111)
538 colnames(ksdata_dataset_abbrev) <- c( 1579 colnames(ksdata_dataset_abbrev) <- c(
539 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", 1580 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p",
540 "FC", "log2FC", "Source" 1581 "FC", "log2FC", "Source"
541 ) 1582 )
542 # SELECT * FROM ksdata_dataset_abbrev 1583 # SELECT * FROM ksdata_dataset_abbrev
543 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p 1584 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p
1585 # (KSEA.Scores.R line # 112)
1586 # "Extract KSData.filtered annotations that are only found in new"
544 ksdata_dataset_abbrev <- 1587 ksdata_dataset_abbrev <-
545 ksdata_dataset_abbrev[ 1588 ksdata_dataset_abbrev[
546 order( 1589 order(
547 ksdata_dataset_abbrev$Kinase.Gene, 1590 ksdata_dataset_abbrev$Kinase.Gene,
548 ksdata_dataset_abbrev$Substrate.Gene, 1591 ksdata_dataset_abbrev$Substrate.Gene,
549 ksdata_dataset_abbrev$Substrate.Mod, 1592 ksdata_dataset_abbrev$Substrate.Mod,
550 ksdata_dataset_abbrev$p), 1593 ksdata_dataset_abbrev$p),
551 ] 1594 ]
1595 if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev))
552 # First aggregation step to account for multiply phosphorylated peptides 1596 # First aggregation step to account for multiply phosphorylated peptides
553 # and differing peptide sequences; the goal here is to combine results 1597 # and differing peptide sequences; the goal here is to combine results
554 # for all measurements of the same substrate. 1598 # for all measurements of the same substrate.
555 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, 1599 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
556 # `Source`, avg(log2FC) AS log2FC 1600 # `Source`, avg(log2FC) AS log2FC
558 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, 1602 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
559 # `Source` 1603 # `Source`
560 # ORDER BY `Kinase.Gene`; 1604 # ORDER BY `Kinase.Gene`;
561 # in two steps: 1605 # in two steps:
562 # (1) compute average log_2(fold-change) 1606 # (1) compute average log_2(fold-change)
1607 # "take the mean of the log2FC amongst phosphosite duplicates"
1608 # (KSEA.Scores.R line # 115)
563 ksdata_dataset_abbrev <- aggregate( 1609 ksdata_dataset_abbrev <- aggregate(
564 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, 1610 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source,
565 data = ksdata_dataset_abbrev, 1611 data = ksdata_dataset_abbrev,
566 FUN = mean 1612 FUN = mean
567 ) 1613 )
1614 if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev))
568 # (2) order by Kinase.Gene 1615 # (2) order by Kinase.Gene
1616 # (KSEA.Scores.R line # 117)
569 ksdata_dataset_abbrev <- 1617 ksdata_dataset_abbrev <-
570 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] 1618 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
571 # SELECT `Kinase.Gene`, count(*) 1619 # SELECT `Kinase.Gene`, count(*)
572 # FROM ksdata_dataset_abbrev 1620 # FROM ksdata_dataset_abbrev
573 # GROUP BY `Kinase.Gene`; 1621 # GROUP BY `Kinase.Gene`;
574 # in two steps: 1622 # in two steps:
575 # (1) Extract the list of Kinase.Gene names 1623 # (1) Extract the list of Kinase.Gene names
1624 # "@@@@@@@@@@@@@@@@@@@@"
1625 # "Do analysis for KSEA"
1626 # "@@@@@@@@@@@@@@@@@@@@"
1627 # (KSEA.Scores.R line # 124)
576 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) 1628 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
577 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, 1629 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev,
578 # named by Kinase.Gene 1630 # named by Kinase.Gene
1631 # (KSEA.Scores.R line # 125)
579 kinase_list <- as.matrix(table(kinase_list)) 1632 kinase_list <- as.matrix(table(kinase_list))
580 # Second aggregation step to account for all substrates per kinase 1633 # Second aggregation step to account for all substrates per kinase
581 # CREATE TABLE mean_fc 1634 # CREATE TABLE mean_fc
582 # AS 1635 # AS
583 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC 1636 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC
584 # FROM ksdata_dataset_abbrev 1637 # FROM ksdata_dataset_abbrev
585 # GROUP BY `Kinase.Gene` 1638 # GROUP BY `Kinase.Gene`
586 mean_fc <- aggregate( 1639 # (KSEA.Scores.R line # 127)
587 log2FC ~ Kinase.Gene, 1640 if (print_nb_messages) nb(see_variable(ksdata_dataset_abbrev), "\n")
588 data = ksdata_dataset_abbrev, 1641 mean_fc <-
589 FUN = mean 1642 aggregate(
590 )
591 # mean_fc columns: "Kinase.Gene", "log2FC"
592 if (FALSE) {
593 # I need to re-think this; I was trying to find the most-represented
594 # peptide, but that horse has already left the barn
595 # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC
596 # FROM ksdata_dataset_abbrev
597 # GROUP BY `Kinase.Gene`
598 max_fc <- aggregate(
599 log2FC ~ Kinase.Gene, 1643 log2FC ~ Kinase.Gene,
600 data = ksdata_dataset_abbrev, 1644 data = ksdata_dataset_abbrev,
601 FUN = function(r) max(abs(r)) 1645 FUN = mean
602 ) 1646 )
603 } 1647 if (print_nb_messages) nbe(see_variable(mean_fc), "\n")
1648
1649 # for contrast j
1650 # for each kinase i
1651 # extract log2 of fold-change (from `new` above)
1652 # (used in KSEA.Scores.R lines # 130 & 132)
1653 log2_fc_j_each_i <-
1654 new$log2_fc
1655
1656 # for contrast j
1657 # for all kinases i
1658 # compute mean of abs(log2 of fold-change)
1659 # (used in KSEA.Scores.R lines # 130)
1660 mean_abs_log2_fc_j_all_i <-
1661 mean(abs(log2_fc_j_each_i), na.rm = TRUE)
1662
1663 # for contrast j
1664 # for all kinases i
1665 # compute mean of log2 of fold-change
1666 # (used in KSEA.Scores.R lines # 132)
1667 mean_log2_fc_j_all_i <-
1668 mean(log2_fc_j_each_i, na.rm = TRUE)
1669
1670 # Reorder mean_fc, although I don't see why
1671 # (KSEA.Scores.R line 128
1672 mean_fc <- mean_fc[order(mean_fc[, 1]), ]
1673 # mean_fc columns so far: "Kinase.Gene", "log2FC"
1674 # - Kinase.Gene
1675 # indicates the gene name for each kinase.
604 1676
605 # Create column 3: mS 1677 # Create column 3: mS
606 mean_fc$m_s <- mean_fc[, 2] 1678 # - mS
1679 # represents the mean log2(fold change) of all the
1680 # kinase's substrates.
1681 # (KSEA.Scores.R line # 129)
1682 mean_fc$m_s <-
1683 mean_fc_m_s <- mean_fc[, 2]
1684
607 # Create column 4: Enrichment 1685 # Create column 4: Enrichment
608 mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) 1686 # - Enrichment
609 # Create column 5: m, count of substrates 1687 # is the background-adjusted value of the kinase's mS.
610 mean_fc$m <- kinase_list 1688 # (KSEA.Scores.R line # 130)
1689 mean_fc$enrichment <-
1690 mean_fc_m_s / mean_abs_log2_fc_j_all_i
1691
1692 # Create column 5: m, count of substrates of kinase (count of j for i)
1693 # - m
1694 # is the total number of detected substrates
1695 # from the experimental dataset for each kinase.
1696 # (KSEA.Scores.R line # 131)
1697 mean_fc$m <-
1698 mean_fc_m <- kinase_list
1699
1700
611 # Create column 6: z-score 1701 # Create column 6: z-score
612 mean_fc$z_score <- ( 1702 # - z.score
613 (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * 1703 # is the normalized score for each kinase, weighted by
614 sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) 1704 # the number of identified substrates.
1705 # (KSEA.Scores.R line # 132)
1706 mean_fc$z_score <-
1707 (mean_fc_m_s - mean_log2_fc_j_all_i) * sqrt(mean_fc_m) /
1708 sd(log2_fc_j_each_i, na.rm = TRUE)
1709
615 # Create column 7: p-value, deduced from z-score 1710 # Create column 7: p-value, deduced from z-score
616 mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) 1711 # - p.value
1712 # represents the statistical assessment for the z.score.
1713 # (KSEA.Scores.R line # 133)
1714 # "one-tailed p-value"
1715 mean_fc$p_value <-
1716 pnorm(-abs(mean_fc$z_score))
1717
1718 # zap excluded kinases; this must be done before adjusting p-value
1719 if (TRUE) {
1720 mean_fc <-
1721 mean_fc[
1722 mean_fc$m >= minimum_substrate_count,
1723 ,
1724 drop = FALSE
1725 ]
1726 }
1727
1728 #ACE nb(see_variable(nrow(mean_fc)), "\n")
617 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value 1729 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value
618 mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") 1730 # - FDR
619 1731 # is the p-value adjusted for multiple hypothesis testing
620 # Remove log2FC column, which is duplicated as mS 1732 # using the Benjamini & Hochberg method."
621 mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] 1733 # (KSEA.Scores.R line # 134)
1734 mean_fc$fdr <-
1735 p.adjust(mean_fc$p_value, method = "fdr")
1736
1737 # It makes no sense to leave Z-scores negative when using
1738 # absolute value of fold-change
1739 if (params$kseaUseAbsoluteLog2FC) {
1740 mean_fc$z_score <- abs(mean_fc$z_score)
1741 }
1742
1743 # Remove second column (log2FC), which is duplicated as mS
1744 # (KSEA.Scores.R line # 136)
1745 mean_fc <-
1746 mean_fc[order(mean_fc$Kinase.Gene), -2]
622 # Correct the column names which we had to hack because of the linter... 1747 # Correct the column names which we had to hack because of the linter...
623 colnames(mean_fc) <- c( 1748 colnames(mean_fc) <- c(
624 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" 1749 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
625 ) 1750 )
1751 # (KSEA.Scores.R line # 138)
626 return(mean_fc) 1752 return(mean_fc)
627 } 1753 }
628 1754
629 low_fdr_barplot <- function( 1755 ksea_low_fdr_barplot_factory <- function(
630 rslt, 1756 rslt,
631 i_cntrst, 1757 i_cntrst,
632 i, 1758 i,
633 a_level, 1759 a_level,
634 b_level, 1760 b_level,
656 k$fdr 1782 k$fdr
657 }, 1783 },
658 "p.value" = { 1784 "p.value" = {
659 k$p_value 1785 k$p_value
660 }, 1786 },
661 stop( 1787 {
662 sprintf( 1788 cat(
663 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", 1789 sprintf(
664 ksea_cutoff_statistic 1790 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
1791 ksea_cutoff_statistic
1792 )
665 ) 1793 )
666 ) 1794 param_df_exit()
667 ) 1795 knitr::knit_exit()
1796 }
1797 )
668 1798
669 k <- k[selector < ksea_cutoff_threshold, ] 1799 k <- k[selector < ksea_cutoff_threshold, ]
670 1800 nrow_k <- nrow(k)
671 if (nrow(k) > 1) { 1801
672 op <- par(mai = c(1, 1.5, 0.4, 0.4)) 1802 #ACE nbe(see_variable(fdr_barplot_dataframe <- k))
1803
1804 if (nrow_k > 0) {
1805 max_nchar_rowname <- max(nchar(rownames(k)))
1806 my_cex_names <- 1.0 / (1 + nrow_k / 50)
1807
1808 if (print_trace_messages) cat_margins("Initially")
1809 if (print_trace_messages) cat_variable(nrow_k, "\n\n", 0)
1810 if (print_trace_messages) cat_variable(my_cex_names, "\n\n", 0)
1811 if (print_trace_messages) cat_variable(max_nchar_rowname, "\n\n", 0)
1812
1813 # fin: The figure region dimensions, (width, height), in inches.
1814 # mar: A numerical vector of the form c(bottom, left, top, right)
1815 # that gives the number of lines of margin to be specified
1816 # on the four sides of the plot; default: c(5, 4, 4, 2) + 0.1
1817
1818 # mar: The figure region dimensions, (width, height), in inches.
673 numeric_z_score <- as.numeric(k$z_score) 1819 numeric_z_score <- as.numeric(k$z_score)
674 z_score_order <- order(numeric_z_score) 1820 bar_order <- order(-as.numeric(k$p_value))
675 kinase_name <- k$kinase_gene 1821 kinase_name <- k$kinase_gene
676 long_caption <- 1822 long_caption <-
677 sprintf( 1823 sprintf(
678 "Kinase z-score, %s < %s, %s", 1824 "Kinase z-score, %s, KSEA %s < %s",
1825 caption,
679 ksea_cutoff_statistic, 1826 ksea_cutoff_statistic,
680 ksea_cutoff_threshold, 1827 ksea_cutoff_threshold
681 caption
682 ) 1828 )
683 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) 1829 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption))
684 cat("\n\\clearpage\n") 1830 # return a function that draws the plot
685 barplot( 1831 function() {
686 height = numeric_z_score[z_score_order], 1832 par_fin <- par("fin") # vector of width_in_inches and height_in_inches)
687 border = NA, 1833 op <- par(
688 xpd = FALSE, 1834 bg = if (print_trace_messages) "yellow" else "white",
689 cex.names = 1.0, 1835 fin = c(par_fin[1], min(par_fin[2], 2.5 + nrow_k / 6)),
690 cex.axis = 1.0, 1836 mar = par("mar") +
691 main = long_caption, 1837 c(3 / nrow_k, (1 + max_nchar_rowname * my_cex_names) / 2, 0, 0)
692 cex.main = my_cex_caption, 1838 # bottom, left, top, right
693 names.arg = kinase_name[z_score_order], 1839 )
694 horiz = TRUE, 1840 on.exit(par(op))
695 srt = 45, 1841 if (print_trace_messages) cat_margins("Eventually")
696 las = 1) 1842
697 par(op) 1843 barplot(
1844 height = numeric_z_score[bar_order],
1845 border = NA,
1846 xpd = FALSE,
1847 cex.names = my_cex_names,
1848 main = long_caption,
1849 cex.main = my_cex_caption,
1850 names.arg = kinase_name[bar_order],
1851 horiz = TRUE,
1852 srt = 45,
1853 las = 1,
1854 cex.axis = 0.9
1855 )
1856 }
698 } 1857 }
1858 } else {
1859 no_op
699 } 1860 }
700 } 1861 }
701 1862
702 # note that this adds elements to the global variable `ksea_asterisk_hash` 1863 # note that this adds elements to the global variable `ksea_asterisk_hash`
703 1864
704 low_fdr_print <- function( 1865 ksea_low_fdr_print <- function(
705 rslt, 1866 rslt,
706 i_cntrst, 1867 i_cntrst,
707 i, 1868 i,
708 a_level, 1869 a_level,
709 b_level, 1870 b_level,
710 fold_change, 1871 fold_change,
711 caption 1872 caption,
1873 write_db = TRUE, # if TRUE, write to DB, else print table
1874 anchor = c(const_table_anchor_p, const_table_anchor_t)
712 ) { 1875 ) {
713 rslt_score_list_i <- rslt$score_list[[i]] 1876 rslt_score_list_i <- rslt$score_list[[i]]
714 if (!is.null(rslt_score_list_i)) { 1877 if (!is.null(rslt_score_list_i)) {
715 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) 1878 rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
716 k <- contrast_ksea_scores <- data.frame( 1879 k <- contrast_ksea_scores <- data.frame(
732 k$fdr 1895 k$fdr
733 }, 1896 },
734 "p.value" = { 1897 "p.value" = {
735 k$p_value 1898 k$p_value
736 }, 1899 },
737 stop( 1900 {
738 sprintf( 1901 cat(
739 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", 1902 sprintf(
740 ksea_cutoff_statistic 1903 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
1904 ksea_cutoff_statistic
1905 )
741 ) 1906 )
742 ) 1907 param_df_exit()
743 ) 1908 knitr::knit_exit()
1909 }
1910 )
744 1911
745 k <- k[selector < ksea_cutoff_threshold, ] 1912 k <- k[selector < ksea_cutoff_threshold, ]
746 # save kinase names to ksea_asterisk_hash 1913 # save kinase names to ksea_asterisk_hash
747 for (kinase_name in k$kinase_gene) { 1914 for (kinase_name in k$kinase_gene) {
748 ksea_asterisk_hash[[kinase_name]] <- 1 1915 ksea_asterisk_hash[[kinase_name]] <- 1
749 } 1916 }
750 1917
751 db_write_table_overwrite <- (i_cntrst < 2) 1918 if (write_db) {
752 db_write_table_append <- !db_write_table_overwrite 1919 db_write_table_overwrite <- (i_cntrst < 2)
753 RSQLite::dbWriteTable( 1920 db_write_table_append <- !db_write_table_overwrite
754 conn = db, 1921 RSQLite::dbWriteTable(
755 name = "contrast_ksea_scores", 1922 conn = db,
756 value = contrast_ksea_scores, 1923 name = "contrast_ksea_scores",
757 append = db_write_table_append 1924 value = contrast_ksea_scores,
758 ) 1925 append = db_write_table_append
759 selector <- switch( 1926 )
760 ksea_cutoff_statistic, 1927 ""
761 "FDR" = { 1928 } else {
762 contrast_ksea_scores$fdr 1929 selector <- switch(
763 }, 1930 ksea_cutoff_statistic,
764 "p.value" = { 1931 "FDR" = {
765 contrast_ksea_scores$p_value 1932 contrast_ksea_scores$fdr
766 }, 1933 },
767 stop( 1934 "p.value" = {
768 sprintf( 1935 contrast_ksea_scores$p_value
769 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", 1936 },
770 ksea_cutoff_statistic 1937 {
1938 cat(
1939 sprintf(
1940 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
1941 ksea_cutoff_statistic
1942 )
1943 )
1944 param_df_exit()
1945 knitr::knit_exit()
1946 }
1947 )
1948 if (print_nb_messages) nbe(see_variable(contrast_ksea_scores))
1949 output_df <- contrast_ksea_scores[
1950 selector < ksea_cutoff_threshold,
1951 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
1952 "z_score", "p_value", "fdr")
1953 ]
1954 output_df$kinase_gene <-
1955 gsub(
1956 "_",
1957 "\\\\_",
1958 output_df$kinase_gene
1959 )
1960 colnames(output_df) <-
1961 c(
1962 colnames(output_df)[1],
1963 colnames(output_df)[2],
1964 "enrichment",
1965 "m_s",
1966 "z_score",
1967 "p_value",
1968 "fdr"
1969 )
1970 #ACE output_order <- with(output_df, order(fdr))
1971 output_order <- with(output_df, order(p_value))
1972 output_df <- output_df[output_order, ]
1973
1974 output_df[, 2] <- sprintf("%0.3g", output_df[, 2])
1975 output_df$fdr <- sprintf("%0.4f", output_df$fdr)
1976 output_df$p_value <- sprintf("%0.2e", output_df$p_value)
1977 output_df$z_score <- sprintf("%0.2f", output_df$z_score)
1978 output_df$m_s <- sprintf("%d", output_df$m_s)
1979 output_df$enrichment <- sprintf("%0.3g", output_df$enrichment)
1980 output_ncol <- ncol(output_df)
1981 colnames(output_df) <-
1982 c(
1983 "Kinase",
1984 "\\(\\overline{{\\lvert}\\log_2 (\\text{fold-change}){\\rvert}}\\)",
1985 "Enrichment",
1986 "Substrates",
1987 "z-score",
1988 "p-value",
1989 "FDR"
1990 )
1991 selector <- switch(
1992 ksea_cutoff_statistic,
1993 "FDR" = {
1994 rslt$score_list[[i]]$FDR
1995 },
1996 "p.value" = {
1997 rslt$score_list[[i]]$p.value
1998 },
1999 {
2000 cat(
2001 sprintf(
2002 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
2003 ksea_cutoff_statistic
2004 )
2005 )
2006 param_df_exit()
2007 knitr::knit_exit()
2008 }
2009 )
2010 if (sum(selector < ksea_cutoff_threshold) > 0) {
2011 if (print_nb_messages) nbe(see_variable(output_df))
2012 math_caption <- gsub("{", "\\{", caption, fixed = TRUE)
2013 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
2014 # with (
2015 # output_df,
2016 # )
2017 if (TRUE) {
2018 output_df$Kinase <- whack_underscores(output_df$Kinase)
2019 data_frame_tabbing_latex(
2020 x = output_df,
2021 # vector of tab stops, in inches
2022 tabstops = c(1.0, 1.2, 1.0, 1.0, 1.0, 1.0),
2023 # vector of headings, registered with tab-stops
2024 headings = colnames(output_df),
2025 # digits to pass to format.data.frame
2026 digits = NULL,
2027 # maximumn number of rows to print
2028 max = NULL,
2029 # optional caption
2030 caption = sprintf(
2031 "\\text{%s}, KSEA %s < %s",
2032 math_caption,
2033 ksea_cutoff_statistic,
2034 ksea_cutoff_threshold
2035 ),
2036 # set underscore_whack to TRUE to escape underscores
2037 underscore_whack = FALSE,
2038 # flag for landscape mode
2039 landscape = FALSE,
2040 # flag indicating that subsubsection should be used for caption
2041 # rather than subsection
2042 use_subsubsection_header = TRUE,
2043 # character-size indicator; for possible values, see:
2044 # https://tug.org/texinfohtml/latex2e.html#Font-sizes
2045 charactersize = "small",
2046 # set verbatim to TRUE to debug output
2047 verbatim = FALSE
2048 )
2049 } else {
2050 data_frame_table_latex(
2051 x = output_df,
2052 justification = "l c c c c c c",
2053 centered = TRUE,
2054 caption = sprintf(
2055 "\\text{%s}, KSEA %s < %s",
2056 math_caption,
2057 ksea_cutoff_statistic,
2058 ksea_cutoff_threshold
2059 ),
2060 anchor = anchor,
2061 underscore_whack = FALSE
2062 )
2063 }
2064 } else {
2065 cat(
2066 sprintf(
2067 "\\break
2068 No kinases had
2069 \\(\\text{KSEA %s}_\\text{enrichment} < %s\\)
2070 for contrast %s\\hfill\\break\n",
2071 ksea_cutoff_statistic,
2072 ksea_cutoff_threshold,
2073 caption
771 ) 2074 )
772 ) 2075 )
773 ) 2076 }
774 output_df <- contrast_ksea_scores[
775 selector < ksea_cutoff_threshold,
776 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
777 "z_score", "p_value", "fdr")
778 ]
779 output_order <- with(output_df, order(mean_log2_fc, kinase_gene))
780 output_df <- output_df[output_order, ]
781 colnames(output_df) <-
782 c(
783 colnames(output_df)[1],
784 colnames(output_df)[2],
785 "enrichment",
786 "m_s",
787 "z_score",
788 "p_value",
789 "fdr"
790 )
791 output_df$fdr <- sprintf("%0.4f", output_df$fdr)
792 output_df$p_value <- sprintf("%0.2e", output_df$p_value)
793 output_df$z_score <- sprintf("%0.2f", output_df$z_score)
794 output_df$m_s <- sprintf("%d", output_df$m_s)
795 output_df$enrichment <- sprintf("%0.2f", output_df$enrichment)
796 output_ncol <- ncol(output_df)
797 colnames(output_df) <-
798 c(
799 "Kinase",
800 "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)",
801 "Enrichment",
802 "Substrates",
803 "z-score",
804 "p-value",
805 "FDR"
806 )
807 selector <- switch(
808 ksea_cutoff_statistic,
809 "FDR" = {
810 rslt$score_list[[i]]$FDR
811 },
812 "p.value" = {
813 rslt$score_list[[i]]$p.value
814 },
815 stop(
816 sprintf(
817 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
818 ksea_cutoff_statistic
819 )
820 )
821 )
822 if (sum(selector < ksea_cutoff_threshold) > 0) {
823 math_caption <- gsub("{", "\\{", caption, fixed = TRUE)
824 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
825 data_frame_latex(
826 x = output_df,
827 justification = "l c c c c c c",
828 centered = TRUE,
829 caption = sprintf(
830 "\\text{%s}, %s < %s",
831 math_caption,
832 ksea_cutoff_statistic,
833 ksea_cutoff_threshold
834 ),
835 anchor = const_table_anchor_p
836 )
837 } else {
838 cat(
839 sprintf(
840 "\\break
841 No kinases had
842 \\(\\text{%s}_\\text{enrichment} < %s\\)
843 for contrast %s\\hfill\\break\n",
844 ksea_cutoff_statistic,
845 ksea_cutoff_threshold,
846 caption
847 )
848 )
849 } 2077 }
2078 } else {
2079 ""
850 } 2080 }
851 } 2081 }
852 2082
853 # create_breaks is a helper for ksea_heatmap 2083 # create_breaks is a helper for ksea_heatmap
854 create_breaks <- function(merged_scores) { 2084 create_breaks <- function(merged_scores) {
2085 if (sum(!is.na(merged_scores)) < 2)
2086 return(NULL)
855 if (min(merged_scores, na.rm = TRUE) < -1.6) { 2087 if (min(merged_scores, na.rm = TRUE) < -1.6) {
856 breaks_neg <- seq(-1.6, 0, length.out = 30) 2088 breaks_neg <- seq(-1.6, 0, length.out = 30)
857 breaks_neg <- 2089 breaks_neg <-
858 append( 2090 append(
859 seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10), 2091 seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10),
887 mycol <- unique(append(mycol_neg, mycol_pos)) 2119 mycol <- unique(append(mycol_neg, mycol_pos))
888 color_breaks <- list(breaks_all, mycol) 2120 color_breaks <- list(breaks_all, mycol)
889 return(color_breaks) 2121 return(color_breaks)
890 } 2122 }
891 2123
2124 hm2plus <- function(
2125 x,
2126 mat = matrix(
2127 c(
2128 c(0, 4, 0),
2129 c(0, 3, 3),
2130 c(2, 1, 1)
2131 ),
2132 nrow = 3,
2133 ncol = 3,
2134 byrow = TRUE
2135 ),
2136 denwid = 0.5,
2137 denhgt = 0.15,
2138 widths = c(0.5, 2.5, 1.5),
2139 heights = c(0.4, 0.15, 3.95),
2140 divergent = FALSE,
2141 notecol = "grey50",
2142 trace = "none",
2143 margins = c(6, 20),
2144 srtcol = 90,
2145 srtrow = 0,
2146 density_info = "none",
2147 key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
2148 key_par = list(),
2149 hclustfun = hclust,
2150 ...
2151 ) {
2152
2153 varargs <- list(...)
2154 if (FALSE) # this is to avoid commenting out code to pass linting...
2155 my_hm2 <- latex_show_invocation(heatmap.2, head_patch = FALSE)
2156 else
2157 my_hm2 <- heatmap.2
2158
2159 x <- as.matrix(x)
2160 if (sum(!is.na(x)) < 1)
2161 return(NULL)
2162 color_count <- 1 + max(64, length(as.vector(x))) # 8 was not enough
2163 break_count <- 1 + color_count
2164 min_nonax <- min(x, na.rm = TRUE)
2165 max_nonax <- max(x, na.rm = TRUE)
2166 if (print_nb_messages) nb("within hm2plus", see_variable(divergent), "\n")
2167 if (divergent) {
2168 zlim <- max(abs(min_nonax), abs(max_nonax))
2169 if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n"))
2170 breaks <- (zlim) / (break_count:1)
2171 if (print_nb_messages) nb(see_variable(breaks, "\n"))
2172 breaks <- breaks - median(breaks)
2173 zlim <- c(-zlim, zlim)
2174 if (print_nb_messages) nb(see_variable(zlim, "\n"))
2175 } else {
2176 zlim <- max(abs(min_nonax), abs(max_nonax))
2177 if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n"))
2178 breaks <- zlim / (break_count:1)
2179 if (print_nb_messages) nb(see_variable(breaks, "\n"))
2180 if (max_nonax < 0) {
2181 breaks <- breaks - zlim
2182 zlim <- c(-zlim, 0)
2183 } else {
2184 zlim <- c(0, zlim)
2185 }
2186 if (print_nb_messages) nb(see_variable(zlim, "\n"))
2187 }
2188 nonax <- x
2189 nonax[is.na(x)] <- min_nonax
2190 if (is.null(widths)) widths <- c(denwid, 4 - denwid, 1.5)
2191 if (is.null(heights)) heights <- c(0.4, denhgt, 4.0)
2192 colors <-
2193 if (divergent && min_nonax < 0) {
2194 # divergent colors on both sides of zero
2195 colorRampPalette(c("red", "white", "blue"))(color_count)
2196 } else if (divergent && min_nonax > 0) {
2197 # "divergent" colors > zero
2198 colorRampPalette(c("white", "blue"))(color_count)
2199 } else if (divergent && max_nonax < 0) {
2200 # "divergent" colors < zero
2201 colorRampPalette(c("red", "white"))(color_count)
2202 } else {
2203 # "non-divergent" colors including zero
2204 hcl.colors(color_count, "YlOrRd", rev = TRUE)
2205 }
2206
2207 #ACE if (print_nb_messages) nb("within hm2plus", see_variable(key_par), "\n")
2208 #ACE if (print_nb_messages) nb(see_variable(colors, "\n"))
2209 #ACE key_par$col = colors
2210 #ACE key_par$breaks = breaks
2211
2212 if (print_nb_messages) nb(see_variable(par(), "\n")) #ACE TODO remove me
2213 if (print_nb_messages) cat("\\leavevmode\n\\linebreak\n") #ACE TODO remove me
2214 suppressWarnings(
2215 my_hm2(
2216 x = x,
2217 col = colors,
2218 #ACE symkey = FALSE,
2219 density.info = density_info,
2220 srtCol = srtcol,
2221 srtRow = srtrow,
2222 margins = margins,
2223 lwid = widths,
2224 lhei = heights,
2225 key.title = NA,
2226 key.xlab = key_xlab,
2227 key.par = key_par,
2228 lmat = mat,
2229 notecol = notecol,
2230 trace = trace,
2231 bg = "yellow",
2232 hclustfun = hclustfun,
2233 #ACE breaks = breaks,
2234 oldstyle = FALSE,
2235 ... # varargs
2236 )
2237 )
2238 # implicitly returning value returned by heatmap.2
2239 }
2240
892 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap 2241 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap
893 draw_kseaapp_summary_heatmap <- function( 2242 draw_kseaapp_summary_heatmap <- function(
894 x, 2243 x, # matrix with row/col names already formatted
895 sample_cluster, 2244 sample_cluster, # a binary input of TRUE or FALSE,
896 merged_asterisk, 2245 # indicating whether or not to perform
897 my_cex_row, 2246 # hierarchical clustering of the sample columns
898 color_breaks, 2247 merged_asterisk, # matrix having dimensions of x, values "*" or ""
899 margins, 2248 color_breaks, # breaks for color gradation, from create_breaks
2249 # passed to `breaks` argument of `image`
2250 margins = c(8, 15), # two integers setting the bottom and right margins
2251 # to accommodate row and column labels
2252 master_cex = 0.7, # basis for text sizes
900 ... 2253 ...
901 ) { 2254 ) {
902 merged_scores <- x 2255 merged_scores <- x
903 if (!is.matrix(x)) { 2256 if (!is.matrix(x)) {
904 cat( 2257 cat(
906 "No plot because \\texttt{typeof(x)} is '", 2259 "No plot because \\texttt{typeof(x)} is '",
907 typeof(x), 2260 typeof(x),
908 "' rather than 'matrix'.\n\n" 2261 "' rather than 'matrix'.\n\n"
909 ) 2262 )
910 ) 2263 )
911 } else if (nrow(x) < 2) { 2264 cat_variable(x)
912 cat("No plot because matrix x has ", nrow(x), " rows.\n\n") 2265 return(FALSE)
913 cat("\\begin{verbatim}\n") 2266 }
914 str(x) 2267 if (print_trace_messages) cat(sprintf("master_cex = %03f\n\n", master_cex))
915 cat("\\end{verbatim}\n") 2268 nrow_x <- nrow(x)
916 } else if (ncol(x) < 2) { 2269 ncol_x <- ncol(x)
917 cat("No plot because matrix x has ", ncol(x), " columns.\n\n") 2270 #if (nrow_x < 2) {
918 cat("\\begin{verbatim}\n") 2271 if (nrow_x < 1) {
919 str(x) 2272 cat("No plot because matrix has no rows.\n\n")
920 cat("\\end{verbatim}\n") 2273 return(FALSE)
921 } else { 2274 } else if (nrow_x < 2) {
922 gplots::heatmap.2( 2275 cat("No plot because matrix has one row. Matrix looks like this:\n\n")
923 x = merged_scores, 2276 cat("\n\\begin{verbatim}\n")
924 Colv = sample_cluster, 2277 print(x)
925 scale = "none", 2278 cat("\n\\end{verbatim}\n")
926 cellnote = merged_asterisk, 2279 return(FALSE)
927 notecol = "white", 2280 } else if (ncol_x < 2) {
928 cexCol = 0.9, 2281 cat("No plot because matrix x has ", ncol_x, " columns.\n\n")
929 # Heuristically assign size of row labels 2282 cat_variable(x)
930 cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25), 2283 return(FALSE)
931 srtCol = 45, 2284 }
932 srtRow = 45, 2285 max_nchar_rowname <- max(nchar(rownames(x)))
933 notecex = 3 * my_cex_row, 2286 max_nchar_colname <- max(nchar(colnames(x)))
934 col = color_breaks[[2]], 2287 my_limit <- g_intensity_hm_rows
935 density.info = "none", 2288
936 trace = "none", 2289 my_row_cex_scale <- master_cex * 150 / nrow_x
937 breaks = color_breaks[[1]], 2290 my_col_cex_scale <- 3.0
938 lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), 2291 my_asterisk_scale <- 0.4 * my_row_cex_scale
939 lhei = c(0.4, 8.0, 1.1), 2292 my_row_warp <- 1
940 lwid = c(0.5, 3), 2293 my_note_warp <- 2
941 key = FALSE, 2294 my_row_warp <- 1
942 margins = margins, 2295 my_row_cex_asterisk <-
943 ... 2296 master_cex * my_row_warp * my_asterisk_scale
944 ) 2297
945 } 2298
946 } 2299 my_col_cex <- my_col_cex_scale * master_cex
947 2300 my_row_cex <- min(3.5 * my_row_cex_asterisk, my_col_cex)
948 # Adapted from KSEAapp::KSEA.Heatmap 2301 my_key_cex <- 1.286
2302 my_hm2_cex <- 1 * master_cex
2303 my_offset <- (4.8 / (9 + nrow_x / 10)) - 0.4
2304 if (print_trace_messages) cat(sprintf("nrow_x = %03f\n\n", nrow_x))
2305 if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset))
2306 my_offset <- 0.05
2307 if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset))
2308 my_scale <- 3.0
2309 if (ncol_x < 10 && nrow_x < 10)
2310 my_scale <- my_scale * 10 / (10 - nrow_x) * 10 / (10 - ncol_x)
2311
2312 my_heights <- c(
2313 0.15,
2314 3.85 - my_offset,
2315 0.5 + my_offset
2316 )
2317 my_margins <- c(1, 1) +
2318 c(
2319 margins[1] * 0.08 * max_nchar_colname * my_col_cex,
2320 margins[2] * 0.04 * max_nchar_rowname * my_row_cex
2321 )
2322
2323 my_notecex <-
2324 my_scale *
2325 min(
2326 1.1,
2327 my_row_cex_asterisk * my_note_warp,
2328 my_col_cex * my_note_warp
2329 )
2330
2331 if (print_trace_messages) {
2332 cat_variable(my_heights, suffix = "; ")
2333 cat_variable(my_margins, suffix = "\n\n")
2334 cat_variable(my_row_cex_scale, suffix = "; ")
2335 cat_variable(my_col_cex_scale, suffix = "\n\n")
2336 cat_variable(my_row_cex_asterisk, suffix = "\n\n")
2337 cat_variable(my_row_cex, suffix = "; ")
2338 cat_variable(my_col_cex, suffix = "\n\n")
2339 cat_variable(my_row_cex, suffix = "; ")
2340 cat_variable(my_col_cex, suffix = "\n\n")
2341 }
2342
2343 hm2plus(
2344 x = merged_scores,
2345 Colv = sample_cluster,
2346 cellnote = merged_asterisk,
2347 cex = my_hm2_cex,
2348 cexCol = my_col_cex,
2349 cexRow = my_row_cex,
2350 denhgt = 0.15,
2351 density_info = "none",
2352 denwid = 0.5,
2353 divergent = TRUE,
2354 key_par = list(cex = my_key_cex),
2355 key_xlab = "Z-score",
2356 margins = my_margins,
2357 notecex = my_scale * min(
2358 1.5,
2359 my_row_cex_asterisk * my_note_warp,
2360 my_col_cex * my_note_warp
2361 ),
2362 notecol = "white",
2363 scale = "none",
2364 srtcol = 90,
2365 srtrow = 0,
2366 trace = "none",
2367 mat = matrix(
2368 c(
2369 c(0, 3, 3),
2370 c(2, 1, 1),
2371 c(0, 4, 0)
2372 ),
2373 nrow = 3,
2374 ncol = 3,
2375 byrow = TRUE
2376 ),
2377 widths = c(0.5, 3.1, 0.9),
2378 heights = my_heights,
2379 ...
2380 )
2381 return(TRUE)
2382 }
2383
2384 # function drawing heatmap of contrast fold-change for each kinase,
2385 # adapted from KSEAapp::KSEA.Heatmap
949 ksea_heatmap <- function( 2386 ksea_heatmap <- function(
950 # the data frame outputs from the KSEA.Scores() function, in list format 2387 # the data frame outputs from the KSEA.Scores() function, in list format
951 score_list, 2388 score_list,
952 # a character vector of all the sample names for heatmap annotation: 2389 # a character vector of all the sample names for heatmap annotation:
953 # - the names must be in the same order as the data in score_list 2390 # - the names must be in the same order as the data in score_list
959 # a numeric value between 0 and infinity indicating the min. number of 2396 # a numeric value between 0 and infinity indicating the min. number of
960 # substrates a kinase must have to be included in the heatmap 2397 # substrates a kinase must have to be included in the heatmap
961 m_cutoff, 2398 m_cutoff,
962 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff 2399 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
963 # for indicating significant kinases in the heatmap 2400 # for indicating significant kinases in the heatmap
964 p_cutoff = 2401 p_cutoff = {
965 stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), 2402 cat("argument 'p_cutoff' is required for function 'ksea_heatmap'")
2403 param_df_exit()
2404 knitr::knit_exit()
2405 },
966 # a binary input of TRUE or FALSE, indicating whether or not to perform 2406 # a binary input of TRUE or FALSE, indicating whether or not to perform
967 # hierarchical clustering of the sample columns 2407 # hierarchical clustering of the sample columns
968 sample_cluster, 2408 sample_cluster,
969 # a binary input of TRUE or FALSE, indicating whether or not to export 2409 # a binary input of TRUE or FALSE, indicating whether or not to export
970 # the heatmap as a .png image into the working directory 2410 # the heatmap as a .png image into the working directory
971 export = FALSE, 2411 export = FALSE,
972 # bottom and right margins; adjust as needed if contrast names are too long 2412 # bottom and right margins; adjust as needehttps://tex.stackexchange.com/a/56795d if contrast names are too long
973 margins = c(6, 20), 2413 margins = c(6, 6),
974 # print which kinases? 2414 # print which kinases?
975 # - Mandatory argument, must be one of const_ksea_.*_kinases 2415 # - Mandatory argument, must be one of const_ksea_.*_kinases
976 which_kinases, 2416 which_kinases,
977 # additional arguments to gplots::heatmap.2, such as: 2417 # additional arguments to gplots::heatmap.2, such as:
978 # - main: main title of plot 2418 # - main: main title of plot
991 paste(names, i, sep = ".") 2431 paste(names, i, sep = ".")
992 } 2432 }
993 master <- 2433 master <-
994 Reduce( 2434 Reduce(
995 f = function(...) { 2435 f = function(...) {
996 base::merge(..., by = "Kinase.Gene", all = FALSE) 2436 base::merge(..., by = "Kinase.Gene", all = TRUE)
997 }, 2437 },
998 x = score_list_m 2438 x = score_list_m
999 ) 2439 )
1000 2440
1001 row.names(master) <- master$Kinase.Gene 2441 row.names(master) <- master$Kinase.Gene
1017 } 2457 }
1018 return(new) 2458 return(new)
1019 } 2459 }
1020 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) 2460 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff))
1021 2461
1022 # begin hack to print only significant rows
1023 asterisk_rows <- rowSums(merged_asterisk == "*") > 0 2462 asterisk_rows <- rowSums(merged_asterisk == "*") > 0
1024 all_rows <- rownames(merged_stats) 2463 all_rows <- rownames(merged_stats)
1025 names(asterisk_rows) <- all_rows 2464 names(asterisk_rows) <- all_rows
1026 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) 2465 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE])
1027 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) 2466 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE])
1028 merged_scores_asterisk <- merged_scores[names(asterisk_rows), ] 2467 merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE]
1029 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ] 2468 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE]
1030 # end hack to print only significant rows
1031 2469
1032 row_list <- list() 2470 row_list <- list()
1033 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows 2471 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows
1034 row_list[[const_ksea_all_kinases]] <- all_rows 2472 row_list[[const_ksea_all_kinases]] <- all_rows
1035 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows 2473 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows
1036 2474
1037 i <- which_kinases 2475 i <- which_kinases
1038 my_row_names <- row_list[[i]] 2476 my_row_names <- row_list[[i]]
1039 scrs <- merged_scores[my_row_names, ] 2477 scrs <- merged_scores[my_row_names, , drop = FALSE]
1040 stts <- merged_stats[my_row_names, ] 2478 stts <- merged_stats[my_row_names, , drop = FALSE]
1041 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) 2479 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff))
1042 2480
1043 color_breaks <- create_breaks(scrs) 2481 color_breaks <- create_breaks(scrs)
2482 if (is.null(color_breaks)) {
2483 cat("No plot because matrix has too few rows.\n\n")
2484 return(NULL)
2485 }
1044 plot_height <- nrow(scrs) ^ 0.55 2486 plot_height <- nrow(scrs) ^ 0.55
1045 plot_width <- ncol(scrs) ^ 0.7 2487 plot_width <- ncol(scrs) ^ 0.7
1046 my_cex_row <- 0.25 * 16 / plot_height
1047 if (export == "TRUE") { 2488 if (export == "TRUE") {
1048 png( 2489 png(
1049 "KSEA.Merged.Heatmap.png", 2490 "KSEA.Merged.Heatmap.png",
1050 width = plot_width * 300, 2491 width = plot_width * 300,
1051 height = 2 * plot_height * 300, 2492 height = 2 * plot_height * 300,
1052 res = 300, 2493 res = 300,
1053 pointsize = 14 2494 pointsize = 14
1054 ) 2495 )
1055 } 2496 }
1056 draw_kseaapp_summary_heatmap( 2497 did_draw <- draw_kseaapp_summary_heatmap(
1057 x = scrs, 2498 x = scrs,
1058 sample_cluster = sample_cluster, 2499 sample_cluster = sample_cluster,
1059 merged_asterisk = merged_asterisk, 2500 merged_asterisk = merged_asterisk,
1060 my_cex_row = my_cex_row,
1061 color_breaks = color_breaks, 2501 color_breaks = color_breaks,
1062 margins = margins 2502 margins = margins
1063 ) 2503 )
1064 if (export == "TRUE") { 2504 if (export == "TRUE") {
1065 dev.off() 2505 dev.off()
1066 } 2506 }
2507 if (!did_draw)
2508 return(NULL)
1067 return(my_row_names) 2509 return(my_row_names)
1068 } 2510 }
1069 2511
1070 # helper for heatmaps of phosphopeptide intensities 2512 # helpers for heatmaps of phosphopeptide intensities
1071 2513
1072 draw_intensity_heatmap <- 2514 # factory producing function to truncate string after n characters
2515 trunc_n <- function(n) {
2516 function(x) {
2517 sapply(
2518 X = x,
2519 FUN = function(s) {
2520 if (is.na(s))
2521 return("NA")
2522 cond <- try_catch_w_e(nchar(s) > n)
2523 if (!is.logical(cond$value)) {
2524 return(cond$value$message)
2525 } else if (cond$value) {
2526 paste0(
2527 strtrim(s, n),
2528 "..."
2529 )
2530 } else {
2531 s
2532 }
2533 },
2534 USE.NAMES = FALSE
2535 )
2536 }
2537 }
2538 trunc_long_ppep <- function(x) trunc_n(40)(x)
2539 trunc_ppep <- function(x) trunc_n(g_ppep_trunc_n)(x)
2540 trunc_subgene <- function(x) trunc_n(g_subgene_trunc_n)(x)
2541 trunc_enriched_substrate <- function(x) trunc_n(g_sbstr_trunc_n)(x)
2542
2543 # factory producing a function that returns a covariance
2544 # matrix's rows (and columns) having variance > v_min
2545 keep_cov_w_var_gtr_min <- function(v_min) {
2546 function(x) {
2547 if (!is.matrix(x))
2548 return(NULL)
2549 keepers <- sapply(
2550 X = seq_len(nrow(x)),
2551 FUN = function(i) {
2552 if (x[i, i] < v_min)
2553 NA
2554 else
2555 x[i, i]
2556 }
2557 )
2558 names(keepers) <- rownames(x)
2559 keepers <- keepers[!is.na(keepers)]
2560 keepers <- names(keepers)
2561 if (length(keepers) == 0)
2562 return(NULL)
2563 x[keepers, keepers]
2564 }
2565 }
2566 # function that returns a matrix's rows having variance > 1
2567 keep_cov_w_var_gtr_1 <- keep_cov_w_var_gtr_min(1)
2568
2569 # factory producing a function that returns
2570 # - either a matrix's rows (rows = TRUE)
2571 # - or a matrix's columns (rows = FALSE)
2572 # having variance > v_min
2573 keep_var_gtr_min <- function(v_min) {
2574 function(x, rows = TRUE) {
2575 nrowcol <- if (rows) nrow else ncol
2576 if (!is.matrix(x))
2577 return(NULL)
2578 keepers <- sapply(
2579 X = seq_len(nrowcol(x)),
2580 FUN = function(i) {
2581 row_var <- var(
2582 if (rows) x[i, ] else x[, i],
2583 na.rm = TRUE
2584 )
2585 if (is.na(row_var) || row_var <= v_min) NA else i
2586 }
2587 )
2588 keepers <- keepers[!is.na(keepers)]
2589 if (rows) x[keepers, ] else x[, keepers]
2590 }
2591 }
2592
2593 keep_var_gtr_0 <- keep_var_gtr_min(0)
2594
2595 # function drawing heatmap of phosphopeptide intensities
2596 ppep_heatmap <-
1073 function( 2597 function(
1074 m, # matrix with rownames already formatted 2598 m, # matrix with rownames already formatted
1075 cutoff, # cutoff used by hm_heading_function 2599 cutoff, # cutoff used by hm_heading_function
1076 hm_heading_function, # construct and cat heading from m and cutoff 2600 hm_heading_function, # construct $ cat heading from m and cutoff
1077 hm_main_title, # main title for plot (drawn below heading) 2601 hm_main_title, # main title for plot (drawn below heading)
1078 suppress_row_dendrogram = TRUE, # set to false to show dendrogram 2602 suppress_row_dendrogram = TRUE, # set to false to show dendrogram
1079 max_peptide_count # experimental: 2603 max_peptide_count = # experimental:
1080 = intensity_hm_rows, # values of 50 and 75 worked well 2604 g_intensity_hm_rows, # values of 50 and 75 worked well
1081 ... # passthru parameters for heatmap 2605 master_cex = 1.0, # basis for text sizes
2606 margins = NULL, # optional margins (bottom, right)
2607 cellnote = NULL, # optional matrix of character; dim = dim(m)
2608 adj = 0.5, # adjust text: 0 left, 0.5 middle, 1 right
2609 ... # passthru to hm2plus or heatmap.2
1082 ) { 2610 ) {
2611 use_heatmap_1 <- FALSE
1083 peptide_count <- 0 2612 peptide_count <- 0
1084 # emit the heading for the heatmap 2613 # emit the heading for the heatmap
1085 if (hm_heading_function(m, cutoff)) { 2614 if (hm_heading_function(m, cutoff)) {
1086 peptide_count <- min(max_peptide_count, nrow(m)) 2615 nrow_m <- nrow(m)
1087 if (nrow(m) > 1) { 2616 peptide_count <- min(max_peptide_count, nrow_m)
2617 if (nrow_m > 1) {
1088 m_margin <- m[peptide_count:1, ] 2618 m_margin <- m[peptide_count:1, ]
1089 # Margin setting was heuristically derived 2619 # Margin was heuristically derived to accommodate the widest label
1090 margins <- 2620 row_mchar_max <- max(nchar(rownames(m_margin)))
1091 c(0.5, # col 2621 col_mchar_max <- max(nchar(colnames(m_margin)))
1092 max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row 2622 row_margin <- master_cex * row_mchar_max * 2.6
1093 ) 2623 col_margin <- master_cex * col_mchar_max * 2.6
1094 } 2624 if (print_trace_messages) cat(sprintf("row_margin = %0.3f; ", row_margin))
1095 if (nrow(m) > 1) { 2625 if (print_trace_messages) cat(sprintf("col_margin = %0.3f; ", col_margin))
2626 hm_call <- NULL
1096 tryCatch( 2627 tryCatch(
1097 { 2628 {
1098 old_oma <- par("oma") 2629 # set non-argument parameters for hm_call inner function
1099 par(cex.main = 0.6) 2630 my_row_cex <-
1100 # Heuristically determined character size adjustment formula 2631 master_cex * 200000 / (
1101 char_contractor <- 2632 (max(nchar(rownames(m_margin)))^2) * g_intensity_hm_rows
1102 250000 / (
1103 max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows
1104 ) 2633 )
1105 heatmap( 2634 m_hm <- m[peptide_count:1, , drop = FALSE]
1106 m[peptide_count:1, ], 2635 if (is.null(cellnote)) {
1107 Rowv = if (suppress_row_dendrogram) NA else NULL, 2636 cellnote <- matrix("", nrow = nrow(m_hm), ncol = ncol(m_hm))
1108 Colv = NA, 2637 cellnote[is.na(m_hm)] <- "NA"
1109 cexRow = char_contractor, 2638 } else {
1110 cexCol = char_contractor * 50 / max_peptide_count, 2639 cellnote <- cellnote[peptide_count:1, , drop = FALSE]
1111 scale = "row", 2640 }
1112 margins = margins, 2641 m_hm[is.na(m_hm)] <- 0
1113 main = 2642 nrow_m_hm <- nrow(m_hm)
1114 "Unimputed, unnormalized log(intensities)", 2643 ncol_m_hm <- ncol(m_hm)
1115 xlab = "", 2644 if (nrow_m_hm < 1 || ncol_m_hm < 1)
1116 las = 1, 2645 return(peptide_count) # return zero as initialized above
1117 ... 2646 my_limit <- g_intensity_hm_rows
2647
2648
2649 my_row_cex <- master_cex * (100 / (2 + row_mchar_max))
2650 my_col_cex <- master_cex * 6 * row_margin / col_margin
2651 my_col_adj <- min(my_col_cex, my_row_cex) / my_col_cex
2652 my_col_cex <- min(my_col_cex, my_row_cex)
2653 col_margin <- sqrt(my_col_adj) * col_margin
2654 if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex))
2655 if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex))
2656 if (is.null(margins)) my_margins <-
2657 c(
2658 (my_col_cex + col_margin), # col
2659 (my_row_cex + row_margin) / my_row_cex # row
1118 ) 2660 )
2661 else
2662 my_margins <- margins
2663
2664 if (print_trace_messages) cat(
2665 sprintf(
2666 "my_margins = c(%s)\n\n",
2667 paste(my_margins, collapse = ", ")
2668 )
2669 )
2670 my_hm2_cex <- 2 * master_cex
2671 my_key_cex <- 0.9 - 0.1 * (g_intensity_hm_rows + nrow_m_hm) / g_intensity_hm_rows
2672 my_key_warp <- 1.5 * 22.75 / row_margin
2673 my_key_cex <- min(1.10, my_key_warp * my_key_cex)
2674 my_hgt_scale <- 3.70 - 0.4 * (max(1, 0.9 * my_row_cex) - 1)
2675 my_hgt_scale <- 3.75 # 3.615
2676 my_hgt_scale <- 3.60 # 3.615
2677 if (print_trace_messages)
2678 cat_variable(my_hgt_scale, "\n\n", 3)
2679 my_warp <- max(0.1, 1.4 * (7.5 + nrow_m) / g_intensity_hm_rows)
2680 if (print_trace_messages)
2681 cat_variable(my_warp, "\n\n", 3)
2682 # added 0.9 heuristically...
2683 my_plot_height <-
2684 (0.566 + 0.354 * (nrow_m / g_intensity_hm_rows)) *
2685 min(my_hgt_scale, my_hgt_scale * my_warp)
2686 my_plot_height <- min(3.65, my_plot_height * g_intensity_hm_rows / 50)
2687 my_heights <- c(
2688 0.3, # title and top dendrogram
2689 my_plot_height, # plot and bottom margin
2690 4.15 - my_hgt_scale, # legend
2691 0.05 + my_hgt_scale - my_plot_height # whitespace below legend
2692 )
2693 my_note_cex <- min(0.8, my_row_cex, my_col_cex)
2694 if (print_trace_messages) {
2695 cat_variable(my_plot_height, "\n\n", 3)
2696 cat_variable(4.19 - my_hgt_scale, "\n\n", 3)
2697 cat_variable(nrow_m_hm, "; ", 0)
2698 cat_variable(ncol_m_hm, "; ", 0)
2699 cat_variable(my_row_cex, "; ", 3)
2700 cat_variable(my_col_cex, "; ", 3)
2701 cat_variable(my_note_cex, "; ", 3)
2702 cat_variable(my_key_cex, "\n\n", 3)
2703 cat_variable(my_hgt_scale, "; ", 3)
2704 cat_variable(my_plot_height, "; ", 3)
2705 cat_variable(my_warp, "\n\n", 3)
2706 cat_variable(my_heights, "; ", 2)
2707 cat_variable(sum(my_heights), "\n\n", 3)
2708 }
2709
2710 # define hm_call inner function
2711 hm_call <- function(x, scaling, title) {
2712 my_cex_main <- min(5.0, 220 / nchar(title))
2713 op <- par(
2714 cex.main = my_cex_main * master_cex,
2715 adj = adj
2716 )
2717 if (
2718 !is.null(
2719 hm2plus(
2720 x,
2721 Colv = NA,
2722 Rowv = TRUE,
2723 cexRow = my_row_cex,
2724 cexCol = my_col_cex,
2725 dendrogram = "row",
2726 las = 1,
2727 main = title,
2728 key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
2729 cex = my_hm2_cex,
2730 key_par = list(cex = my_key_cex),
2731 margins = my_margins,
2732 widths = c(0.4, 2.6, 1.5),
2733 heights = my_heights,
2734 mat = matrix(
2735 c(
2736 c(0, 3, 3),
2737 c(2, 1, 1),
2738 c(0, 4, 0),
2739 c(0, 0, 0)
2740 ),
2741 nrow = 4,
2742 ncol = 3,
2743 byrow = TRUE
2744 ),
2745 na.rm = TRUE,
2746 scale = scaling,
2747 srtcol = 90,
2748 srtrow = 0,
2749 xlab = "",
2750 cellnote = cellnote,
2751 notecex = my_note_cex,
2752 ...
2753 )
2754 )
2755 ) {
2756 if (print_trace_messages) cat(
2757 sprintf(
2758 "my_heights = c(%s); sum = %0.3f\n\n",
2759 paste(
2760 sprintf("%0.3f", my_heights),
2761 collapse = ", "
2762 ),
2763 sum(my_heights)
2764 )
2765 )
2766 if (print_trace_messages) cat(
2767 sprintf("my_key_cex = %0.3f\n\n",
2768 my_key_cex)
2769 )
2770 if (print_trace_messages) cat(
2771 sprintf("my_key_cex/my_heights[3] = %0.3f\n\n",
2772 my_key_cex / my_heights[3])
2773 )
2774 if (print_trace_messages) cat(
2775 sprintf("my_heights[2]/my_heights[3] = %0.3f\n\n",
2776 my_heights[2] / my_heights[3])
2777 )
2778 }
2779 par(op)
2780 }
2781
2782 # invoke hm_call inner function
2783 if (sum(rowSums(!is.na(m_hm)) < 2))
2784 hm_call(
2785 m_hm,
2786 "none",
2787 "log(intensities), unscaled, unimputed, and unnormalized"
2788 )
2789 else
2790 hm_call(
2791 m_hm,
2792 "row",
2793 "log(intensities), row-scaled, unimputed, and unnormalized"
2794 )
1119 }, 2795 },
1120 error = function(e) { 2796 error = function(e) {
1121 cat( 2797 if (!is.null(hm_call)) {
1122 sprintf( 2798 m_hm[is.na(m_hm)] <- 0
1123 "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n", 2799 tryCatch(
1124 e$message 2800 {
2801 if (nrow(m_hm) > 1)
2802 hm_call(
2803 m_hm,
2804 "none",
2805 paste(
2806 "log(intensities), unscaled,",
2807 "zero-imputed, unnormalized"
2808 )
2809 )
2810 else
2811 cat("\nThere are too few peptides to produce a heatmap.\n")
2812 },
2813 error = function(r) {
2814 cat(
2815 sprintf(
2816 "\n%s %s Internal message: %s\n\\newline\n\n",
2817 "Failure drawing heatmap,",
2818 "possibly because of too many missing values.\n\\newline\n\n",
2819 r$message
2820 )
2821 )
2822 cat_margins()
2823 }
2824 )
2825 } else {
2826 cat(
2827 "\nFailure drawing heatmap, possibly because of too many missing values.\n"
1125 ) 2828 )
1126 ) 2829 }
1127 }, 2830 }
1128 finally = par(old_oma)
1129 ) 2831 )
1130 } 2832 }
1131 } 2833 }
1132 return(peptide_count) 2834 # return value:
1133 } 2835 peptide_count
2836 }
2837
2838 # function drawing heatmap of correlations if they exist, else covariances
2839 cov_heatmap <-
2840 function(
2841 m, # matrix with rownames already formatted
2842 top_substrates = FALSE,
2843 ... # passthru to hm2plus or heatmap.2
2844 ) {
2845 if (print_nb_messages) nbe(see_variable(m), " [", nrow(m), "x", ncol(m), "\n")
2846 #ACE nb(rowSums(m, na.rm = TRUE))
2847 #ACE bad_rows <- (rowSums(m, na.rm = TRUE) == 0)
2848 #ACE nb(see_variable(bad_rows))
2849 #ACE m <- m[-bad_rows, , drop = FALSE]
2850 colnames_m <- colnames(m)
2851 is_na_m <- is.na(m)
2852 tmp <- m
2853 tmp[is_na_m] <- 0
2854
2855 tmp <- m[, 0 < colSums(x = tmp)] # by default, na.rm is FALSE
2856
2857 colnames_tmp <- colnames(tmp)
2858
2859 my_low_p_seq <- seq(
2860 from = min(g_intensity_hm_rows, nrow(m)),
2861 to = 1,
2862 by = -1
2863 )
2864
2865 if (g_correlate_substrates) {
2866 # zap samples having zero or near-zero variance
2867 tmp[is.na(tmp)] <- 0
2868 nzv <- caret::nearZeroVar(
2869 tmp, # matrix of values, samples x variables
2870 freqCut = 1.01, # min(freq most prevalent value /
2871 # freq second most prevalent)
2872 uniqueCut = 99 # max(number of unique values /
2873 # total number of samples)
2874 )
2875 tmp <- if (length(nzv) > 0) {
2876 m[, -nzv, drop = FALSE]
2877 } else {
2878 m
2879 }
2880 } else {
2881 tmp <- m[my_low_p_seq, , drop = FALSE]
2882 }
2883
2884
2885 t_m <- t(tmp)
2886 t_m[is.na(t_m)] <- 0
2887 prefiltered_nrow <- ncol(t_m)
2888
2889 my_corcov <- cov(t_m)
2890 did_filter_rows <- did_filter_cols <- FALSE
2891 if (g_correlate_substrates && !is_positive_definite(my_corcov)) {
2892 my_correlate_substrates <- FALSE
2893 t_m <- t(m[my_low_p_seq, , drop = FALSE])
2894 t_m[is.na(t_m)] <- 0
2895 unfiltered_row_count <- ncol(t_m)
2896 unfiltered_col_count <- nrow(t_m)
2897
2898 # zap empty samples
2899 t_m <- t_m[0 < rowSums(x = t_m), ]
2900 # zap substrates present in fewer than two samples
2901 foo <- t_m > 0
2902 foo <- colSums(x = foo) > 1
2903 t_m <- t_m[, foo]
2904
2905 did_filter_rows <- unfiltered_row_count > ncol(t_m)
2906 did_filter_cols <- unfiltered_col_count > nrow(t_m)
2907
2908 colnames_tmp <- rownames(t_m)
2909 my_corcov <- cov(t_m)
2910 if (g_filter_cov_var_gt_1) {
2911 my_corcov <- keep_cov_w_var_gtr_1(my_corcov)
2912 }
2913 } else if (g_correlate_substrates) {
2914 my_corcov <- cov2cor(my_corcov)
2915 my_correlate_substrates <- TRUE
2916 } else {
2917 my_correlate_substrates <- FALSE
2918 if (g_filter_cov_var_gt_1) my_corcov <- keep_cov_w_var_gtr_1(my_corcov)
2919 }
2920
2921 omitted_samples <- colnames_m[colnames_m %notin% colnames_tmp]
2922 suffix <- if (length(omitted_samples) > 1) "s" else ""
2923
2924 f_omissions <-
2925 function(is_corr) {
2926 cat(
2927 sprintf(
2928 "Below is the %s plot for %s substrates",
2929 if (is_corr) "correlation" else "covariance",
2930 sprintf(
2931 if (top_substrates)
2932 "%0.0f \"highest-quality\""
2933 else
2934 "%0.0f",
2935 ncol(t_m)
2936 )
2937 )
2938 )
2939 if (did_filter_cols) {
2940 cat(sprintf(", omitting sample%s ", suffix))
2941 latex_collapsed_vector(", ", omitted_samples)
2942 }
2943 cat(".\n\n")
2944 }
2945
2946 if (is.null(my_corcov) || sum(!is.na(t_m)) < 2) {
2947 cat(
2948 sprintf(
2949 "\\newline\n%s %s plot.\n",
2950 "Insufficient covariance to produce",
2951 if (my_correlate_substrates)
2952 "correlation"
2953 else
2954 "covariance"
2955 ),
2956 "\\newpage\n"
2957 )
2958 return(NULL)
2959 }
2960
2961 cat("\\leavevmode\n", "\\newpage\n")
2962 f_omissions(my_correlate_substrates)
2963
2964 master_cex <- 0.4
2965 max_nchar <- max(nchar(rownames(t_m)))
2966 my_limit <- g_intensity_hm_rows
2967 diminution <- sqrt(my_limit / (my_limit + ncol(t_m)))
2968 my_row_cex <-
2969 my_col_cex <-
2970 min(1.75, master_cex * 9 * diminution ^ 1.5)
2971 my_margin <- 3 + my_row_cex * 64 / (8 + max_nchar)
2972 my_key_cex <- 1.4
2973 my_hm2_cex <- 1.0 * master_cex
2974 my_hgt_scale <- 3.50 - 0.26 * (max(0.4, my_key_cex) - 0.4)
2975 my_hgt_scale <- 2.7
2976
2977 my_legend_height <- 4.0 - my_hgt_scale
2978 my_legend_height <- 0.5 * my_key_cex
2979 my_warp <- 0.65 * (my_limit + ncol(t_m)) / my_limit
2980 my_warp <- 0.8
2981 my_legend_height <- 0.77
2982 my_legend_height <- 0.67
2983 my_plot_height <- my_hgt_scale + (1 - my_warp) * my_legend_height
2984 my_legend_height <- my_warp * my_legend_height
2985
2986 parjust <- par(adj = 0.5)
2987 on.exit(par(parjust))
2988 my_corcov <- my_corcov[order(rownames(my_corcov)), ]
2989 my_main <-
2990 sprintf("%s among %s substrates %s",
2991 if (my_correlate_substrates) "Correlation"
2992 else "Covariance",
2993 kinase_name,
2994 if (!my_correlate_substrates &&
2995 g_filter_cov_var_gt_1 &&
2996 did_filter_rows
2997 )
2998 "having variance > 1"
2999 else ""
3000 )
3001 my_main_nchar <- nchar(my_main)
3002 my_heights <- c(
3003 0.3,
3004 my_plot_height,
3005 my_legend_height # was 4.0 - my_hgt_scale # was 4.19
3006 )
3007 if (print_trace_messages) cat(sprintf("max_nchar = %0.3f; ", max_nchar))
3008 if (print_trace_messages) cat(sprintf("my_margin = %0.3f; ", my_margin))
3009 if (print_trace_messages) cat(sprintf("my_plot_height = %0.3f\n\n", my_plot_height))
3010 if (print_trace_messages) cat(sprintf("master_cex = %0.3f; ", master_cex))
3011 if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex))
3012 if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex))
3013 if (print_trace_messages) cat(sprintf("my_key_cex = %0.3f\n\n", my_key_cex))
3014 if (print_trace_messages) cat(sprintf("my_hgt_scale = %0.3f\n\n", my_hgt_scale))
3015 if (print_trace_messages) cat(sprintf("legend height = %0.3f\n\n", my_legend_height))
3016 if (print_trace_messages) cat(
3017 sprintf(
3018 "my_heights = c(%s); sum = %0.3f\n\n",
3019 paste(
3020 sprintf("%0.3f", my_heights),
3021 collapse = ", "
3022 ),
3023 sum(my_heights)
3024 )
3025 )
3026 op <- par(cex.main = (30 + my_main_nchar) / my_main_nchar)
3027 on.exit(par(op))
3028 hm2plus(
3029 x = my_corcov,
3030 cex = my_hm2_cex,
3031 cexCol = my_col_cex,
3032 cexRow = my_row_cex,
3033 density_info = "none",
3034 denhgt = 0.15,
3035 denwid = 0.5,
3036 divergent = TRUE,
3037 key_par = list(cex = my_key_cex),
3038 key_xlab = if (my_correlate_substrates) "Correlation"
3039 else "Covariance",
3040 main = my_main,
3041 mat = matrix(
3042 c(
3043 c(0, 3, 3),
3044 c(2, 1, 1),
3045 c(0, 4, 0)
3046 ),
3047 nrow = 3,
3048 ncol = 3,
3049 byrow = TRUE
3050 ),
3051 heights = my_heights,
3052 margins = c(my_margin, my_margin),
3053 widths = c(0.5, 3.1, 0.9),
3054 scale = "none",
3055 symkey = TRUE,
3056 symbreaks = TRUE,
3057 symm = FALSE #TODO evaluate TRUE
3058 # ...
3059 )
3060 } # end cov_heatmap
3061
3062 ### FILE IMPORT
3063
3064 # function reading bzipped file to data.frame
3065 bzip2df <- function(d, f, ctor = bzfile) {
3066 # read.delim file (by default, compressed by bzip2)
3067 if (file.exists(f)) {
3068 conn <- NULL
3069 pf <- parent.frame()
3070 tryCatch(
3071 assign(
3072 as.character(substitute(d)),
3073 read.delim(conn <- bzfile(f, open = "r")),
3074 pf
3075 ),
3076 finally = if (!is.null(conn)) close(conn)
3077 )
3078 }
3079 }
3080
1134 ``` 3081 ```
1135 3082
1136 ```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
1137 cat("\\listoftables\n")
1138 ```
1139 # Purpose 3083 # Purpose
1140 3084
1141 To perform for phosphopeptides: 3085 The purpose of this analysis is to perform for phosphopeptides:
1142 3086
1143 - imputation of missing values, 3087 - imputation of missing values,
1144 - quantile normalization, 3088 - quantile normalization,
1145 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), and 3089 - ANOVA (using the R stats::`r params$oneWayManyCategories` function),
3090 - assignment of an FDR-adjusted $p$-value and a "quality score" to each phosphopeptide, and
1146 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: 3091 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases:
1147 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) 3092 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org)
1148 - The Human Proteome Database [http://hprd.org](http://hprd.org) 3093 - The Human Proteome Database [http://hprd.org](http://hprd.org)
1149 - NetworKIN [http://networkin.science/](http://networkin.science/) 3094 - NetworKIN [http://networkin.science/](http://networkin.science/)
1150 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) 3095 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx)
1151 3096
1152 ```{r include = FALSE} 3097 ```{r include = FALSE}
1153 3098
1154 ### GLOBAL VARIABLES 3099 if (params$kseaUseAbsoluteLog2FC) {
1155 3100 sfc <- "|s|"
1156 # parameters for KSEA 3101 pfc <- "|p|"
1157 3102 pfc_txt <- "$\\text{absolute value}({\\log_2 (\\text{fold-change})})$"
1158 ksea_cutoff_statistic <- params$kseaCutoffStatistic 3103 } else {
1159 ksea_cutoff_threshold <- params$kseaCutoffThreshold 3104 sfc <- "s"
1160 ksea_min_kinase_count <- params$kseaMinKinaseCount 3105 pfc <- "p"
3106 pfc_txt <- "${\\log_2 (\\text{fold-change}})$"
3107 }
1161 3108
1162 ksea_heatmap_titles <- list() 3109 ksea_heatmap_titles <- list()
1163 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- 3110 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <-
1164 sprintf( 3111 sprintf(
1165 "Summary for all kinases enriched in one or more contrasts at %s < %s", 3112 "Summary for all kinases enriched in one or more contrasts at %s < %s",
1175 ksea_cutoff_threshold 3122 ksea_cutoff_threshold
1176 ) 3123 )
1177 # hash to hold names of significantly enriched kinases 3124 # hash to hold names of significantly enriched kinases
1178 ksea_asterisk_hash <- new_env() 3125 ksea_asterisk_hash <- new_env()
1179 3126
1180 # READ PARAMETERS (mostly) 3127 # PROCESS (mostly read) PARAMETERS
1181
1182 intensity_hm_rows <- params$intensityHeatmapRows
1183 # Input Filename
1184 input_file <- params$inputFile
1185
1186 # First data column - ideally, this could be detected via regexSampleNames,
1187 # but for now leave it as is.
1188 first_data_column <- params$firstDataColumn
1189 fdc_is_integer <- is.integer(first_data_column)
1190 if (fdc_is_integer) {
1191 first_data_column <- as.integer(params$firstDataColumn)
1192 }
1193 3128
1194 # False discovery rate adjustment for ANOVA 3129 # False discovery rate adjustment for ANOVA
1195 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 3130 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
1196 val_fdr <- 3131 val_fdr <- read.table(file = alpha_file, sep = "\t", header = FALSE, quote = "")
1197 read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "")
1198 3132
1199 if ( 3133 if (
1200 ncol(val_fdr) != 1 || 3134 ncol(val_fdr) != 1 ||
1201 sum(!is.numeric(val_fdr[, 1])) || 3135 sum(!is.numeric(val_fdr[, 1])) ||
1202 sum(val_fdr[, 1] < 0) || 3136 sum(val_fdr[, 1] < 0) ||
1203 sum(val_fdr[, 1] > 1) 3137 sum(val_fdr[, 1] > 1)
1204 ) { 3138 ) {
1205 stop("alphaFile should be one column of numbers within the range [0.0,1.0]") 3139 cat("alphaFile should be one column of numbers within the range [0.0,1.0]")
3140 param_df_exit()
3141 knitr::knit_exit()
1206 } 3142 }
1207 val_fdr <- val_fdr[, 1] 3143 val_fdr <- val_fdr[, 1]
1208 3144
1209 #Imputed Data filename
1210 imputed_data_filename <- params$imputedDataFilename
1211 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
1212 anova_ksea_mtdt_file <- params$anovaKseaMetadata
1213
1214 ``` 3145 ```
1215 3146
1216 ```{r echo = FALSE} 3147 ```{r echo = FALSE, results = 'asis'}
1217 # Imputation method, should be one of 3148
1218 # "random", "group-median", "median", or "mean" 3149
1219 imputation_method <- params$imputationMethod
1220
1221 # Selection of percentile of logvalue data to set the mean for random number
1222 # generation when using random imputation
1223 mean_percentile <- params$meanPercentile / 100.0
1224
1225 # deviation adjustment-factor for random values; real number.
1226 sd_percentile <- params$sdPercentile
1227
1228 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
1229 regex_sample_names <- params$regexSampleNames
1230
1231 # Regular expression to extract Sample Grouping from Sample Name;
1232 # if error occurs, compare sample_treatment_levels vs. sample_name_matches
1233 # to see if groupings/pairs line up
1234 # e.g., "(\\d+)"
1235 regex_sample_grouping <- params$regexSampleGrouping
1236
1237 one_way_all_categories_fname <- params$oneWayManyCategories
1238 one_way_all_categories <- try_catch_w_e(
1239 match.fun(one_way_all_categories_fname))
1240 if (!is.function(one_way_all_categories$value)) {
1241 write("fatal error for parameter oneWayManyCategories:", stderr())
1242 write(one_way_all_categories$value$message, stderr())
1243 if (sys.nframe() > 0) quit(save = "no", status = 1)
1244 stop("Cannot continue. Goodbye.")
1245 }
1246 one_way_all_categories <- one_way_all_categories$value
1247
1248 one_way_two_categories_fname <- params$oneWayManyCategories
1249 one_way_two_categories <- try_catch_w_e(
1250 match.fun(one_way_two_categories_fname))
1251 if (!is.function(one_way_two_categories$value)) {
1252 cat("fatal error for parameter oneWayTwoCategories: \n")
1253 cat(one_way_two_categories$value$message, fill = TRUE)
1254 if (sys.nframe() > 0) quit(save = "no", status = 1)
1255 stop("Cannot continue. Goodbye.")
1256 }
1257 one_way_two_categories <- one_way_two_categories$value
1258
1259 preproc_db <- params$preprocDb
1260 ksea_app_prep_db <- params$kseaAppPrepDb
1261 result <- file.copy( 3150 result <- file.copy(
1262 from = preproc_db, 3151 from = preproc_db,
1263 to = ksea_app_prep_db, 3152 to = ksea_app_prep_db,
1264 overwrite = TRUE 3153 overwrite = TRUE
1265 ) 3154 )
1270 preproc_db, 3159 preproc_db,
1271 ksea_app_prep_db, 3160 ksea_app_prep_db,
1272 ), 3161 ),
1273 stderr() 3162 stderr()
1274 ) 3163 )
1275 if (sys.nframe() > 0) quit(save = "no", status = 1) 3164 if (sys.nframe() > 0) {
1276 stop("Cannot continue. Goodbye.") 3165 cat("Cannot continue and quit() failed. Goodbye.")
1277 } 3166 param_df_exit()
3167 knitr::knit_exit()
3168 # in case knit_exit doesn't exit
3169 quit(save = "no", status = 1)
3170 }
3171 }
3172
3173 if (FALSE) {
3174 write.table(x = param_df, file = "test-data/params.txt")
3175 }
3176
1278 ``` 3177 ```
1279 3178
1280 ```{r echo = FALSE} 3179 ```{r echo = FALSE}
1281 ### READ DATA 3180 ### READ DATA
1282 3181
1287 sep = "\t", 3186 sep = "\t",
1288 header = TRUE, 3187 header = TRUE,
1289 quote = "", 3188 quote = "",
1290 check.names = FALSE 3189 check.names = FALSE
1291 ) 3190 )
3191
1292 ``` 3192 ```
1293 3193
1294 # Extract Sample Names and Treatment Levels 3194
1295 3195 # Extraction of Sample Classes and Names from Input Data
1296 Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2.
1297 3196
1298 ```{r echo = FALSE, results = 'asis'} 3197 ```{r echo = FALSE, results = 'asis'}
1299 3198
1300 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) 3199 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
3200 my_column_names <- names(full_data)
1301 3201
1302 if (!fdc_is_integer) { 3202 if (!fdc_is_integer) {
1303 if (length(data_column_indices) > 0) { 3203 if (length(data_column_indices) > 0) {
1304 first_data_column <- data_column_indices[1] 3204 first_data_column <- data_column_indices[1]
1305 } else { 3205 } else {
1306 stop(paste("failed to convert firstDataColumn:", first_data_column)) 3206 cat(paste("failed to convert firstDataColumn:", first_data_column))
3207 param_df_exit()
3208 knitr::knit_exit()
1307 } 3209 }
1308 } 3210 }
1309 3211
1310 cat( 3212 cat(
1311 sprintf( 3213 sprintf(
1312 paste( 3214 paste(
1313 "\n\nThe input data file has peptide-intensity data for each sample", 3215 "\n\nThe input data file has peptide-intensity data",
1314 "in one of columns %d through %d.\n\n" 3216 "in columns %d (\"%s\") through %d (\"%s\")."
1315 ), 3217 ),
1316 min(data_column_indices), 3218 tmp <- min(data_column_indices),
1317 max(data_column_indices) 3219 my_column_names[tmp],
1318 ) 3220 tmp <- max(data_column_indices),
1319 ) 3221 my_column_names[tmp]
1320 3222 )
1321 # Write column names as a LaTeX enumerated list. 3223 )
1322 column_name_df <- data.frame( 3224
1323 column = seq_len(length(colnames(full_data))), 3225 if (TRUE) {
1324 name = paste0("\\verb@", colnames(full_data), "@") 3226 cat0(
1325 ) 3227 table_offset(i = 1, new = TRUE),
1326 data_frame_latex( 3228 "Sample classes and names are shown in ",
1327 x = column_name_df, 3229 table_href(),
1328 justification = "l l", 3230 ".\n\n"
1329 centered = TRUE, 3231 )
1330 caption = "Input data column names", 3232 } else {
1331 anchor = const_table_anchor_bp, 3233 cat0(
1332 underscore_whack = FALSE 3234 "\\newcounter{offset}\n",
1333 ) 3235 "\\setcounter{offset}{\\value{table}}\n",
3236 "\\stepcounter{offset}\n",
3237 "Sample classes and names are shown in ",
3238 table_href(),
3239 ".\n\n"
3240 )
3241 }
3242
3243 #TODO remove this unused variable and assignment
3244 if (FALSE) {
3245 # Write column names as a LaTeX enumerated list.
3246 column_name_df <- data.frame(
3247 column = seq_len(length(colnames(full_data))),
3248 name = paste0("\\verb@", colnames(full_data), "@")
3249 )
3250 }
1334 3251
1335 ``` 3252 ```
1336 3253
1337 ```{r echo = FALSE, results = 'asis'} 3254 ```{r echo = FALSE, results = 'asis'}
3255 # extract intensity columns from full_data to quant_data
1338 quant_data <- full_data[first_data_column:length(full_data)] 3256 quant_data <- full_data[first_data_column:length(full_data)]
1339 quant_data[quant_data == 0] <- NA 3257 quant_data[quant_data == 0] <- NA
1340 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide 3258 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide
3259 full_data_names <- colnames(quant_data)
1341 # Extract factors and trt-replicates using regular expressions. 3260 # Extract factors and trt-replicates using regular expressions.
1342 # Typically: 3261 # Typically:
1343 # regex_sample_names is "\\.\\d+[A-Z]$" 3262 # regex_sample_names is "\\.\\d+[A-Z]$"
1344 # regex_sample_grouping is "\\d+" 3263 # regex_sample_grouping is "\\d+"
1345 # This would distinguish trt-replicates by terminal letter [A-Z] 3264 # This would distinguish trt-replicates by terminal letter [A-Z]
1353 colnames(quant_data) <- sample_name_matches 3272 colnames(quant_data) <- sample_name_matches
1354 3273
1355 write_debug_file(quant_data) 3274 write_debug_file(quant_data)
1356 3275
1357 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) 3276 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
1358 sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) 3277 smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match))
3278
3279 if (print_nb_messages) nbe(see_variable(smpl_trt, "\n\n"))
3280 if (print_nb_messages) nbe(see_variable(sample_name_matches, "\n\n"))
3281 if (print_nb_messages) nbe(see_variable(full_data_names, "\n\n"))
3282
3283 sample_treatment_df <-
3284 save_sample_treatment_df <-
3285 data.frame(
3286 class = smpl_trt,
3287 sample = sample_name_matches,
3288 full_sample_names = full_data_names
3289 )
3290
3291 if (print_nb_messages) nbe(see_variable(sample_treatment_df, "\n\n"))
3292
3293 # reorder data
3294 my_order <- with(sample_treatment_df, order(class, sample))
3295 quant_data <- quant_data[, my_order]
3296 sample_name_matches <- sample_name_matches[my_order]
3297 smpl_trt <- smpl_trt[my_order]
3298 sample_treatment_df <- data.frame(
3299 class = smpl_trt,
3300 sample = sample_name_matches
3301 )
3302
3303 # filter smpl_trt as appropriate
3304 if (sample_group_filter %in% c("include", "exclude")) {
3305 include_sample <-
3306 mgrepl(
3307 v = sample_group_filter_patterns,
3308 s = as.character(smpl_trt),
3309 fixed = sample_group_filter_fixed,
3310 perl = sample_group_filter_perl,
3311 ignore.case = sample_group_filter_nocase
3312 )
3313 if (sum(include_sample) < 2) {
3314 errmsg <-
3315 paste(
3316 "ERROR:",
3317 sum(include_sample),
3318 "samples are too few for analysis;",
3319 "check input parameters for sample-name parsing"
3320 )
3321 cat0(
3322 errmsg,
3323 "\\stepcounter{offset}\n",
3324 " in ",
3325 table_href(),
3326 ".\n\n"
3327 )
3328 data_frame_tabbing_latex(
3329 x = save_sample_treatment_df,
3330 tabstops = c(1.25, 1.25),
3331 caption = "Sample classes",
3332 use_subsubsection_header = FALSE
3333 )
3334 data_frame_tabbing_latex(
3335 x =
3336 param_df[
3337 c("regexSampleNames",
3338 "regexSampleGrouping",
3339 "groupFilterPatterns",
3340 "groupFilter",
3341 "groupFilterMode"
3342 ),
3343 ],
3344 tabstops = c(1.75),
3345 underscore_whack = TRUE,
3346 caption = "Input parameters for sample-name parsing",
3347 verbatim = FALSE
3348 )
3349 param_df_exit()
3350 knitr::knit_exit()
3351 return(invisible(-1))
3352 }
3353 sample_treatment_df <-
3354 if (sample_group_filter == "include")
3355 sample_treatment_df[include_sample, ]
3356 else
3357 sample_treatment_df[!include_sample, ]
3358 } else {
3359 include_sample <- rep.int(TRUE, length(smpl_trt))
3360 }
3361 sample_name_matches <- sample_treatment_df$sample
3362 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
3363 smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match))
3364 sample_treatment_df$class <- smpl_trt
3365
3366 # filter quant_data as appropriate
1359 number_of_samples <- length(sample_name_matches) 3367 number_of_samples <- length(sample_name_matches)
1360 sample_treatment_df <- data.frame( 3368 quant_data <- quant_data[, sample_name_matches]
1361 level = sample_treatment_levels, 3369
1362 sample = sample_name_matches 3370 sample_level_integers <- as.integer(smpl_trt)
1363 ) 3371 sample_treatment_levels <- levels(smpl_trt)
1364 data_frame_latex( 3372 count_of_treatment_levels <- length(sample_treatment_levels)
3373
3374 # for each phosphopeptide, across treatment levels, compute minimum
3375 # count of observed (i.e., non-missing) values
3376 my_env <- new_env()
3377 for (l in sample_treatment_levels)
3378 my_env[[as.character(l)]] <-
3379 as.vector(rowSums(!is.na(quant_data[l == smpl_trt])))
3380 min_group_obs_count <- row_apply(
3381 x = Reduce(
3382 f = function(l, r) cbind(l, my_env[[r]]),
3383 x = sample_treatment_levels,
3384 init = c()
3385 ),
3386 fun = min
3387 )
3388 names(min_group_obs_count) <- rownames(quant_data)
3389 rm(my_env)
3390
3391 # display (possibly-filtered) results
3392 cat("\\newpage\n")
3393
3394 if (sum(include_sample) > 1) {
3395 data_frame_tabbing_latex(
1365 x = sample_treatment_df, 3396 x = sample_treatment_df,
1366 justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", 3397 tabstops = c(1.25),
1367 centered = TRUE, 3398 caption = "Sample classes",
1368 caption = "Treatment levels", 3399 use_subsubsection_header = FALSE
1369 anchor = const_table_anchor_tbp, 3400 )
1370 underscore_whack = FALSE 3401 }
1371 ) 3402 sample_name_grow <- 10 / (10 + max(nchar(sample_name_matches) + 6))
3403 sample_colsep <- transition_positions(as.integer(sample_treatment_df$class))
1372 ``` 3404 ```
1373 3405
1374 ```{r echo = FALSE, results = 'asis'} 3406 ```{r echo = FALSE, results = 'asis'}
1375 cat("\\newpage\n") 3407 cat("\\newpage\n")
1376 ``` 3408 ```
1377 3409
1378 ## Are the log-transformed sample distributions similar? 3410 ## Are the log-transformed sample distributions similar?
1379 3411
1380 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} 3412 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
1381 3413
1382 quant_data[quant_data == 0] <- NA #replace 0 with NA 3414 quant_data[quant_data == 0] <- NA #replace 0 with NA
1383 quant_data_log <- log10(quant_data) 3415 quant_data_log <- log10(quant_data)
1384 3416
1385 rownames(quant_data_log) <- rownames(quant_data) 3417 rownames(quant_data_log) <- rownames(quant_data)
1386 colnames(quant_data_log) <- sample_name_matches 3418 colnames(quant_data_log) <- sample_name_matches
1387 3419
1388 write_debug_file(quant_data_log) 3420 write_debug_file(quant_data_log)
1389 3421
1390 # data visualization 3422 g_ppep_distrib_ctl <- new_env()
1391 old_par <- par( 3423 g_ppep_distrib_ctl$xlab_line <- 3.5 + 11.86 * (0.67 - sample_name_grow)
1392 mai = par("mai") + c(0.5, 0, 0, 0) 3424 g_ppep_distrib_ctl$mai_bottom <- (0.5 + 3.95 * (0.67 - sample_name_grow))
1393 ) 3425 g_ppep_distrib_ctl$axis <- (0.6 + 0.925 * (0.67 - sample_name_grow))
1394 # ref: https://r-charts.com/distribution/add-points-boxplot/ 3426
1395 # Vertical plot 3427 my_ppep_distrib_bxp <- function(
1396 boxplot( 3428 x
1397 quant_data_log 3429 , sample_name_grow = sample_name_grow
1398 , las = 1 3430 , main
1399 , col = const_boxplot_fill 3431 , varwidth = FALSE
1400 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 3432 , sub = NULL
1401 , xlab = "Sample" 3433 , xlab
1402 ) 3434 , ylab
1403 par(old_par) 3435 , col = const_boxplot_fill
1404 3436 , notch = FALSE
1405 3437 , ppep_distrib_ctl = g_ppep_distrib_ctl
1406 3438 , ...
1407 cat("\n\n\n") 3439 ) {
1408 cat("\n\n\n") 3440 my_xlab_line <- g_ppep_distrib_ctl$xlab_line
3441 my_mai_bottom <- g_ppep_distrib_ctl$mai_bottom
3442 my_axis <- g_ppep_distrib_ctl$axis
3443
3444 if (print_trace_messages) {
3445 cat_variable(my_xlab_line, suffix = "; ")
3446 cat_variable(my_mai_bottom, suffix = "; ")
3447 cat_variable(my_axis, suffix = "\n\n")
3448 }
3449
3450 old_par <- par(
3451 mai = par("mai") + c(my_mai_bottom, 0, 0, 0),
3452 cex.axis = my_axis,
3453 cex.lab = 1.2
3454 )
3455 tryCatch(
3456 {
3457 # Vertical plot
3458 boxplot(
3459 x
3460 , las = 2
3461 , col = col
3462 , main = main
3463 , sub = NULL
3464 , ylab = ylab
3465 , xlab = NULL
3466 , notch = notch
3467 , varwidth = varwidth
3468 , ...
3469 )
3470 title(
3471 sub = sub
3472 , cex.sub = 1.0
3473 , line = my_xlab_line + 1
3474 )
3475 title(
3476 xlab = xlab
3477 , line = my_xlab_line
3478 )
3479 },
3480 finally = par(old_par)
3481 )
3482 }
3483
3484 my_ppep_distrib_bxp(
3485 x = quant_data_log
3486 , sample_name_grow = sample_name_grow
3487 , main = "Peptide intensities for each sample"
3488 , varwidth = boxplot_varwidth
3489 , sub = "Box widths reflect number of peptides for sample"
3490 , xlab = "Sample"
3491 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
3492 , col = const_boxplot_fill
3493 , notch = FALSE
3494 )
3495
3496 cat("\n\n\n\n")
1409 3497
1410 ``` 3498 ```
1411 3499
1412 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} 3500 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
1413 if (nrow(quant_data_log) > 1) { 3501 if (nrow(quant_data_log) > 1) {
1452 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), 3540 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"),
1453 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") 3541 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)")
1454 ) 3542 )
1455 ``` 3543 ```
1456 3544
3545 # Characterization of Input Data
3546
1457 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values 3547 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values
1458 3548
1459 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} 3549 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
1460 # determine quantile 3550 # determine quantile
1461 q1 <- quantile(logvalues, probs = mean_percentile)[1] 3551 q1 <- quantile(logvalues, probs = mean_percentile)[1]
1462 3552
1463 # 1 = row of matrix (ie, phosphopeptide) 3553 # 1 = row of matrix (ie, phosphopeptide)
1464 sds <- apply(quant_data_log, 1, sd_finite) 3554 sds <- row_apply(quant_data_log, sd_finite)
1465 if (sum(!is.na(sds)) > 2) { 3555 if (sum(!is.na(sds)) > 2) {
1466 plot( 3556 plot(
1467 density(sds, na.rm = TRUE) 3557 density(sds, na.rm = TRUE)
1468 , main = "Smoothed estimated probability density vs. std. deviation" 3558 , main = "Smoothed estimated probability density vs. std. deviation"
1469 , sub = "(probability estimation made with Gaussian smoothing)" 3559 , sub = "(probability estimation made with Gaussian smoothing)"
1493 ```{r echo = FALSE} 3583 ```{r echo = FALSE}
1494 3584
1495 # prep for trt-median based imputation 3585 # prep for trt-median based imputation
1496 3586
1497 ``` 3587 ```
1498 # Impute Missing Values 3588 # Imputation of Missing Values
1499 3589
1500 ```{r echo = FALSE} 3590 ```{r echo = FALSE}
1501 3591
1502 imp_smry_pot_peptides_before <- nrow(quant_data_log) 3592 imp_smry_pot_peptides_before <- nrow(quant_data_log)
1503 imp_smry_missing_values_before <- number_to_impute 3593 imp_smry_missing_values_before <- number_to_impute
1524 quant_data_imp <- quant_data 3614 quant_data_imp <- quant_data
1525 imputation_method_description <- 3615 imputation_method_description <-
1526 paste("Substitute missing value with", 3616 paste("Substitute missing value with",
1527 "median peptide-intensity for sample group.\n" 3617 "median peptide-intensity for sample group.\n"
1528 ) 3618 )
1529 sample_level_integers <- as.integer(sample_treatment_levels)
1530 # Take the accurate ln(x+1) because the data are log-normally distributed 3619 # Take the accurate ln(x+1) because the data are log-normally distributed
1531 # and because median can involve an average of two measurements. 3620 # and because median can involve an average of two measurements.
1532 quant_data_imp <- log1p(quant_data_imp) 3621 quant_data_imp <- log1p(quant_data_imp)
1533 for (i in seq_len(length(levels(sample_treatment_levels)))) { 3622 for (i in seq_len(count_of_treatment_levels)) {
1534 # Determine the columns for this factor-level 3623 # Determine the columns for this factor-level
1535 level_cols <- i == sample_level_integers 3624 level_cols <- i == sample_level_integers
1536 # Extract those columns 3625 # Extract those columns
1537 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] 3626 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE]
1538 # assign to ind the row-column pairs corresponding to each NA 3627 # assign to ind the row-column pairs corresponding to each NA
1539 ind <- which(is.na(lvlsbst), arr.ind = TRUE) 3628 ind <- which(is.na(lvlsbst), arr.ind = TRUE)
1540 # No group-median exists if there is only one sample 3629 # No group-median exists if there is only one sample
1541 # a given ppep has no measurement; otherwise, proceed. 3630 # a given ppep has no measurement; otherwise, proceed.
1542 if (ncol(lvlsbst) > 1) { 3631 if (ncol(lvlsbst) > 1) {
1543 the_centers <- 3632 the_centers <-
1544 apply(lvlsbst, 1, median, na.rm = TRUE) 3633 row_apply(lvlsbst, median, na.rm = TRUE)
1545 for (j in seq_len(nrow(lvlsbst))) { 3634 for (j in seq_len(nrow(lvlsbst))) {
1546 for (k in seq_len(ncol(lvlsbst))) { 3635 for (k in seq_len(ncol(lvlsbst))) {
1547 if (is.na(lvlsbst[j, k])) { 3636 if (is.na(lvlsbst[j, k])) {
1548 lvlsbst[j, k] <- the_centers[j] 3637 lvlsbst[j, k] <- the_centers[j]
1549 } 3638 }
1563 "median peptide-intensity across all sample classes.\n" 3652 "median peptide-intensity across all sample classes.\n"
1564 ) 3653 )
1565 # Take the accurate ln(x+1) because the data are log-normally distributed 3654 # Take the accurate ln(x+1) because the data are log-normally distributed
1566 # and because median can involve an average of two measurements. 3655 # and because median can involve an average of two measurements.
1567 quant_data_imp <- log1p(quant_data_imp) 3656 quant_data_imp <- log1p(quant_data_imp)
1568 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] 3657 quant_data_imp[ind] <- row_apply(quant_data_imp, median, na.rm = TRUE)[ind[, 1]]
1569 # Take the accurate e^x - 1 to match scaling of original input. 3658 # Take the accurate e^x - 1 to match scaling of original input.
1570 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) 3659 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
1571 good_rows <- !is.nan(rowMeans(quant_data_imp)) 3660 good_rows <- !is.nan(rowMeans(quant_data_imp))
1572 } 3661 }
1573 , "mean" = { 3662 , "mean" = {
1579 # Take the accurate ln(x+1) because the data are log-normally distributed, 3668 # Take the accurate ln(x+1) because the data are log-normally distributed,
1580 # so arguments to mean should be previously transformed. 3669 # so arguments to mean should be previously transformed.
1581 # this will have to be 3670 # this will have to be
1582 quant_data_imp <- log1p(quant_data_imp) 3671 quant_data_imp <- log1p(quant_data_imp)
1583 # Assign to NA cells the mean for the row 3672 # Assign to NA cells the mean for the row
1584 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] 3673 quant_data_imp[ind] <- row_apply(quant_data_imp, mean, na.rm = TRUE)[ind[, 1]]
1585 # Take the accurate e^x - 1 to match scaling of original input. 3674 # Take the accurate e^x - 1 to match scaling of original input.
1586 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) 3675 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
1587 good_rows <- !is.nan(rowMeans(quant_data_imp)) 3676 good_rows <- !is.nan(rowMeans(quant_data_imp))
1588 } 3677 }
1589 , "random" = { 3678 , "random" = {
1623 ```{r echo = FALSE} 3712 ```{r echo = FALSE}
1624 3713
1625 imp_smry_pot_peptides_after <- sum(good_rows) 3714 imp_smry_pot_peptides_after <- sum(good_rows)
1626 imp_smry_rejected_after <- sum(!good_rows) 3715 imp_smry_rejected_after <- sum(!good_rows)
1627 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) 3716 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ]))
3717
3718 # From ?`%in%`, %in% is currently defined as function(x, table) match(x, table, nomatch = 0) > 0
3719
3720 sink(stderr())
3721 print("`%in%`:")
3722 print(`%in%`)
3723 sink()
3724
3725 stock_in <-
3726 names(good_rows) %in%
3727 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
3728 if (print_nb_messages) nbe(see_variable(stock_in), "\n")
3729
3730 explicit_in <-
3731 0 < match(
3732 names(good_rows),
3733 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
3734 )
3735 if (print_nb_messages) nbe(see_variable(explicit_in), "\n")
3736
3737 great_enough_row_names <- good_rows[
3738 names(good_rows) %in%
3739 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
3740 ]
3741 if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n")
3742 great_enough_row_names <- great_enough_row_names[great_enough_row_names]
3743 if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n")
1628 ``` 3744 ```
3745
1629 ```{r echo = FALSE, results = 'asis'} 3746 ```{r echo = FALSE, results = 'asis'}
1630 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf 3747 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
1631 tabular_lines_fmt <- paste( 3748 tabular_lines_fmt <- paste(
1632 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) 3749 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
3750 " \\leavevmode",
1633 " \\caption{Imputation Results}", 3751 " \\caption{Imputation Results}",
1634 " \\centering", # \centering centers the table on the page 3752 " \\centering", # \centering centers the table on the page
1635 " \\begin{tabular}{l c c c}", 3753 " \\begin{tabular}{l c c c}",
1636 " \\hline\\hline", 3754 " \\hline\\hline",
1637 " \\ & potential peptides & missing values & rejected", 3755 " \\ & potential peptides & missing values & rejected",
1638 " peptides \\\\ [0.5ex]", 3756 " peptides \\\\ [0.5ex]",
1639 " \\hline", 3757 " \\hline",
1640 " before imputation & %d & %d (%d\\%s) & \\\\", 3758 " before imputation & %d & %d (%d\\%s) & \\\\",
1641 " after imputation & %d & %d & %d \\\\ [1ex]", 3759 " after imputation & %d & %d & %d \\\\",
3760 " after keep comparable & %d & & %d \\\\ [1ex]",
1642 " \\hline", 3761 " \\hline",
1643 " \\end{tabular}", 3762 " \\end{tabular}",
1644 #" \\label{table:nonlin}", # may be used to refer this table in the text 3763 #" \\label{table:nonlin}", # may be used to refer this table in the text
1645 "\\end{table}", 3764 "\\end{table}",
1646 sep = "\n" 3765 sep = "\n"
1652 imp_smry_missing_values_before, 3771 imp_smry_missing_values_before,
1653 imp_smry_pct_missing, 3772 imp_smry_pct_missing,
1654 "%", 3773 "%",
1655 imp_smry_pot_peptides_after, 3774 imp_smry_pot_peptides_after,
1656 imp_smry_missing_values_after, 3775 imp_smry_missing_values_after,
1657 imp_smry_rejected_after 3776 imp_smry_rejected_after,
3777 length(great_enough_row_names),
3778 imp_smry_pot_peptides_before -
3779 length(great_enough_row_names)
1658 ) 3780 )
1659 cat(tabular_lines) 3781 cat(tabular_lines)
1660 ``` 3782 ```
1661 ```{r echo = FALSE} 3783
1662 3784 ```{r filter_good_rows, echo = FALSE}
1663 3785
1664 # Zap rows where imputation was ineffective 3786 if (print_nb_messages) nbe("before name extraction, ", see_variable(length(good_rows)), " ", see_variable(good_rows), "\n")
3787 good_rows <- names(good_rows[names(great_enough_row_names)])
3788 if (print_nb_messages) nbe("after name extraction, ", see_variable(length(good_rows)), see_variable(good_rows), "\n")
3789
3790 #ACE min_group_obs_count <- min_group_obs_count[names(great_enough_row_names)]
3791 #ACE nbe("good_rows")
3792 #ACE nbe(see_variable(good_rows))
3793 #ACE nbe("names(min_group_obs_count) before filter for good rows")
3794 #ACE nbe(see_variable(names(min_group_obs_count)))
3795 min_group_obs_count <- min_group_obs_count[good_rows]
3796 #ACE nbe("min_group_obs_count after filter for good rows")
3797 #ACE nbe(see_variable(names(min_group_obs_count)))
3798
3799 # Zap rows where imputation was insufficiently effective
1665 full_data <- full_data [good_rows, ] 3800 full_data <- full_data [good_rows, ]
1666 quant_data <- quant_data [good_rows, ] 3801 quant_data <- quant_data [good_rows, ]
1667 3802 quant_data_log <- quant_data_log [good_rows, ]
3803
3804 if (print_nb_messages) nbe("before row filter, ", see_variable(nrow(quant_data_imp)), "\n")
1668 quant_data_imp <- quant_data_imp[good_rows, ] 3805 quant_data_imp <- quant_data_imp[good_rows, ]
3806 if (print_nb_messages) nbe("after row filter, ", see_variable(nrow(quant_data_imp)), "\n")
1669 write_debug_file(quant_data_imp) 3807 write_debug_file(quant_data_imp)
1670 quant_data_imp_good_rows <- quant_data_imp 3808 quant_data_imp_good_rows <- quant_data_imp
1671 3809
1672 write_debug_file(quant_data_imp_good_rows) 3810 write_debug_file(quant_data_imp_good_rows)
1673 ``` 3811 ```
1717 d_imputed <- d_combined 3855 d_imputed <- d_combined
1718 } 3856 }
1719 3857
1720 ``` 3858 ```
1721 3859
1722 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} 3860 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
1723 zero_sd_rownames <- 3861 zero_sd_rownames <-
1724 rownames(quant_data_imp)[ 3862 rownames(quant_data_imp)[
1725 is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) 3863 is.na((row_apply(quant_data_imp, sd, na.rm = TRUE)) == 0)
1726 ] 3864 ]
1727 3865
1728 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { 3866 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) {
1729 stop("All peptides have zero standard deviation. Cannot continue.") 3867 cat("All peptides have zero standard deviation. Cannot continue.")
3868 param_df_exit()
3869 knitr::knit_exit()
1730 } 3870 }
1731 if (length(zero_sd_rownames) > 0) { 3871 if (length(zero_sd_rownames) > 0) {
1732 cat( 3872 cat(
1733 sprintf("%d peptides with zero variance were removed from statistical consideration", 3873 sprintf(
1734 length(zero_sd_rownames) 3874 "%d %s %s",
3875 length(zero_sd_rownames),
3876 "peptides with zero variance",
3877 "were removed from statistical consideration"
1735 ) 3878 )
1736 ) 3879 )
1737 zap_named_rows <- function(df, nms) { 3880 zap_named_rows <- function(df, nms) {
1738 return(df[!(row.names(df) %in% nms), ]) 3881 return(df[!(row.names(df) %in% nms), ])
1739 } 3882 }
1740 quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) 3883 quant_data_imp <-
1741 quant_data <- zap_named_rows(quant_data, zero_sd_rownames) 3884 zap_named_rows(quant_data_imp, zero_sd_rownames)
1742 full_data <- zap_named_rows(full_data, zero_sd_rownames) 3885 quant_data <-
3886 zap_named_rows(quant_data, zero_sd_rownames)
3887 full_data <-
3888 zap_named_rows(full_data, zero_sd_rownames)
3889 min_group_obs_count <-
3890 min_group_obs_count[names(min_group_obs_count) %notin% zero_sd_rownames]
1743 } 3891 }
1744 3892
1745 if (sum(is.na(quant_data)) > 0) { 3893 if (sum(is.na(quant_data)) > 0) {
1746 cat("\\leavevmode\\newpage\n") 3894 cat("\\leavevmode\\newpage\n")
1747 # data visualization
1748 old_par <- par(
1749 mai = par("mai") + c(0.5, 0, 0, 0)
1750 )
1751 # Copy quant data to x 3895 # Copy quant data to x
1752 x <- quant_data 3896 x <- quant_data
1753 # x gets to have values of: 3897 # x gets to have values of:
1754 # - NA for observed values 3898 # - NA for observed values
1755 # - 1 for missing values 3899 # - 1 for missing values
1774 max(red_dots, blue_dots, na.rm = TRUE) 3918 max(red_dots, blue_dots, na.rm = TRUE)
1775 ) 3919 )
1776 show_stripchart <- 3920 show_stripchart <-
1777 50 > (count_red + count_blue) / length(sample_name_matches) 3921 50 > (count_red + count_blue) / length(sample_name_matches)
1778 if (show_stripchart) { 3922 if (show_stripchart) {
1779 boxplot_sub <- "Light blue = data before imputation; Red = imputed data" 3923 boxplot_sub <- "Light blue = data before imputation; Red = imputed data;"
1780 } else { 3924 } else {
1781 boxplot_sub <- "" 3925 boxplot_sub <- ""
1782 } 3926 }
1783 3927
1784 # Vertical plot 3928 # Vertical plot
1785 colnames(blue_dots) <- sample_name_matches 3929 colnames(blue_dots) <- sample_name_matches
1786 boxplot( 3930 my_ppep_distrib_bxp(
1787 blue_dots 3931 x = blue_dots
1788 , las = 1 # "always horizontal" 3932 , sample_name_grow = sample_name_grow
1789 , col = const_boxplot_fill
1790 , ylim = ylim
1791 , main = "Peptide intensities after eliminating unusable peptides" 3933 , main = "Peptide intensities after eliminating unusable peptides"
1792 , sub = boxplot_sub 3934 , varwidth = boxplot_varwidth
3935 , sub = paste(boxplot_sub, "Box widths reflect number of peptides for sample")
1793 , xlab = "Sample" 3936 , xlab = "Sample"
1794 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 3937 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
3938 , col = const_boxplot_fill
3939 , notch = FALSE
3940 , ylim = ylim
1795 ) 3941 )
1796 3942
1797 if (show_stripchart) { 3943 if (show_stripchart) {
1798 # Points 3944 # Points
1799 # ref: https://r-charts.com/distribution/add-points-boxplot/ 3945 # ref: https://r-charts.com/distribution/add-points-boxplot/
1801 stripchart( 3947 stripchart(
1802 blue_dots, # Data 3948 blue_dots, # Data
1803 method = "jitter", # Random noise 3949 method = "jitter", # Random noise
1804 jitter = const_stripchart_jitter, 3950 jitter = const_stripchart_jitter,
1805 pch = 19, # Pch symbols 3951 pch = 19, # Pch symbols
1806 cex = const_stripsmall_cex, # Size of symbols reduced 3952 cex = const_stripchart_cex, # Size of symbols reduced
1807 col = "lightblue", # Color of the symbol 3953 col = "lightblue", # Color of the symbol
1808 vertical = TRUE, # Vertical mode 3954 vertical = TRUE, # Vertical mode
1809 add = TRUE # Add it over 3955 add = TRUE # Add it over
1810 ) 3956 )
1811 stripchart( 3957 stripchart(
1812 red_dots, # Data 3958 red_dots, # Data
1813 method = "jitter", # Random noise 3959 method = "jitter", # Random noise
1814 jitter = const_stripchart_jitter, 3960 jitter = const_stripchart_jitter,
1815 pch = 19, # Pch symbols 3961 pch = 19, # Pch symbols
1816 cex = const_stripsmall_cex, # Size of symbols reduced 3962 cex = const_stripchart_cex, # Size of symbols reduced
1817 col = "red", # Color of the symbol 3963 col = "red", # Color of the symbol
1818 vertical = TRUE, # Vertical mode 3964 vertical = TRUE, # Vertical mode
1819 add = TRUE # Add it over 3965 add = TRUE # Add it over
1820 ) 3966 )
1821 3967
1822 } 3968 }
1823 if (TRUE) { 3969 }
1824 # show measured values in blue on left half-violin plot 3970 ```
1825 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") 3971
1826 vioplot::vioplot( 3972 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
1827 x = lapply(blue_dots, function(x) x[!is.na(x)]), 3973 if (sum(is.na(quant_data)) > 0) {
1828 col = "lightblue1", 3974 # show measured values in blue on left half-violin plot
1829 side = "left", 3975 cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
1830 plotCentre = "line", 3976 old_par <- par(
1831 ylim = ylim_save, 3977 mai = par("mai") + c(g_ppep_distrib_ctl$mai_bottom, 0, 0, 0),
1832 main = "Distributions of observed and imputed data", 3978 cex.axis = g_ppep_distrib_ctl$axis,
1833 sub = "Light blue = observed data; Pink = imputed data", 3979 cex.lab = 1.2
1834 xlab = "Sample", 3980 )
1835 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 3981 tryCatch(
1836 ) 3982 {
1837 red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) 3983 vioplot::vioplot(
1838 cols_to_delete <- c() 3984 x = lapply(blue_dots, function(x) x[!is.na(x)]),
1839 for (ix in seq_len(length(red_violins))) { 3985 col = "lightblue1",
1840 if (length(red_violins[[ix]]) < 1) { 3986 side = "left",
1841 cols_to_delete <- c(cols_to_delete, ix) 3987 plotCentre = "line",
3988 ylim = ylim_save,
3989 main = "Distributions of observed and imputed data",
3990 sub = NULL,
3991 las = 2,
3992 xlab = NULL,
3993 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
3994 )
3995 title(
3996 sub = "Light blue = observed data; Pink = imputed data",
3997 cex.sub = 1.0,
3998 line = g_ppep_distrib_ctl$xlab_line + 1
3999 )
4000 title(
4001 xlab = "Sample",
4002 line = g_ppep_distrib_ctl$xlab_line
4003 )
4004 red_violins <- lapply(red_dots, function(x) x[!is.na(x)])
4005 cols_to_delete <- c()
4006 for (ix in seq_len(length(red_violins))) {
4007 if (length(red_violins[[ix]]) < 1) {
4008 cols_to_delete <- c(cols_to_delete, ix)
4009 }
1842 } 4010 }
1843 } 4011 # destroy any unimputable columns
1844 # destroy any unimputable columns 4012 if (!is.null(cols_to_delete)) {
1845 if (!is.null(cols_to_delete)) { 4013 red_violins <- red_violins[-cols_to_delete]
1846 red_violins <- red_violins[-cols_to_delete] 4014 }
1847 } 4015 # plot imputed values in red on right half-violin plot
1848 # plot imputed values in red on right half-violin plot 4016 vioplot::vioplot(
1849 vioplot::vioplot( 4017 x = red_violins,
1850 x = red_violins, 4018 col = "lightpink1",
1851 col = "lightpink1", 4019 side = "right",
1852 side = "right", 4020 plotCentre = "line",
1853 plotCentre = "line", 4021 add = TRUE
1854 add = TRUE 4022 )
1855 ) 4023
1856 } 4024 },
1857 4025 finally = par(old_par)
1858 par(old_par) 4026 )
1859 4027
1860 # density plot 4028 # density plot
1861 cat("\\leavevmode\n\n\n\n\n\n\n") 4029 cat("\\leavevmode\n\n\n\n\n\n\n")
1862 if (can_plot_before_after_imp) { 4030 if (can_plot_before_after_imp) {
1863 ylim <- c( 4031 ylim <- c(
1891 } 4059 }
1892 cat("\\leavevmode\\newpage\n") 4060 cat("\\leavevmode\\newpage\n")
1893 } 4061 }
1894 ``` 4062 ```
1895 4063
1896 # Perform Quantile Normalization 4064 # Quantile Normalization
1897 4065
1898 The excellent `normalize.quantiles` function from 4066 The excellent `normalize.quantiles` function from
1899 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* 4067 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)*
1900 performs "quantile normalization" as described Bolstad *et al.* (2003), 4068 performs "quantile normalization" as described Bolstad *et al.* (2003),
1901 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* 4069 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)*
1902 and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, 4070 and its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html),
1903 i.e., it assumes that the goal is to detect 4071 i.e., it assumes that the goal is to detect
1904 subtle differences among grossly similar samples (having similar distributions) 4072 subtle differences among grossly similar samples (having similar distributions)
1905 by equailzing intra-quantile quantitations. 4073 by equalizing intra-quantile quantitations^[Unfortunately,
1906 Unfortunately, one software library upon which it depends 4074 one software library upon which `preprocessCore` depends
1907 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* 4075 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)*
1908 that requires that a specific, non-concurrent version of the library be 4076 that requires that a specific, non-concurrent version of the library (`openblas` version $0.3.3$) be
1909 installed. The installation command equivalent to what was used to install the library to produce the results presented here is: 4077 installed. The installation command equivalent to what was used to install the library to produce the results presented here is:
1910 ``` 4078 \linebreak
1911 conda install bioconductor-preprocesscore openblas=0.3.3 4079 ` conda install bioconductor-preprocesscore openblas=0.3.3`].
1912 ```
1913 4080
1914 4081
1915 <!-- 4082 <!--
1916 # Apply quantile normalization using preprocessCore::normalize.quantiles 4083 # Apply quantile normalization using preprocessCore::normalize.quantiles
1917 # --- 4084 # ---
1918 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html 4085 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
1919 # except this: https://support.bioconductor.org/p/122925/#9135989 4086 # except this: https://support.bioconductor.org/p/122925/#9135989
1920 # says to install it like this: 4087 # says to install it like this:
1921 # ``` 4088 # ```
1922 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) 4089 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1])
1923 # ``` 4090 # ```
1924 # conda installation (necessary because of a bug in recent openblas): 4091 # conda installation (necessary because of a bug in recent openblas):
1925 # conda install bioconductor-preprocesscore openblas=0.3.3 4092 # conda install bioconductor-preprocesscore openblas=0.3.3
1926 # ... 4093 # ...
1927 # --- 4094 # ---
1928 # normalize.quantiles {preprocessCore} -- Quantile Normalization 4095 # normalize.quantiles {preprocessCore} -- Quantile Normalization
1929 # 4096 #
1930 # Description: 4097 # Description:
1931 # Using a normalization based upon quantiles, this function normalizes a 4098 # Using a normalization based upon quantiles, this function normalizes a
1932 # matrix of probe level intensities. 4099 # matrix of probe level intensities.
1933 # 4100 #
1934 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the 4101 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the
1935 # assumption that the data is missing at random. 4102 # assumption that the data is missing at random.
1936 # 4103 #
1937 # Usage: 4104 # Usage:
1938 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) 4105 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE)
1939 # 4106 #
1940 # Arguments: 4107 # Arguments:
1941 # 4108 #
1942 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. 4109 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
1943 # 4110 #
1944 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, 4111 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
1945 # but in certain situations not making a copy of the matrix, but instead normalizing 4112 # but in certain situations not making a copy of the matrix, but instead normalizing
1946 # it in place will be more memory friendly. 4113 # it in place will be more memory friendly.
1947 # 4114 #
1948 # - keep.names: Boolean option to preserve matrix row and column names in output. 4115 # - keep.names: Boolean option to preserve matrix row and column names in output.
1949 # 4116 #
1950 # Details: 4117 # Details:
1951 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. 4118 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
1952 # No special allowances are made for outliers. If you make use of quantile normalization 4119 # No special allowances are made for outliers. If you make use of quantile normalization
1953 # please cite Bolstad et al, Bioinformatics (2003). 4120 # please cite Bolstad et al, Bioinformatics (2003).
1954 # 4121 #
1955 # This functions will handle missing data (ie NA values), based on 4122 # This functions will handle missing data (ie NA values), based on
1956 # the assumption that the data is missing at random. 4123 # the assumption that the data is missing at random.
1957 # 4124 #
1958 # Note that the current implementation optimizes for better memory usage 4125 # Note that the current implementation optimizes for better memory usage
1959 # at the cost of some additional run-time. 4126 # at the cost of some additional run-time.
1960 # 4127 #
1961 # Value: A normalized matrix. 4128 # Value: A normalized matrix.
1962 # 4129 #
1963 # Author: Ben Bolstad, bmbolstad.com 4130 # Author: Ben Bolstad, bmbolstad.com
1964 # 4131 #
1965 # References 4132 # References
1966 # 4133 #
1967 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide 4134 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
1968 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf 4135 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
1969 # 4136 #
1970 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of 4137 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
1971 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias 4138 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
1972 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 4139 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
1973 # http://bmbolstad.com/misc/normalize/normalize.html 4140 # http://bmbolstad.com/misc/normalize/normalize.html
1974 # ... 4141 # ...
1975 --> 4142 -->
1976 ```{r echo = FALSE, results = 'asis'} 4143 ```{r echo = FALSE, results = 'asis'}
1977 4144
4145 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp)), "\n")
1978 if (nrow(quant_data_imp) > 0) { 4146 if (nrow(quant_data_imp) > 0) {
1979 quant_data_imp_qn <- preprocessCore::normalize.quantiles( 4147 quant_data_imp_qn <- preprocessCore::normalize.quantiles(
1980 as.matrix(quant_data_imp), keep.names = TRUE 4148 as.matrix(quant_data_imp), keep.names = TRUE
1981 ) 4149 )
1982 } else { 4150 } else {
1983 quant_data_imp_qn <- as.matrix(quant_data_imp) 4151 quant_data_imp_qn <- as.matrix(quant_data_imp)
1984 } 4152 }
1985 4153
4154 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn)), "\n")
4155
1986 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) 4156 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
1987
1988 write_debug_file(quant_data_imp_qn) 4157 write_debug_file(quant_data_imp_qn)
1989 4158
1990 quant_data_imp_qn_log <- log10(quant_data_imp_qn) 4159 quant_data_imp_qn_log <- log10(quant_data_imp_qn)
1991
1992 write_debug_file(quant_data_imp_qn_log) 4160 write_debug_file(quant_data_imp_qn_log)
1993 4161
4162 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n")
4163 if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n")
4164
1994 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) 4165 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
1995 4166
1996 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) 4167 sel <- row_apply(quant_data_imp_qn_ls, any_nan)
1997 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls 4168 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls
1998 4169
1999 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] 4170 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ]
2000 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) 4171 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)
2001 4172
2006 4177
2007 # Create data.frame used by ANOVA analysis 4178 # Create data.frame used by ANOVA analysis
2008 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) 4179 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
2009 ``` 4180 ```
2010 4181
2011 <!-- ACE insertion begin -->
2012 ## Are normalized, imputed, log-transformed sample distributions similar? 4182 ## Are normalized, imputed, log-transformed sample distributions similar?
2013 4183
2014 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} 4184 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
2015 4185
2016 # Save unimputed quant_data_log for plotting below 4186 # Save unimputed quant_data_log for plotting below
2017 unimputed_quant_data_log <- quant_data_log 4187 unimputed_quant_data_log <- quant_data_log
2018 4188
2019 # log10 transform (after preparing for zero values, 4189 # log10 transform (after preparing for zero values,
2032 ) 4202 )
2033 cat("\n\n\n") 4203 cat("\n\n\n")
2034 4204
2035 4205
2036 # data visualization 4206 # data visualization
4207 if (TRUE) {
4208
4209 my_ppep_distrib_bxp(
4210 x = quant_data_log
4211 , sample_name_grow = sample_name_grow
4212 , main = "Peptide intensities for each sample"
4213 , varwidth = boxplot_varwidth
4214 , sub = NULL
4215 , xlab = "Sample"
4216 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
4217 , col = const_boxplot_fill
4218 , notch = boxplot_notch
4219 )
4220
4221 } else {
4222
2037 old_par <- par( 4223 old_par <- par(
2038 mai = par("mai") + c(0.5, 0, 0, 0) 4224 mai = par("mai") + c(0.5, 0, 0, 0)
2039 , oma = par("oma") + c(0.5, 0, 0, 0) 4225 , oma = par("oma") + c(0.5, 0, 0, 0)
2040 ) 4226 )
2041 # ref: https://r-charts.com/distribution/add-points-boxplot/ 4227 # ref: https://r-charts.com/distribution/add-points-boxplot/
2042 # Vertical plot 4228 # Vertical plot
2043 colnames(quant_data_log) <- sample_name_matches 4229 colnames(quant_data_log) <- sample_name_matches
2044 boxplot( 4230 boxplot(
2045 quant_data_log 4231 quant_data_log
2046 , las = 1 4232 , las = 2
4233 , cex.axis = 0.9 * sample_name_grow
2047 , col = const_boxplot_fill 4234 , col = const_boxplot_fill
2048 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 4235 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
2049 , xlab = "Sample" 4236 , xlab = "Sample"
4237 , notch = boxplot_notch
4238 , varwidth = boxplot_varwidth
2050 ) 4239 )
2051 par(old_par) 4240 par(old_par)
4241 }
2052 } else { 4242 } else {
2053 cat("There are no peptides to plot\n") 4243 cat("There are no peptides to plot\n")
2054 } 4244 }
2055 4245
2056 cat("\n\n\n") 4246 cat("\n\n\n")
2072 cat("\\leavevmode\\newpage\n") 4262 cat("\\leavevmode\\newpage\n")
2073 ``` 4263 ```
2074 4264
2075 # ANOVA Analysis 4265 # ANOVA Analysis
2076 4266
2077 ```{r, echo = FALSE} 4267 ## Assignment of $p$-value and quality score
4268
4269 For each phosphopeptide, ANOVA analysis was performed to compute a $p$-value representing the evidence against the "null hypothesis" ($H_0$) that the intensity does not vary significantly among sample groups.
4270
4271 However, because as more and more phosphopeptides are tested, there is increasd probability that, by random chance, a given peptide will have a $p$-value that appears to indicate significance. For this reason, the $p$-values were adjusted by applying the False Discovery Rate (FDR) correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x).
4272
4273 Furthermore, to give more weight to phosphopeptides having fewer missing values, an (arbitrarily defined) quality score was assigned to each, defined as:
4274
4275 $$
4276 \textit{quality}_j
4277 = \frac{1 + o_{j}}{v_{j}(1 + o_{j}) + 0.005}
4278 $$
4279
4280 where:
4281
4282 - $o_j$ is the minimum number of non-missing observations per sample group for substrate $j$ for all sample groups, and
4283 - $v_j$ is the FDR-adjusted ANOVA $p$-value for substrate $j$.
4284
4285
4286 ```{r, echo = FALSE, results = 'asis'}
4287 cat("\\newpage\n")
4288
2078 # Make new data frame containing only Phosphopeptides 4289 # Make new data frame containing only Phosphopeptides
2079 # to connect preANOVA to ANOVA (connect_df) 4290 # to connect preANOVA to ANOVA (connect_df)
2080 connect_df <- data.frame( 4291 connect_df <- data.frame(
2081 data_table_imp_qn_lt$Phosphopeptide 4292 data_table_imp_qn_lt$Phosphopeptide
2082 , data_table_imp_qn_lt[, first_data_column] 4293 , data_table_imp_qn_lt[, first_data_column]
2083 ) 4294 )
2084 colnames(connect_df) <- c("Phosphopeptide", "Intensity") 4295 colnames(connect_df) <- c("Phosphopeptide", "Intensity")
2085 ``` 4296 ```
2086 4297
2087 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} 4298 ```{r anova, echo = FALSE, fig.dim = c(10, 12), results = 'asis'}
2088 count_of_treatment_levels <- length(levels(sample_treatment_levels)) 4299 g_can_run_ksea <- FALSE
4300 old_oma <- par("oma")
2089 if (count_of_treatment_levels < 2) { 4301 if (count_of_treatment_levels < 2) {
2090 nuke_control_sequences <-
2091 function(s) {
2092 s <- gsub("[\\]", "xyzzy_plugh", s)
2093 s <- gsub("[$]", "\\\\$", s)
2094 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
2095 return(s)
2096 }
2097 cat( 4302 cat(
2098 "ERROR!!!! Cannot perform ANOVA analysis", 4303 "ERROR!!!! Cannot perform ANOVA analysis",
2099 "(see next page)\\newpage\n" 4304 "(see next page)\\newpage\n"
2100 ) 4305 )
2101 cat( 4306 cat(
2107 cat("Unparsed sample names are:\n\n\n", 4312 cat("Unparsed sample names are:\n\n\n",
2108 "\\begin{quote}\n", 4313 "\\begin{quote}\n",
2109 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), 4314 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"),
2110 "\n\\end{quote}\n\n") 4315 "\n\\end{quote}\n\n")
2111 4316
2112 regex_sample_names <- nuke_control_sequences(regex_sample_names) 4317 regex_sample_names <- latex_printable_control_seqs(regex_sample_names)
2113 4318
2114 cat("\\leavevmode\n\n\n") 4319 cat("\\leavevmode\n\n\n")
2115 cat("Parsing rule for SampleNames is", 4320 cat("Parsing rule for SampleNames is",
2116 "\n\n\n", 4321 "\n\n\n",
2117 "\\text{'", 4322 "\\text{'",
2123 cat("\nParsed sample names are:\n", 4328 cat("\nParsed sample names are:\n",
2124 "\\begin{quote}\n", 4329 "\\begin{quote}\n",
2125 paste(sample_name_matches, collapse = "\n\n\n"), 4330 paste(sample_name_matches, collapse = "\n\n\n"),
2126 "\n\\end{quote}\n\n") 4331 "\n\\end{quote}\n\n")
2127 4332
2128 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) 4333 regex_sample_grouping <- latex_printable_control_seqs(regex_sample_grouping)
2129 4334
2130 cat("\\leavevmode\n\n\n") 4335 cat("\\leavevmode\n\n\n")
2131 cat("Parsing rule for SampleGrouping is", 4336 cat("Parsing rule for SampleGrouping is",
2132 "\n\n\n", 4337 "\n\n\n",
2133 "\\text{'", 4338 "\\text{'",
2142 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), 4347 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"),
2143 "\n\\end{quote}\n\n") 4348 "\n\\end{quote}\n\n")
2144 4349
2145 } else { 4350 } else {
2146 4351
2147 p_value_data_anova_ps <- 4352 if (print_nb_messages) nbe("computing p_value_data_anova_ps\n")
2148 apply( 4353 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n")
2149 quant_data_imp_qn_log, 4354 if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n")
2150 1, 4355 if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log[, ".NE.7C"]), "\n")
2151 anova_func, 4356 if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log), "\n")
2152 grouping_factor = sample_treatment_levels, 4357 if (print_nb_messages) nbe(see_variable(anova_func), "\n")
2153 one_way_f = one_way_all_categories 4358 if (print_nb_messages) nbe(see_variable(smpl_trt), "\n")
2154 ) 4359 if (print_nb_messages) nbe(see_variable(one_way_all_categories), "\n")
4360 tryCatch(
4361 {
4362 p_value_data_anova_ps <-
4363 row_apply(
4364 quant_data_imp_qn_log,
4365 anova_func,
4366 grouping_factor = smpl_trt,
4367 one_way_f = one_way_all_categories
4368 )
4369 },
4370 error = function(e) {
4371 mesg <- paste("Could not compute ANOVA because", e$message)
4372 cat("\n\n", mesg, "\n\n")
4373 param_df_noexit(e)
4374 sink(stderr())
4375 cat("\n\n", mesg, "\n\n")
4376 values <- paste(param_df$parameter, "=", param_df$value, collapse = "\n")
4377 cat(values)
4378 sink()
4379 knitr::knit_exit()
4380 exit(code = 1)
4381 }
4382 )
4383 if (print_nb_messages) nbe(see_variable(p_value_data_anova_ps), "\n")
2155 4384
2156 p_value_data_anova_ps_fdr <- 4385 p_value_data_anova_ps_fdr <-
2157 p.adjust(p_value_data_anova_ps, method = "fdr") 4386 p.adjust(p_value_data_anova_ps, method = "fdr")
4387 my_ppep_v <- full_data[, 1]
4388 p_value_data <- list(
4389 phosphopeptide = my_ppep_v,
4390 raw_anova_p = p_value_data_anova_ps,
4391 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr,
4392 missing_values = rowSums(is.na(quant_data)),
4393 min_group_obs_count = min_group_obs_count
4394 )
2158 p_value_data <- data.frame( 4395 p_value_data <- data.frame(
2159 phosphopeptide = full_data[, 1] 4396 phosphopeptide = my_ppep_v,
2160 , 4397 raw_anova_p = p_value_data_anova_ps,
2161 raw_anova_p = p_value_data_anova_ps 4398 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr,
2162 , 4399 missing_values = rowSums(is.na(quant_data)),
2163 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr 4400 min_group_obs_count = min_group_obs_count
2164 ) 4401 )
4402 p_value_data$quality <- 1.0 / with(
4403 p_value_data,
4404 fdr_adjusted_anova_p + 0.005 / (1 + min_group_obs_count)
4405 )
4406
4407 p_value_data$ranking <-
4408 with(
4409 p_value_data,
4410 switch(
4411 g_intensity_hm_criteria,
4412 "quality" = order(-quality),
4413 "na_count" = order(missing_values, fdr_adjusted_anova_p),
4414 # the default is "p_value"
4415 order(fdr_adjusted_anova_p)
4416 )
4417 )
4418 p_value_data <- p_value_data[p_value_data$ranking, , drop = FALSE]
4419
4420 write.table(
4421 p_value_data,
4422 file = "p_value_data.txt",
4423 sep = "\t",
4424 col.names = TRUE,
4425 row.names = FALSE,
4426 quote = FALSE
4427 )
4428
2165 4429
2166 # output ANOVA file to constructed filename, 4430 # output ANOVA file to constructed filename,
2167 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" 4431 # e.g. "Outputfile_pST_ANOVA_STEP5.txt"
2168 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" 4432 # becomes "Outputfile_pST_ANOVA_STEP5_FDR0.05.txt"
2169 4433
2170 # Re-output datasets to include p-values 4434 # Re-output datasets to include p-values
2171 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) 4435 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:ncol(p_value_data)])
2172 write.table( 4436 write.table(
2173 cbind(metadata_plus_p, quant_data_imp), 4437 cbind(metadata_plus_p, quant_data_imp),
2174 file = imputed_data_filename, 4438 file = imputed_data_filename,
2175 sep = "\t", 4439 sep = "\t",
2176 col.names = TRUE, 4440 col.names = TRUE,
2186 row.names = FALSE, 4450 row.names = FALSE,
2187 quote = FALSE 4451 quote = FALSE
2188 ) 4452 )
2189 4453
2190 4454
2191 p_value_data <-
2192 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ]
2193
2194 first_page_suppress <- 1 4455 first_page_suppress <- 1
2195 number_of_peptides_found <- 0 4456 number_of_peptides_found <- 0
2196 cutoff <- val_fdr[1] 4457 cutoff <- val_fdr[1]
2197 for (cutoff in val_fdr) { 4458 for (cutoff in val_fdr) {
2198 if (number_of_peptides_found > 49) { 4459 #loop through FDR cutoffs
4460 if (number_of_peptides_found > g_intensity_hm_rows - 1) {
2199 cat("\\leavevmode\n\n\n") 4461 cat("\\leavevmode\n\n\n")
2200 break 4462 break
2201 } 4463 }
2202 4464
2203 #loop through FDR cutoffs 4465 bool_1 <- (p_value_data$fdr_adjusted_anova_p < cutoff)
4466 bool_2 <- (p_value_data$min_group_obs_count >= g_intensity_min_per_class)
4467 g_can_run_ksea <- g_can_run_ksea || (sum(bool_2) > 0)
4468 bool_4 <- (p_value_data$quality >= params$minQuality)
4469 bool_3 <- as.logical(
4470 as.integer(bool_1) *
4471 as.integer(bool_2) *
4472 as.integer(bool_4)
4473 )
4474 if (print_trace_messages) {
4475 if (length(bool_1) > 30) {
4476 cat_variable(bool_1, force_str = TRUE)
4477 cat_variable(bool_2, force_str = TRUE)
4478 cat_variable(bool_3, force_str = TRUE)
4479 } else {
4480 cat_variable(bool_1, suffix = "\n\n")
4481 cat_variable(bool_2, suffix = "\n\n")
4482 cat_variable(bool_3, suffix = "\n\n")
4483 }
4484 cat_variable(length(bool_3), digits = 0, suffix = "; ")
4485 cat_variable(sum(bool_3), digits = 0, suffix = "\n\n")
4486 }
2204 4487
2205 filtered_p <- 4488 filtered_p <-
2206 p_value_data[ 4489 p_value_data[bool_3, , drop = FALSE]
2207 which(p_value_data$fdr_adjusted_anova_p < cutoff), 4490 filtered_p <-
2208 , drop = FALSE 4491 filtered_p[!is.na(filtered_p$phosphopeptide), , drop = FALSE]
2209 ] 4492
4493 if (print_trace_messages)
4494 cat_variable(filtered_p, force_str = TRUE)
4495
4496 have_remaining_peptides <- sum(bool_3, na.rm = TRUE) > 0
4497 filter_result_string <-
4498 sprintf(
4499 "%s, %s of %0.0f peptides remained having both %s and %s.\n\n",
4500 "After filtering for ANOVA results",
4501 if (have_remaining_peptides)
4502 as.character(sum(bool_3, na.rm = TRUE))
4503 else
4504 "none",
4505 length(bool_3),
4506 sprintf("adjusted p-value < %s", as.character(signif(cutoff, 2))),
4507 sprintf(
4508 "more than %0.0f observations in some groups",
4509 max(0, g_intensity_min_per_class - 1)
4510 )
4511 )
4512
2210 filtered_data_filtered <- 4513 filtered_data_filtered <-
2211 quant_data_imp_qn_log[ 4514 quant_data_imp_qn_log[
2212 rownames(filtered_p), 4515 rownames(filtered_p),
2213 , drop = FALSE 4516 , drop = FALSE
2214 ] 4517 ]
4518 # order by p-value
2215 filtered_data_filtered <- 4519 filtered_data_filtered <-
2216 filtered_data_filtered[ 4520 filtered_data_filtered[
2217 order(filtered_p$fdr_adjusted_anova_p), 4521 order(filtered_p$fdr_adjusted_anova_p),
2218 , drop = FALSE 4522 , drop = FALSE
2219 ] 4523 ]
2220 4524
2221 # <!-- ACE insertion start --> 4525 if (have_remaining_peptides && nrow(filtered_p) > 0 && nrow(filtered_data_filtered) > 0) {
2222
2223 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
2224 if (first_page_suppress == 1) { 4526 if (first_page_suppress == 1) {
2225 first_page_suppress <- 0 4527 first_page_suppress <- 0
2226 } else { 4528 } else {
2227 cat("\\newpage\n") 4529 cat("\\newpage\n")
2228 } 4530 }
2229 if (nrow(filtered_data_filtered) > 1) { 4531 latex_samepage({
2230 subsection_header(sprintf( 4532 cat(filter_result_string)
2231 "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", 4533 if (nrow(filtered_data_filtered) > 1) {
2232 nrow(filtered_data_filtered), 4534 cat(subsection_header(sprintf(
2233 cutoff 4535 "Intensity distributions for %d phosphopeptides\n",
2234 )) 4536 nrow(filtered_data_filtered)
2235 } else { 4537 )))
2236 subsection_header(sprintf( 4538 } else {
2237 "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", 4539 cat(subsection_header(sprintf(
2238 rownames(filtered_data_filtered)[1], 4540 "Intensity distribution for one phosphopeptide (%s)\n",
2239 cutoff 4541 rownames(filtered_data_filtered)[1]
2240 )) 4542 )))
2241 } 4543 }
2242 cat("\n\n\n") 4544 }) # end latex_samepage
2243 cat("\n\n\n") 4545
2244
2245 old_oma <- par("oma")
2246 old_par <- par( 4546 old_par <- par(
2247 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), 4547 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
2248 oma = old_oma * c(1, 1, 0.3, 1), 4548 oma = old_oma * c(1, 1, 0.3, 1),
2249 cex.main = 0.9, 4549 cex.main = 0.9,
2250 cex.axis = 0.7, 4550 cex.axis = 0.7,
2251 fin = c(9, 7.25) 4551 fin = c(9, 7.25)
2252 ) 4552 )
2253 # ref: https://r-charts.com/distribution/add-points-boxplot/
2254 # Vertical plot 4553 # Vertical plot
2255 colnames(filtered_data_filtered) <- sample_name_matches 4554 colnames(filtered_data_filtered) <- sample_name_matches
2256 tryCatch( 4555 tryCatch(
2257 boxplot( 4556 boxplot(
2258 filtered_data_filtered, 4557 filtered_data_filtered,
2259 main = "Imputed, normalized intensities", # no line plot 4558 main = "Imputed, normalized intensities", # no line plot
2260 las = 1, 4559 las = 2,
4560 cex.axis = 0.9 * sample_name_grow,
2261 col = const_boxplot_fill, 4561 col = const_boxplot_fill,
2262 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 4562 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
4563 notch = FALSE,
4564 varwidth = boxplot_varwidth
2263 ), 4565 ),
2264 error = function(e) print(e) 4566 error = function(e) {
4567 print(e)
4568 cat_margins()
4569 }
4570
2265 ) 4571 )
2266 par(old_par) 4572 par(old_par)
2267 } else { 4573 } else {
2268 cat(sprintf( 4574 cat(sprintf(
2269 "%s < %0.2f\n\n\n\n\n", 4575 "%s < %0.2f\n\n\n\n\n",
2270 "No peptides were found to have cutoff adjusted p-value", 4576 "No peptides were found to have cutoff adjusted p-value",
2271 cutoff 4577 cutoff
2272 )) 4578 ))
2273 } 4579 }
2274 4580
2275 if (nrow(filtered_data_filtered) > 0) { 4581 if (have_remaining_peptides && nrow(filtered_data_filtered) > 0) {
2276 # Add Phosphopeptide column to anova_filtered table 4582 # Add Phosphopeptide column to anova_filtered table
2277 # The assumption here is that the first intensity is unique; 4583 # The assumption here is that the first intensity is unique;
2278 # this is a hokey assumption but almost definitely will 4584 # this is a hokey assumption but almost definitely will
2279 # be true in the real world unless there is a computation 4585 # be true in the real world unless there is a computation
2280 # error upstream. 4586 # error upstream.
2303 by.y = "Phosphopeptide" 4609 by.y = "Phosphopeptide"
2304 ) 4610 )
2305 4611
2306 # Produce heatmap to visualize significance and the effect of imputation 4612 # Produce heatmap to visualize significance and the effect of imputation
2307 4613
2308 anova_filtered_merge_format <- sapply(
2309 X = filtered_p$fdr_adjusted_anova_p
2310 ,
2311 FUN = function(x) {
2312 if (x > 0.0001)
2313 paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s")
2314 else
2315 paste0("(%0.4e) %s")
2316 }
2317 )
2318
2319 cat_hm_heading <- function(m, cutoff) { 4614 cat_hm_heading <- function(m, cutoff) {
2320 cat("\\newpage\n") 4615 if (nrow(m) > g_intensity_hm_rows) {
2321 if (nrow(m) > intensity_hm_rows) { 4616 cat("\\clearpage\n")
2322 subsection_header( 4617 cat(subsection_header(
2323 paste( 4618 paste(
2324 sprintf("Heatmap for the %d most-significant peptides", 4619 sprintf("Heatmap for the %d most-significant peptides",
2325 intensity_hm_rows), 4620 g_intensity_hm_rows),
2326 sprintf("whose adjusted p-value < %0.2f\n", cutoff) 4621 sprintf("whose adjusted p-value < %0.2f\n", cutoff)
2327 ) 4622 )
2328 ) 4623 ))
2329 } else { 4624 } else {
2330 if (nrow(m) == 1) { 4625 if (nrow(m) == 0) {
2331 return(FALSE) 4626 return(FALSE)
2332 } else { 4627 } else {
2333 subsection_header( 4628 cat(subsection_header(
2334 paste( 4629 paste(
2335 sprintf("Heatmap for %d usable peptides whose", nrow(m)), 4630 sprintf("Heatmap for %d usable peptide genes whose", nrow(m)),
2336 sprintf("adjusted p-value < %0.2f\n", cutoff) 4631 sprintf("adjusted p-value < %0.2f\n", cutoff)
2337 ) 4632 )
2338 ) 4633 ))
2339 } 4634 }
2340 } 4635 }
2341 cat("\n\n\n") 4636 cat("\n\n\n")
2342 cat("\n\n\n") 4637 cat("\n\n\n")
2343 return(TRUE) 4638 return(TRUE)
2344 } 4639 }
2345 4640
2346 # construct matrix with appropriate rownames 4641 # construct matrix with appropriate rownames
2347 m <- 4642 m <-
2348 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) 4643 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
2349 if (nrow(m) > 0) { 4644 nrow_m <- nrow(m)
4645 ncol_m <- ncol(m)
4646 if (nrow_m > 0) {
2350 rownames_m <- rownames(m) 4647 rownames_m <- rownames(m)
2351 rownames(m) <- sapply( 4648 q <- data.frame(pepname = rownames_m)
2352 X = seq_len(nrow(m)) 4649 g <- sqldf("
2353 , 4650 SELECT q.pepname, substr(met.Gene_Name, 1, 30) AS gene_name
4651 FROM q, metadata_plus_p AS met
4652 WHERE q.pepname = met.Phosphopeptide
4653 ORDER BY q.rowid
4654 ")
4655 tmp <- sapply(
4656 X = seq_len(nrow(g)),
4657 FUN = function(i) {
4658 pre <- strsplit(g$gene_name[i], "; ")[[1]]
4659 rslt <- paste(unique(pre), sep = "; ")
4660 return(rslt)
4661 }
4662 )
4663 tmp <- unlist(tmp)
4664 tmp <-
4665 make.names(tmp, unique = TRUE)
4666 tmp <- sub(
4667 "No_Gene_Name",
4668 "not_found",
4669 tmp,
4670 fixed = TRUE
4671 )
4672 ten_trunc_names <- trunc_ppep(rownames_m)
4673 tmp <- sapply(
4674 X = seq_len(nrow_m),
2354 FUN = function(i) { 4675 FUN = function(i) {
2355 sprintf( 4676 sprintf(
2356 anova_filtered_merge_format[i], 4677 "(%s) %s",
2357 filtered_p$fdr_adjusted_anova_p[i], 4678 tmp[i],
2358 rownames_m[i] 4679 ten_trunc_names[i]
2359 ) 4680 )
2360 } 4681 }
2361 ) 4682 )
4683 rownames(m) <- tmp
2362 } 4684 }
2363 # draw the heading and heatmap 4685 # draw the heading and heatmap
2364 if (nrow(m) > 0) { 4686 if (nrow_m > 0) {
2365 number_of_peptides_found <- 4687 number_of_peptides_found <-
2366 draw_intensity_heatmap( 4688 ppep_heatmap(
2367 m = m, 4689 m = m,
2368 cutoff = cutoff, 4690 cutoff = cutoff,
2369 hm_heading_function = cat_hm_heading, 4691 hm_heading_function = cat_hm_heading,
2370 hm_main_title = "Unimputed, unnormalized log(intensities)", 4692 hm_main_title =
2371 suppress_row_dendrogram = FALSE 4693 "log(intensities), row-scaled, unimputed, unnormalized",
4694 suppress_row_dendrogram = FALSE,
4695 master_cex = 0.35,
4696 sepcolor = "black",
4697 colsep = sample_colsep
2372 ) 4698 )
4699 if (number_of_peptides_found > 1) {
4700 cat("\\leavevmode\n")
4701 }
2373 } 4702 }
2374 } 4703 }
2375 } 4704 }
2376 } 4705 }
2377 cat("\\leavevmode\n\n\n") 4706 cat(filter_result_string)
4707 cat("\\leavevmode\n")
4708
4709 if (!g_can_run_ksea) {
4710 errmsg <- paste("Cannot proceed with KSEA analysis",
4711 "because too many values are missing.")
4712 if (FALSE) cat0(
4713 errmsg,
4714 "\\stepcounter{offset}\n",
4715 "\\stepcounter{offset}\n",
4716 "\\stepcounter{offset}\n",
4717 " in ",
4718 table_href(),
4719 ".\n\n"
4720 )
4721 if (FALSE) {
4722 if (print_nb_messages) nbe(see_variable(p_value_data))
4723 } else {
4724 if (print_nb_messages) nbe(see_variable(p_value_data))
4725
4726 display_p_value_data <- p_value_data
4727 display_p_value_data$raw_anova_p <-
4728 sprintf("%0.3g", display_p_value_data$raw_anova_p)
4729 display_p_value_data$fdr_adjusted_anova_p <-
4730 sprintf("%0.3g", display_p_value_data$fdr_adjusted_anova_p)
4731 display_p_value_data$quality <-
4732 sprintf("%0.3g", display_p_value_data$quality)
4733
4734 headers_1st_line <-
4735 c("", "Raw ANOVA", "FDR-adj.", "Missing", "Min. #", "", "")
4736 headers_2nd_line <-
4737 c("Phosphopeptide", "p-value", "p-value", "values", "group-obs", "Quality", "Ranking")
4738 data_frame_tabbing_latex(
4739 x = display_p_value_data,
4740 tabstops = c(2.75, 0.80, 0.80, 0.5, 0.6, 0.60),
4741 use_subsubsection_header = FALSE,
4742 headings = c(headers_1st_line, headers_2nd_line),
4743 caption = "ANOVA results"
4744
4745 )
4746 }
4747 data_frame_tabbing_latex(
4748 x = save_sample_treatment_df,
4749 tabstops = c(1.25, 1.25),
4750 caption = "Sample classes",
4751 use_subsubsection_header = FALSE
4752 )
4753 param_df_exit()
4754 knitr::knit_exit()
4755 return(invisible(-1))
4756 }
4757
2378 ``` 4758 ```
2379 4759
2380 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} 4760 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
2381 4761
2382 if (count_of_treatment_levels > 1) { 4762 if (g_can_run_ksea && count_of_treatment_levels > 1) {
2383 # Prepare two-way contrasts with adjusted p-values 4763 # Prepare two-way contrasts with adjusted p-values
2384 # Strategy: 4764 # Strategy:
2385 # - use imputed, log-transformed data: 4765 # - use imputed, log-transformed data:
2386 # - remember this when computing log2(fold-change) 4766 # - remember this when computing log2(fold-change)
2387 # - each contrast is between a combination of trt levels 4767 # - each contrast is between a combination of trt levels
2393 # - adjust p-value, assuming that 4773 # - adjust p-value, assuming that
2394 # (# of pppeps)*(# of contrasts) tests were performed 4774 # (# of pppeps)*(# of contrasts) tests were performed
2395 4775
2396 # Each contrast is between a combination of trt levels 4776 # Each contrast is between a combination of trt levels
2397 m2 <- combn( 4777 m2 <- combn(
2398 x = seq_len(length(levels(sample_treatment_levels))), 4778 x = seq_len(length(levels(smpl_trt))),
2399 m = 2, 4779 m = 2,
2400 simplify = TRUE 4780 simplify = TRUE
2401 ) 4781 )
2402 contrast_count <- ncol(m2) 4782 contrast_count <- ncol(m2)
2403 4783
2407 f_m2 <- 4787 f_m2 <-
2408 function(cntrst, lvl1, lvl2) { 4788 function(cntrst, lvl1, lvl2) {
2409 return( 4789 return(
2410 data.frame( 4790 data.frame(
2411 contrast = cntrst, 4791 contrast = cntrst,
2412 level = sample_treatment_levels[ 4792 level = smpl_trt[
2413 sample_treatment_levels %in% 4793 smpl_trt %in%
2414 levels(sample_treatment_levels)[c(lvl1, lvl2)] 4794 levels(smpl_trt)[c(lvl1, lvl2)]
2415 ], 4795 ],
2416 label = sample_name_matches[ 4796 label = sample_name_matches[
2417 sample_treatment_levels %in% 4797 smpl_trt %in%
2418 levels(sample_treatment_levels)[c(lvl1, lvl2)] 4798 levels(smpl_trt)[c(lvl1, lvl2)]
2419 ] 4799 ]
2420 ) 4800 )
2421 ) 4801 )
2422 } 4802 }
2423 # - compute a df for each contrast 4803 # - compute a df for each contrast
2593 ; 4973 ;
2594 " 4974 "
2595 ) 4975 )
2596 4976
2597 # - create contrast-metadata table 4977 # - create contrast-metadata table
4978 if (print_nb_messages) nbe("CREATE TABLE contrast_lvl_lvl_metadata")
2598 dml_no_rows_exec(db, " 4979 dml_no_rows_exec(db, "
2599 CREATE TABLE contrast_lvl_lvl_metadata 4980 CREATE TABLE contrast_lvl_lvl_metadata
2600 AS 4981 AS
2601 SELECT DISTINCT 4982 SELECT DISTINCT
2602 a.contrast AS ab_contrast, 4983 a.contrast AS ab_contrast,
2703 rownames(grouping_factor) <- grouping_factor$sample 5084 rownames(grouping_factor) <- grouping_factor$sample
2704 grouping_factor <- grouping_factor[, "level", drop = FALSE] 5085 grouping_factor <- grouping_factor[, "level", drop = FALSE]
2705 5086
2706 # - run the two-level (one-way) test 5087 # - run the two-level (one-way) test
2707 p_value_data_contrast_ps <- 5088 p_value_data_contrast_ps <-
2708 apply( 5089 row_apply(
2709 X = contrast_cast_data, 5090 x = contrast_cast_data,
2710 MARGIN = 1, # apply to rows 5091 fun = anova_func,
2711 FUN = anova_func,
2712 grouping_factor = 5092 grouping_factor =
2713 as.factor(as.numeric(grouping_factor$level)), # anova_func arg2 5093 as.factor(grouping_factor$level), # anova_func arg2
2714 one_way_f = one_way_two_categories, # anova_func arg3 5094 one_way_f = one_way_two_categories, # anova_func arg3
2715 simplify = TRUE # TRUE is the default for simplify 5095 simplify = TRUE # TRUE is the default for simplify
2716 ) 5096 )
2717 contrast_data_adj_p_values <- p.adjust( 5097 contrast_data_adj_p_values <- p.adjust(
2718 p = p_value_data_contrast_ps, 5098 p = p_value_data_contrast_ps,
2920 AND NOT m.`Gene` = 'No_Gene_Name' 5300 AND NOT m.`Gene` = 'No_Gene_Name'
2921 AND NOT v.log2_fc = 0 5301 AND NOT v.log2_fc = 0
2922 ; 5302 ;
2923 " 5303 "
2924 ) 5304 )
5305 # We are done with DDL and insertion
5306 RSQLite::dbDisconnect(db)
2925 } 5307 }
2926 ``` 5308 ```
2927 5309
2928 ```{r echo = FALSE, results = 'asis'} 5310 ```{r echo = FALSE, results = 'asis'}
2929 cat("\\newpage\n") 5311 cat("\\newpage\n")
2930 ``` 5312 ```
2931 5313
2932 # KSEA Analysis 5314 # KSEA Analysis Summaries
2933 5315
2934 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: 5316 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package:
2935 5317
2936 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp 5318 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp
2937 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). 5319 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415).
2938 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). 5320 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/).
2939 5321
2940 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: 5322 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as:
2941 5323
2942 $$ 5324 $$
2943 \text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} 5325 \text{kinase enrichment }z\text{-score}_{j,i} = \frac{(\overline{`r sfc`}_{j,i} - \overline{`r pfc`}_j)\sqrt{m_{j,i}}}{\delta_j}
2944 $$ 5326 $$
2945 5327
2946 and fold-enrichment is computed as: 5328 and fold-enrichment is computed as:
2947 5329
2948 $$ 5330 $$
2949 \text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} 5331 \text{Enrichment}_{j,i} = \frac{\overline{`r sfc`}_{j,i}}{\overline{`r pfc`}_j}
2950 $$ 5332 $$
2951 5333
2952 where: 5334 where:
2953 5335
2954 - $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, 5336 - $\overline{`r sfc`}_{j,i}$ is the mean `r pfc_txt` in intensities of known substrates of the kinase $i$ in contrast $j$,
2955 - $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and 5337 - $\overline{`r pfc`}_j$ is the mean `r pfc_txt` of all phosphosites identified in contrast $j$, and
2956 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, 5338 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$,
2957 - $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. 5339 - $\delta_j$ is the standard deviation of the $\log_2 (\text{fold-change})$ for contrast $j$ across all phosphosites in the dataset.
2958 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. 5340 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score.
2959 5341
2960 $\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) 5342 $\text{FDR}_{j,i}$ is the False Discovery Rate corrected kinase enrichment score.
2961 5343
2962 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). 5344 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive).
2963 5345
2964 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. 5346 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`.
2965 5347
2966 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. 5348 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis.
2967 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). 5349 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)).
2968 - Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). 5350 - Kinase-substrate data were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).
2969 5351
2970 ```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} 5352 For each enriched kinase, a heatmap showing the intensities is presented for up to `r g_intensity_hm_rows` substrates, i.e., those substrates having the highest"quality".
5353
5354 Where possible, a heatmap of the correlations among these the selected substrates is also presented; if correlations cannot be computed (because of too many missing values), then the covariances are heatmapped for substrates having a variance greater than 1.
5355
5356 ```{r ksea, echo = FALSE, fig.dim = c(12, 14.5), results = 'asis'}
5357 cat("\\clearpage\n")
2971 5358
2972 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) 5359 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
2973 5360
2974 # -- eliminate the table that's about to be defined 5361 # -- eliminate the table that's about to be defined
2975 ddl_exec(db, " 5362 ddl_exec(db, "
3044 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] 5431 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
3045 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] 5432 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
3046 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) 5433 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
3047 contrast_longlabel <- ( 5434 contrast_longlabel <- (
3048 sprintf( 5435 sprintf(
3049 "Trt %s {%s} -> Trt %s {%s}", 5436 "Class %s -> Class %s",
3050 contrast_metadata_df[i_cntrst, "b_level"], 5437 contrast_metadata_df[i_cntrst, "b_level"],
3051 gsub( 5438 contrast_metadata_df[i_cntrst, "a_level"]
3052 pattern = ";",
3053 replacement = ", ",
3054 x = contrast_metadata_df[i_cntrst, "b_samples"],
3055 fixed = TRUE
3056 ),
3057 contrast_metadata_df[i_cntrst, "a_level"],
3058 gsub(
3059 pattern = ";",
3060 replacement = ", ",
3061 x = contrast_metadata_df[i_cntrst, "a_samples"],
3062 fixed = TRUE
3063 )
3064 ) 5439 )
3065 ) 5440 )
3066 kseaapp_input <- 5441 kseaapp_input <-
3067 sqldf::sqldf( 5442 sqldf::sqldf(
3068 x = sprintf(" 5443 x = sprintf("
3089 sub_title <- contrast_longlabel 5464 sub_title <- contrast_longlabel
3090 tryCatch( 5465 tryCatch(
3091 expr = { 5466 expr = {
3092 ksea_scores_rslt <- 5467 ksea_scores_rslt <-
3093 ksea_scores( 5468 ksea_scores(
3094 ksdata = pseudo_ksdata, # KSEAapp::KSData, 5469 ksdata = pseudo_ksdata,
3095 px = kseaapp_input, 5470 px = kseaapp_input,
3096 networkin = TRUE, 5471 networkin = TRUE,
3097 networkin_cutoff = 2 5472 networkin_cutoff = 2,
5473 minimum_substrate_count = ksea_min_substrate_count
3098 ) 5474 )
5475
5476 if (FALSE) {
5477 ksea_scores_rslt <-
5478 ksea_scores_rslt[
5479 ksea_scores_rslt$m >= ksea_min_substrate_count,
5480 ,
5481 drop = FALSE
5482 ]
5483 }
5484
5485 if (FALSE) {
5486 data_frame_tabbing_latex(
5487 x = ksea_scores_rslt,
5488 tabstops = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.8),
5489 caption = paste("KSEA scores for contrast ",
5490 cntrst_b_level, "to", cntrst_a_level),
5491 use_subsubsection_header = FALSE
5492 )
5493 }
5494
5495 if (FALSE) {
5496 if (print_nb_messages) nbe("Output contents of `ksea_scores_rslt` table\n")
5497 cat_variable(ksea_scores_rslt)
5498 cat("\n\\clearpage\n")
5499 }
3099 5500
3100 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { 5501 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
3101 next_index <- 1 + next_index 5502 next_index <- 1 + next_index
3102 rslt$score_list[[next_index]] <- ksea_scores_rslt 5503 rslt$score_list[[next_index]] <- ksea_scores_rslt
3103 rslt$name_list[[next_index]] <- contrast_label 5504 rslt$name_list[[next_index]] <- contrast_label
3104 rslt$longname_list[[next_index]] <- contrast_longlabel 5505 rslt$longname_list[[next_index]] <- contrast_longlabel
3105 low_fdr_print( 5506 ksea_low_fdr_print(
3106 rslt = rslt, 5507 rslt = rslt,
3107 i_cntrst = i_cntrst, 5508 i_cntrst = i_cntrst,
3108 i = next_index, 5509 i = next_index,
3109 a_level = cntrst_a_level, 5510 a_level = cntrst_a_level,
3110 b_level = cntrst_b_level, 5511 b_level = cntrst_b_level,
3111 fold_change = cntrst_fold_change, 5512 fold_change = cntrst_fold_change,
3112 caption = contrast_longlabel 5513 caption = contrast_longlabel
3113 ) 5514 )
3114 } 5515 }
3115 }, 5516 },
3116 error = function(e) str(e) 5517 error = function(e) {
5518 str(e)
5519 cat_margins()
5520 }
3117 ) 5521 )
3118 } 5522 }
3119 5523
3120 plotted_kinases <- NULL 5524 plotted_kinases <- NULL
3121 if (length(rslt$score_list) > 1) { 5525 if (g_can_run_ksea && length(rslt$score_list) > 1) {
3122 for (i in seq_len(length(ksea_heatmap_titles))) { 5526 for (i in seq_len(length(ksea_heatmap_titles))) {
3123 hdr <- ksea_heatmap_titles[[i]] 5527 hdr <- ksea_heatmap_titles[[i]]
3124 which_kinases <- i 5528 which_kinases <- i
3125 5529
3126 cat("\\clearpage\n\\begin{center}\n") 5530 cat("\\clearpage\n\\begin{center}\n")
3127 if (i == const_ksea_astrsk_kinases) { 5531 if (i == const_ksea_astrsk_kinases) {
3128 subsection_header(hdr) 5532 cat(subsection_header(hdr))
3129 } else { 5533 } else {
3130 subsection_header(hdr) 5534 cat(subsection_header(hdr))
3131 } 5535 }
3132 cat("\\end{center}\n") 5536 cat("\\end{center}\n")
3133 5537
3134 plotted_kinases <- ksea_heatmap( 5538 plotted_kinases <- ksea_heatmap(
3135 # the data frame outputs from the KSEA.Scores() function, in list format 5539 # the data frame outputs from the KSEA.Scores() function, in list format
3144 # a numeric value between 0 and infinity indicating the min. number of 5548 # a numeric value between 0 and infinity indicating the min. number of
3145 # substrates a kinase must have to be included in the heatmap 5549 # substrates a kinase must have to be included in the heatmap
3146 m_cutoff = 1, 5550 m_cutoff = 1,
3147 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff 5551 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
3148 # for indicating significant kinases in the heatmap 5552 # for indicating significant kinases in the heatmap
3149 p_cutoff = 0.05, 5553 p_cutoff = params$kseaCutoffThreshold,
3150 # a binary input of TRUE or FALSE, indicating whether or not to perform 5554 # a binary input of TRUE or FALSE, indicating whether or not to perform
3151 # hierarchical clustering of the sample columns 5555 # hierarchical clustering of the sample columns
3152 sample_cluster = TRUE, 5556 sample_cluster = TRUE,
3153 # a binary input of TRUE or FALSE, indicating whether or not to export 5557 # a binary input of TRUE or FALSE, indicating whether or not to export
3154 # the heatmap as a .png image into the working directory 5558 # the heatmap as a .png image into the working directory
3161 ylab = "Kinase", 5565 ylab = "Kinase",
3162 # print which kinases: 5566 # print which kinases:
3163 # - 1 : all kinases 5567 # - 1 : all kinases
3164 # - 2 : significant kinases 5568 # - 2 : significant kinases
3165 # - 3 : non-significant kinases 5569 # - 3 : non-significant kinases
3166 which_kinases = which_kinases 5570 which_kinases = which_kinases,
3167 ) 5571 margins = c(7, 15)
3168 cat("\\begin{center}\n") 5572 )
3169 cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") 5573 if (!is.null(plotted_kinases)) {
3170 cat("\\end{center}\n") 5574 cat("\\begin{center}\n")
5575 if (which_kinases != const_ksea_nonastrsk_kinases)
5576 cat("Asterisks reflect significance.\n")
5577 cat("\\end{center}\n")
5578 }
3171 } # end for (i in ... 5579 } # end for (i in ...
3172 } # end if (length ... 5580 } # end if (length ...
3173 5581 ```
3174 for (i_cntrst in seq_len(length(rslt$score_list))) { 5582
3175 next_index <- i_cntrst 5583 ```{r kseabar_calc, echo = FALSE, fig.dim = c(9.5, 6), results = 'asis'}
3176 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] 5584 ksea_prints <- list()
3177 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] 5585 ksea_barplots <- list()
3178 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] 5586 for (i_cntrst in seq_len(length(rslt$score_list))) {
3179 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) 5587 next_index <- i_cntrst
3180 contrast_longlabel <- ( 5588 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
5589 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
5590 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
5591 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
5592 contrast_longlabel <- (
5593 sprintf(
5594 "Class %s -> Class %s",
5595 contrast_metadata_df[i_cntrst, "b_level"],
5596 contrast_metadata_df[i_cntrst, "a_level"]
5597 )
5598 )
5599 main_title <- (
5600 sprintf(
5601 "Change from treatment %s to treatment %s",
5602 contrast_metadata_df[i_cntrst, "b_level"],
5603 contrast_metadata_df[i_cntrst, "a_level"]
5604 )
5605 )
5606 sub_title <- contrast_longlabel
5607 tryCatch(
5608 expr = {
5609 ksea_scores_rslt <- rslt$score_list[[next_index]]
5610 if (print_nb_messages) nbe(see_variable(ksea_scores_rslt)) #ACE
5611
5612 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
5613 sink(deferred <- file())
5614 ksea_low_fdr_print(
5615 rslt = rslt,
5616 i_cntrst = i_cntrst,
5617 i = next_index,
5618 a_level = cntrst_a_level,
5619 b_level = cntrst_b_level,
5620 fold_change = cntrst_fold_change,
5621 caption = contrast_longlabel,
5622 write_db = FALSE,
5623 anchor = const_table_anchor_t
5624 )
5625 cat("\n")
5626 sink()
5627 lines <-
5628 paste(
5629 readLines(deferred, warn = FALSE),
5630 collapse = "\n"
5631 )
5632 close(deferred)
5633 sq_put(ksea_prints, lines)
5634 sink(stderr())
5635 cat("\n---\n")
5636 cat_variable(ksea_prints)
5637 barplot_closure <-
5638 ksea_low_fdr_barplot_factory(
5639 rslt = rslt,
5640 i_cntrst = i_cntrst,
5641 i = next_index,
5642 a_level = cntrst_a_level,
5643 b_level = cntrst_b_level,
5644 fold_change = cntrst_fold_change,
5645 caption = contrast_longlabel
5646 )
5647 if (rlang::is_closure(barplot_closure))
5648 sq_put(ksea_barplots, barplot_closure)
5649 else
5650 sq_put(ksea_barplots, no_op)
5651 str(ksea_barplots)
5652 cat("\n...\n")
5653 sink()
5654 }
5655 },
5656 error = function(e) {
5657 str(e)
5658 cat_margins()
5659 }
5660 )
5661 }
5662 ```
5663
5664 ```{r phosphoelm_kinase_upid_desc, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'}
5665
5666 have_kinase_descriptions <-
5667 if (!is.null(bzip2df(kinase_uprt_desc_lut, kinase_uprt_desc_lut_bz2)) &&
5668 !is.null(bzip2df(kinase_name_uprt_lut, kinase_name_uprt_lut_bz2))
5669 ) {
5670 rownames(kinase_uprt_desc_lut) <- kinase_uprt_desc_lut$UniProtID
5671 kinase_name_to_desc_uprt <- function(s) {
5672 rslt <- NULL
5673 tryCatch(
5674 {
5675 which_rows <- eval(s == kinase_name_uprt_lut$kinase)
5676 kinase_uprtid <-
5677 kinase_name_uprt_lut[which_rows, 2]
5678 # filter for first _HUMAN match if any
5679 grepl_human <- grepl("_HUMAN$", kinase_uprtid)
5680 if (0 < sum(grepl_human))
5681 kinase_uprtid <- kinase_uprtid[grepl_human]
5682 # filter for first match if any
5683 if (0 < length(kinase_uprtid)) {
5684 kinase_uprtid <- kinase_uprtid[1]
5685 kinase_desc <- kinase_uprt_desc_lut[kinase_uprtid, 2]
5686 if (!is.na(kinase_desc))
5687 rslt <- c(kinase_desc, kinase_uprtid)
5688 else
5689 rslt <- c(kinase_desc, "")
5690 }
5691 },
5692 warning = str
5693 )
5694 rslt
5695 }
5696 TRUE
5697 } else {
5698 kinase_name_to_desc_uprt <- function(s) NULL
5699 FALSE
5700 }
5701 ```
5702
5703 ```{r write_params, echo = FALSE, results = 'asis'}
5704 # perhaps this should be moved into the functions section, eventually ...
5705 write_params <- function(db) {
5706 # write parameters to report
5707
5708 # write parameters to SQLite output
5709
5710 mqppep_anova_script_param_df <- data.frame(
5711 script = "mqppep_anova_script.Rmd",
5712 parameter = names(param_unlist),
5713 value = param_unlist
5714 )
5715 ddl_exec(db, "
5716 DROP TABLE IF EXISTS script_parameter;
5717 "
5718 )
5719 ddl_exec(db, "
5720 CREATE TABLE IF NOT EXISTS script_parameter(
5721 script TEXT,
5722 parameter TEXT,
5723 value ANY,
5724 UNIQUE (script, parameter) ON CONFLICT REPLACE
5725 )
5726 ;
5727 "
5728 )
5729 RSQLite::dbWriteTable(
5730 conn = db,
5731 name = "script_parameter",
5732 value = mqppep_anova_script_param_df,
5733 append = TRUE
5734 )
5735
5736 loaded_packages_df <- sessioninfo::package_info("loaded")
5737 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
5738 loaded_packages_df <- data.frame(
5739 package = loaded_packages_df$package,
5740 version = loaded_packages_df$loadedversion,
5741 date = loaded_packages_df$date
5742 )
5743 #ACE cat("\\clearpage\n\\section{R package versions}\n")
5744 #ACE data_frame_tabbing_latex(
5745 #ACE x = loaded_packages_df,
5746 #ACE tabstops = c(2.5, 1.25),
5747 #ACE caption = "R package versions"
5748 #ACE )
5749 cat("\\clearpage\n\\section{Input parameter settings}\n")
5750 data_frame_tabbing_latex(
5751 x = param_df,
5752 tabstops = c(1.75),
5753 underscore_whack = TRUE,
5754 caption = "Input parameters",
5755 verbatim = FALSE
5756 )
5757 }
5758
5759 if (!have_kinase_descriptions) {
5760 write_params(db)
5761 # We are done with output
5762 RSQLite::dbDisconnect(db)
5763 param_df_exit()
5764 knitr::knit_exit()
5765 }
5766 ```
5767
5768 ```{r kseabar, echo = FALSE, fig.dim = c(9.5, 12.3), results = 'asis'}
5769 if (have_kinase_descriptions) {
5770 my_section_header <-
3181 sprintf( 5771 sprintf(
3182 "Trt %s {%s} -> Trt %s {%s}", 5772 "inases whose KSEA %s < %s\n",
3183 contrast_metadata_df[i_cntrst, "b_level"], 5773 ksea_cutoff_statistic,
3184 gsub( 5774 signif(ksea_cutoff_threshold, 2)
3185 pattern = ";", 5775 )
3186 replacement = ", ", 5776
3187 x = contrast_metadata_df[i_cntrst, "b_samples"], 5777 # Use enriched kinases to find enriched kinase-substrate pairs
3188 fixed = TRUE 5778 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
3189 ), 5779
3190 contrast_metadata_df[i_cntrst, "a_level"], 5780 enriched_kinase_descs <-
3191 gsub( 5781 Reduce(
3192 pattern = ";", 5782 f = function(l, r) {
3193 replacement = ", ", 5783 lkup <- kinase_name_to_desc_uprt(r)
3194 x = contrast_metadata_df[i_cntrst, "a_samples"], 5784 if (is.null(lkup)) l
3195 fixed = TRUE 5785 else r2 <- rbind(
3196 ) 5786 l,
3197 ) 5787 data.frame(
3198 ) 5788 kinase = r,
3199 main_title <- ( 5789 uniprot_id = lkup[2],
5790 description = lkup[1]
5791 )
5792 )
5793 },
5794 x = enriched_kinases$kinase,
5795 init = NULL
5796 )
5797
5798 if (length(enriched_kinase_descs) > 0 && nrow(enriched_kinase_descs) > 0) {
5799 cat("\n\\clearpage\n")
5800 data_frame_tabbing_latex(
5801 x = enriched_kinase_descs,
5802 tabstops = c(0.9, 1.3),
5803 headings = c("Kinase", "UniProt ID", "Description"),
5804 caption = paste0("Descriptions of k", my_section_header)
5805 )
5806 }
5807
5808 if (FALSE) {
5809 cat_variable(sqldf("SELECT kinase FROM enriched_kinases"))
5810 cat_variable(sqldf("
5811 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
5812 FROM pseudo_ksdata
5813 WHERE gene IN (SELECT kinase FROM enriched_kinases)
5814 "))
5815 data_frame_table_latex(
5816 x = sqldf("
5817 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
5818 FROM pseudo_ksdata
5819 WHERE gene IN (SELECT kinase FROM enriched_kinases)
5820 "),
5821 justification = "l l l",
5822 centered = TRUE,
5823 caption = "substrates of enriched kinases",
5824 anchor = c(const_table_anchor_p, const_table_anchor_t),
5825 underscore_whack = TRUE
5826 )
5827 data_frame_table_latex(
5828 x = sqldf("
5829 SELECT
5830 gene AS kinase,
5831 ppep,
5832 sub_gene,
5833 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
5834 fdr_adjusted_anova_p,
5835 quality,
5836 min_group_obs_count
5837 FROM (
5838 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
5839 FROM pseudo_ksdata
5840 WHERE gene IN (SELECT kinase FROM enriched_kinases)
5841 ),
5842 p_value_data
5843 WHERE ppep = phosphopeptide
5844 GROUP BY kinase, ppep
5845 ORDER BY kinase, ppep, p_value_data.quality DESC
5846 "),
5847 justification = "l l l l l l l",
5848 centered = TRUE,
5849 caption = "labeled substrates of enriched kinases",
5850 anchor = c(const_table_anchor_p, const_table_anchor_t),
5851 underscore_whack = TRUE
5852 )
5853 }
5854 all_enriched_substrates <- sqldf("
5855 SELECT
5856 gene AS kinase,
5857 ppep,
5858 sub_gene,
5859 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
5860 fdr_adjusted_anova_p,
5861 quality,
5862 min_group_obs_count
5863 FROM (
5864 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
5865 FROM pseudo_ksdata
5866 WHERE gene IN (SELECT kinase FROM enriched_kinases)
5867 ),
5868 p_value_data
5869 WHERE ppep = phosphopeptide
5870 GROUP BY kinase, ppep
5871 ORDER BY kinase, ppep, p_value_data.quality DESC
5872 ")
5873
5874 all_enriched_substrates <-
5875 all_enriched_substrates[
5876 all_enriched_substrates$quality >= params$minQuality,
5877 ,
5878 drop = FALSE
5879 ]
5880
5881 all_enriched_substrates$sub_gene <-
5882 sub(
5883 " ///.*",
5884 " ...",
5885 all_enriched_substrates$sub_gene
5886 )
5887
5888 all_enriched_substrates$label <-
5889 with(
5890 all_enriched_substrates,
3200 sprintf( 5891 sprintf(
3201 "Change from treatment %s to treatment %s", 5892 "(%s-%s) %s",
3202 contrast_metadata_df[i_cntrst, "b_level"], 5893 kinase,
3203 contrast_metadata_df[i_cntrst, "a_level"] 5894 trunc_subgene(sub_gene),
3204 ) 5895 ppep
3205 ) 5896 )
3206 sub_title <- contrast_longlabel 5897 )
3207 tryCatch( 5898
3208 expr = { 5899 # this global is set to TRUE by cat_enriched_heading immediately below
3209 ksea_scores_rslt <- rslt$score_list[[next_index]] 5900 g_neednewpage <- FALSE
3210 5901
3211 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { 5902 # helper used to label per-kinase substrate enrichment figure
3212 low_fdr_barplot( 5903 cat_enriched_heading <- function(m, cut_args) {
3213 rslt = rslt, 5904 cutoff <- cut_args$cutoff
3214 i_cntrst = i_cntrst, 5905 kinase <- cut_args$kinase
3215 i = next_index, 5906 if (g_neednewpage) cat("\\newpage\n")
3216 a_level = cntrst_a_level, 5907 g_neednewpage <- TRUE
3217 b_level = cntrst_b_level, 5908 if (nrow(m) > g_intensity_hm_rows) {
3218 fold_change = cntrst_fold_change, 5909 cat(subsection_header(
3219 caption = contrast_longlabel
3220 )
3221 }
3222 },
3223 error = function(e) str(e)
3224 )
3225 }
3226 ```
3227
3228 ```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
3229
3230 # Use enriched kinases to find enriched kinase-substrate pairs
3231 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
3232 all_enriched_substrates <- sqldf("
3233 SELECT
3234 gene AS kinase,
3235 ppep,
3236 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label
3237 FROM (
3238 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
3239 FROM pseudo_ksdata
3240 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3241 )
3242 GROUP BY ppep
3243 ")
3244
3245 # helper used to label per-kinase substrate enrichment figure
3246 cat_enriched_heading <- function(m, cut_args) {
3247 cutoff <- cut_args$cutoff
3248 kinase <- cut_args$kinase
3249 statistic <- cut_args$statistic
3250 threshold <- cut_args$threshold
3251 cat("\\newpage\n")
3252 if (nrow(m) > intensity_hm_rows) {
3253 subsection_header(
3254 paste(
3255 sprintf( 5910 sprintf(
3256 "Lowest p-valued %d (of %d) enriched %s-substrates,", 5911 "Highest-quality %d (of %d) enriched %s-substrates",
3257 intensity_hm_rows, 5912 g_intensity_hm_rows,
3258 nrow(m), 5913 nrow(m),
3259 kinase 5914 kinase
3260 ), 5915 )
3261 sprintf(" KSEA %s < %0.2f\n", statistic, threshold) 5916 ))
3262 )
3263 )
3264 } else {
3265 if (nrow(m) == 1) {
3266 return(FALSE)
3267 } else { 5917 } else {
3268 subsection_header( 5918 if (nrow(m) == 0) {
3269 paste( 5919 return(FALSE)
5920 } else {
5921 nrow_m <- nrow(m)
5922 cat(subsection_header(
3270 sprintf( 5923 sprintf(
3271 "%d enriched %s-substrates,", 5924 "%d enriched %s-substrate%s",
3272 nrow(m), 5925 nrow_m,
3273 kinase 5926 kinase,
3274 ), 5927 if (nrow_m > 1) "s" else ""
3275 sprintf(
3276 " KSEA %s < %0.2f\n",
3277 statistic,
3278 threshold
3279 ) 5928 )
5929 ))
5930 }
5931 }
5932 cat("\n\n\n")
5933 cat("\n\n\n")
5934 return(TRUE)
5935 }
5936
5937 # --------------------------------
5938 # hack begin - show all substrates
5939 enriched_substrates <- all_enriched_substrates
5940 # add "FALSE &&" to prevent listing of substrates
5941 if (show_enriched_substrates && nrow(enriched_substrates) > 0) {
5942 short_row_names <- sub(
5943 "$FAILED_MATCH_GENE_NAME",
5944 "not_found",
5945 enriched_substrates$sub_gene,
5946 fixed = TRUE
5947 )
5948
5949 if (print_nb_messages) nbe(see_variable(enriched_substrates))
5950 substrates_df <- with(
5951 enriched_substrates,
5952 data.frame(
5953 kinase = kinase,
5954 substrate = sub(" ///*", "...", short_row_names),
5955 anova_p_value = signif(fdr_adjusted_anova_p, 2),
5956 min_group_obs_count = signif(min_group_obs_count, 0),
5957 quality = signif(quality, 3),
5958 sequence = trunc_n(30)(ppep)
3280 ) 5959 )
3281 ) 5960 )
3282 } 5961
3283 } 5962 substrates_df <- substrates_df[
3284 cat("\n\n\n") 5963 with(substrates_df, order(kinase, -quality)),
3285 cat("\n\n\n") 5964 ,
3286 return(TRUE) 5965 drop = FALSE
3287 } 5966 ]
3288 5967
3289 # Disabling heatmaps for substrates pending decision whether to eliminate them altogether 5968 if (print_nb_messages) nbe(see_variable(substrates_df))
3290 if (FALSE) 5969 if (nrow(substrates_df) < 1)
5970 substrates_df$sequence <- c()
5971 if (print_nb_messages) nbe(see_variable(substrates_df))
5972 names(substrates_df) <- headers_2nd_line <-
5973 c("Kinase", "Substrate", "p-value", "per group)", "quality", "Sequence")
5974 headers_1st_line <- c("", "", "ANOVA", "min(values", "", "")
5975 data_frame_tabbing_latex(
5976 x = substrates_df,
5977 tabstops = c(1.2, 0.8, 0.5, 0.65, 0.5),
5978 headings = c(headers_1st_line, headers_2nd_line),
5979 caption = "Details for all enriched substrates of enriched kinases"
5980 )
5981 rm(
5982 enriched_substrates,
5983 substrates_df,
5984 short_row_names,
5985 headers_1st_line,
5986 headers_2nd_line
5987 )
5988 }
5989 cat("\\clearpage\n")
5990 # hack end - show all substrates
5991 # --------------------------------
5992
5993 # print deferred tables and graphs for kinases from contrasts
5994 for (i_cntrst in seq_len(length(ksea_prints))) {
5995 #latex_samepage({
5996 cat(ksea_prints[[i_cntrst]])
5997 cat("\n")
5998 ksea_barplots[[i_cntrst]]()
5999 cat("\n")
6000 cat("\\clearpage\n")
6001 #})
6002 }
6003
6004 }
6005 ```
6006
6007 ```{r enriched, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'}
6008 if (g_can_run_ksea) {
6009 g_did_enriched_header <- FALSE
3291 for (kinase_name in sort(enriched_kinases$kinase)) { 6010 for (kinase_name in sort(enriched_kinases$kinase)) {
3292 enriched_substrates <- 6011 enriched_substrates <-
3293 all_enriched_substrates[ 6012 all_enriched_substrates[
3294 all_enriched_substrates$kinase == kinase_name, 6013 all_enriched_substrates$kinase == kinase_name,
3295 , 6014 ,
3296 drop = FALSE 6015 drop = FALSE
3297 ] 6016 ]
6017 ten_trunc_ppep <- trunc_enriched_substrate(enriched_substrates$ppep)
6018 enriched_substrates$label <- with(
6019 enriched_substrates,
6020 sprintf(
6021 "(%s) %s",
6022 make.names(
6023 sub("$FAILED_MATCH_GENE_NAME", "not_found", sub_gene, fixed = TRUE),
6024 unique = TRUE
6025 ),
6026 ten_trunc_ppep
6027 )
6028 )
3298 # Get the intensity values for the heatmap 6029 # Get the intensity values for the heatmap
3299 enriched_intensities <- 6030 enriched_intensities <-
3300 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) 6031 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE])
6032
3301 # Remove rows having too many NA values to be relevant 6033 # Remove rows having too many NA values to be relevant
3302 na_counter <- is.na(enriched_intensities) 6034 good_rows <- (rowSums(enriched_intensities, na.rm = TRUE) != 0)
3303 na_counts <- apply(na_counter, 1, sum) 6035 #ACE nbe(see_variable(good_rows), "\n")
3304 enriched_intensities <- 6036 enriched_substrates <- enriched_substrates[good_rows, , drop = FALSE]
3305 enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE] 6037 enriched_intensities <- enriched_intensities[good_rows, , drop = FALSE]
6038
3306 # Rename the rows with the display-name for the heatmap 6039 # Rename the rows with the display-name for the heatmap
3307 rownames(enriched_intensities) <- 6040 short_row_names <- sub(
6041 "$FAILED_MATCH_GENE_NAME",
6042 "not_found",
6043 enriched_substrates$sub_gene,
6044 fixed = TRUE
6045 )
6046 short_row_names <-
6047 make.names(short_row_names, unique = TRUE)
6048 long_row_names <-
3308 sapply( 6049 sapply(
3309 X = rownames(enriched_intensities), 6050 X = rownames(enriched_intensities),
3310 FUN = function(rn) { 6051 FUN = function(rn) {
3311 enriched_substrates[enriched_substrates$ppep == rn, "label"] 6052 enriched_substrates[enriched_substrates$ppep == rn, "label"]
3312 } 6053 }
3313 ) 6054 )
6055 rownames(enriched_intensities) <- long_row_names
3314 # Format as matrix for heatmap 6056 # Format as matrix for heatmap
3315 m <- as.matrix(enriched_intensities) 6057 m <- as.matrix(enriched_intensities)
6058 rownames(m) <- trunc_enriched_substrate(rownames(m))
6059
6060 #ACE nb("m with bad rows: ", see_variable(m), "\n")
6061 #ACE good_rows <- (rowSums(m, na.rm = TRUE) != 0)
6062 #ACE nb(see_variable(good_rows), "\n")
6063 #ACE m <- m[good_rows, , drop = FALSE]
6064 #ACE nb("m without(?) bad rows: ", see_variable(m), "\n")
6065 #ACE nb(see_variable(short_row_names), "\n")
6066 #ACE local_short_row_names <- short_row_names[good_rows]
6067 #ACE local_long_row_names <- long_row_names[good_rows]
6068 #ACE local_enriched_intensities <- enriched_intensities[local_long_row_names, ]
6069
3316 # Draw the heading and heatmap 6070 # Draw the heading and heatmap
3317 if (nrow(m) > 0) { 6071 nrow_m <- nrow(m)
6072 if (nrow_m > 0) {
6073 if (!g_did_enriched_header) {
6074 cat("\n\\clearpage\n")
6075 cat(section_header(paste0("K", my_section_header)))
6076 g_did_enriched_header <- TRUE
6077 }
6078 is_na_m <- is.na(m)
6079 cellnote_m <- is_na_m
6080 cellnote_m[!is_na_m] <- ""
6081 cellnote_m[is_na_m] <- "NA"
3318 cut_args <- new_env() 6082 cut_args <- new_env()
3319 cut_args$cutoff <- cutoff 6083 cut_args$cutoff <- cutoff
3320 cut_args$kinase <- kinase_name 6084 cut_args$kinase <- kinase_name
3321 cut_args$statistic <- ksea_cutoff_statistic 6085 cut_args$statistic <- ksea_cutoff_statistic
3322 cut_args$threshold <- ksea_cutoff_threshold 6086 cut_args$threshold <- ksea_cutoff_threshold
3323 number_of_peptides_found <- 6087 number_of_peptides_found <-
3324 draw_intensity_heatmap( 6088 ppep_heatmap(
3325 m = m, 6089 m = m,
6090 cellnote = cellnote_m,
3326 cutoff = cut_args, 6091 cutoff = cut_args,
3327 hm_heading_function = cat_enriched_heading, 6092 hm_heading_function = cat_enriched_heading,
3328 hm_main_title 6093 hm_main_title
3329 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", 6094 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates",
3330 suppress_row_dendrogram = FALSE 6095 suppress_row_dendrogram = FALSE,
6096 master_cex = 0.35,
6097 sepcolor = "black",
6098 colsep = sample_colsep
3331 ) 6099 )
6100 if (number_of_peptides_found > 1) {
6101
6102 tryCatch(
6103 {
6104 rownames(m) <- short_row_names
6105 cov_heatmap(m, nrow_m > g_intensity_hm_rows)
6106 },
6107 error = function(e) {
6108 cat(
6109 sprintf(
6110 "ERROR: %s\n\\newline\n",
6111 mget("e")
6112 )
6113 )
6114 cat(
6115 sprintf(
6116 "message: %s\n\\newline\n",
6117 e$message
6118 )
6119 )
6120 cat_margins()
6121 }
6122 )
6123 }
6124 substrates_df <- with(
6125 enriched_substrates,
6126 data.frame(
6127 substrate = sub(" ///*", "...", short_row_names),
6128 sequence = trunc_long_ppep(ppep),
6129 anova_p_value = signif(fdr_adjusted_anova_p, 2),
6130 min_group_obs_count = signif(min_group_obs_count, 0),
6131 quality = signif(quality, 3)
6132 )
6133 )
6134 excess_substrates <- nrow(substrates_df) > g_intensity_hm_rows
6135 if (excess_substrates)
6136 substrates_df <- substrates_df[1:g_intensity_hm_rows, ]
6137 names(substrates_df) <- headers_2nd_line <-
6138 c("Substrate", "Sequence", "p-value", "per group)", "quality")
6139 headers_1st_line <- c("", "", "ANOVA", "min(values", "")
6140 if (1 < nrow(enriched_substrates))
6141 cat("\n\\newpage\n")
6142 cat(subsubsection_header(
6143 sprintf(
6144 "Details for %s%s-substrates",
6145 if (excess_substrates)
6146 sprintf(
6147 "%s \"highest quality\" ",
6148 g_intensity_hm_rows
6149 )
6150 else "",
6151 kinase_name
6152 )
6153 ))
6154 substrates_df <- substrates_df[order(-substrates_df$quality), ]
6155 data_frame_tabbing_latex(
6156 x = substrates_df,
6157 tabstops = c(0.8, 3.8, 0.6, 0.8),
6158 headings = c(headers_1st_line, headers_2nd_line)
6159 )
6160 } else {
6161 if (print_nb_messages) nbe(see_variable(nrow_m > 0), "\n")
3332 } 6162 }
3333 } 6163 if (print_nb_messages) nb("end kinase ", kinase_name, "\n")
3334 6164 }
3335 # Write output tabular files 6165
3336 6166 # Write output tabular files
3337 # get kinase, ppep, concat(kinase) tuples for enriched kinases 6167
3338 6168 # get kinase, ppep, concat(kinase) tuples for enriched kinases
3339 kinase_ppep_label <- sqldf(" 6169
3340 WITH 6170 if (print_nb_messages) nb("kinase_ppep_label <- ...\n")
3341 t(ppep, label) AS 6171 if (print_nb_messages) nbe("kinase_ppep_label <- ...\n")
3342 ( 6172 kinase_ppep_label <- sqldf("
3343 SELECT DISTINCT 6173 WITH
3344 SUB_MOD_RSD AS ppep, 6174 t(ppep, label) AS
3345 group_concat(gene, '; ') AS label 6175 (
6176 SELECT DISTINCT
6177 SUB_MOD_RSD AS ppep,
6178 group_concat(gene, '; ') AS label
6179 FROM pseudo_ksdata
6180 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
6181 GROUP BY ppep
6182 ),
6183 k(kinase, ppep_join) AS
6184 (
6185 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
3346 FROM pseudo_ksdata 6186 FROM pseudo_ksdata
3347 WHERE GENE IN (SELECT kinase FROM enriched_kinases) 6187 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3348 GROUP BY ppep 6188 )
3349 ), 6189 SELECT k.kinase, t.ppep, t.label
3350 k(kinase, ppep_join) AS 6190 FROM t, k
3351 ( 6191 WHERE t.ppep = k.ppep_join
3352 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join 6192 ORDER BY k.kinase, t.ppep
3353 FROM pseudo_ksdata 6193 ")
3354 WHERE GENE IN (SELECT kinase FROM enriched_kinases) 6194
3355 ) 6195
3356 SELECT k.kinase, t.ppep, t.label 6196 # extract what we need from full_data
3357 FROM t, k 6197 impish <- cbind(rownames(quant_data_imp), quant_data_imp)
3358 WHERE t.ppep = k.ppep_join 6198 colnames(impish)[1] <- "Phosphopeptide"
3359 ORDER BY k.kinase, t.ppep 6199 data_table_imputed_sql <- "
3360 ") 6200 SELECT
3361 6201 f.*,
3362 # extract what we need from full_data 6202 k.label AS KSEA_enrichments,
3363 impish <- cbind(rownames(quant_data_imp), quant_data_imp) 6203 q.*
3364 colnames(impish)[1] <- "Phosphopeptide" 6204 FROM
3365 data_table_imputed_sql <- " 6205 metadata_plus_p f
3366 SELECT 6206 LEFT JOIN kinase_ppep_label k
3367 f.*, 6207 ON f.Phosphopeptide = k.ppep,
3368 k.label AS KSEA_enrichments, 6208 impish q
3369 q.* 6209 WHERE
3370 FROM 6210 f.Phosphopeptide = q.Phosphopeptide
3371 metadata_plus_p f 6211 "
3372 LEFT JOIN kinase_ppep_label k 6212 data_table_imputed <- sqldf(data_table_imputed_sql)
3373 ON f.Phosphopeptide = k.ppep, 6213 # Zap the duplicated 'Phosphopeptide' column named 'ppep'
3374 impish q 6214 data_table_imputed <-
3375 WHERE 6215 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
3376 f.Phosphopeptide = q.Phosphopeptide 6216
3377 " 6217 # Output imputed, un-normalized data
3378 data_table_imputed <- sqldf(data_table_imputed_sql) 6218 if (print_nb_messages) nb("Output imputed, un-normalized data tabular file\n")
3379 # Zap the duplicated 'Phosphopeptide' column named 'ppep' 6219 if (print_nb_messages) nbe("Output imputed, un-normalized data tabular file\n")
3380 data_table_imputed <- 6220 write.table(
3381 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] 6221 data_table_imputed
3382 6222 , file = imputed_data_filename
3383 # Output with imputed, un-normalized data 6223 , sep = "\t"
3384 6224 , col.names = TRUE
3385 write.table( 6225 , row.names = FALSE
3386 data_table_imputed 6226 , quote = FALSE
3387 , file = imputed_data_filename 6227 )
3388 , sep = "\t" 6228
3389 , col.names = TRUE 6229
3390 , row.names = FALSE 6230 #output quantile normalized data
3391 , quote = FALSE 6231 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
3392 ) 6232 colnames(impish)[1] <- "Phosphopeptide"
3393 6233 data_table_imputed <- sqldf(data_table_imputed_sql)
3394 6234 # Zap the duplicated 'Phosphopeptide' column named 'ppep'
3395 #output quantile normalized data 6235 data_table_imputed <-
3396 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) 6236 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
3397 colnames(impish)[1] <- "Phosphopeptide" 6237 if (print_nb_messages) nb("Output quantile normalized data tabular file\n")
3398 data_table_imputed <- sqldf(data_table_imputed_sql) 6238 if (print_nb_messages) nbe("Output quantile normalized data tabular file\n")
3399 # Zap the duplicated 'Phosphopeptide' column named 'ppep' 6239 write.table(
3400 data_table_imputed <- 6240 data_table_imputed,
3401 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] 6241 file = imp_qn_lt_data_filenm,
3402 write.table( 6242 sep = "\t",
3403 data_table_imputed, 6243 col.names = TRUE,
3404 file = imp_qn_lt_data_filenm, 6244 row.names = FALSE,
3405 sep = "\t", 6245 quote = FALSE
3406 col.names = TRUE, 6246 )
3407 row.names = FALSE, 6247
3408 quote = FALSE 6248 ppep_kinase <- sqldf("
3409 ) 6249 SELECT DISTINCT k.ppep, k.kinase
3410 6250 FROM (
3411 ppep_kinase <- sqldf(" 6251 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
3412 SELECT DISTINCT k.ppep, k.kinase 6252 FROM pseudo_ksdata
3413 FROM ( 6253 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3414 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep 6254 ) k
3415 FROM pseudo_ksdata 6255 ORDER BY k.ppep, k.kinase
3416 WHERE GENE IN (SELECT kinase FROM enriched_kinases) 6256 ")
3417 ) k 6257
3418 ORDER BY k.ppep, k.kinase 6258 RSQLite::dbWriteTable(
3419 ") 6259 conn = db,
3420 6260 name = "ksea_enriched_ks",
3421 RSQLite::dbWriteTable( 6261 value = ppep_kinase,
3422 conn = db, 6262 append = FALSE
3423 name = "ksea_enriched_ks", 6263 )
3424 value = ppep_kinase, 6264 }
3425 append = FALSE 6265
3426 ) 6266 if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\n")
3427 6267
3428 RSQLite::dbWriteTable( 6268 RSQLite::dbWriteTable(
3429 conn = db, 6269 conn = db,
3430 name = "anova_signif", 6270 name = "anova_signif",
3431 value = p_value_data, 6271 value = p_value_data,
3451 ON m.phospho_peptide = kek.ppep 6291 ON m.phospho_peptide = kek.ppep
3452 ; 6292 ;
3453 " 6293 "
3454 ) 6294 )
3455 6295
6296 if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\n")
6297 if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\n")
3456 write.table( 6298 write.table(
3457 dbReadTable(db, "stats_metadata_v"), 6299 dbReadTable(db, "stats_metadata_v"),
3458 file = anova_ksea_mtdt_file, 6300 file = anova_ksea_mtdt_file,
3459 sep = "\t", 6301 sep = "\t",
3460 col.names = TRUE, 6302 col.names = TRUE,
3461 row.names = FALSE, 6303 row.names = FALSE,
3462 quote = FALSE 6304 quote = FALSE
3463 ) 6305 )
3464 6306
6307 cat("\n\\clearpage\n")
3465 6308
3466 ``` 6309 ```
6310
6311 # Data-processing summary flowchart
6312
6313 ![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf)
3467 6314
3468 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} 6315 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
3469 cat("\\leavevmode\n\n\n") 6316 cat("\\leavevmode\n\n\n")
3470 6317
3471 # write parameters to report 6318 write_params(db)
3472
3473 param_unlist <- unlist(as.list(params))
3474 param_df <- data.frame(
3475 parameter = paste0("\\verb@", names(param_unlist), "@"),
3476 value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@")
3477 )
3478
3479 data_frame_latex(
3480 x = param_df,
3481 justification = "p{0.35\\linewidth} p{0.6\\linewidth}",
3482 centered = TRUE,
3483 caption = "Input parameters",
3484 anchor = const_table_anchor_bp,
3485 underscore_whack = FALSE
3486 )
3487
3488 # write parameters to SQLite output
3489
3490 mqppep_anova_script_param_df <- data.frame(
3491 script = "mqppep_anova_script.Rmd",
3492 parameter = names(param_unlist),
3493 value = param_unlist
3494 )
3495 ddl_exec(db, "
3496 DROP TABLE IF EXISTS script_parameter;
3497 "
3498 )
3499 ddl_exec(db, "
3500 CREATE TABLE IF NOT EXISTS script_parameter(
3501 script TEXT,
3502 parameter TEXT,
3503 value ANY,
3504 UNIQUE (script, parameter) ON CONFLICT REPLACE
3505 )
3506 ;
3507 "
3508 )
3509 RSQLite::dbWriteTable(
3510 conn = db,
3511 name = "script_parameter",
3512 value = mqppep_anova_script_param_df,
3513 append = TRUE
3514 )
3515
3516 # We are done with output 6319 # We are done with output
3517 RSQLite::dbDisconnect(db) 6320 RSQLite::dbDisconnect(db)
6321
6322 cat("\\clearpage\n\\section{R package versions}\n")
6323 utils::toLatex(utils::sessionInfo())
3518 ``` 6324 ```
3519 <!--
3520 There's gotta be a better way...
3521
3522 loaded_packages_df <- sessioninfo::package_info("loaded")
3523 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
3524 loaded_packages_df <- data.frame(
3525 package = loaded_packages_df$package,
3526 version = loaded_packages_df$loadedversion,
3527 date = loaded_packages_df$date
3528 )
3529 data_frame_latex(
3530 x = loaded_packages_df,
3531 justification = "l | l l",
3532 centered = FALSE,
3533 caption = "Loaded R packages",
3534 anchor = const_table_anchor_bp
3535 )
3536 -->