comparison mqppep_anova_script.Rmd @ 0:8dfd5d2b5903 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author galaxyp
date Mon, 11 Jul 2022 19:22:54 +0000
parents
children b76c75521d91
comparison
equal deleted inserted replaced
-1:000000000000 0:8dfd5d2b5903
1 ---
2 title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA"
3 author:
4 - "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]"
5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]"
6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]"
7 date:
8 - "May 28, 2018"
9 - "; revised June 23, 2022"
10 output:
11 pdf_document:
12 toc: true
13 toc_depth: 3
14 keep_tex: true
15 header-includes:
16 - \usepackage{longtable}
17 - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut
18 - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut
19 params:
20 alphaFile: "test-data/alpha_levels.tabular"
21 inputFile: "test-data/test_input_for_anova.tabular"
22 preprocDb: "test-data/test_input_for_anova.sqlite"
23 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
24 show_toc: true
25 firstDataColumn: "^Intensity[^_]"
26 imputationMethod: !r c("group-median", "median", "mean", "random")[1]
27 meanPercentile: 1
28 sdPercentile: 1.0
29 regexSampleNames: "\\.\\d+[A-Z]$"
30 regexSampleGrouping: "\\d+"
31 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt"
32 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt"
33 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt"
34 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]
35 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3]
36 kseaCutoffStatistic: !r c("p.value", "FDR")[2]
37 kseaCutoffThreshold: !r c( 0.1, 0.05)[2]
38 kseaMinKinaseCount: 1
39 intensityHeatmapRows: 75
40 ---
41 <!--
42 kseaCutoffStatistic: !r c("p.value", "FDR")[2]
43 kseaCutoffThreshold: !r c(0.05, 0.1)[1]
44
45 alphaFile: "test-data/alpha_levels.tabular"
46 inputFile: "test-data/test_input_for_anova.tabular"
47 preprocDb: "test-data/test_input_for_anova.sqlite"
48 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
49
50 alphaFile: "test-data/alpha_levels.tabular"
51 inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular"
52 preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite"
53 kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2]
54
55 alphaFile: "test-data/alpha_levels.tabular"
56 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular"
57 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite"
58 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
59
60 alphaFile: "test-data/alpha_levels.tabular"
61 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular"
62 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite"
63 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
64
65 inputFile: "test-data/density_failure.preproc_tab.tabular"
66 kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2]
67 latex_document: default
68 -->
69 ```{r setup, include = FALSE}
70 #ref for debugging: https://yihui.org/tinytex/r/#debugging
71 options(tinytex.verbose = TRUE)
72
73 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
74 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
75 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))
76
77 # freeze the random number generator so the same results will be produced
78 # from run to run
79 set.seed(28571)
80
81 ### LIBRARIES
82 library(gplots)
83 library(DBI)
84 library(RSQLite)
85 # Suppress "Warning: no DISPLAY variable so Tk is not available"
86 suppressWarnings(suppressMessages(library(sqldf)))
87
88 # required but not added to search list:
89 # - DBI
90 # - RSQLite
91 # - ggplot2
92 # - knitr
93 # - latex2exp
94 # - preprocessCore
95 # - reshape2
96 # - vioplot
97
98 ### CONSTANTS
99
100 const_parfin <- par("fin")
101 const_boxplot_fill <- "grey94"
102 const_stripchart_cex <- 0.5
103 const_stripsmall_cex <-
104 sqrt(const_stripchart_cex * const_stripchart_cex / 2)
105 const_stripchart_jitter <- 0.3
106 const_write_debug_files <- FALSE
107 const_table_anchor_bp <- "bp"
108 const_table_anchor_ht <- "ht"
109 const_table_anchor_p <- "p"
110 const_table_anchor_tbp <- "tbp"
111
112
113 const_ksea_astrsk_kinases <- 1
114 const_ksea_nonastrsk_kinases <- 2
115 const_ksea_all_kinases <- 3
116
117 const_log10_e <- log10(exp(1))
118
119 ### FUNCTIONS
120
121 # from `demo(error.catching)`
122 ##' Catch *and* save both errors and warnings, and in the case of
123 ##' a warning, also keep the computed result.
124 ##'
125 ##' @title tryCatch both warnings (with value) and errors
126 ##' @param expr an \R expression to evaluate
127 ##' @return a list with 'value' and 'warning', where
128 ##' 'value' may be an error caught.
129 ##' @author Martin Maechler;
130 ##' Copyright (C) 2010-2012 The R Core Team
131 try_catch_w_e <- function(expr) {
132 wrn <- NULL
133 # warning handler
134 w_handler <- function(w) {
135 wrn <<- w
136 invokeRestart("muffleWarning")
137 }
138 list(
139 value = withCallingHandlers(
140 tryCatch(
141 expr,
142 error = function(e) e
143 ),
144 warning = w_handler
145 ),
146 warning = wrn
147 )
148 }
149
150
151 write_debug_file <- function(s) {
152 if (const_write_debug_files) {
153 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s)))
154 print(sprintf("DEBUG writing file %s", spath))
155 write.table(
156 s,
157 file = s_path,
158 sep = "\t",
159 col.names = TRUE,
160 row.names = TRUE,
161 quote = FALSE
162 )
163 }
164 }
165
166 # ref: http://adv-r.had.co.nz/Environments.html
167 # "When creating your own environment, note that you should set its parent
168 # environment to be the empty environment. This ensures you don't
169 # accidentally inherit objects from somewhere else."
170 # Caution: this prevents `with(my_env, expr)` from working when `expr`
171 # contains anything from the global environment, even operators!
172 # Hence, `x <- 1; get("x", new_env())` fails by design.
173 new_env <- function() {
174 new.env(parent = emptyenv())
175 }
176
177 ### numerical/statistical helper functions
178
179 any_nan <- function(x) {
180 !any(x == "NaN")
181 }
182
183 # determine standard deviation of quantile to impute
184 sd_finite <- function(x) {
185 ok <- is.finite(x)
186 sd(x[ok])
187 }
188
189 anova_func <- function(x, grouping_factor, one_way_f) {
190 subject <- data.frame(
191 intensity = x
192 )
193 x_aov <-
194 one_way_f(
195 formula = intensity ~ grouping_factor,
196 data = subject
197 )
198 pvalue <-
199 if (identical(one_way_f, aov))
200 summary(x_aov)[[1]][["Pr(>F)"]][1]
201 else
202 pvalue <- x_aov$p.value
203 pvalue
204 }
205
206
207 ### LaTeX functions
208
209 latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) {
210 v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v
211 cat(
212 paste0(
213 v_sub,
214 collapse = collapse_string
215 )
216 )
217 }
218
219 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
220 cat("\\begin{itemize}\n\\item ")
221 latex_collapsed_vector(collapse_string, v, underscore_whack)
222 cat("\n\\end{itemize}\n")
223 }
224
225 latex_itemized_list <- function(v, underscore_whack = TRUE) {
226 latex_itemized_collapsed("\n\\item ", v, underscore_whack)
227 }
228
229 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
230 cat("\\begin{enumerate}\n\\item ")
231 latex_collapsed_vector(collapse_string, v, underscore_whack)
232 cat("\n\\end{enumerate}\n")
233 }
234
235 latex_enumerated_list <- function(v) {
236 latex_enumerated_collapsed("\n\\item ", v)
237 }
238
239 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
240 latex_collapsed_vector(" & ", v, underscore_whack)
241 cat(extra)
242 cat(" \\\\\n")
243 }
244
245 # Use this like print.data.frame, from which it is adapted:
246 data_frame_latex <-
247 function(
248 x,
249 ...,
250 # digits to pass to format.data.frame
251 digits = NULL,
252 # TRUE -> right-justify columns; FALSE -> left-justify
253 right = TRUE,
254 # maximumn number of rows to print
255 max = NULL,
256 # string with justification of each column
257 justification = NULL,
258 # TRUE to center on page
259 centered = TRUE,
260 # optional caption
261 caption = NULL,
262 # h(inline); b(bottom); t (top) or p (separate page)
263 anchor = "h",
264 # set underscore_whack to TRUE to escape underscores
265 underscore_whack = TRUE
266 ) {
267 if (is.null(justification))
268 justification <-
269 Reduce(
270 f = paste,
271 x = rep_len(if (right) "r" else "l", length(colnames(x)))
272 )
273 n <- length(rownames(x))
274 if (length(x) == 0L) {
275 cat(
276 sprintf(
277 # if n is one, use singular 'row', else use plural 'rows'
278 ngettext(
279 n,
280 "data frame with 0 columns and %d row",
281 "data frame with 0 columns and %d rows"
282 ),
283 n
284 ),
285 "\n",
286 sep = ""
287 )
288 } else if (n == 0L) {
289 cat("0 rows for:\n")
290 latex_itemized_list(
291 v = names(x),
292 underscore_whack = underscore_whack
293 )
294 } else {
295 if (is.null(max))
296 max <- getOption("max.print", 99999L)
297 if (!is.finite(max))
298 stop("invalid 'max' / getOption(\"max.print\"): ",
299 max)
300 omit <- (n0 <- max %/% length(x)) < n
301 m <- as.matrix(
302 format.data.frame(
303 if (omit) x[seq_len(n0), , drop = FALSE] else x,
304 digits = digits,
305 na.encode = FALSE
306 )
307 )
308 cat(
309 # h(inline); b(bottom); t (top) or p (separate page)
310 paste0("\\begin{table}[", anchor, "]\n")
311 )
312 if (!is.null(caption))
313 cat(paste0(" \\caption{", caption, "}"))
314 if (centered) cat("\\centering\n")
315 cat(
316 paste(
317 " \\begin{tabular}{",
318 justification,
319 "}\n",
320 sep = ""
321 )
322 )
323 # ref: https://tex.stackexchange.com/a/50353
324 # Describes use of \rule{0pt}{3ex}
325 if (!is.null(caption))
326 cat("\\B \\\\ \\hline\\hline\n")
327 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
328 latex_table_row(
329 v = colnames(m),
330 extra = "\\T\\B",
331 underscore_whack = underscore_whack
332 )
333 cat("\\hline\n")
334 for (i in seq_len(length(m[, 1]))) {
335 latex_table_row(
336 v = m[i, ],
337 underscore_whack = underscore_whack
338 )
339 }
340 cat(
341 paste(
342 " \\end{tabular}",
343 "\\end{table}",
344 sep = "\n"
345 )
346 )
347 if (omit)
348 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
349 n - n0, "rows ]\n")
350 }
351 invisible(x)
352 }
353
354 hypersub <-
355 function(s) {
356 hyper <- tolower(s)
357 hyper <- gsub("[^a-z0-9]+", "-", hyper)
358 hyper <- gsub("[-]+", "-", hyper)
359 hyper <- sub("^[-]", "", hyper)
360 hyper <- sub("[-]$", "", hyper)
361 return(hyper)
362 }
363
364 subsection_header <-
365 function(s) {
366 hyper <- hypersub(s)
367 cat(
368 sprintf(
369 "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n",
370 hyper, s, hyper
371 )
372 )
373 }
374
375 subsubsection_header <-
376 function(s) {
377 hyper <- hypersub(s)
378 cat(
379 sprintf(
380 "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n",
381 hyper, s, hyper
382 )
383 )
384 }
385
386 ### SQLite functions
387
388 ddl_exec <- function(db, sql) {
389 discard <- DBI::dbExecute(conn = db, statement = sql)
390 if (FALSE && discard != 0) {
391 need_newpage <- TRUE
392 if (need_newpage) {
393 need_newpage <<- FALSE
394 cat("\\newpage\n")
395 }
396 o_file <- stdout()
397 cat("\n\\begin{verbatim}\n")
398 cat(sql, file = o_file)
399 cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file)
400 cat("\n\\end{verbatim}\n")
401 }
402 }
403
404 dml_no_rows_exec <- function(db, sql) {
405 discard <- DBI::dbExecute(conn = db, statement = sql)
406 if (discard != 0) {
407 need_newpage <- TRUE
408 if (need_newpage) {
409 need_newpage <<- FALSE
410 cat("\\newpage\n")
411 }
412 cat("\n\\begin{verbatim}\n")
413 o_file <- stdout()
414 cat(sql, file = o_file)
415 cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file)
416 cat("\n\\end{verbatim}\n")
417 }
418 }
419
420 ### KSEA functions and helpers
421
422 # Adapted from KSEAapp::KSEA.Scores to allow retrieval of:
423 # - maximum log2(FC)
424 ksea_scores <- function(
425
426 # For human data, typically, ksdata = KSEAapp::ksdata
427 ksdata,
428
429 # Input data file having columns:
430 # - Protein : abbreviated protein name
431 # - Gene : HUGO gene name
432 # - Peptide : peptide sequence without indications of phosphorylation
433 # - Reside.Both : position(s) of phosphorylation within Gene sequence
434 # - First letter designates AA that is modified
435 # - Numbers indicate position within Gene
436 # - Multiple values are separated by semicolons
437 # - p : p-value
438 # - FC : fold-change
439 px,
440
441 # A binary input of TRUE or FALSE, indicating whether or not to include
442 # NetworKIN predictions
443 networkin,
444
445 # A numeric value between 1 and infinity setting the minimum NetworKIN
446 # score (can be left out if networkin = FALSE)
447 networkin_cutoff
448
449 ) {
450 if (length(grep(";", px$Residue.Both)) == 0) {
451 # There are no Residue.Both entries having semicolons, so new is
452 # simply px except two columns are renamed and a column is added
453 # for log2(abs(fold-change))
454 new <- px
455 colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
456 new$log2_fc <- log2(abs(as.numeric(as.character(new$FC))))
457 new <- new[complete.cases(new$log2_fc), ]
458 } else {
459 # Split each row having semicolons in Residue.Both into rows that are
460 # duplicated in all respects except that each row has a single
461 # member of the set "split-on-semicolon-Residue.Both"
462 px_double <- px[grep(";", px$Residue.Both), ]
463 residues <- as.character(px_double$Residue.Both)
464 residues <- as.matrix(residues, ncol = 1)
465 split <- strsplit(residues, split = ";")
466 # x gets count of residues in each row,
467 # i.e., 1 + count of semicolons
468 x <- sapply(split, length)
469 # Here is the set of split rows
470 px_single <- data.frame(
471 Protein = rep(px_double$Protein, x),
472 Gene = rep(px_double$Gene, x),
473 Peptide = rep(px_double$Peptide, x),
474 Residue.Both = unlist(split),
475 p = rep(px_double$p, x),
476 FC = rep(px_double$FC, x)
477 )
478 # new first gets the split rows
479 new <- px[-grep(";", px$Residue.Both), ]
480 # to new, append the rows that didn't need splitting in the first place
481 new <- rbind(new, px_single)
482 # map Gene to SUB_GENE
483 # map Residue.Both to SUB_MOD_RSD
484 colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
485 # Eliminate any non-positive values to prevent introduction of
486 # infinite or NaN values
487 new[(0 <= new$log2_fc), "log2_fc"] <- NA
488 # Because of preceding step, there is no need for abs in the next line
489 new$log2_fc <- log2(as.numeric(as.character(new$FC)))
490 # Convert any illegal values from NaN to NA
491 new[is.nan(new$log2_fc), "log2_fc"] <- NA
492 # Eliminate rows having missing values (e.g., non-imputed data)
493 new <- new[complete.cases(new$log2_fc), ]
494 }
495 if (networkin == TRUE) {
496 # When NetworKIN is true, filter on NetworKIN.cutoff which includes
497 # PhosphoSitePlus data *because its networkin_score is set to Inf*
498 ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ]
499 ksdata_filtered <- ksdata_filtered[
500 (ksdata_filtered$networkin_score >= networkin_cutoff), ]
501 } else {
502 # Otherwise, simply use PhosphSitePlus rows
503 ksdata_filtered <- ksdata[
504 grep("PhosphoSitePlus", ksdata$Source), ]
505 }
506 # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
507 # colnames of ksdata_filtered:
508 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID"
509 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID"
510 # "SITE_...7_AA" "networkin_score" "Source"
511 # colnames of new:
512 # "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc"
513 # Equivalent to:
514 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc
515 # FROM ksdata_filtered a
516 # INNER JOIN new b
517 # ON a.SUB_GENE = b.SUB_GENE
518 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD
519 ksdata_dataset <- base::merge(ksdata_filtered, new)
520 # colnames of ksdata_dataset:
521 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE"
522 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD"
523 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein"
524 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform)
525 # Re-order dataset; prior to accounting for isoforms
526 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
527 # Extract non-isoform accession in UniProtKB
528 ksdata_dataset$uniprot_no_isoform <- sapply(
529 ksdata_dataset$KIN_ACC_ID,
530 function(x) unlist(strsplit(as.character(x), split = "-"))[1]
531 )
532 # Discard previous results while selecting interesting columns ...
533 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
534 # Column names are now:
535 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p"
536 # "FC" "log2_fc" "Source"
537 # Make column names human-readable
538 colnames(ksdata_dataset_abbrev) <- c(
539 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p",
540 "FC", "log2FC", "Source"
541 )
542 # SELECT * FROM ksdata_dataset_abbrev
543 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p
544 ksdata_dataset_abbrev <-
545 ksdata_dataset_abbrev[
546 order(
547 ksdata_dataset_abbrev$Kinase.Gene,
548 ksdata_dataset_abbrev$Substrate.Gene,
549 ksdata_dataset_abbrev$Substrate.Mod,
550 ksdata_dataset_abbrev$p),
551 ]
552 # First aggregation step to account for multiply phosphorylated peptides
553 # and differing peptide sequences; the goal here is to combine results
554 # for all measurements of the same substrate.
555 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
556 # `Source`, avg(log2FC) AS log2FC
557 # FROM ksdata_dataset_abbrev
558 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
559 # `Source`
560 # ORDER BY `Kinase.Gene`;
561 # in two steps:
562 # (1) compute average log_2(fold-change)
563 ksdata_dataset_abbrev <- aggregate(
564 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source,
565 data = ksdata_dataset_abbrev,
566 FUN = mean
567 )
568 # (2) order by Kinase.Gene
569 ksdata_dataset_abbrev <-
570 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
571 # SELECT `Kinase.Gene`, count(*)
572 # FROM ksdata_dataset_abbrev
573 # GROUP BY `Kinase.Gene`;
574 # in two steps:
575 # (1) Extract the list of Kinase.Gene names
576 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
577 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev,
578 # named by Kinase.Gene
579 kinase_list <- as.matrix(table(kinase_list))
580 # Second aggregation step to account for all substrates per kinase
581 # CREATE TABLE mean_fc
582 # AS
583 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC
584 # FROM ksdata_dataset_abbrev
585 # GROUP BY `Kinase.Gene`
586 mean_fc <- aggregate(
587 log2FC ~ Kinase.Gene,
588 data = ksdata_dataset_abbrev,
589 FUN = mean
590 )
591 # mean_fc columns: "Kinase.Gene", "log2FC"
592 if (FALSE) {
593 # I need to re-think this; I was trying to find the most-represented
594 # peptide, but that horse has already left the barn
595 # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC
596 # FROM ksdata_dataset_abbrev
597 # GROUP BY `Kinase.Gene`
598 max_fc <- aggregate(
599 log2FC ~ Kinase.Gene,
600 data = ksdata_dataset_abbrev,
601 FUN = function(r) max(abs(r))
602 )
603 }
604
605 # Create column 3: mS
606 mean_fc$m_s <- mean_fc[, 2]
607 # Create column 4: Enrichment
608 mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE))
609 # Create column 5: m, count of substrates
610 mean_fc$m <- kinase_list
611 # Create column 6: z-score
612 mean_fc$z_score <- (
613 (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) *
614 sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE)
615 # Create column 7: p-value, deduced from z-score
616 mean_fc$p_value <- pnorm(-abs(mean_fc$z_score))
617 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value
618 mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr")
619
620 # Remove log2FC column, which is duplicated as mS
621 mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2]
622 # Correct the column names which we had to hack because of the linter...
623 colnames(mean_fc) <- c(
624 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
625 )
626 return(mean_fc)
627 }
628
629 low_fdr_barplot <- function(
630 rslt,
631 i_cntrst,
632 i,
633 a_level,
634 b_level,
635 fold_change,
636 caption
637 ) {
638 rslt_score_list_i <- rslt$score_list[[i]]
639 if (!is.null(rslt_score_list_i)) {
640 rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
641 k <- data.frame(
642 contrast = as.integer(i_cntrst),
643 a_level = rep.int(a_level, rslt_score_list_i_nrow),
644 b_level = rep.int(b_level, rslt_score_list_i_nrow),
645 kinase_gene = rslt_score_list_i$Kinase.Gene,
646 mean_log2_fc = rslt_score_list_i$mS,
647 enrichment = rslt_score_list_i$Enrichment,
648 substrate_count = rslt_score_list_i$m,
649 z_score = rslt_score_list_i$z.score,
650 p_value = rslt_score_list_i$p.value,
651 fdr = rslt_score_list_i$FDR
652 )
653 selector <- switch(
654 ksea_cutoff_statistic,
655 "FDR" = {
656 k$fdr
657 },
658 "p.value" = {
659 k$p_value
660 },
661 stop(
662 sprintf(
663 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
664 ksea_cutoff_statistic
665 )
666 )
667 )
668
669 k <- k[selector < ksea_cutoff_threshold, ]
670
671 if (nrow(k) > 1) {
672 op <- par(mai = c(1, 1.5, 0.4, 0.4))
673 numeric_z_score <- as.numeric(k$z_score)
674 z_score_order <- order(numeric_z_score)
675 kinase_name <- k$kinase_gene
676 long_caption <-
677 sprintf(
678 "Kinase z-score, %s < %s, %s",
679 ksea_cutoff_statistic,
680 ksea_cutoff_threshold,
681 caption
682 )
683 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption))
684 cat("\n\\clearpage\n")
685 barplot(
686 height = numeric_z_score[z_score_order],
687 border = NA,
688 xpd = FALSE,
689 cex.names = 1.0,
690 cex.axis = 1.0,
691 main = long_caption,
692 cex.main = my_cex_caption,
693 names.arg = kinase_name[z_score_order],
694 horiz = TRUE,
695 srt = 45,
696 las = 1)
697 par(op)
698 }
699 }
700 }
701
702 # note that this adds elements to the global variable `ksea_asterisk_hash`
703
704 low_fdr_print <- function(
705 rslt,
706 i_cntrst,
707 i,
708 a_level,
709 b_level,
710 fold_change,
711 caption
712 ) {
713 rslt_score_list_i <- rslt$score_list[[i]]
714 if (!is.null(rslt_score_list_i)) {
715 rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
716 k <- contrast_ksea_scores <- data.frame(
717 contrast = as.integer(i_cntrst),
718 a_level = rep.int(a_level, rslt_score_list_i_nrow),
719 b_level = rep.int(b_level, rslt_score_list_i_nrow),
720 kinase_gene = rslt_score_list_i$Kinase.Gene,
721 mean_log2_fc = rslt_score_list_i$mS,
722 enrichment = rslt_score_list_i$Enrichment,
723 substrate_count = rslt_score_list_i$m,
724 z_score = rslt_score_list_i$z.score,
725 p_value = rslt_score_list_i$p.value,
726 fdr = rslt_score_list_i$FDR
727 )
728
729 selector <- switch(
730 ksea_cutoff_statistic,
731 "FDR" = {
732 k$fdr
733 },
734 "p.value" = {
735 k$p_value
736 },
737 stop(
738 sprintf(
739 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
740 ksea_cutoff_statistic
741 )
742 )
743 )
744
745 k <- k[selector < ksea_cutoff_threshold, ]
746 # save kinase names to ksea_asterisk_hash
747 for (kinase_name in k$kinase_gene) {
748 ksea_asterisk_hash[[kinase_name]] <- 1
749 }
750
751 db_write_table_overwrite <- (i_cntrst < 2)
752 db_write_table_append <- !db_write_table_overwrite
753 RSQLite::dbWriteTable(
754 conn = db,
755 name = "contrast_ksea_scores",
756 value = contrast_ksea_scores,
757 append = db_write_table_append
758 )
759 selector <- switch(
760 ksea_cutoff_statistic,
761 "FDR" = {
762 contrast_ksea_scores$fdr
763 },
764 "p.value" = {
765 contrast_ksea_scores$p_value
766 },
767 stop(
768 sprintf(
769 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
770 ksea_cutoff_statistic
771 )
772 )
773 )
774 output_df <- contrast_ksea_scores[
775 selector < ksea_cutoff_threshold,
776 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
777 "z_score", "p_value", "fdr")
778 ]
779 output_order <- with(output_df, order(mean_log2_fc, kinase_gene))
780 output_df <- output_df[output_order, ]
781 colnames(output_df) <-
782 c(
783 colnames(output_df)[1],
784 colnames(output_df)[2],
785 "enrichment",
786 "m_s",
787 "z_score",
788 "p_value",
789 "fdr"
790 )
791 output_df$fdr <- sprintf("%0.4f", output_df$fdr)
792 output_df$p_value <- sprintf("%0.2e", output_df$p_value)
793 output_df$z_score <- sprintf("%0.2f", output_df$z_score)
794 output_df$m_s <- sprintf("%d", output_df$m_s)
795 output_df$enrichment <- sprintf("%0.2f", output_df$enrichment)
796 output_ncol <- ncol(output_df)
797 colnames(output_df) <-
798 c(
799 "Kinase",
800 "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)",
801 "Enrichment",
802 "Substrates",
803 "z-score",
804 "p-value",
805 "FDR"
806 )
807 selector <- switch(
808 ksea_cutoff_statistic,
809 "FDR" = {
810 rslt$score_list[[i]]$FDR
811 },
812 "p.value" = {
813 rslt$score_list[[i]]$p.value
814 },
815 stop(
816 sprintf(
817 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
818 ksea_cutoff_statistic
819 )
820 )
821 )
822 if (sum(selector < ksea_cutoff_threshold) > 0) {
823 math_caption <- gsub("{", "\\{", caption, fixed = TRUE)
824 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
825 data_frame_latex(
826 x = output_df,
827 justification = "l c c c c c c",
828 centered = TRUE,
829 caption = sprintf(
830 "\\text{%s}, %s < %s",
831 math_caption,
832 ksea_cutoff_statistic,
833 ksea_cutoff_threshold
834 ),
835 anchor = const_table_anchor_p
836 )
837 } else {
838 cat(
839 sprintf(
840 "\\break
841 No kinases had
842 \\(\\text{%s}_\\text{enrichment} < %s\\)
843 for contrast %s\\hfill\\break\n",
844 ksea_cutoff_statistic,
845 ksea_cutoff_threshold,
846 caption
847 )
848 )
849 }
850 }
851 }
852
853 # create_breaks is a helper for ksea_heatmap
854 create_breaks <- function(merged_scores) {
855 if (min(merged_scores, na.rm = TRUE) < -1.6) {
856 breaks_neg <- seq(-1.6, 0, length.out = 30)
857 breaks_neg <-
858 append(
859 seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10),
860 breaks_neg
861 )
862 breaks_neg <- sort(unique(breaks_neg))
863 } else {
864 breaks_neg <- seq(-1.6, 0, length.out = 30)
865 }
866 if (max(merged_scores, na.rm = TRUE) > 1.6) {
867 breaks_pos <- seq(0, 1.6, length.out = 30)
868 breaks_pos <-
869 append(
870 breaks_pos,
871 seq(1.6, max(merged_scores, na.rm = TRUE),
872 length.out = 10)
873 )
874 breaks_pos <- sort(unique(breaks_pos))
875 } else {
876 breaks_pos <- seq(0, 1.6, length.out = 30)
877 }
878 breaks_all <- unique(append(breaks_neg, breaks_pos))
879 mycol_neg <-
880 gplots::colorpanel(n = length(breaks_neg),
881 low = "blue",
882 high = "white")
883 mycol_pos <-
884 gplots::colorpanel(n = length(breaks_pos) - 1,
885 low = "white",
886 high = "red")
887 mycol <- unique(append(mycol_neg, mycol_pos))
888 color_breaks <- list(breaks_all, mycol)
889 return(color_breaks)
890 }
891
892 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap
893 draw_kseaapp_summary_heatmap <- function(
894 x,
895 sample_cluster,
896 merged_asterisk,
897 my_cex_row,
898 color_breaks,
899 margins,
900 ...
901 ) {
902 merged_scores <- x
903 if (!is.matrix(x)) {
904 cat(
905 paste0(
906 "No plot because \\texttt{typeof(x)} is '",
907 typeof(x),
908 "' rather than 'matrix'.\n\n"
909 )
910 )
911 } else if (nrow(x) < 2) {
912 cat("No plot because matrix x has ", nrow(x), " rows.\n\n")
913 cat("\\begin{verbatim}\n")
914 str(x)
915 cat("\\end{verbatim}\n")
916 } else if (ncol(x) < 2) {
917 cat("No plot because matrix x has ", ncol(x), " columns.\n\n")
918 cat("\\begin{verbatim}\n")
919 str(x)
920 cat("\\end{verbatim}\n")
921 } else {
922 gplots::heatmap.2(
923 x = merged_scores,
924 Colv = sample_cluster,
925 scale = "none",
926 cellnote = merged_asterisk,
927 notecol = "white",
928 cexCol = 0.9,
929 # Heuristically assign size of row labels
930 cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25),
931 srtCol = 45,
932 srtRow = 45,
933 notecex = 3 * my_cex_row,
934 col = color_breaks[[2]],
935 density.info = "none",
936 trace = "none",
937 breaks = color_breaks[[1]],
938 lmat = rbind(c(0, 3), c(2, 1), c(0, 4)),
939 lhei = c(0.4, 8.0, 1.1),
940 lwid = c(0.5, 3),
941 key = FALSE,
942 margins = margins,
943 ...
944 )
945 }
946 }
947
948 # Adapted from KSEAapp::KSEA.Heatmap
949 ksea_heatmap <- function(
950 # the data frame outputs from the KSEA.Scores() function, in list format
951 score_list,
952 # a character vector of all the sample names for heatmap annotation:
953 # - the names must be in the same order as the data in score_list
954 # - please avoid long names, as they may get cropped in the final image
955 sample_labels,
956 # character string of either "p.value" or "FDR" indicating the data column
957 # to use for marking statistically significant scores
958 stats,
959 # a numeric value between 0 and infinity indicating the min. number of
960 # substrates a kinase must have to be included in the heatmap
961 m_cutoff,
962 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
963 # for indicating significant kinases in the heatmap
964 p_cutoff =
965 stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"),
966 # a binary input of TRUE or FALSE, indicating whether or not to perform
967 # hierarchical clustering of the sample columns
968 sample_cluster,
969 # a binary input of TRUE or FALSE, indicating whether or not to export
970 # the heatmap as a .png image into the working directory
971 export = FALSE,
972 # bottom and right margins; adjust as needed if contrast names are too long
973 margins = c(6, 20),
974 # print which kinases?
975 # - Mandatory argument, must be one of const_ksea_.*_kinases
976 which_kinases,
977 # additional arguments to gplots::heatmap.2, such as:
978 # - main: main title of plot
979 # - xlab: x-axis label
980 # - ylab: y-axis label
981 ...
982 ) {
983 filter_m <- function(dataset, m_cutoff) {
984 filtered <- dataset[(dataset$m >= m_cutoff), ]
985 return(filtered)
986 }
987 score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff))
988 for (i in seq_len(length(score_list_m))) {
989 names <- colnames(score_list_m[[i]])[c(2:7)]
990 colnames(score_list_m[[i]])[c(2:7)] <-
991 paste(names, i, sep = ".")
992 }
993 master <-
994 Reduce(
995 f = function(...) {
996 base::merge(..., by = "Kinase.Gene", all = FALSE)
997 },
998 x = score_list_m
999 )
1000
1001 row.names(master) <- master$Kinase.Gene
1002 columns <- as.character(colnames(master))
1003 merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE])
1004 colnames(merged_scores) <- sample_labels
1005 merged_stats <- as.matrix(master[, grep(stats, columns)])
1006 asterisk <- function(mtrx, p_cutoff) {
1007 new <- data.frame()
1008 for (i in seq_len(nrow(mtrx))) {
1009 for (j in seq_len(ncol(mtrx))) {
1010 my_value <- mtrx[i, j]
1011 if (!is.na(my_value) && my_value < p_cutoff) {
1012 new[i, j] <- "*"
1013 } else {
1014 new[i, j] <- ""
1015 }
1016 }
1017 }
1018 return(new)
1019 }
1020 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff))
1021
1022 # begin hack to print only significant rows
1023 asterisk_rows <- rowSums(merged_asterisk == "*") > 0
1024 all_rows <- rownames(merged_stats)
1025 names(asterisk_rows) <- all_rows
1026 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE])
1027 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE])
1028 merged_scores_asterisk <- merged_scores[names(asterisk_rows), ]
1029 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ]
1030 # end hack to print only significant rows
1031
1032 row_list <- list()
1033 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows
1034 row_list[[const_ksea_all_kinases]] <- all_rows
1035 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows
1036
1037 i <- which_kinases
1038 my_row_names <- row_list[[i]]
1039 scrs <- merged_scores[my_row_names, ]
1040 stts <- merged_stats[my_row_names, ]
1041 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff))
1042
1043 color_breaks <- create_breaks(scrs)
1044 plot_height <- nrow(scrs) ^ 0.55
1045 plot_width <- ncol(scrs) ^ 0.7
1046 my_cex_row <- 0.25 * 16 / plot_height
1047 if (export == "TRUE") {
1048 png(
1049 "KSEA.Merged.Heatmap.png",
1050 width = plot_width * 300,
1051 height = 2 * plot_height * 300,
1052 res = 300,
1053 pointsize = 14
1054 )
1055 }
1056 draw_kseaapp_summary_heatmap(
1057 x = scrs,
1058 sample_cluster = sample_cluster,
1059 merged_asterisk = merged_asterisk,
1060 my_cex_row = my_cex_row,
1061 color_breaks = color_breaks,
1062 margins = margins
1063 )
1064 if (export == "TRUE") {
1065 dev.off()
1066 }
1067 return(my_row_names)
1068 }
1069
1070 # helper for heatmaps of phosphopeptide intensities
1071
1072 draw_intensity_heatmap <-
1073 function(
1074 m, # matrix with rownames already formatted
1075 cutoff, # cutoff used by hm_heading_function
1076 hm_heading_function, # construct and cat heading from m and cutoff
1077 hm_main_title, # main title for plot (drawn below heading)
1078 suppress_row_dendrogram = TRUE, # set to false to show dendrogram
1079 max_peptide_count # experimental:
1080 = intensity_hm_rows, # values of 50 and 75 worked well
1081 ... # passthru parameters for heatmap
1082 ) {
1083 peptide_count <- 0
1084 # emit the heading for the heatmap
1085 if (hm_heading_function(m, cutoff)) {
1086 peptide_count <- min(max_peptide_count, nrow(m))
1087 if (nrow(m) > 1) {
1088 m_margin <- m[peptide_count:1, ]
1089 # Margin setting was heuristically derived
1090 margins <-
1091 c(0.5, # col
1092 max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row
1093 )
1094 }
1095 if (nrow(m) > 1) {
1096 tryCatch(
1097 {
1098 old_oma <- par("oma")
1099 par(cex.main = 0.6)
1100 # Heuristically determined character size adjustment formula
1101 char_contractor <-
1102 250000 / (
1103 max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows
1104 )
1105 heatmap(
1106 m[peptide_count:1, ],
1107 Rowv = if (suppress_row_dendrogram) NA else NULL,
1108 Colv = NA,
1109 cexRow = char_contractor,
1110 cexCol = char_contractor * 50 / max_peptide_count,
1111 scale = "row",
1112 margins = margins,
1113 main =
1114 "Unimputed, unnormalized log(intensities)",
1115 xlab = "",
1116 las = 1,
1117 ...
1118 )
1119 },
1120 error = function(e) {
1121 cat(
1122 sprintf(
1123 "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n",
1124 e$message
1125 )
1126 )
1127 },
1128 finally = par(old_oma)
1129 )
1130 }
1131 }
1132 return(peptide_count)
1133 }
1134 ```
1135
1136 ```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
1137 cat("\\listoftables\n")
1138 ```
1139 # Purpose
1140
1141 To perform for phosphopeptides:
1142
1143 - imputation of missing values,
1144 - quantile normalization,
1145 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), and
1146 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases:
1147 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org)
1148 - The Human Proteome Database [http://hprd.org](http://hprd.org)
1149 - NetworKIN [http://networkin.science/](http://networkin.science/)
1150 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx)
1151
1152 ```{r include = FALSE}
1153
1154 ### GLOBAL VARIABLES
1155
1156 # parameters for KSEA
1157
1158 ksea_cutoff_statistic <- params$kseaCutoffStatistic
1159 ksea_cutoff_threshold <- params$kseaCutoffThreshold
1160 ksea_min_kinase_count <- params$kseaMinKinaseCount
1161
1162 ksea_heatmap_titles <- list()
1163 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <-
1164 sprintf(
1165 "Summary for all kinases enriched in one or more contrasts at %s < %s",
1166 ksea_cutoff_statistic,
1167 ksea_cutoff_threshold
1168 )
1169 ksea_heatmap_titles[[const_ksea_all_kinases]] <-
1170 "Summary figure for all contrasts and all kinases"
1171 ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <-
1172 sprintf(
1173 "Summary for all kinases not enriched at %s < %s in any contrast",
1174 ksea_cutoff_statistic,
1175 ksea_cutoff_threshold
1176 )
1177 # hash to hold names of significantly enriched kinases
1178 ksea_asterisk_hash <- new_env()
1179
1180 # READ PARAMETERS (mostly)
1181
1182 intensity_hm_rows <- params$intensityHeatmapRows
1183 # Input Filename
1184 input_file <- params$inputFile
1185
1186 # First data column - ideally, this could be detected via regexSampleNames,
1187 # but for now leave it as is.
1188 first_data_column <- params$firstDataColumn
1189 fdc_is_integer <- is.integer(first_data_column)
1190 if (fdc_is_integer) {
1191 first_data_column <- as.integer(params$firstDataColumn)
1192 }
1193
1194 # False discovery rate adjustment for ANOVA
1195 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
1196 val_fdr <-
1197 read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "")
1198
1199 if (
1200 ncol(val_fdr) != 1 ||
1201 sum(!is.numeric(val_fdr[, 1])) ||
1202 sum(val_fdr[, 1] < 0) ||
1203 sum(val_fdr[, 1] > 1)
1204 ) {
1205 stop("alphaFile should be one column of numbers within the range [0.0,1.0]")
1206 }
1207 val_fdr <- val_fdr[, 1]
1208
1209 #Imputed Data filename
1210 imputed_data_filename <- params$imputedDataFilename
1211 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
1212 anova_ksea_mtdt_file <- params$anovaKseaMetadata
1213
1214 ```
1215
1216 ```{r echo = FALSE}
1217 # Imputation method, should be one of
1218 # "random", "group-median", "median", or "mean"
1219 imputation_method <- params$imputationMethod
1220
1221 # Selection of percentile of logvalue data to set the mean for random number
1222 # generation when using random imputation
1223 mean_percentile <- params$meanPercentile / 100.0
1224
1225 # deviation adjustment-factor for random values; real number.
1226 sd_percentile <- params$sdPercentile
1227
1228 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
1229 regex_sample_names <- params$regexSampleNames
1230
1231 # Regular expression to extract Sample Grouping from Sample Name;
1232 # if error occurs, compare sample_treatment_levels vs. sample_name_matches
1233 # to see if groupings/pairs line up
1234 # e.g., "(\\d+)"
1235 regex_sample_grouping <- params$regexSampleGrouping
1236
1237 one_way_all_categories_fname <- params$oneWayManyCategories
1238 one_way_all_categories <- try_catch_w_e(
1239 match.fun(one_way_all_categories_fname))
1240 if (!is.function(one_way_all_categories$value)) {
1241 write("fatal error for parameter oneWayManyCategories:", stderr())
1242 write(one_way_all_categories$value$message, stderr())
1243 if (sys.nframe() > 0) quit(save = "no", status = 1)
1244 stop("Cannot continue. Goodbye.")
1245 }
1246 one_way_all_categories <- one_way_all_categories$value
1247
1248 one_way_two_categories_fname <- params$oneWayManyCategories
1249 one_way_two_categories <- try_catch_w_e(
1250 match.fun(one_way_two_categories_fname))
1251 if (!is.function(one_way_two_categories$value)) {
1252 cat("fatal error for parameter oneWayTwoCategories: \n")
1253 cat(one_way_two_categories$value$message, fill = TRUE)
1254 if (sys.nframe() > 0) quit(save = "no", status = 1)
1255 stop("Cannot continue. Goodbye.")
1256 }
1257 one_way_two_categories <- one_way_two_categories$value
1258
1259 preproc_db <- params$preprocDb
1260 ksea_app_prep_db <- params$kseaAppPrepDb
1261 result <- file.copy(
1262 from = preproc_db,
1263 to = ksea_app_prep_db,
1264 overwrite = TRUE
1265 )
1266 if (!result) {
1267 write(
1268 sprintf(
1269 "fatal error copying initial database '%s' to output '%s'",
1270 preproc_db,
1271 ksea_app_prep_db,
1272 ),
1273 stderr()
1274 )
1275 if (sys.nframe() > 0) quit(save = "no", status = 1)
1276 stop("Cannot continue. Goodbye.")
1277 }
1278 ```
1279
1280 ```{r echo = FALSE}
1281 ### READ DATA
1282
1283 # read.table reads a file in table format and creates a data frame from it.
1284 # - note that `quote = ""` means that quotation marks are treated literally.
1285 full_data <- read.table(
1286 file = input_file,
1287 sep = "\t",
1288 header = TRUE,
1289 quote = "",
1290 check.names = FALSE
1291 )
1292 ```
1293
1294 # Extract Sample Names and Treatment Levels
1295
1296 Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2.
1297
1298 ```{r echo = FALSE, results = 'asis'}
1299
1300 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
1301
1302 if (!fdc_is_integer) {
1303 if (length(data_column_indices) > 0) {
1304 first_data_column <- data_column_indices[1]
1305 } else {
1306 stop(paste("failed to convert firstDataColumn:", first_data_column))
1307 }
1308 }
1309
1310 cat(
1311 sprintf(
1312 paste(
1313 "\n\nThe input data file has peptide-intensity data for each sample",
1314 "in one of columns %d through %d.\n\n"
1315 ),
1316 min(data_column_indices),
1317 max(data_column_indices)
1318 )
1319 )
1320
1321 # Write column names as a LaTeX enumerated list.
1322 column_name_df <- data.frame(
1323 column = seq_len(length(colnames(full_data))),
1324 name = paste0("\\verb@", colnames(full_data), "@")
1325 )
1326 data_frame_latex(
1327 x = column_name_df,
1328 justification = "l l",
1329 centered = TRUE,
1330 caption = "Input data column names",
1331 anchor = const_table_anchor_bp,
1332 underscore_whack = FALSE
1333 )
1334
1335 ```
1336
1337 ```{r echo = FALSE, results = 'asis'}
1338 quant_data <- full_data[first_data_column:length(full_data)]
1339 quant_data[quant_data == 0] <- NA
1340 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide
1341 # Extract factors and trt-replicates using regular expressions.
1342 # Typically:
1343 # regex_sample_names is "\\.\\d+[A-Z]$"
1344 # regex_sample_grouping is "\\d+"
1345 # This would distinguish trt-replicates by terminal letter [A-Z]
1346 # in sample names and group them into trts by the preceding digits.
1347 # e.g.:
1348 # group .1A .1B .1C into group 1;
1349 # group .2A .2B .2C, into group 2;
1350 # etc.
1351 m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE)
1352 sample_name_matches <- regmatches(names(quant_data), m)
1353 colnames(quant_data) <- sample_name_matches
1354
1355 write_debug_file(quant_data)
1356
1357 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
1358 sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match))
1359 number_of_samples <- length(sample_name_matches)
1360 sample_treatment_df <- data.frame(
1361 level = sample_treatment_levels,
1362 sample = sample_name_matches
1363 )
1364 data_frame_latex(
1365 x = sample_treatment_df,
1366 justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}",
1367 centered = TRUE,
1368 caption = "Treatment levels",
1369 anchor = const_table_anchor_tbp,
1370 underscore_whack = FALSE
1371 )
1372 ```
1373
1374 ```{r echo = FALSE, results = 'asis'}
1375 cat("\\newpage\n")
1376 ```
1377
1378 ## Are the log-transformed sample distributions similar?
1379
1380 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
1381
1382 quant_data[quant_data == 0] <- NA #replace 0 with NA
1383 quant_data_log <- log10(quant_data)
1384
1385 rownames(quant_data_log) <- rownames(quant_data)
1386 colnames(quant_data_log) <- sample_name_matches
1387
1388 write_debug_file(quant_data_log)
1389
1390 # data visualization
1391 old_par <- par(
1392 mai = par("mai") + c(0.5, 0, 0, 0)
1393 )
1394 # ref: https://r-charts.com/distribution/add-points-boxplot/
1395 # Vertical plot
1396 boxplot(
1397 quant_data_log
1398 , las = 1
1399 , col = const_boxplot_fill
1400 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
1401 , xlab = "Sample"
1402 )
1403 par(old_par)
1404
1405
1406
1407 cat("\n\n\n")
1408 cat("\n\n\n")
1409
1410 ```
1411
1412 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
1413 if (nrow(quant_data_log) > 1) {
1414 quant_data_log_stack <- stack(quant_data_log)
1415 ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
1416 ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
1417 ggplot2::ylab("Probability density") +
1418 ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
1419 } else {
1420 cat("No density plot because there are too few peptides.\n\n")
1421 }
1422 ```
1423
1424 ## Globally, are peptide intensities are approximately unimodal?
1425
1426 <!--
1427 # bquote could be used as an alternative to latex2exp::TeX below particularly
1428 # and when plotting math expressions generally, at the expense of mastering
1429 # another syntax, which hardly seems worthwhile when I need to use TeX
1430 # elsewhere; here's an introduction to bquote:
1431 # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/
1432 -->
1433 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
1434
1435 # identify the location of missing values
1436 fin <- is.finite(as.numeric(as.matrix(quant_data_log)))
1437
1438 logvalues <- as.numeric(as.matrix(quant_data_log))[fin]
1439 logvalues_density <- density(logvalues)
1440 plot(
1441 x = logvalues_density,
1442 main = latex2exp::TeX(
1443 "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)"
1444 ),
1445 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
1446 ylab = "Probability density"
1447 )
1448 hist(
1449 x = as.numeric(as.matrix(quant_data_log)),
1450 xlim = c(min(logvalues_density$x), max(logvalues_density$x)),
1451 breaks = 100,
1452 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"),
1453 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)")
1454 )
1455 ```
1456
1457 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values
1458
1459 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
1460 # determine quantile
1461 q1 <- quantile(logvalues, probs = mean_percentile)[1]
1462
1463 # 1 = row of matrix (ie, phosphopeptide)
1464 sds <- apply(quant_data_log, 1, sd_finite)
1465 if (sum(!is.na(sds)) > 2) {
1466 plot(
1467 density(sds, na.rm = TRUE)
1468 , main = "Smoothed estimated probability density vs. std. deviation"
1469 , sub = "(probability estimation made with Gaussian smoothing)"
1470 , ylab = "Probability density"
1471 )
1472 } else {
1473 cat(
1474 "At least two non-missing values are required to plot",
1475 "probability density.\n\n"
1476 )
1477 }
1478
1479 ```
1480
1481 ```{r echo = FALSE}
1482 # Determine number of cells to impute
1483 temp <- quant_data[is.na(quant_data)]
1484
1485 # Determine number of values to impute
1486 number_to_impute <- length(temp)
1487
1488 # Determine percent of missing values
1489 pct_missing_values <-
1490 round(length(temp) / (length(logvalues) + length(temp)) * 100)
1491 ```
1492
1493 ```{r echo = FALSE}
1494
1495 # prep for trt-median based imputation
1496
1497 ```
1498 # Impute Missing Values
1499
1500 ```{r echo = FALSE}
1501
1502 imp_smry_pot_peptides_before <- nrow(quant_data_log)
1503 imp_smry_missing_values_before <- number_to_impute
1504 imp_smry_pct_missing <- pct_missing_values
1505
1506 ```
1507
1508 ```{r echo = FALSE}
1509 #Determine number of cells to impute
1510
1511 ```
1512 ```{r echo = FALSE}
1513
1514 # Identify which values are missing and need to be imputed
1515 ind <- which(is.na(quant_data), arr.ind = TRUE)
1516
1517 ```
1518 ```{r echo = FALSE, results = 'asis'}
1519
1520 # Apply imputation
1521 switch(
1522 imputation_method
1523 , "group-median" = {
1524 quant_data_imp <- quant_data
1525 imputation_method_description <-
1526 paste("Substitute missing value with",
1527 "median peptide-intensity for sample group.\n"
1528 )
1529 sample_level_integers <- as.integer(sample_treatment_levels)
1530 # Take the accurate ln(x+1) because the data are log-normally distributed
1531 # and because median can involve an average of two measurements.
1532 quant_data_imp <- log1p(quant_data_imp)
1533 for (i in seq_len(length(levels(sample_treatment_levels)))) {
1534 # Determine the columns for this factor-level
1535 level_cols <- i == sample_level_integers
1536 # Extract those columns
1537 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE]
1538 # assign to ind the row-column pairs corresponding to each NA
1539 ind <- which(is.na(lvlsbst), arr.ind = TRUE)
1540 # No group-median exists if there is only one sample
1541 # a given ppep has no measurement; otherwise, proceed.
1542 if (ncol(lvlsbst) > 1) {
1543 the_centers <-
1544 apply(lvlsbst, 1, median, na.rm = TRUE)
1545 for (j in seq_len(nrow(lvlsbst))) {
1546 for (k in seq_len(ncol(lvlsbst))) {
1547 if (is.na(lvlsbst[j, k])) {
1548 lvlsbst[j, k] <- the_centers[j]
1549 }
1550 }
1551 }
1552 quant_data_imp[, level_cols] <- lvlsbst
1553 }
1554 }
1555 # Take the accurate e^x - 1 to match scaling of original input.
1556 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
1557 good_rows <- !is.na(rowMeans(quant_data_imp))
1558 }
1559 , "median" = {
1560 quant_data_imp <- quant_data
1561 imputation_method_description <-
1562 paste("Substitute missing value with",
1563 "median peptide-intensity across all sample classes.\n"
1564 )
1565 # Take the accurate ln(x+1) because the data are log-normally distributed
1566 # and because median can involve an average of two measurements.
1567 quant_data_imp <- log1p(quant_data_imp)
1568 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]]
1569 # Take the accurate e^x - 1 to match scaling of original input.
1570 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
1571 good_rows <- !is.nan(rowMeans(quant_data_imp))
1572 }
1573 , "mean" = {
1574 quant_data_imp <- quant_data
1575 imputation_method_description <-
1576 paste("Substitute missing value with",
1577 "geometric-mean peptide intensity across all sample classes.\n"
1578 )
1579 # Take the accurate ln(x+1) because the data are log-normally distributed,
1580 # so arguments to mean should be previously transformed.
1581 # this will have to be
1582 quant_data_imp <- log1p(quant_data_imp)
1583 # Assign to NA cells the mean for the row
1584 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]]
1585 # Take the accurate e^x - 1 to match scaling of original input.
1586 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
1587 good_rows <- !is.nan(rowMeans(quant_data_imp))
1588 }
1589 , "random" = {
1590 quant_data_imp <- quant_data
1591 m1 <- median(sds, na.rm = TRUE) * sd_percentile #sd to be used is the median sd
1592 # If you want results to be reproducible, you will want to call
1593 # base::set.seed before calling stats::rnorm
1594 imputation_method_description <-
1595 paste("Substitute each missing value with random intensity",
1596 sprintf(
1597 "random intensity $N \\sim (%0.2f, %0.2f)$.\n",
1598 q1, m1
1599 )
1600 )
1601 cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n",
1602 100 * mean_percentile))
1603 cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n",
1604 sd_percentile))
1605 quant_data_imp[ind] <-
1606 10 ^ rnorm(number_to_impute, mean = q1, sd = m1)
1607 quant_data_imp_ln <- log1p(quant_data_imp)
1608 good_rows <- !is.nan(rowMeans(quant_data_imp))
1609 }
1610 )
1611 quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e
1612
1613 if (length(good_rows) < 1) {
1614 print("ERROR: Cannot impute data; there are no good rows!")
1615 return(-1)
1616 }
1617 ```
1618
1619 ```{r echo = FALSE, results = 'asis'}
1620 cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description)
1621 ```
1622
1623 ```{r echo = FALSE}
1624
1625 imp_smry_pot_peptides_after <- sum(good_rows)
1626 imp_smry_rejected_after <- sum(!good_rows)
1627 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ]))
1628 ```
1629 ```{r echo = FALSE, results = 'asis'}
1630 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
1631 tabular_lines_fmt <- paste(
1632 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
1633 " \\caption{Imputation Results}",
1634 " \\centering", # \centering centers the table on the page
1635 " \\begin{tabular}{l c c c}",
1636 " \\hline\\hline",
1637 " \\ & potential peptides & missing values & rejected",
1638 " peptides \\\\ [0.5ex]",
1639 " \\hline",
1640 " before imputation & %d & %d (%d\\%s) & \\\\",
1641 " after imputation & %d & %d & %d \\\\ [1ex]",
1642 " \\hline",
1643 " \\end{tabular}",
1644 #" \\label{table:nonlin}", # may be used to refer this table in the text
1645 "\\end{table}",
1646 sep = "\n"
1647 )
1648 tabular_lines <-
1649 sprintf(
1650 tabular_lines_fmt,
1651 imp_smry_pot_peptides_before,
1652 imp_smry_missing_values_before,
1653 imp_smry_pct_missing,
1654 "%",
1655 imp_smry_pot_peptides_after,
1656 imp_smry_missing_values_after,
1657 imp_smry_rejected_after
1658 )
1659 cat(tabular_lines)
1660 ```
1661 ```{r echo = FALSE}
1662
1663
1664 # Zap rows where imputation was ineffective
1665 full_data <- full_data [good_rows, ]
1666 quant_data <- quant_data [good_rows, ]
1667
1668 quant_data_imp <- quant_data_imp[good_rows, ]
1669 write_debug_file(quant_data_imp)
1670 quant_data_imp_good_rows <- quant_data_imp
1671
1672 write_debug_file(quant_data_imp_good_rows)
1673 ```
1674
1675 ```{r echo = FALSE, results = 'asis'}
1676
1677 can_plot_before_after_imp <- TRUE
1678 d_combined <-
1679 as.numeric(
1680 as.matrix(
1681 log10(quant_data_imp)
1682 )
1683 )
1684 d_original <-
1685 as.numeric(
1686 as.matrix(
1687 log10(quant_data_imp[!is.na(quant_data)])
1688 )
1689 )
1690
1691 if (sum(!is.na(d_original)) > 2) {
1692 d_original <- density(d_original)
1693 } else {
1694 can_plot_before_after_imp <- FALSE
1695 }
1696 if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) {
1697 d_combined <- density(d_combined)
1698 } else {
1699 can_plot_before_after_imp <- FALSE
1700 }
1701
1702 if (sum(is.na(quant_data)) > 0) {
1703 # There ARE missing values
1704 d_imputed <-
1705 as.numeric(
1706 as.matrix(
1707 log10(quant_data_imp[is.na(quant_data)])
1708 )
1709 )
1710 if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) {
1711 d_imputed <- (density(d_imputed))
1712 } else {
1713 can_plot_before_after_imp <- FALSE
1714 }
1715 } else {
1716 # There are NO missing values
1717 d_imputed <- d_combined
1718 }
1719
1720 ```
1721
1722 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
1723 zero_sd_rownames <-
1724 rownames(quant_data_imp)[
1725 is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0)
1726 ]
1727
1728 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) {
1729 stop("All peptides have zero standard deviation. Cannot continue.")
1730 }
1731 if (length(zero_sd_rownames) > 0) {
1732 cat(
1733 sprintf("%d peptides with zero variance were removed from statistical consideration",
1734 length(zero_sd_rownames)
1735 )
1736 )
1737 zap_named_rows <- function(df, nms) {
1738 return(df[!(row.names(df) %in% nms), ])
1739 }
1740 quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames)
1741 quant_data <- zap_named_rows(quant_data, zero_sd_rownames)
1742 full_data <- zap_named_rows(full_data, zero_sd_rownames)
1743 }
1744
1745 if (sum(is.na(quant_data)) > 0) {
1746 cat("\\leavevmode\\newpage\n")
1747 # data visualization
1748 old_par <- par(
1749 mai = par("mai") + c(0.5, 0, 0, 0)
1750 )
1751 # Copy quant data to x
1752 x <- quant_data
1753 # x gets to have values of:
1754 # - NA for observed values
1755 # - 1 for missing values
1756 x[is.na(x)] <- 0
1757 x[x > 1] <- NA
1758 x[x == 0] <- 1
1759 # Log-transform imputed data
1760 # update variable because rows may have been eliminated from quant_data_imp
1761 quant_data_imp_log10 <- log10(quant_data_imp)
1762
1763 write_debug_file(quant_data_imp_log10)
1764
1765 # Set blue_dots to log of quant data or NA for NA quant data
1766 blue_dots <- log10(quant_data)
1767 # Set red_dots to log of imputed data or NA for observed quant data
1768 red_dots <- quant_data_imp_log10 * x
1769
1770 count_red <- sum(!is.na(red_dots))
1771 count_blue <- sum(!is.na(blue_dots))
1772 ylim_save <- ylim <- c(
1773 min(red_dots, blue_dots, na.rm = TRUE),
1774 max(red_dots, blue_dots, na.rm = TRUE)
1775 )
1776 show_stripchart <-
1777 50 > (count_red + count_blue) / length(sample_name_matches)
1778 if (show_stripchart) {
1779 boxplot_sub <- "Light blue = data before imputation; Red = imputed data"
1780 } else {
1781 boxplot_sub <- ""
1782 }
1783
1784 # Vertical plot
1785 colnames(blue_dots) <- sample_name_matches
1786 boxplot(
1787 blue_dots
1788 , las = 1 # "always horizontal"
1789 , col = const_boxplot_fill
1790 , ylim = ylim
1791 , main = "Peptide intensities after eliminating unusable peptides"
1792 , sub = boxplot_sub
1793 , xlab = "Sample"
1794 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
1795 )
1796
1797 if (show_stripchart) {
1798 # Points
1799 # ref: https://r-charts.com/distribution/add-points-boxplot/
1800 # NA values are not plotted
1801 stripchart(
1802 blue_dots, # Data
1803 method = "jitter", # Random noise
1804 jitter = const_stripchart_jitter,
1805 pch = 19, # Pch symbols
1806 cex = const_stripsmall_cex, # Size of symbols reduced
1807 col = "lightblue", # Color of the symbol
1808 vertical = TRUE, # Vertical mode
1809 add = TRUE # Add it over
1810 )
1811 stripchart(
1812 red_dots, # Data
1813 method = "jitter", # Random noise
1814 jitter = const_stripchart_jitter,
1815 pch = 19, # Pch symbols
1816 cex = const_stripsmall_cex, # Size of symbols reduced
1817 col = "red", # Color of the symbol
1818 vertical = TRUE, # Vertical mode
1819 add = TRUE # Add it over
1820 )
1821
1822 }
1823 if (TRUE) {
1824 # show measured values in blue on left half-violin plot
1825 cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
1826 vioplot::vioplot(
1827 x = lapply(blue_dots, function(x) x[!is.na(x)]),
1828 col = "lightblue1",
1829 side = "left",
1830 plotCentre = "line",
1831 ylim = ylim_save,
1832 main = "Distributions of observed and imputed data",
1833 sub = "Light blue = observed data; Pink = imputed data",
1834 xlab = "Sample",
1835 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
1836 )
1837 red_violins <- lapply(red_dots, function(x) x[!is.na(x)])
1838 cols_to_delete <- c()
1839 for (ix in seq_len(length(red_violins))) {
1840 if (length(red_violins[[ix]]) < 1) {
1841 cols_to_delete <- c(cols_to_delete, ix)
1842 }
1843 }
1844 # destroy any unimputable columns
1845 if (!is.null(cols_to_delete)) {
1846 red_violins <- red_violins[-cols_to_delete]
1847 }
1848 # plot imputed values in red on right half-violin plot
1849 vioplot::vioplot(
1850 x = red_violins,
1851 col = "lightpink1",
1852 side = "right",
1853 plotCentre = "line",
1854 add = TRUE
1855 )
1856 }
1857
1858 par(old_par)
1859
1860 # density plot
1861 cat("\\leavevmode\n\n\n\n\n\n\n")
1862 if (can_plot_before_after_imp) {
1863 ylim <- c(
1864 0,
1865 max(
1866 if (is.list(d_combined)) d_combined$y else d_combined,
1867 if (is.list(d_original)) d_original$y else d_original,
1868 if (is.list(d_imputed)) d_imputed$y else d_imputed,
1869 na.rm = TRUE
1870 )
1871 )
1872 plot(
1873 d_combined,
1874 ylim = ylim,
1875 sub =
1876 paste(
1877 "Blue = data before imputation; Red = imputed data;",
1878 "Black = combined"
1879 ),
1880 main = "Density of peptide intensity before and after imputation",
1881 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
1882 ylab = "Probability density"
1883 )
1884 lines(d_original, col = "blue")
1885 lines(d_imputed, col = "red")
1886 } else {
1887 cat(
1888 "There are too few points to plot the density of peptide intensity",
1889 "before and after imputation."
1890 )
1891 }
1892 cat("\\leavevmode\\newpage\n")
1893 }
1894 ```
1895
1896 # Perform Quantile Normalization
1897
1898 The excellent `normalize.quantiles` function from
1899 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)*
1900 performs "quantile normalization" as described Bolstad *et al.* (2003),
1901 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)*
1902 and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*,
1903 i.e., it assumes that the goal is to detect
1904 subtle differences among grossly similar samples (having similar distributions)
1905 by equailzing intra-quantile quantitations.
1906 Unfortunately, one software library upon which it depends
1907 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)*
1908 that requires that a specific, non-concurrent version of the library be
1909 installed. The installation command equivalent to what was used to install the library to produce the results presented here is:
1910 ```
1911 conda install bioconductor-preprocesscore openblas=0.3.3
1912 ```
1913
1914
1915 <!--
1916 # Apply quantile normalization using preprocessCore::normalize.quantiles
1917 # ---
1918 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
1919 # except this: https://support.bioconductor.org/p/122925/#9135989
1920 # says to install it like this:
1921 # ```
1922 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1])
1923 # ```
1924 # conda installation (necessary because of a bug in recent openblas):
1925 # conda install bioconductor-preprocesscore openblas=0.3.3
1926 # ...
1927 # ---
1928 # normalize.quantiles {preprocessCore} -- Quantile Normalization
1929 #
1930 # Description:
1931 # Using a normalization based upon quantiles, this function normalizes a
1932 # matrix of probe level intensities.
1933 #
1934 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the
1935 # assumption that the data is missing at random.
1936 #
1937 # Usage:
1938 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE)
1939 #
1940 # Arguments:
1941 #
1942 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
1943 #
1944 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
1945 # but in certain situations not making a copy of the matrix, but instead normalizing
1946 # it in place will be more memory friendly.
1947 #
1948 # - keep.names: Boolean option to preserve matrix row and column names in output.
1949 #
1950 # Details:
1951 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
1952 # No special allowances are made for outliers. If you make use of quantile normalization
1953 # please cite Bolstad et al, Bioinformatics (2003).
1954 #
1955 # This functions will handle missing data (ie NA values), based on
1956 # the assumption that the data is missing at random.
1957 #
1958 # Note that the current implementation optimizes for better memory usage
1959 # at the cost of some additional run-time.
1960 #
1961 # Value: A normalized matrix.
1962 #
1963 # Author: Ben Bolstad, bmbolstad.com
1964 #
1965 # References
1966 #
1967 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
1968 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
1969 #
1970 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
1971 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
1972 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
1973 # http://bmbolstad.com/misc/normalize/normalize.html
1974 # ...
1975 -->
1976 ```{r echo = FALSE, results = 'asis'}
1977
1978 if (nrow(quant_data_imp) > 0) {
1979 quant_data_imp_qn <- preprocessCore::normalize.quantiles(
1980 as.matrix(quant_data_imp), keep.names = TRUE
1981 )
1982 } else {
1983 quant_data_imp_qn <- as.matrix(quant_data_imp)
1984 }
1985
1986 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
1987
1988 write_debug_file(quant_data_imp_qn)
1989
1990 quant_data_imp_qn_log <- log10(quant_data_imp_qn)
1991
1992 write_debug_file(quant_data_imp_qn_log)
1993
1994 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
1995
1996 sel <- apply(quant_data_imp_qn_ls, 1, any_nan)
1997 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls
1998
1999 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ]
2000 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)
2001
2002 quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls)
2003
2004 write_debug_file(quant_data_imp_qn_ls)
2005 write_debug_file(quant_data_imp_qn_ls2)
2006
2007 # Create data.frame used by ANOVA analysis
2008 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
2009 ```
2010
2011 <!-- ACE insertion begin -->
2012 ## Are normalized, imputed, log-transformed sample distributions similar?
2013
2014 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
2015
2016 # Save unimputed quant_data_log for plotting below
2017 unimputed_quant_data_log <- quant_data_log
2018
2019 # log10 transform (after preparing for zero values,
2020 # which should never happen...)
2021 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001
2022 quant_data_log <- log10(quant_data_imp_qn)
2023
2024 how_many_peptides <- nrow(quant_data_log)
2025
2026 if ((how_many_peptides) > 0) {
2027 cat(
2028 sprintf(
2029 "Intensities for %d peptides:\n\n\n",
2030 how_many_peptides
2031 )
2032 )
2033 cat("\n\n\n")
2034
2035
2036 # data visualization
2037 old_par <- par(
2038 mai = par("mai") + c(0.5, 0, 0, 0)
2039 , oma = par("oma") + c(0.5, 0, 0, 0)
2040 )
2041 # ref: https://r-charts.com/distribution/add-points-boxplot/
2042 # Vertical plot
2043 colnames(quant_data_log) <- sample_name_matches
2044 boxplot(
2045 quant_data_log
2046 , las = 1
2047 , col = const_boxplot_fill
2048 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
2049 , xlab = "Sample"
2050 )
2051 par(old_par)
2052 } else {
2053 cat("There are no peptides to plot\n")
2054 }
2055
2056 cat("\n\n\n")
2057
2058 ```
2059
2060 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
2061 if (nrow(quant_data_log) > 1) {
2062 quant_data_log_stack <- stack(quant_data_log)
2063 ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
2064 ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
2065 ggplot2::ylab("Probability density") +
2066 ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
2067 } else {
2068 cat("No density plot because there are fewer than two peptides to plot.\n\n")
2069 }
2070 ```
2071 ```{r echo = FALSE, results = 'asis'}
2072 cat("\\leavevmode\\newpage\n")
2073 ```
2074
2075 # ANOVA Analysis
2076
2077 ```{r, echo = FALSE}
2078 # Make new data frame containing only Phosphopeptides
2079 # to connect preANOVA to ANOVA (connect_df)
2080 connect_df <- data.frame(
2081 data_table_imp_qn_lt$Phosphopeptide
2082 , data_table_imp_qn_lt[, first_data_column]
2083 )
2084 colnames(connect_df) <- c("Phosphopeptide", "Intensity")
2085 ```
2086
2087 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
2088 count_of_treatment_levels <- length(levels(sample_treatment_levels))
2089 if (count_of_treatment_levels < 2) {
2090 nuke_control_sequences <-
2091 function(s) {
2092 s <- gsub("[\\]", "xyzzy_plugh", s)
2093 s <- gsub("[$]", "\\\\$", s)
2094 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
2095 return(s)
2096 }
2097 cat(
2098 "ERROR!!!! Cannot perform ANOVA analysis",
2099 "(see next page)\\newpage\n"
2100 )
2101 cat(
2102 "ERROR: ANOVA analysis",
2103 "requires two or more factor levels!\n\n\n"
2104 )
2105
2106 cat("\n\n\n\n\n")
2107 cat("Unparsed sample names are:\n\n\n",
2108 "\\begin{quote}\n",
2109 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"),
2110 "\n\\end{quote}\n\n")
2111
2112 regex_sample_names <- nuke_control_sequences(regex_sample_names)
2113
2114 cat("\\leavevmode\n\n\n")
2115 cat("Parsing rule for SampleNames is",
2116 "\n\n\n",
2117 "\\text{'",
2118 regex_sample_names,
2119 "'}\n\n\n",
2120 sep = ""
2121 )
2122
2123 cat("\nParsed sample names are:\n",
2124 "\\begin{quote}\n",
2125 paste(sample_name_matches, collapse = "\n\n\n"),
2126 "\n\\end{quote}\n\n")
2127
2128 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping)
2129
2130 cat("\\leavevmode\n\n\n")
2131 cat("Parsing rule for SampleGrouping is",
2132 "\n\n\n",
2133 "\\text{'",
2134 regex_sample_grouping,
2135 "'}\n\n\n",
2136 sep = ""
2137 )
2138
2139 cat("\n\n\n")
2140 cat("Sample group assignments are:\n",
2141 "\\begin{quote}\n",
2142 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"),
2143 "\n\\end{quote}\n\n")
2144
2145 } else {
2146
2147 p_value_data_anova_ps <-
2148 apply(
2149 quant_data_imp_qn_log,
2150 1,
2151 anova_func,
2152 grouping_factor = sample_treatment_levels,
2153 one_way_f = one_way_all_categories
2154 )
2155
2156 p_value_data_anova_ps_fdr <-
2157 p.adjust(p_value_data_anova_ps, method = "fdr")
2158 p_value_data <- data.frame(
2159 phosphopeptide = full_data[, 1]
2160 ,
2161 raw_anova_p = p_value_data_anova_ps
2162 ,
2163 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr
2164 )
2165
2166 # output ANOVA file to constructed filename,
2167 # e.g. "Outputfile_pST_ANOVA_STEP5.txt"
2168 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"
2169
2170 # Re-output datasets to include p-values
2171 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3])
2172 write.table(
2173 cbind(metadata_plus_p, quant_data_imp),
2174 file = imputed_data_filename,
2175 sep = "\t",
2176 col.names = TRUE,
2177 row.names = FALSE,
2178 quote = FALSE
2179 )
2180
2181 write.table(
2182 cbind(metadata_plus_p, quant_data_imp_qn_log),
2183 file = imp_qn_lt_data_filenm,
2184 sep = "\t",
2185 col.names = TRUE,
2186 row.names = FALSE,
2187 quote = FALSE
2188 )
2189
2190
2191 p_value_data <-
2192 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ]
2193
2194 first_page_suppress <- 1
2195 number_of_peptides_found <- 0
2196 cutoff <- val_fdr[1]
2197 for (cutoff in val_fdr) {
2198 if (number_of_peptides_found > 49) {
2199 cat("\\leavevmode\n\n\n")
2200 break
2201 }
2202
2203 #loop through FDR cutoffs
2204
2205 filtered_p <-
2206 p_value_data[
2207 which(p_value_data$fdr_adjusted_anova_p < cutoff),
2208 , drop = FALSE
2209 ]
2210 filtered_data_filtered <-
2211 quant_data_imp_qn_log[
2212 rownames(filtered_p),
2213 , drop = FALSE
2214 ]
2215 filtered_data_filtered <-
2216 filtered_data_filtered[
2217 order(filtered_p$fdr_adjusted_anova_p),
2218 , drop = FALSE
2219 ]
2220
2221 # <!-- ACE insertion start -->
2222
2223 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
2224 if (first_page_suppress == 1) {
2225 first_page_suppress <- 0
2226 } else {
2227 cat("\\newpage\n")
2228 }
2229 if (nrow(filtered_data_filtered) > 1) {
2230 subsection_header(sprintf(
2231 "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n",
2232 nrow(filtered_data_filtered),
2233 cutoff
2234 ))
2235 } else {
2236 subsection_header(sprintf(
2237 "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n",
2238 rownames(filtered_data_filtered)[1],
2239 cutoff
2240 ))
2241 }
2242 cat("\n\n\n")
2243 cat("\n\n\n")
2244
2245 old_oma <- par("oma")
2246 old_par <- par(
2247 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
2248 oma = old_oma * c(1, 1, 0.3, 1),
2249 cex.main = 0.9,
2250 cex.axis = 0.7,
2251 fin = c(9, 7.25)
2252 )
2253 # ref: https://r-charts.com/distribution/add-points-boxplot/
2254 # Vertical plot
2255 colnames(filtered_data_filtered) <- sample_name_matches
2256 tryCatch(
2257 boxplot(
2258 filtered_data_filtered,
2259 main = "Imputed, normalized intensities", # no line plot
2260 las = 1,
2261 col = const_boxplot_fill,
2262 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
2263 ),
2264 error = function(e) print(e)
2265 )
2266 par(old_par)
2267 } else {
2268 cat(sprintf(
2269 "%s < %0.2f\n\n\n\n\n",
2270 "No peptides were found to have cutoff adjusted p-value",
2271 cutoff
2272 ))
2273 }
2274
2275 if (nrow(filtered_data_filtered) > 0) {
2276 # Add Phosphopeptide column to anova_filtered table
2277 # The assumption here is that the first intensity is unique;
2278 # this is a hokey assumption but almost definitely will
2279 # be true in the real world unless there is a computation
2280 # error upstream.
2281 anova_filtered_merge <- base::merge(
2282 x = connect_df,
2283 y = filtered_data_filtered,
2284 by.x = "Intensity",
2285 by.y = 1
2286 )
2287 anova_filtered_merge_order <- rownames(filtered_p)
2288
2289 anova_filtered <- data.frame(
2290 ppep = anova_filtered_merge$Phosphopeptide,
2291 intense = anova_filtered_merge$Intensity,
2292 data = anova_filtered_merge[, 2:number_of_samples + 1]
2293 )
2294 colnames(anova_filtered) <-
2295 c("Phosphopeptide", colnames(filtered_data_filtered))
2296
2297 # Merge qualitative columns into the ANOVA data
2298 output_table <- data.frame(anova_filtered$Phosphopeptide)
2299 output_table <- base::merge(
2300 x = output_table,
2301 y = data_table_imp_qn_lt,
2302 by.x = "anova_filtered.Phosphopeptide",
2303 by.y = "Phosphopeptide"
2304 )
2305
2306 # Produce heatmap to visualize significance and the effect of imputation
2307
2308 anova_filtered_merge_format <- sapply(
2309 X = filtered_p$fdr_adjusted_anova_p
2310 ,
2311 FUN = function(x) {
2312 if (x > 0.0001)
2313 paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s")
2314 else
2315 paste0("(%0.4e) %s")
2316 }
2317 )
2318
2319 cat_hm_heading <- function(m, cutoff) {
2320 cat("\\newpage\n")
2321 if (nrow(m) > intensity_hm_rows) {
2322 subsection_header(
2323 paste(
2324 sprintf("Heatmap for the %d most-significant peptides",
2325 intensity_hm_rows),
2326 sprintf("whose adjusted p-value < %0.2f\n", cutoff)
2327 )
2328 )
2329 } else {
2330 if (nrow(m) == 1) {
2331 return(FALSE)
2332 } else {
2333 subsection_header(
2334 paste(
2335 sprintf("Heatmap for %d usable peptides whose", nrow(m)),
2336 sprintf("adjusted p-value < %0.2f\n", cutoff)
2337 )
2338 )
2339 }
2340 }
2341 cat("\n\n\n")
2342 cat("\n\n\n")
2343 return(TRUE)
2344 }
2345
2346 # construct matrix with appropriate rownames
2347 m <-
2348 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
2349 if (nrow(m) > 0) {
2350 rownames_m <- rownames(m)
2351 rownames(m) <- sapply(
2352 X = seq_len(nrow(m))
2353 ,
2354 FUN = function(i) {
2355 sprintf(
2356 anova_filtered_merge_format[i],
2357 filtered_p$fdr_adjusted_anova_p[i],
2358 rownames_m[i]
2359 )
2360 }
2361 )
2362 }
2363 # draw the heading and heatmap
2364 if (nrow(m) > 0) {
2365 number_of_peptides_found <-
2366 draw_intensity_heatmap(
2367 m = m,
2368 cutoff = cutoff,
2369 hm_heading_function = cat_hm_heading,
2370 hm_main_title = "Unimputed, unnormalized log(intensities)",
2371 suppress_row_dendrogram = FALSE
2372 )
2373 }
2374 }
2375 }
2376 }
2377 cat("\\leavevmode\n\n\n")
2378 ```
2379
2380 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
2381
2382 if (count_of_treatment_levels > 1) {
2383 # Prepare two-way contrasts with adjusted p-values
2384 # Strategy:
2385 # - use imputed, log-transformed data:
2386 # - remember this when computing log2(fold-change)
2387 # - each contrast is between a combination of trt levels
2388 # - for each contrast, compute samples that are members
2389 # - compute one-way test:
2390 # - use `oneway.test` (Welch test) if numbers of samples
2391 # are not equivalent between trt levels
2392 # - otherwise, aov is fine but offers no advantage
2393 # - adjust p-value, assuming that
2394 # (# of pppeps)*(# of contrasts) tests were performed
2395
2396 # Each contrast is between a combination of trt levels
2397 m2 <- combn(
2398 x = seq_len(length(levels(sample_treatment_levels))),
2399 m = 2,
2400 simplify = TRUE
2401 )
2402 contrast_count <- ncol(m2)
2403
2404 # For each contrast, compute samples that are members
2405 # - local function to construct a data.frame for each contrast
2406 # - the contrast in the first "column"
2407 f_m2 <-
2408 function(cntrst, lvl1, lvl2) {
2409 return(
2410 data.frame(
2411 contrast = cntrst,
2412 level = sample_treatment_levels[
2413 sample_treatment_levels %in%
2414 levels(sample_treatment_levels)[c(lvl1, lvl2)]
2415 ],
2416 label = sample_name_matches[
2417 sample_treatment_levels %in%
2418 levels(sample_treatment_levels)[c(lvl1, lvl2)]
2419 ]
2420 )
2421 )
2422 }
2423 # - compute a df for each contrast
2424 sample_level_dfs <- lapply(
2425 X = 1:contrast_count,
2426 FUN = function(i) f_m2(i, m2[1, i], m2[2, i])
2427 )
2428 # - compute a single df for all contrasts
2429 combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs)
2430
2431 # - dispose objects to free resources
2432 rm(sample_level_dfs)
2433
2434 # - write the df to a DB for later join-per-contrast
2435 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
2436
2437 RSQLite::dbWriteTable(
2438 conn = db,
2439 name = "contrast",
2440 value = combined_contrast_df,
2441 overwrite = TRUE
2442 )
2443
2444 # Create UK for insert
2445 ddl_exec(db, "
2446 CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx
2447 ON contrast(contrast, label);
2448 "
2449 )
2450 # Create indexes for join
2451 ddl_exec(db, "
2452 -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label
2453 CREATE INDEX IF NOT EXISTS contrast__label__idx
2454 ON contrast(label);
2455 "
2456 )
2457 ddl_exec(db, "
2458 -- index for joining two contrast_lvl_ppep_avg_quant on contrast
2459 CREATE INDEX IF NOT EXISTS contrast__contrast__idx
2460 ON contrast(contrast);
2461 "
2462 )
2463 ddl_exec(db, "
2464 -- index for joining two contrast_lvl_ppep_avg_quant on phophospep
2465 CREATE INDEX IF NOT EXISTS contrast__level__idx
2466 ON contrast(level);
2467 "
2468 )
2469 # - dispose objects to free resources
2470 rm(combined_contrast_df)
2471
2472 # Use imputed, log-transformed data
2473 # - remember that this was donoe when computing log2(fold-change)
2474 # - melt data matrix for use in later join-per-contrast
2475 casted <- cbind(
2476 data.frame(vrbl = rownames(quant_data_imp_qn_log)),
2477 quant_data_imp_qn_log
2478 )
2479 quant_data_imp_qn_log_melted <- reshape2::melt(
2480 casted,
2481 id.vars = "vrbl"
2482 )
2483 colnames(quant_data_imp_qn_log_melted) <-
2484 c("phosphopep", "sample", "quant")
2485 # - dispose objects to free resources
2486 rm(casted)
2487
2488 # - write the df to a DB for use in later join-per-contrast
2489 RSQLite::dbWriteTable(
2490 conn = db,
2491 name = "ppep_smpl_qnlt",
2492 value = quant_data_imp_qn_log_melted,
2493 overwrite = TRUE
2494 )
2495 # Create UK for insert
2496 ddl_exec(db, "
2497 CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx
2498 ON ppep_smpl_qnlt(phosphopep, sample);
2499 "
2500 )
2501 # Create index for join
2502 ddl_exec(db, "
2503 -- index for join in contrast_ppep_smpl_qnlt
2504 CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx
2505 ON ppep_smpl_qnlt(sample);
2506 "
2507 )
2508 ddl_exec(db, "
2509 -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep
2510 CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx
2511 ON ppep_smpl_qnlt(phosphopep);
2512 "
2513 )
2514 # - dispose objects to free resources
2515 rm(quant_data_imp_qn_log_melted)
2516
2517 # - drop views if exist
2518 ddl_exec(db, "
2519 -- drop view dependent on contrast_lvl_ppep_avg_quant
2520 DROP VIEW IF EXISTS v_contrast_log2_fc;
2521 "
2522 )
2523 ddl_exec(db, "
2524 -- drop table dependent on contrast_ppep_smpl_qnlt
2525 DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant;
2526 "
2527 )
2528 ddl_exec(db, "
2529 DROP TABLE IF EXISTS contrast_lvl_lvl_metadata;
2530 "
2531 )
2532 ddl_exec(db, "
2533 DROP VIEW IF EXISTS v_contrast_lvl_metadata;
2534 "
2535 )
2536 ddl_exec(db, "
2537 -- drop view dependent on contrast_ppep_smpl_qnlt
2538 DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant;
2539 "
2540 )
2541 ddl_exec(db, "
2542 DROP VIEW IF EXISTS v_contrast_lvl_lvl;
2543 "
2544 )
2545 ddl_exec(db, "
2546 -- drop view upon which other views depend
2547 DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt;
2548 "
2549 )
2550 # - create view
2551 dml_no_rows_exec(db, "
2552 -- view contrast_ppep_smpl_qnlt is used for each phopshopep to
2553 -- compute p-value for test of trt effect for two trt levels
2554 CREATE VIEW contrast_ppep_smpl_qnlt
2555 AS
2556 SELECT contrast,
2557 level,
2558 phosphopep,
2559 sample,
2560 quant
2561 FROM contrast AS c,
2562 ppep_smpl_qnlt AS q
2563 WHERE q.sample = c.label
2564 ORDER BY contrast, level, phosphopep
2565 ;
2566 "
2567 )
2568 # - create simplification views
2569 dml_no_rows_exec(db, "
2570 CREATE VIEW v_contrast_lvl_metadata
2571 AS
2572 SELECT contrast,
2573 level,
2574 group_concat(label, ';') AS samples
2575 FROM contrast
2576 GROUP BY contrast, level
2577 /* view v_contrast_lvl_metadata is used
2578 to simplify creation of table contrast_lvl_lvl_metadata */
2579 ;
2580 "
2581 )
2582 dml_no_rows_exec(db, "
2583 CREATE VIEW v_contrast_lvl_ppep_avg_quant
2584 AS
2585 SELECT contrast,
2586 level,
2587 phosphopep,
2588 avg(quant) AS avg_quant
2589 FROM contrast_ppep_smpl_qnlt
2590 GROUP BY contrast, level, phosphopep
2591 /* view v_contrast_lvl_ppep_avg_quant is used
2592 to simplify view v_contrast_log2_fc */
2593 ;
2594 "
2595 )
2596
2597 # - create contrast-metadata table
2598 dml_no_rows_exec(db, "
2599 CREATE TABLE contrast_lvl_lvl_metadata
2600 AS
2601 SELECT DISTINCT
2602 a.contrast AS ab_contrast,
2603 a.level AS a_level,
2604 b.level AS b_level,
2605 a.samples AS a_samples,
2606 b.samples AS b_samples,
2607 'log2(level_'||a.level||'/level_'||b.level||')'
2608 AS fc_description
2609 FROM v_contrast_lvl_metadata AS a,
2610 v_contrast_lvl_metadata AS b
2611 WHERE a.contrast = b.contrast
2612 AND a.level > b.level
2613 /* view v_contrast_lvl_lvl is used
2614 to simplify view v_contrast_log2_fc */
2615 ;
2616 "
2617 )
2618 # - create pseudo-materialized view table
2619 dml_no_rows_exec(db, "
2620 CREATE VIEW v_contrast_lvl_lvl
2621 AS
2622 SELECT DISTINCT
2623 a.contrast AS ab_contrast,
2624 a.level AS a_level,
2625 b.level AS b_level
2626 FROM contrast AS a,
2627 contrast AS b
2628 WHERE a.contrast = b.contrast
2629 AND a.level > b.level
2630 /* view v_contrast_lvl_lvl is used
2631 to simplify view v_contrast_log2_fc */
2632 ;
2633 "
2634 )
2635
2636 # - create view to compute log2(fold-change)
2637 dml_no_rows_exec(db, "
2638 CREATE VIEW v_contrast_log2_fc
2639 AS
2640 SELECT ab.ab_contrast AS contrast,
2641 m.a_level AS a_level,
2642 c.avg_quant AS a_quant,
2643 m.a_samples AS a_samples,
2644 ab.b_level AS b_level,
2645 d.avg_quant AS b_quant,
2646 m.b_samples AS b_samples,
2647 m.fc_description AS fc_description,
2648 3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc,
2649 d.phosphopep AS phosphopep
2650 FROM contrast_lvl_lvl_metadata AS m,
2651 v_contrast_lvl_ppep_avg_quant AS d,
2652 v_contrast_lvl_lvl AS ab
2653 INNER JOIN v_contrast_lvl_ppep_avg_quant AS c
2654 ON c.contrast = ab.ab_contrast
2655 AND c.level = ab.a_level
2656 WHERE d.contrast = ab.ab_contrast
2657 AND m.ab_contrast = ab.ab_contrast
2658 AND d.level = ab.b_level
2659 AND d.phosphopep = c.phosphopep
2660 /* view to compute log2(fold-change) */
2661 ;
2662 "
2663 )
2664
2665 # For each contrast, compute samples that are members
2666 # compute one-way test:
2667 # - use `oneway.test` (Welch test) if numbers of samples
2668 # are not equivalent between trt levels
2669 # - otherwise, aov is fine but offers no advantage
2670 for (contrast in contrast_count:2) {
2671 invisible(contrast)
2672 }
2673 for (contrast in 1:contrast_count) {
2674 contrast_df <- sqldf::sqldf(
2675 x = paste0("
2676 SELECT level, phosphopep, sample, quant
2677 FROM contrast_ppep_smpl_qnlt
2678 WHERE contrast = ", contrast, "
2679 ORDER BY phosphopep, level, sample
2680 "),
2681 connection = db
2682 )
2683 contrast_cast <- reshape2::dcast(
2684 data = contrast_df,
2685 formula = phosphopep ~ sample,
2686 value.var = "quant"
2687 )
2688 contrast_cast_ncol <- ncol(contrast_cast)
2689 contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol]
2690 contrast_cast_samples <- colnames(contrast_cast_data)
2691
2692 # - order grouping_factor by order of sample columns of contrast_cast_data
2693 grouping_factor <- sqldf::sqldf(
2694 x = paste0("
2695 SELECT sample, level
2696 FROM contrast_ppep_smpl_qnlt
2697 WHERE contrast = ", contrast, "
2698 ORDER BY phosphopep, level, sample
2699 LIMIT ", contrast_cast_ncol - 1
2700 ),
2701 connection = db
2702 )
2703 rownames(grouping_factor) <- grouping_factor$sample
2704 grouping_factor <- grouping_factor[, "level", drop = FALSE]
2705
2706 # - run the two-level (one-way) test
2707 p_value_data_contrast_ps <-
2708 apply(
2709 X = contrast_cast_data,
2710 MARGIN = 1, # apply to rows
2711 FUN = anova_func,
2712 grouping_factor =
2713 as.factor(as.numeric(grouping_factor$level)), # anova_func arg2
2714 one_way_f = one_way_two_categories, # anova_func arg3
2715 simplify = TRUE # TRUE is the default for simplify
2716 )
2717 contrast_data_adj_p_values <- p.adjust(
2718 p = p_value_data_contrast_ps,
2719 method = "fdr",
2720 n = length(p_value_data_contrast_ps) # this is the default, length(p)
2721 )
2722 # - compute the fold-change
2723 contrast_p_df <-
2724 data.frame(
2725 contrast = contrast,
2726 phosphopep = contrast_cast$phosphopep,
2727 p_value_raw = p_value_data_contrast_ps,
2728 p_value_adj = contrast_data_adj_p_values
2729 )
2730 db_write_table_overwrite <- (contrast < 2)
2731 db_write_table_append <- !db_write_table_overwrite
2732 RSQLite::dbWriteTable(
2733 conn = db,
2734 name = "contrast_ppep_p_val",
2735 value = contrast_p_df,
2736 append = db_write_table_append
2737 )
2738 # Create UK for insert
2739 ddl_exec(db, "
2740 CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx
2741 ON contrast_ppep_p_val(phosphopep, contrast);
2742 "
2743 )
2744 }
2745 # Perhaps this could be done more elegantly using unique keys
2746 # or creating the tables before saving data to them, but this
2747 # is fast and, if the database exists on disk rather than in
2748 # memory, it doesn't stress memory.
2749 dml_no_rows_exec(db, "
2750 CREATE TEMP table contrast_log2_fc
2751 AS
2752 SELECT *
2753 FROM v_contrast_log2_fc
2754 ORDER BY contrast, phosphopep
2755 ;
2756 "
2757 )
2758 dml_no_rows_exec(db, "
2759 CREATE TEMP table ppep_p_val
2760 AS
2761 SELECT p_value_raw,
2762 p_value_adj,
2763 contrast AS p_val_contrast,
2764 phosphopep AS p_val_ppep
2765 FROM contrast_ppep_p_val
2766 ORDER BY contrast, phosphopep
2767 ;
2768 "
2769 )
2770 dml_no_rows_exec(db, "
2771 DROP TABLE IF EXISTS contrast_log2_fc_p_val
2772 ;
2773 "
2774 )
2775 dml_no_rows_exec(db, "
2776 CREATE TABLE contrast_log2_fc_p_val
2777 AS
2778 SELECT a.*,
2779 b.p_value_raw,
2780 b.p_value_adj,
2781 b.p_val_contrast,
2782 b.p_val_ppep
2783 FROM contrast_log2_fc a, ppep_p_val b
2784 WHERE a.rowid = b.rowid
2785 AND a.phosphopep = b.p_val_ppep
2786 ;
2787 "
2788 )
2789 # Create UK
2790 ddl_exec(db, "
2791 CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx
2792 ON contrast_log2_fc_p_val(phosphopep, contrast);
2793 "
2794 )
2795 # Create indices for future queries
2796 ddl_exec(db, "
2797 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx
2798 ON contrast_log2_fc_p_val(contrast);
2799 "
2800 )
2801 ddl_exec(db, "
2802 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx
2803 ON contrast_log2_fc_p_val(phosphopep);
2804 "
2805 )
2806 ddl_exec(db, "
2807 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx
2808 ON contrast_log2_fc_p_val(p_value_raw);
2809 "
2810 )
2811 ddl_exec(db, "
2812 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx
2813 ON contrast_log2_fc_p_val(p_value_adj);
2814 "
2815 )
2816 dml_no_rows_exec(db, "
2817 DROP VIEW IF EXISTS v_contrast_log2_fc_p_val
2818 ;
2819 "
2820 )
2821 dml_no_rows_exec(db, "
2822 CREATE VIEW v_contrast_log2_fc_p_val
2823 AS
2824 SELECT contrast,
2825 a_level,
2826 a_samples,
2827 b_level,
2828 b_samples,
2829 a_quant,
2830 b_quant,
2831 fc_description,
2832 log2_fc,
2833 p_value_raw,
2834 p_value_adj,
2835 phosphopep
2836 FROM contrast_log2_fc_p_val
2837 ORDER BY contrast, phosphopep
2838 ;
2839 "
2840 )
2841 ddl_exec(db, "
2842 DROP TABLE IF EXISTS kseaapp_metadata
2843 ;
2844 "
2845 )
2846 dml_no_rows_exec(db, "
2847 CREATE TABLE kseaapp_metadata
2848 AS
2849 WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS (
2850 SELECT DISTINCT
2851 deppep.seq,
2852 ppep.seq,
2853 GeneName||';',
2854 UniProtID||';',
2855 PhosphoResidue||';'
2856 FROM
2857 ppep, deppep, mrgfltr_metadata
2858 WHERE
2859 mrgfltr_metadata.ppep_id = ppep.id
2860 AND
2861 ppep.deppep_id = deppep.id
2862 )
2863 SELECT
2864 ppep AS `ppep`,
2865 SUBSTR(uniprot_id, 1, INSTR(uniprot_id,';') - 1 ) AS `Protein`,
2866 SUBSTR(gene_name, 1, INSTR(gene_name,';') - 1 ) AS `Gene`,
2867 deppep AS `Peptide`,
2868 REPLACE(
2869 REPLACE(
2870 SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ),
2871 'p',
2872 ''
2873 ),
2874 ', ',
2875 ';'
2876 ) AS `Residue.Both`
2877 FROM extended
2878 ;
2879 "
2880 )
2881 # Create indexes for join
2882 ddl_exec(db, "
2883 CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx
2884 ON kseaapp_metadata(ppep);
2885 "
2886 )
2887 ddl_exec(db, "
2888 DROP VIEW IF EXISTS v_kseaapp_contrast
2889 ;
2890 "
2891 )
2892 dml_no_rows_exec(db, "
2893 CREATE VIEW v_kseaapp_contrast
2894 AS
2895 SELECT a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both`
2896 FROM v_contrast_log2_fc_p_val a, kseaapp_metadata b
2897 WHERE b.ppep = a.phosphopep
2898 ;
2899 "
2900 )
2901 ddl_exec(db, "
2902 DROP VIEW IF EXISTS v_kseaapp_input
2903 ;
2904 "
2905 )
2906 dml_no_rows_exec(db, "
2907 CREATE VIEW v_kseaapp_input
2908 AS
2909 SELECT v.contrast,
2910 v.phosphopep,
2911 m.`Protein`,
2912 m.`Gene`,
2913 m.`Peptide`,
2914 m.`Residue.Both`,
2915 v.p_value_raw AS `p`,
2916 v.log2_fc AS `FC`
2917 FROM kseaapp_metadata AS m,
2918 v_contrast_log2_fc_p_val AS v
2919 WHERE m.ppep = v.phosphopep
2920 AND NOT m.`Gene` = 'No_Gene_Name'
2921 AND NOT v.log2_fc = 0
2922 ;
2923 "
2924 )
2925 }
2926 ```
2927
2928 ```{r echo = FALSE, results = 'asis'}
2929 cat("\\newpage\n")
2930 ```
2931
2932 # KSEA Analysis
2933
2934 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package:
2935
2936 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp
2937 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415).
2938 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/).
2939
2940 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as:
2941
2942 $$
2943 \text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j}
2944 $$
2945
2946 and fold-enrichment is computed as:
2947
2948 $$
2949 \text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j}
2950 $$
2951
2952 where:
2953
2954 - $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$,
2955 - $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and
2956 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$,
2957 - $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset.
2958 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score.
2959
2960 $\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x)
2961
2962 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive).
2963
2964 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`.
2965
2966 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis.
2967 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)).
2968 - Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).
2969
2970 ```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
2971
2972 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
2973
2974 # -- eliminate the table that's about to be defined
2975 ddl_exec(db, "
2976 DROP TABLE IF EXISTS site_metadata;
2977 ")
2978
2979 # -- define the site_metadata table
2980 ddl_exec(db, "
2981 CREATE TABLE site_metadata(
2982 id INTEGER PRIMARY KEY
2983 , site_type_id INTEGER REFERENCES site_type(id)
2984 , full TEXT UNIQUE ON CONFLICT IGNORE
2985 , abbrev TEXT
2986 , pattern TEXT
2987 , motif TEXT
2988 );
2989 ")
2990
2991 # -- populate the table with initial values
2992 ddl_exec(db, "
2993 INSERT INTO site_metadata(full, abbrev, site_type_id)
2994 SELECT DISTINCT kinase_map, kinase_map, site_type_id
2995 FROM ppep_gene_site
2996 ORDER BY kinase_map;
2997 ")
2998
2999 # -- drop bogus KSData view if exists
3000 ddl_exec(db, "
3001 DROP VIEW IF EXISTS ks_data_v;
3002 ")
3003
3004 # -- create view to serve as an impostor for KSEAapp::KSData
3005 ddl_exec(db, "
3006 CREATE VIEW IF NOT EXISTS ks_data_v
3007 AS
3008 SELECT
3009 'NA' AS KINASE,
3010 'NA' AS KIN_ACC_ID,
3011 kinase_map AS GENE,
3012 'NA' AS KIN_ORGANISM,
3013 'NA' AS SUBSTRATE,
3014 0 AS SUB_GENE_ID,
3015 'NA' AS SUB_ACC_ID,
3016 gene_names AS SUB_GENE,
3017 'NA' AS SUB_ORGANISM,
3018 phospho_peptide AS SUB_MOD_RSD,
3019 0 AS SITE_GROUP_ID,
3020 'NA' AS 'SITE_7AA',
3021 2 AS networkin_score,
3022 type_name AS Source
3023 FROM ppep_gene_site_view;
3024 ")
3025
3026 contrast_metadata_df <-
3027 sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db)
3028 rslt <- new_env()
3029 rslt$score_list <- list()
3030 rslt$name_list <- list()
3031 rslt$longname_list <- list()
3032
3033 ddl_exec(db, "
3034 DROP TABLE IF EXISTS contrast_ksea_scores;
3035 "
3036 )
3037
3038 next_index <- 0
3039 err_na_subscr_df_const <-
3040 "missing values are not allowed in subscripted assignments of data frames"
3041
3042 for (i_cntrst in seq_len(nrow(contrast_metadata_df))) {
3043 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
3044 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
3045 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
3046 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
3047 contrast_longlabel <- (
3048 sprintf(
3049 "Trt %s {%s} -> Trt %s {%s}",
3050 contrast_metadata_df[i_cntrst, "b_level"],
3051 gsub(
3052 pattern = ";",
3053 replacement = ", ",
3054 x = contrast_metadata_df[i_cntrst, "b_samples"],
3055 fixed = TRUE
3056 ),
3057 contrast_metadata_df[i_cntrst, "a_level"],
3058 gsub(
3059 pattern = ";",
3060 replacement = ", ",
3061 x = contrast_metadata_df[i_cntrst, "a_samples"],
3062 fixed = TRUE
3063 )
3064 )
3065 )
3066 kseaapp_input <-
3067 sqldf::sqldf(
3068 x = sprintf("
3069 SELECT `Protein`, `Gene`, `Peptide`, phosphopep AS `Residue.Both`, `p`, `FC`
3070 FROM v_kseaapp_input
3071 WHERE contrast = %d
3072 ",
3073 i_cntrst
3074 ),
3075 connection = db
3076 )
3077
3078 pseudo_ksdata <- dbReadTable(db, "ks_data_v")
3079
3080 # This hack is because SQL table has the log2-transformed values
3081 kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE]
3082 main_title <- (
3083 sprintf(
3084 "Change from treatment %s to treatment %s",
3085 contrast_metadata_df[i_cntrst, "b_level"],
3086 contrast_metadata_df[i_cntrst, "a_level"]
3087 )
3088 )
3089 sub_title <- contrast_longlabel
3090 tryCatch(
3091 expr = {
3092 ksea_scores_rslt <-
3093 ksea_scores(
3094 ksdata = pseudo_ksdata, # KSEAapp::KSData,
3095 px = kseaapp_input,
3096 networkin = TRUE,
3097 networkin_cutoff = 2
3098 )
3099
3100 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
3101 next_index <- 1 + next_index
3102 rslt$score_list[[next_index]] <- ksea_scores_rslt
3103 rslt$name_list[[next_index]] <- contrast_label
3104 rslt$longname_list[[next_index]] <- contrast_longlabel
3105 low_fdr_print(
3106 rslt = rslt,
3107 i_cntrst = i_cntrst,
3108 i = next_index,
3109 a_level = cntrst_a_level,
3110 b_level = cntrst_b_level,
3111 fold_change = cntrst_fold_change,
3112 caption = contrast_longlabel
3113 )
3114 }
3115 },
3116 error = function(e) str(e)
3117 )
3118 }
3119
3120 plotted_kinases <- NULL
3121 if (length(rslt$score_list) > 1) {
3122 for (i in seq_len(length(ksea_heatmap_titles))) {
3123 hdr <- ksea_heatmap_titles[[i]]
3124 which_kinases <- i
3125
3126 cat("\\clearpage\n\\begin{center}\n")
3127 if (i == const_ksea_astrsk_kinases) {
3128 subsection_header(hdr)
3129 } else {
3130 subsection_header(hdr)
3131 }
3132 cat("\\end{center}\n")
3133
3134 plotted_kinases <- ksea_heatmap(
3135 # the data frame outputs from the KSEA.Scores() function, in list format
3136 score_list = rslt$score_list,
3137 # a character vector of all the sample names for heatmap annotation:
3138 # - the names must be in the same order as the data in score_list
3139 # - please avoid long names, as they may get cropped in the final image
3140 sample_labels = rslt$name_list,
3141 # character string of either "p.value" or "FDR" indicating the data column
3142 # to use for marking statistically significant scores
3143 stats = c("p.value", "FDR")[2],
3144 # a numeric value between 0 and infinity indicating the min. number of
3145 # substrates a kinase must have to be included in the heatmap
3146 m_cutoff = 1,
3147 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
3148 # for indicating significant kinases in the heatmap
3149 p_cutoff = 0.05,
3150 # a binary input of TRUE or FALSE, indicating whether or not to perform
3151 # hierarchical clustering of the sample columns
3152 sample_cluster = TRUE,
3153 # a binary input of TRUE or FALSE, indicating whether or not to export
3154 # the heatmap as a .png image into the working directory
3155 export = FALSE,
3156 # additional arguments to gplots::heatmap.2, such as:
3157 # - main: main title of plot
3158 # - xlab: x-axis label
3159 # - ylab: y-axis label
3160 xlab = "Contrast",
3161 ylab = "Kinase",
3162 # print which kinases:
3163 # - 1 : all kinases
3164 # - 2 : significant kinases
3165 # - 3 : non-significant kinases
3166 which_kinases = which_kinases
3167 )
3168 cat("\\begin{center}\n")
3169 cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n")
3170 cat("\\end{center}\n")
3171 } # end for (i in ...
3172 } # end if (length ...
3173
3174 for (i_cntrst in seq_len(length(rslt$score_list))) {
3175 next_index <- i_cntrst
3176 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
3177 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
3178 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
3179 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
3180 contrast_longlabel <- (
3181 sprintf(
3182 "Trt %s {%s} -> Trt %s {%s}",
3183 contrast_metadata_df[i_cntrst, "b_level"],
3184 gsub(
3185 pattern = ";",
3186 replacement = ", ",
3187 x = contrast_metadata_df[i_cntrst, "b_samples"],
3188 fixed = TRUE
3189 ),
3190 contrast_metadata_df[i_cntrst, "a_level"],
3191 gsub(
3192 pattern = ";",
3193 replacement = ", ",
3194 x = contrast_metadata_df[i_cntrst, "a_samples"],
3195 fixed = TRUE
3196 )
3197 )
3198 )
3199 main_title <- (
3200 sprintf(
3201 "Change from treatment %s to treatment %s",
3202 contrast_metadata_df[i_cntrst, "b_level"],
3203 contrast_metadata_df[i_cntrst, "a_level"]
3204 )
3205 )
3206 sub_title <- contrast_longlabel
3207 tryCatch(
3208 expr = {
3209 ksea_scores_rslt <- rslt$score_list[[next_index]]
3210
3211 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
3212 low_fdr_barplot(
3213 rslt = rslt,
3214 i_cntrst = i_cntrst,
3215 i = next_index,
3216 a_level = cntrst_a_level,
3217 b_level = cntrst_b_level,
3218 fold_change = cntrst_fold_change,
3219 caption = contrast_longlabel
3220 )
3221 }
3222 },
3223 error = function(e) str(e)
3224 )
3225 }
3226 ```
3227
3228 ```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
3229
3230 # Use enriched kinases to find enriched kinase-substrate pairs
3231 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
3232 all_enriched_substrates <- sqldf("
3233 SELECT
3234 gene AS kinase,
3235 ppep,
3236 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label
3237 FROM (
3238 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
3239 FROM pseudo_ksdata
3240 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3241 )
3242 GROUP BY ppep
3243 ")
3244
3245 # helper used to label per-kinase substrate enrichment figure
3246 cat_enriched_heading <- function(m, cut_args) {
3247 cutoff <- cut_args$cutoff
3248 kinase <- cut_args$kinase
3249 statistic <- cut_args$statistic
3250 threshold <- cut_args$threshold
3251 cat("\\newpage\n")
3252 if (nrow(m) > intensity_hm_rows) {
3253 subsection_header(
3254 paste(
3255 sprintf(
3256 "Lowest p-valued %d (of %d) enriched %s-substrates,",
3257 intensity_hm_rows,
3258 nrow(m),
3259 kinase
3260 ),
3261 sprintf(" KSEA %s < %0.2f\n", statistic, threshold)
3262 )
3263 )
3264 } else {
3265 if (nrow(m) == 1) {
3266 return(FALSE)
3267 } else {
3268 subsection_header(
3269 paste(
3270 sprintf(
3271 "%d enriched %s-substrates,",
3272 nrow(m),
3273 kinase
3274 ),
3275 sprintf(
3276 " KSEA %s < %0.2f\n",
3277 statistic,
3278 threshold
3279 )
3280 )
3281 )
3282 }
3283 }
3284 cat("\n\n\n")
3285 cat("\n\n\n")
3286 return(TRUE)
3287 }
3288
3289 # Disabling heatmaps for substrates pending decision whether to eliminate them altogether
3290 if (FALSE)
3291 for (kinase_name in sort(enriched_kinases$kinase)) {
3292 enriched_substrates <-
3293 all_enriched_substrates[
3294 all_enriched_substrates$kinase == kinase_name,
3295 ,
3296 drop = FALSE
3297 ]
3298 # Get the intensity values for the heatmap
3299 enriched_intensities <-
3300 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE])
3301 # Remove rows having too many NA values to be relevant
3302 na_counter <- is.na(enriched_intensities)
3303 na_counts <- apply(na_counter, 1, sum)
3304 enriched_intensities <-
3305 enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE]
3306 # Rename the rows with the display-name for the heatmap
3307 rownames(enriched_intensities) <-
3308 sapply(
3309 X = rownames(enriched_intensities),
3310 FUN = function(rn) {
3311 enriched_substrates[enriched_substrates$ppep == rn, "label"]
3312 }
3313 )
3314 # Format as matrix for heatmap
3315 m <- as.matrix(enriched_intensities)
3316 # Draw the heading and heatmap
3317 if (nrow(m) > 0) {
3318 cut_args <- new_env()
3319 cut_args$cutoff <- cutoff
3320 cut_args$kinase <- kinase_name
3321 cut_args$statistic <- ksea_cutoff_statistic
3322 cut_args$threshold <- ksea_cutoff_threshold
3323 number_of_peptides_found <-
3324 draw_intensity_heatmap(
3325 m = m,
3326 cutoff = cut_args,
3327 hm_heading_function = cat_enriched_heading,
3328 hm_main_title
3329 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates",
3330 suppress_row_dendrogram = FALSE
3331 )
3332 }
3333 }
3334
3335 # Write output tabular files
3336
3337 # get kinase, ppep, concat(kinase) tuples for enriched kinases
3338
3339 kinase_ppep_label <- sqldf("
3340 WITH
3341 t(ppep, label) AS
3342 (
3343 SELECT DISTINCT
3344 SUB_MOD_RSD AS ppep,
3345 group_concat(gene, '; ') AS label
3346 FROM pseudo_ksdata
3347 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3348 GROUP BY ppep
3349 ),
3350 k(kinase, ppep_join) AS
3351 (
3352 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
3353 FROM pseudo_ksdata
3354 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3355 )
3356 SELECT k.kinase, t.ppep, t.label
3357 FROM t, k
3358 WHERE t.ppep = k.ppep_join
3359 ORDER BY k.kinase, t.ppep
3360 ")
3361
3362 # extract what we need from full_data
3363 impish <- cbind(rownames(quant_data_imp), quant_data_imp)
3364 colnames(impish)[1] <- "Phosphopeptide"
3365 data_table_imputed_sql <- "
3366 SELECT
3367 f.*,
3368 k.label AS KSEA_enrichments,
3369 q.*
3370 FROM
3371 metadata_plus_p f
3372 LEFT JOIN kinase_ppep_label k
3373 ON f.Phosphopeptide = k.ppep,
3374 impish q
3375 WHERE
3376 f.Phosphopeptide = q.Phosphopeptide
3377 "
3378 data_table_imputed <- sqldf(data_table_imputed_sql)
3379 # Zap the duplicated 'Phosphopeptide' column named 'ppep'
3380 data_table_imputed <-
3381 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
3382
3383 # Output with imputed, un-normalized data
3384
3385 write.table(
3386 data_table_imputed
3387 , file = imputed_data_filename
3388 , sep = "\t"
3389 , col.names = TRUE
3390 , row.names = FALSE
3391 , quote = FALSE
3392 )
3393
3394
3395 #output quantile normalized data
3396 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
3397 colnames(impish)[1] <- "Phosphopeptide"
3398 data_table_imputed <- sqldf(data_table_imputed_sql)
3399 # Zap the duplicated 'Phosphopeptide' column named 'ppep'
3400 data_table_imputed <-
3401 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
3402 write.table(
3403 data_table_imputed,
3404 file = imp_qn_lt_data_filenm,
3405 sep = "\t",
3406 col.names = TRUE,
3407 row.names = FALSE,
3408 quote = FALSE
3409 )
3410
3411 ppep_kinase <- sqldf("
3412 SELECT DISTINCT k.ppep, k.kinase
3413 FROM (
3414 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
3415 FROM pseudo_ksdata
3416 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3417 ) k
3418 ORDER BY k.ppep, k.kinase
3419 ")
3420
3421 RSQLite::dbWriteTable(
3422 conn = db,
3423 name = "ksea_enriched_ks",
3424 value = ppep_kinase,
3425 append = FALSE
3426 )
3427
3428 RSQLite::dbWriteTable(
3429 conn = db,
3430 name = "anova_signif",
3431 value = p_value_data,
3432 append = FALSE
3433 )
3434
3435 ddl_exec(db, "
3436 DROP VIEW IF EXISTS stats_metadata_v;
3437 "
3438 )
3439 dml_no_rows_exec(db, "
3440 CREATE VIEW stats_metadata_v
3441 AS
3442 SELECT DISTINCT m.*,
3443 p.raw_anova_p,
3444 p.fdr_adjusted_anova_p,
3445 kek.kinase AS ksea_enrichments
3446 FROM
3447 mrgfltr_metadata_view m
3448 LEFT JOIN anova_signif p
3449 ON m.phospho_peptide = p.phosphopeptide
3450 LEFT JOIN ksea_enriched_ks kek
3451 ON m.phospho_peptide = kek.ppep
3452 ;
3453 "
3454 )
3455
3456 write.table(
3457 dbReadTable(db, "stats_metadata_v"),
3458 file = anova_ksea_mtdt_file,
3459 sep = "\t",
3460 col.names = TRUE,
3461 row.names = FALSE,
3462 quote = FALSE
3463 )
3464
3465
3466 ```
3467
3468 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
3469 cat("\\leavevmode\n\n\n")
3470
3471 # write parameters to report
3472
3473 param_unlist <- unlist(as.list(params))
3474 param_df <- data.frame(
3475 parameter = paste0("\\verb@", names(param_unlist), "@"),
3476 value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@")
3477 )
3478
3479 data_frame_latex(
3480 x = param_df,
3481 justification = "p{0.35\\linewidth} p{0.6\\linewidth}",
3482 centered = TRUE,
3483 caption = "Input parameters",
3484 anchor = const_table_anchor_bp,
3485 underscore_whack = FALSE
3486 )
3487
3488 # write parameters to SQLite output
3489
3490 mqppep_anova_script_param_df <- data.frame(
3491 script = "mqppep_anova_script.Rmd",
3492 parameter = names(param_unlist),
3493 value = param_unlist
3494 )
3495 ddl_exec(db, "
3496 DROP TABLE IF EXISTS script_parameter;
3497 "
3498 )
3499 ddl_exec(db, "
3500 CREATE TABLE IF NOT EXISTS script_parameter(
3501 script TEXT,
3502 parameter TEXT,
3503 value ANY,
3504 UNIQUE (script, parameter) ON CONFLICT REPLACE
3505 )
3506 ;
3507 "
3508 )
3509 RSQLite::dbWriteTable(
3510 conn = db,
3511 name = "script_parameter",
3512 value = mqppep_anova_script_param_df,
3513 append = TRUE
3514 )
3515
3516 # We are done with output
3517 RSQLite::dbDisconnect(db)
3518 ```
3519 <!--
3520 There's gotta be a better way...
3521
3522 loaded_packages_df <- sessioninfo::package_info("loaded")
3523 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
3524 loaded_packages_df <- data.frame(
3525 package = loaded_packages_df$package,
3526 version = loaded_packages_df$loadedversion,
3527 date = loaded_packages_df$date
3528 )
3529 data_frame_latex(
3530 x = loaded_packages_df,
3531 justification = "l | l l",
3532 centered = FALSE,
3533 caption = "Loaded R packages",
3534 anchor = const_table_anchor_bp
3535 )
3536 -->