Mercurial > repos > galaxyp > mqppep_preproc
comparison mqppep_anova_script.Rmd @ 0:8dfd5d2b5903 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author | galaxyp |
---|---|
date | Mon, 11 Jul 2022 19:22:54 +0000 |
parents | |
children | b76c75521d91 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8dfd5d2b5903 |
---|---|
1 --- | |
2 title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA" | |
3 author: | |
4 - "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]" | |
5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" | |
6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" | |
7 date: | |
8 - "May 28, 2018" | |
9 - "; revised June 23, 2022" | |
10 output: | |
11 pdf_document: | |
12 toc: true | |
13 toc_depth: 3 | |
14 keep_tex: true | |
15 header-includes: | |
16 - \usepackage{longtable} | |
17 - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut | |
18 - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut | |
19 params: | |
20 alphaFile: "test-data/alpha_levels.tabular" | |
21 inputFile: "test-data/test_input_for_anova.tabular" | |
22 preprocDb: "test-data/test_input_for_anova.sqlite" | |
23 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
24 show_toc: true | |
25 firstDataColumn: "^Intensity[^_]" | |
26 imputationMethod: !r c("group-median", "median", "mean", "random")[1] | |
27 meanPercentile: 1 | |
28 sdPercentile: 1.0 | |
29 regexSampleNames: "\\.\\d+[A-Z]$" | |
30 regexSampleGrouping: "\\d+" | |
31 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" | |
32 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" | |
33 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" | |
34 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] | |
35 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] | |
36 kseaCutoffStatistic: !r c("p.value", "FDR")[2] | |
37 kseaCutoffThreshold: !r c( 0.1, 0.05)[2] | |
38 kseaMinKinaseCount: 1 | |
39 intensityHeatmapRows: 75 | |
40 --- | |
41 <!-- | |
42 kseaCutoffStatistic: !r c("p.value", "FDR")[2] | |
43 kseaCutoffThreshold: !r c(0.05, 0.1)[1] | |
44 | |
45 alphaFile: "test-data/alpha_levels.tabular" | |
46 inputFile: "test-data/test_input_for_anova.tabular" | |
47 preprocDb: "test-data/test_input_for_anova.sqlite" | |
48 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
49 | |
50 alphaFile: "test-data/alpha_levels.tabular" | |
51 inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" | |
52 preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" | |
53 kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] | |
54 | |
55 alphaFile: "test-data/alpha_levels.tabular" | |
56 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" | |
57 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" | |
58 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | |
59 | |
60 alphaFile: "test-data/alpha_levels.tabular" | |
61 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" | |
62 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" | |
63 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | |
64 | |
65 inputFile: "test-data/density_failure.preproc_tab.tabular" | |
66 kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] | |
67 latex_document: default | |
68 --> | |
69 ```{r setup, include = FALSE} | |
70 #ref for debugging: https://yihui.org/tinytex/r/#debugging | |
71 options(tinytex.verbose = TRUE) | |
72 | |
73 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 | |
74 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 | |
75 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) | |
76 | |
77 # freeze the random number generator so the same results will be produced | |
78 # from run to run | |
79 set.seed(28571) | |
80 | |
81 ### LIBRARIES | |
82 library(gplots) | |
83 library(DBI) | |
84 library(RSQLite) | |
85 # Suppress "Warning: no DISPLAY variable so Tk is not available" | |
86 suppressWarnings(suppressMessages(library(sqldf))) | |
87 | |
88 # required but not added to search list: | |
89 # - DBI | |
90 # - RSQLite | |
91 # - ggplot2 | |
92 # - knitr | |
93 # - latex2exp | |
94 # - preprocessCore | |
95 # - reshape2 | |
96 # - vioplot | |
97 | |
98 ### CONSTANTS | |
99 | |
100 const_parfin <- par("fin") | |
101 const_boxplot_fill <- "grey94" | |
102 const_stripchart_cex <- 0.5 | |
103 const_stripsmall_cex <- | |
104 sqrt(const_stripchart_cex * const_stripchart_cex / 2) | |
105 const_stripchart_jitter <- 0.3 | |
106 const_write_debug_files <- FALSE | |
107 const_table_anchor_bp <- "bp" | |
108 const_table_anchor_ht <- "ht" | |
109 const_table_anchor_p <- "p" | |
110 const_table_anchor_tbp <- "tbp" | |
111 | |
112 | |
113 const_ksea_astrsk_kinases <- 1 | |
114 const_ksea_nonastrsk_kinases <- 2 | |
115 const_ksea_all_kinases <- 3 | |
116 | |
117 const_log10_e <- log10(exp(1)) | |
118 | |
119 ### FUNCTIONS | |
120 | |
121 # from `demo(error.catching)` | |
122 ##' Catch *and* save both errors and warnings, and in the case of | |
123 ##' a warning, also keep the computed result. | |
124 ##' | |
125 ##' @title tryCatch both warnings (with value) and errors | |
126 ##' @param expr an \R expression to evaluate | |
127 ##' @return a list with 'value' and 'warning', where | |
128 ##' 'value' may be an error caught. | |
129 ##' @author Martin Maechler; | |
130 ##' Copyright (C) 2010-2012 The R Core Team | |
131 try_catch_w_e <- function(expr) { | |
132 wrn <- NULL | |
133 # warning handler | |
134 w_handler <- function(w) { | |
135 wrn <<- w | |
136 invokeRestart("muffleWarning") | |
137 } | |
138 list( | |
139 value = withCallingHandlers( | |
140 tryCatch( | |
141 expr, | |
142 error = function(e) e | |
143 ), | |
144 warning = w_handler | |
145 ), | |
146 warning = wrn | |
147 ) | |
148 } | |
149 | |
150 | |
151 write_debug_file <- function(s) { | |
152 if (const_write_debug_files) { | |
153 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) | |
154 print(sprintf("DEBUG writing file %s", spath)) | |
155 write.table( | |
156 s, | |
157 file = s_path, | |
158 sep = "\t", | |
159 col.names = TRUE, | |
160 row.names = TRUE, | |
161 quote = FALSE | |
162 ) | |
163 } | |
164 } | |
165 | |
166 # ref: http://adv-r.had.co.nz/Environments.html | |
167 # "When creating your own environment, note that you should set its parent | |
168 # environment to be the empty environment. This ensures you don't | |
169 # accidentally inherit objects from somewhere else." | |
170 # Caution: this prevents `with(my_env, expr)` from working when `expr` | |
171 # contains anything from the global environment, even operators! | |
172 # Hence, `x <- 1; get("x", new_env())` fails by design. | |
173 new_env <- function() { | |
174 new.env(parent = emptyenv()) | |
175 } | |
176 | |
177 ### numerical/statistical helper functions | |
178 | |
179 any_nan <- function(x) { | |
180 !any(x == "NaN") | |
181 } | |
182 | |
183 # determine standard deviation of quantile to impute | |
184 sd_finite <- function(x) { | |
185 ok <- is.finite(x) | |
186 sd(x[ok]) | |
187 } | |
188 | |
189 anova_func <- function(x, grouping_factor, one_way_f) { | |
190 subject <- data.frame( | |
191 intensity = x | |
192 ) | |
193 x_aov <- | |
194 one_way_f( | |
195 formula = intensity ~ grouping_factor, | |
196 data = subject | |
197 ) | |
198 pvalue <- | |
199 if (identical(one_way_f, aov)) | |
200 summary(x_aov)[[1]][["Pr(>F)"]][1] | |
201 else | |
202 pvalue <- x_aov$p.value | |
203 pvalue | |
204 } | |
205 | |
206 | |
207 ### LaTeX functions | |
208 | |
209 latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) { | |
210 v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v | |
211 cat( | |
212 paste0( | |
213 v_sub, | |
214 collapse = collapse_string | |
215 ) | |
216 ) | |
217 } | |
218 | |
219 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
220 cat("\\begin{itemize}\n\\item ") | |
221 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
222 cat("\n\\end{itemize}\n") | |
223 } | |
224 | |
225 latex_itemized_list <- function(v, underscore_whack = TRUE) { | |
226 latex_itemized_collapsed("\n\\item ", v, underscore_whack) | |
227 } | |
228 | |
229 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
230 cat("\\begin{enumerate}\n\\item ") | |
231 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
232 cat("\n\\end{enumerate}\n") | |
233 } | |
234 | |
235 latex_enumerated_list <- function(v) { | |
236 latex_enumerated_collapsed("\n\\item ", v) | |
237 } | |
238 | |
239 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { | |
240 latex_collapsed_vector(" & ", v, underscore_whack) | |
241 cat(extra) | |
242 cat(" \\\\\n") | |
243 } | |
244 | |
245 # Use this like print.data.frame, from which it is adapted: | |
246 data_frame_latex <- | |
247 function( | |
248 x, | |
249 ..., | |
250 # digits to pass to format.data.frame | |
251 digits = NULL, | |
252 # TRUE -> right-justify columns; FALSE -> left-justify | |
253 right = TRUE, | |
254 # maximumn number of rows to print | |
255 max = NULL, | |
256 # string with justification of each column | |
257 justification = NULL, | |
258 # TRUE to center on page | |
259 centered = TRUE, | |
260 # optional caption | |
261 caption = NULL, | |
262 # h(inline); b(bottom); t (top) or p (separate page) | |
263 anchor = "h", | |
264 # set underscore_whack to TRUE to escape underscores | |
265 underscore_whack = TRUE | |
266 ) { | |
267 if (is.null(justification)) | |
268 justification <- | |
269 Reduce( | |
270 f = paste, | |
271 x = rep_len(if (right) "r" else "l", length(colnames(x))) | |
272 ) | |
273 n <- length(rownames(x)) | |
274 if (length(x) == 0L) { | |
275 cat( | |
276 sprintf( | |
277 # if n is one, use singular 'row', else use plural 'rows' | |
278 ngettext( | |
279 n, | |
280 "data frame with 0 columns and %d row", | |
281 "data frame with 0 columns and %d rows" | |
282 ), | |
283 n | |
284 ), | |
285 "\n", | |
286 sep = "" | |
287 ) | |
288 } else if (n == 0L) { | |
289 cat("0 rows for:\n") | |
290 latex_itemized_list( | |
291 v = names(x), | |
292 underscore_whack = underscore_whack | |
293 ) | |
294 } else { | |
295 if (is.null(max)) | |
296 max <- getOption("max.print", 99999L) | |
297 if (!is.finite(max)) | |
298 stop("invalid 'max' / getOption(\"max.print\"): ", | |
299 max) | |
300 omit <- (n0 <- max %/% length(x)) < n | |
301 m <- as.matrix( | |
302 format.data.frame( | |
303 if (omit) x[seq_len(n0), , drop = FALSE] else x, | |
304 digits = digits, | |
305 na.encode = FALSE | |
306 ) | |
307 ) | |
308 cat( | |
309 # h(inline); b(bottom); t (top) or p (separate page) | |
310 paste0("\\begin{table}[", anchor, "]\n") | |
311 ) | |
312 if (!is.null(caption)) | |
313 cat(paste0(" \\caption{", caption, "}")) | |
314 if (centered) cat("\\centering\n") | |
315 cat( | |
316 paste( | |
317 " \\begin{tabular}{", | |
318 justification, | |
319 "}\n", | |
320 sep = "" | |
321 ) | |
322 ) | |
323 # ref: https://tex.stackexchange.com/a/50353 | |
324 # Describes use of \rule{0pt}{3ex} | |
325 if (!is.null(caption)) | |
326 cat("\\B \\\\ \\hline\\hline\n") | |
327 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 | |
328 latex_table_row( | |
329 v = colnames(m), | |
330 extra = "\\T\\B", | |
331 underscore_whack = underscore_whack | |
332 ) | |
333 cat("\\hline\n") | |
334 for (i in seq_len(length(m[, 1]))) { | |
335 latex_table_row( | |
336 v = m[i, ], | |
337 underscore_whack = underscore_whack | |
338 ) | |
339 } | |
340 cat( | |
341 paste( | |
342 " \\end{tabular}", | |
343 "\\end{table}", | |
344 sep = "\n" | |
345 ) | |
346 ) | |
347 if (omit) | |
348 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", | |
349 n - n0, "rows ]\n") | |
350 } | |
351 invisible(x) | |
352 } | |
353 | |
354 hypersub <- | |
355 function(s) { | |
356 hyper <- tolower(s) | |
357 hyper <- gsub("[^a-z0-9]+", "-", hyper) | |
358 hyper <- gsub("[-]+", "-", hyper) | |
359 hyper <- sub("^[-]", "", hyper) | |
360 hyper <- sub("[-]$", "", hyper) | |
361 return(hyper) | |
362 } | |
363 | |
364 subsection_header <- | |
365 function(s) { | |
366 hyper <- hypersub(s) | |
367 cat( | |
368 sprintf( | |
369 "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", | |
370 hyper, s, hyper | |
371 ) | |
372 ) | |
373 } | |
374 | |
375 subsubsection_header <- | |
376 function(s) { | |
377 hyper <- hypersub(s) | |
378 cat( | |
379 sprintf( | |
380 "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n", | |
381 hyper, s, hyper | |
382 ) | |
383 ) | |
384 } | |
385 | |
386 ### SQLite functions | |
387 | |
388 ddl_exec <- function(db, sql) { | |
389 discard <- DBI::dbExecute(conn = db, statement = sql) | |
390 if (FALSE && discard != 0) { | |
391 need_newpage <- TRUE | |
392 if (need_newpage) { | |
393 need_newpage <<- FALSE | |
394 cat("\\newpage\n") | |
395 } | |
396 o_file <- stdout() | |
397 cat("\n\\begin{verbatim}\n") | |
398 cat(sql, file = o_file) | |
399 cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file) | |
400 cat("\n\\end{verbatim}\n") | |
401 } | |
402 } | |
403 | |
404 dml_no_rows_exec <- function(db, sql) { | |
405 discard <- DBI::dbExecute(conn = db, statement = sql) | |
406 if (discard != 0) { | |
407 need_newpage <- TRUE | |
408 if (need_newpage) { | |
409 need_newpage <<- FALSE | |
410 cat("\\newpage\n") | |
411 } | |
412 cat("\n\\begin{verbatim}\n") | |
413 o_file <- stdout() | |
414 cat(sql, file = o_file) | |
415 cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file) | |
416 cat("\n\\end{verbatim}\n") | |
417 } | |
418 } | |
419 | |
420 ### KSEA functions and helpers | |
421 | |
422 # Adapted from KSEAapp::KSEA.Scores to allow retrieval of: | |
423 # - maximum log2(FC) | |
424 ksea_scores <- function( | |
425 | |
426 # For human data, typically, ksdata = KSEAapp::ksdata | |
427 ksdata, | |
428 | |
429 # Input data file having columns: | |
430 # - Protein : abbreviated protein name | |
431 # - Gene : HUGO gene name | |
432 # - Peptide : peptide sequence without indications of phosphorylation | |
433 # - Reside.Both : position(s) of phosphorylation within Gene sequence | |
434 # - First letter designates AA that is modified | |
435 # - Numbers indicate position within Gene | |
436 # - Multiple values are separated by semicolons | |
437 # - p : p-value | |
438 # - FC : fold-change | |
439 px, | |
440 | |
441 # A binary input of TRUE or FALSE, indicating whether or not to include | |
442 # NetworKIN predictions | |
443 networkin, | |
444 | |
445 # A numeric value between 1 and infinity setting the minimum NetworKIN | |
446 # score (can be left out if networkin = FALSE) | |
447 networkin_cutoff | |
448 | |
449 ) { | |
450 if (length(grep(";", px$Residue.Both)) == 0) { | |
451 # There are no Residue.Both entries having semicolons, so new is | |
452 # simply px except two columns are renamed and a column is added | |
453 # for log2(abs(fold-change)) | |
454 new <- px | |
455 colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") | |
456 new$log2_fc <- log2(abs(as.numeric(as.character(new$FC)))) | |
457 new <- new[complete.cases(new$log2_fc), ] | |
458 } else { | |
459 # Split each row having semicolons in Residue.Both into rows that are | |
460 # duplicated in all respects except that each row has a single | |
461 # member of the set "split-on-semicolon-Residue.Both" | |
462 px_double <- px[grep(";", px$Residue.Both), ] | |
463 residues <- as.character(px_double$Residue.Both) | |
464 residues <- as.matrix(residues, ncol = 1) | |
465 split <- strsplit(residues, split = ";") | |
466 # x gets count of residues in each row, | |
467 # i.e., 1 + count of semicolons | |
468 x <- sapply(split, length) | |
469 # Here is the set of split rows | |
470 px_single <- data.frame( | |
471 Protein = rep(px_double$Protein, x), | |
472 Gene = rep(px_double$Gene, x), | |
473 Peptide = rep(px_double$Peptide, x), | |
474 Residue.Both = unlist(split), | |
475 p = rep(px_double$p, x), | |
476 FC = rep(px_double$FC, x) | |
477 ) | |
478 # new first gets the split rows | |
479 new <- px[-grep(";", px$Residue.Both), ] | |
480 # to new, append the rows that didn't need splitting in the first place | |
481 new <- rbind(new, px_single) | |
482 # map Gene to SUB_GENE | |
483 # map Residue.Both to SUB_MOD_RSD | |
484 colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") | |
485 # Eliminate any non-positive values to prevent introduction of | |
486 # infinite or NaN values | |
487 new[(0 <= new$log2_fc), "log2_fc"] <- NA | |
488 # Because of preceding step, there is no need for abs in the next line | |
489 new$log2_fc <- log2(as.numeric(as.character(new$FC))) | |
490 # Convert any illegal values from NaN to NA | |
491 new[is.nan(new$log2_fc), "log2_fc"] <- NA | |
492 # Eliminate rows having missing values (e.g., non-imputed data) | |
493 new <- new[complete.cases(new$log2_fc), ] | |
494 } | |
495 if (networkin == TRUE) { | |
496 # When NetworKIN is true, filter on NetworKIN.cutoff which includes | |
497 # PhosphoSitePlus data *because its networkin_score is set to Inf* | |
498 ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] | |
499 ksdata_filtered <- ksdata_filtered[ | |
500 (ksdata_filtered$networkin_score >= networkin_cutoff), ] | |
501 } else { | |
502 # Otherwise, simply use PhosphSitePlus rows | |
503 ksdata_filtered <- ksdata[ | |
504 grep("PhosphoSitePlus", ksdata$Source), ] | |
505 } | |
506 # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD | |
507 # colnames of ksdata_filtered: | |
508 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" | |
509 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" | |
510 # "SITE_...7_AA" "networkin_score" "Source" | |
511 # colnames of new: | |
512 # "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc" | |
513 # Equivalent to: | |
514 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc | |
515 # FROM ksdata_filtered a | |
516 # INNER JOIN new b | |
517 # ON a.SUB_GENE = b.SUB_GENE | |
518 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD | |
519 ksdata_dataset <- base::merge(ksdata_filtered, new) | |
520 # colnames of ksdata_dataset: | |
521 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" | |
522 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" | |
523 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" | |
524 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) | |
525 # Re-order dataset; prior to accounting for isoforms | |
526 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] | |
527 # Extract non-isoform accession in UniProtKB | |
528 ksdata_dataset$uniprot_no_isoform <- sapply( | |
529 ksdata_dataset$KIN_ACC_ID, | |
530 function(x) unlist(strsplit(as.character(x), split = "-"))[1] | |
531 ) | |
532 # Discard previous results while selecting interesting columns ... | |
533 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] | |
534 # Column names are now: | |
535 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" | |
536 # "FC" "log2_fc" "Source" | |
537 # Make column names human-readable | |
538 colnames(ksdata_dataset_abbrev) <- c( | |
539 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", | |
540 "FC", "log2FC", "Source" | |
541 ) | |
542 # SELECT * FROM ksdata_dataset_abbrev | |
543 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p | |
544 ksdata_dataset_abbrev <- | |
545 ksdata_dataset_abbrev[ | |
546 order( | |
547 ksdata_dataset_abbrev$Kinase.Gene, | |
548 ksdata_dataset_abbrev$Substrate.Gene, | |
549 ksdata_dataset_abbrev$Substrate.Mod, | |
550 ksdata_dataset_abbrev$p), | |
551 ] | |
552 # First aggregation step to account for multiply phosphorylated peptides | |
553 # and differing peptide sequences; the goal here is to combine results | |
554 # for all measurements of the same substrate. | |
555 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | |
556 # `Source`, avg(log2FC) AS log2FC | |
557 # FROM ksdata_dataset_abbrev | |
558 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | |
559 # `Source` | |
560 # ORDER BY `Kinase.Gene`; | |
561 # in two steps: | |
562 # (1) compute average log_2(fold-change) | |
563 ksdata_dataset_abbrev <- aggregate( | |
564 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, | |
565 data = ksdata_dataset_abbrev, | |
566 FUN = mean | |
567 ) | |
568 # (2) order by Kinase.Gene | |
569 ksdata_dataset_abbrev <- | |
570 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] | |
571 # SELECT `Kinase.Gene`, count(*) | |
572 # FROM ksdata_dataset_abbrev | |
573 # GROUP BY `Kinase.Gene`; | |
574 # in two steps: | |
575 # (1) Extract the list of Kinase.Gene names | |
576 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) | |
577 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, | |
578 # named by Kinase.Gene | |
579 kinase_list <- as.matrix(table(kinase_list)) | |
580 # Second aggregation step to account for all substrates per kinase | |
581 # CREATE TABLE mean_fc | |
582 # AS | |
583 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC | |
584 # FROM ksdata_dataset_abbrev | |
585 # GROUP BY `Kinase.Gene` | |
586 mean_fc <- aggregate( | |
587 log2FC ~ Kinase.Gene, | |
588 data = ksdata_dataset_abbrev, | |
589 FUN = mean | |
590 ) | |
591 # mean_fc columns: "Kinase.Gene", "log2FC" | |
592 if (FALSE) { | |
593 # I need to re-think this; I was trying to find the most-represented | |
594 # peptide, but that horse has already left the barn | |
595 # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC | |
596 # FROM ksdata_dataset_abbrev | |
597 # GROUP BY `Kinase.Gene` | |
598 max_fc <- aggregate( | |
599 log2FC ~ Kinase.Gene, | |
600 data = ksdata_dataset_abbrev, | |
601 FUN = function(r) max(abs(r)) | |
602 ) | |
603 } | |
604 | |
605 # Create column 3: mS | |
606 mean_fc$m_s <- mean_fc[, 2] | |
607 # Create column 4: Enrichment | |
608 mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) | |
609 # Create column 5: m, count of substrates | |
610 mean_fc$m <- kinase_list | |
611 # Create column 6: z-score | |
612 mean_fc$z_score <- ( | |
613 (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * | |
614 sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) | |
615 # Create column 7: p-value, deduced from z-score | |
616 mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) | |
617 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value | |
618 mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") | |
619 | |
620 # Remove log2FC column, which is duplicated as mS | |
621 mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] | |
622 # Correct the column names which we had to hack because of the linter... | |
623 colnames(mean_fc) <- c( | |
624 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" | |
625 ) | |
626 return(mean_fc) | |
627 } | |
628 | |
629 low_fdr_barplot <- function( | |
630 rslt, | |
631 i_cntrst, | |
632 i, | |
633 a_level, | |
634 b_level, | |
635 fold_change, | |
636 caption | |
637 ) { | |
638 rslt_score_list_i <- rslt$score_list[[i]] | |
639 if (!is.null(rslt_score_list_i)) { | |
640 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) | |
641 k <- data.frame( | |
642 contrast = as.integer(i_cntrst), | |
643 a_level = rep.int(a_level, rslt_score_list_i_nrow), | |
644 b_level = rep.int(b_level, rslt_score_list_i_nrow), | |
645 kinase_gene = rslt_score_list_i$Kinase.Gene, | |
646 mean_log2_fc = rslt_score_list_i$mS, | |
647 enrichment = rslt_score_list_i$Enrichment, | |
648 substrate_count = rslt_score_list_i$m, | |
649 z_score = rslt_score_list_i$z.score, | |
650 p_value = rslt_score_list_i$p.value, | |
651 fdr = rslt_score_list_i$FDR | |
652 ) | |
653 selector <- switch( | |
654 ksea_cutoff_statistic, | |
655 "FDR" = { | |
656 k$fdr | |
657 }, | |
658 "p.value" = { | |
659 k$p_value | |
660 }, | |
661 stop( | |
662 sprintf( | |
663 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
664 ksea_cutoff_statistic | |
665 ) | |
666 ) | |
667 ) | |
668 | |
669 k <- k[selector < ksea_cutoff_threshold, ] | |
670 | |
671 if (nrow(k) > 1) { | |
672 op <- par(mai = c(1, 1.5, 0.4, 0.4)) | |
673 numeric_z_score <- as.numeric(k$z_score) | |
674 z_score_order <- order(numeric_z_score) | |
675 kinase_name <- k$kinase_gene | |
676 long_caption <- | |
677 sprintf( | |
678 "Kinase z-score, %s < %s, %s", | |
679 ksea_cutoff_statistic, | |
680 ksea_cutoff_threshold, | |
681 caption | |
682 ) | |
683 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) | |
684 cat("\n\\clearpage\n") | |
685 barplot( | |
686 height = numeric_z_score[z_score_order], | |
687 border = NA, | |
688 xpd = FALSE, | |
689 cex.names = 1.0, | |
690 cex.axis = 1.0, | |
691 main = long_caption, | |
692 cex.main = my_cex_caption, | |
693 names.arg = kinase_name[z_score_order], | |
694 horiz = TRUE, | |
695 srt = 45, | |
696 las = 1) | |
697 par(op) | |
698 } | |
699 } | |
700 } | |
701 | |
702 # note that this adds elements to the global variable `ksea_asterisk_hash` | |
703 | |
704 low_fdr_print <- function( | |
705 rslt, | |
706 i_cntrst, | |
707 i, | |
708 a_level, | |
709 b_level, | |
710 fold_change, | |
711 caption | |
712 ) { | |
713 rslt_score_list_i <- rslt$score_list[[i]] | |
714 if (!is.null(rslt_score_list_i)) { | |
715 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) | |
716 k <- contrast_ksea_scores <- data.frame( | |
717 contrast = as.integer(i_cntrst), | |
718 a_level = rep.int(a_level, rslt_score_list_i_nrow), | |
719 b_level = rep.int(b_level, rslt_score_list_i_nrow), | |
720 kinase_gene = rslt_score_list_i$Kinase.Gene, | |
721 mean_log2_fc = rslt_score_list_i$mS, | |
722 enrichment = rslt_score_list_i$Enrichment, | |
723 substrate_count = rslt_score_list_i$m, | |
724 z_score = rslt_score_list_i$z.score, | |
725 p_value = rslt_score_list_i$p.value, | |
726 fdr = rslt_score_list_i$FDR | |
727 ) | |
728 | |
729 selector <- switch( | |
730 ksea_cutoff_statistic, | |
731 "FDR" = { | |
732 k$fdr | |
733 }, | |
734 "p.value" = { | |
735 k$p_value | |
736 }, | |
737 stop( | |
738 sprintf( | |
739 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
740 ksea_cutoff_statistic | |
741 ) | |
742 ) | |
743 ) | |
744 | |
745 k <- k[selector < ksea_cutoff_threshold, ] | |
746 # save kinase names to ksea_asterisk_hash | |
747 for (kinase_name in k$kinase_gene) { | |
748 ksea_asterisk_hash[[kinase_name]] <- 1 | |
749 } | |
750 | |
751 db_write_table_overwrite <- (i_cntrst < 2) | |
752 db_write_table_append <- !db_write_table_overwrite | |
753 RSQLite::dbWriteTable( | |
754 conn = db, | |
755 name = "contrast_ksea_scores", | |
756 value = contrast_ksea_scores, | |
757 append = db_write_table_append | |
758 ) | |
759 selector <- switch( | |
760 ksea_cutoff_statistic, | |
761 "FDR" = { | |
762 contrast_ksea_scores$fdr | |
763 }, | |
764 "p.value" = { | |
765 contrast_ksea_scores$p_value | |
766 }, | |
767 stop( | |
768 sprintf( | |
769 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
770 ksea_cutoff_statistic | |
771 ) | |
772 ) | |
773 ) | |
774 output_df <- contrast_ksea_scores[ | |
775 selector < ksea_cutoff_threshold, | |
776 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", | |
777 "z_score", "p_value", "fdr") | |
778 ] | |
779 output_order <- with(output_df, order(mean_log2_fc, kinase_gene)) | |
780 output_df <- output_df[output_order, ] | |
781 colnames(output_df) <- | |
782 c( | |
783 colnames(output_df)[1], | |
784 colnames(output_df)[2], | |
785 "enrichment", | |
786 "m_s", | |
787 "z_score", | |
788 "p_value", | |
789 "fdr" | |
790 ) | |
791 output_df$fdr <- sprintf("%0.4f", output_df$fdr) | |
792 output_df$p_value <- sprintf("%0.2e", output_df$p_value) | |
793 output_df$z_score <- sprintf("%0.2f", output_df$z_score) | |
794 output_df$m_s <- sprintf("%d", output_df$m_s) | |
795 output_df$enrichment <- sprintf("%0.2f", output_df$enrichment) | |
796 output_ncol <- ncol(output_df) | |
797 colnames(output_df) <- | |
798 c( | |
799 "Kinase", | |
800 "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)", | |
801 "Enrichment", | |
802 "Substrates", | |
803 "z-score", | |
804 "p-value", | |
805 "FDR" | |
806 ) | |
807 selector <- switch( | |
808 ksea_cutoff_statistic, | |
809 "FDR" = { | |
810 rslt$score_list[[i]]$FDR | |
811 }, | |
812 "p.value" = { | |
813 rslt$score_list[[i]]$p.value | |
814 }, | |
815 stop( | |
816 sprintf( | |
817 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
818 ksea_cutoff_statistic | |
819 ) | |
820 ) | |
821 ) | |
822 if (sum(selector < ksea_cutoff_threshold) > 0) { | |
823 math_caption <- gsub("{", "\\{", caption, fixed = TRUE) | |
824 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) | |
825 data_frame_latex( | |
826 x = output_df, | |
827 justification = "l c c c c c c", | |
828 centered = TRUE, | |
829 caption = sprintf( | |
830 "\\text{%s}, %s < %s", | |
831 math_caption, | |
832 ksea_cutoff_statistic, | |
833 ksea_cutoff_threshold | |
834 ), | |
835 anchor = const_table_anchor_p | |
836 ) | |
837 } else { | |
838 cat( | |
839 sprintf( | |
840 "\\break | |
841 No kinases had | |
842 \\(\\text{%s}_\\text{enrichment} < %s\\) | |
843 for contrast %s\\hfill\\break\n", | |
844 ksea_cutoff_statistic, | |
845 ksea_cutoff_threshold, | |
846 caption | |
847 ) | |
848 ) | |
849 } | |
850 } | |
851 } | |
852 | |
853 # create_breaks is a helper for ksea_heatmap | |
854 create_breaks <- function(merged_scores) { | |
855 if (min(merged_scores, na.rm = TRUE) < -1.6) { | |
856 breaks_neg <- seq(-1.6, 0, length.out = 30) | |
857 breaks_neg <- | |
858 append( | |
859 seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10), | |
860 breaks_neg | |
861 ) | |
862 breaks_neg <- sort(unique(breaks_neg)) | |
863 } else { | |
864 breaks_neg <- seq(-1.6, 0, length.out = 30) | |
865 } | |
866 if (max(merged_scores, na.rm = TRUE) > 1.6) { | |
867 breaks_pos <- seq(0, 1.6, length.out = 30) | |
868 breaks_pos <- | |
869 append( | |
870 breaks_pos, | |
871 seq(1.6, max(merged_scores, na.rm = TRUE), | |
872 length.out = 10) | |
873 ) | |
874 breaks_pos <- sort(unique(breaks_pos)) | |
875 } else { | |
876 breaks_pos <- seq(0, 1.6, length.out = 30) | |
877 } | |
878 breaks_all <- unique(append(breaks_neg, breaks_pos)) | |
879 mycol_neg <- | |
880 gplots::colorpanel(n = length(breaks_neg), | |
881 low = "blue", | |
882 high = "white") | |
883 mycol_pos <- | |
884 gplots::colorpanel(n = length(breaks_pos) - 1, | |
885 low = "white", | |
886 high = "red") | |
887 mycol <- unique(append(mycol_neg, mycol_pos)) | |
888 color_breaks <- list(breaks_all, mycol) | |
889 return(color_breaks) | |
890 } | |
891 | |
892 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap | |
893 draw_kseaapp_summary_heatmap <- function( | |
894 x, | |
895 sample_cluster, | |
896 merged_asterisk, | |
897 my_cex_row, | |
898 color_breaks, | |
899 margins, | |
900 ... | |
901 ) { | |
902 merged_scores <- x | |
903 if (!is.matrix(x)) { | |
904 cat( | |
905 paste0( | |
906 "No plot because \\texttt{typeof(x)} is '", | |
907 typeof(x), | |
908 "' rather than 'matrix'.\n\n" | |
909 ) | |
910 ) | |
911 } else if (nrow(x) < 2) { | |
912 cat("No plot because matrix x has ", nrow(x), " rows.\n\n") | |
913 cat("\\begin{verbatim}\n") | |
914 str(x) | |
915 cat("\\end{verbatim}\n") | |
916 } else if (ncol(x) < 2) { | |
917 cat("No plot because matrix x has ", ncol(x), " columns.\n\n") | |
918 cat("\\begin{verbatim}\n") | |
919 str(x) | |
920 cat("\\end{verbatim}\n") | |
921 } else { | |
922 gplots::heatmap.2( | |
923 x = merged_scores, | |
924 Colv = sample_cluster, | |
925 scale = "none", | |
926 cellnote = merged_asterisk, | |
927 notecol = "white", | |
928 cexCol = 0.9, | |
929 # Heuristically assign size of row labels | |
930 cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25), | |
931 srtCol = 45, | |
932 srtRow = 45, | |
933 notecex = 3 * my_cex_row, | |
934 col = color_breaks[[2]], | |
935 density.info = "none", | |
936 trace = "none", | |
937 breaks = color_breaks[[1]], | |
938 lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), | |
939 lhei = c(0.4, 8.0, 1.1), | |
940 lwid = c(0.5, 3), | |
941 key = FALSE, | |
942 margins = margins, | |
943 ... | |
944 ) | |
945 } | |
946 } | |
947 | |
948 # Adapted from KSEAapp::KSEA.Heatmap | |
949 ksea_heatmap <- function( | |
950 # the data frame outputs from the KSEA.Scores() function, in list format | |
951 score_list, | |
952 # a character vector of all the sample names for heatmap annotation: | |
953 # - the names must be in the same order as the data in score_list | |
954 # - please avoid long names, as they may get cropped in the final image | |
955 sample_labels, | |
956 # character string of either "p.value" or "FDR" indicating the data column | |
957 # to use for marking statistically significant scores | |
958 stats, | |
959 # a numeric value between 0 and infinity indicating the min. number of | |
960 # substrates a kinase must have to be included in the heatmap | |
961 m_cutoff, | |
962 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | |
963 # for indicating significant kinases in the heatmap | |
964 p_cutoff = | |
965 stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), | |
966 # a binary input of TRUE or FALSE, indicating whether or not to perform | |
967 # hierarchical clustering of the sample columns | |
968 sample_cluster, | |
969 # a binary input of TRUE or FALSE, indicating whether or not to export | |
970 # the heatmap as a .png image into the working directory | |
971 export = FALSE, | |
972 # bottom and right margins; adjust as needed if contrast names are too long | |
973 margins = c(6, 20), | |
974 # print which kinases? | |
975 # - Mandatory argument, must be one of const_ksea_.*_kinases | |
976 which_kinases, | |
977 # additional arguments to gplots::heatmap.2, such as: | |
978 # - main: main title of plot | |
979 # - xlab: x-axis label | |
980 # - ylab: y-axis label | |
981 ... | |
982 ) { | |
983 filter_m <- function(dataset, m_cutoff) { | |
984 filtered <- dataset[(dataset$m >= m_cutoff), ] | |
985 return(filtered) | |
986 } | |
987 score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff)) | |
988 for (i in seq_len(length(score_list_m))) { | |
989 names <- colnames(score_list_m[[i]])[c(2:7)] | |
990 colnames(score_list_m[[i]])[c(2:7)] <- | |
991 paste(names, i, sep = ".") | |
992 } | |
993 master <- | |
994 Reduce( | |
995 f = function(...) { | |
996 base::merge(..., by = "Kinase.Gene", all = FALSE) | |
997 }, | |
998 x = score_list_m | |
999 ) | |
1000 | |
1001 row.names(master) <- master$Kinase.Gene | |
1002 columns <- as.character(colnames(master)) | |
1003 merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE]) | |
1004 colnames(merged_scores) <- sample_labels | |
1005 merged_stats <- as.matrix(master[, grep(stats, columns)]) | |
1006 asterisk <- function(mtrx, p_cutoff) { | |
1007 new <- data.frame() | |
1008 for (i in seq_len(nrow(mtrx))) { | |
1009 for (j in seq_len(ncol(mtrx))) { | |
1010 my_value <- mtrx[i, j] | |
1011 if (!is.na(my_value) && my_value < p_cutoff) { | |
1012 new[i, j] <- "*" | |
1013 } else { | |
1014 new[i, j] <- "" | |
1015 } | |
1016 } | |
1017 } | |
1018 return(new) | |
1019 } | |
1020 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) | |
1021 | |
1022 # begin hack to print only significant rows | |
1023 asterisk_rows <- rowSums(merged_asterisk == "*") > 0 | |
1024 all_rows <- rownames(merged_stats) | |
1025 names(asterisk_rows) <- all_rows | |
1026 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) | |
1027 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) | |
1028 merged_scores_asterisk <- merged_scores[names(asterisk_rows), ] | |
1029 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ] | |
1030 # end hack to print only significant rows | |
1031 | |
1032 row_list <- list() | |
1033 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows | |
1034 row_list[[const_ksea_all_kinases]] <- all_rows | |
1035 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows | |
1036 | |
1037 i <- which_kinases | |
1038 my_row_names <- row_list[[i]] | |
1039 scrs <- merged_scores[my_row_names, ] | |
1040 stts <- merged_stats[my_row_names, ] | |
1041 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) | |
1042 | |
1043 color_breaks <- create_breaks(scrs) | |
1044 plot_height <- nrow(scrs) ^ 0.55 | |
1045 plot_width <- ncol(scrs) ^ 0.7 | |
1046 my_cex_row <- 0.25 * 16 / plot_height | |
1047 if (export == "TRUE") { | |
1048 png( | |
1049 "KSEA.Merged.Heatmap.png", | |
1050 width = plot_width * 300, | |
1051 height = 2 * plot_height * 300, | |
1052 res = 300, | |
1053 pointsize = 14 | |
1054 ) | |
1055 } | |
1056 draw_kseaapp_summary_heatmap( | |
1057 x = scrs, | |
1058 sample_cluster = sample_cluster, | |
1059 merged_asterisk = merged_asterisk, | |
1060 my_cex_row = my_cex_row, | |
1061 color_breaks = color_breaks, | |
1062 margins = margins | |
1063 ) | |
1064 if (export == "TRUE") { | |
1065 dev.off() | |
1066 } | |
1067 return(my_row_names) | |
1068 } | |
1069 | |
1070 # helper for heatmaps of phosphopeptide intensities | |
1071 | |
1072 draw_intensity_heatmap <- | |
1073 function( | |
1074 m, # matrix with rownames already formatted | |
1075 cutoff, # cutoff used by hm_heading_function | |
1076 hm_heading_function, # construct and cat heading from m and cutoff | |
1077 hm_main_title, # main title for plot (drawn below heading) | |
1078 suppress_row_dendrogram = TRUE, # set to false to show dendrogram | |
1079 max_peptide_count # experimental: | |
1080 = intensity_hm_rows, # values of 50 and 75 worked well | |
1081 ... # passthru parameters for heatmap | |
1082 ) { | |
1083 peptide_count <- 0 | |
1084 # emit the heading for the heatmap | |
1085 if (hm_heading_function(m, cutoff)) { | |
1086 peptide_count <- min(max_peptide_count, nrow(m)) | |
1087 if (nrow(m) > 1) { | |
1088 m_margin <- m[peptide_count:1, ] | |
1089 # Margin setting was heuristically derived | |
1090 margins <- | |
1091 c(0.5, # col | |
1092 max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row | |
1093 ) | |
1094 } | |
1095 if (nrow(m) > 1) { | |
1096 tryCatch( | |
1097 { | |
1098 old_oma <- par("oma") | |
1099 par(cex.main = 0.6) | |
1100 # Heuristically determined character size adjustment formula | |
1101 char_contractor <- | |
1102 250000 / ( | |
1103 max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows | |
1104 ) | |
1105 heatmap( | |
1106 m[peptide_count:1, ], | |
1107 Rowv = if (suppress_row_dendrogram) NA else NULL, | |
1108 Colv = NA, | |
1109 cexRow = char_contractor, | |
1110 cexCol = char_contractor * 50 / max_peptide_count, | |
1111 scale = "row", | |
1112 margins = margins, | |
1113 main = | |
1114 "Unimputed, unnormalized log(intensities)", | |
1115 xlab = "", | |
1116 las = 1, | |
1117 ... | |
1118 ) | |
1119 }, | |
1120 error = function(e) { | |
1121 cat( | |
1122 sprintf( | |
1123 "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n", | |
1124 e$message | |
1125 ) | |
1126 ) | |
1127 }, | |
1128 finally = par(old_oma) | |
1129 ) | |
1130 } | |
1131 } | |
1132 return(peptide_count) | |
1133 } | |
1134 ``` | |
1135 | |
1136 ```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
1137 cat("\\listoftables\n") | |
1138 ``` | |
1139 # Purpose | |
1140 | |
1141 To perform for phosphopeptides: | |
1142 | |
1143 - imputation of missing values, | |
1144 - quantile normalization, | |
1145 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), and | |
1146 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: | |
1147 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) | |
1148 - The Human Proteome Database [http://hprd.org](http://hprd.org) | |
1149 - NetworKIN [http://networkin.science/](http://networkin.science/) | |
1150 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) | |
1151 | |
1152 ```{r include = FALSE} | |
1153 | |
1154 ### GLOBAL VARIABLES | |
1155 | |
1156 # parameters for KSEA | |
1157 | |
1158 ksea_cutoff_statistic <- params$kseaCutoffStatistic | |
1159 ksea_cutoff_threshold <- params$kseaCutoffThreshold | |
1160 ksea_min_kinase_count <- params$kseaMinKinaseCount | |
1161 | |
1162 ksea_heatmap_titles <- list() | |
1163 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- | |
1164 sprintf( | |
1165 "Summary for all kinases enriched in one or more contrasts at %s < %s", | |
1166 ksea_cutoff_statistic, | |
1167 ksea_cutoff_threshold | |
1168 ) | |
1169 ksea_heatmap_titles[[const_ksea_all_kinases]] <- | |
1170 "Summary figure for all contrasts and all kinases" | |
1171 ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <- | |
1172 sprintf( | |
1173 "Summary for all kinases not enriched at %s < %s in any contrast", | |
1174 ksea_cutoff_statistic, | |
1175 ksea_cutoff_threshold | |
1176 ) | |
1177 # hash to hold names of significantly enriched kinases | |
1178 ksea_asterisk_hash <- new_env() | |
1179 | |
1180 # READ PARAMETERS (mostly) | |
1181 | |
1182 intensity_hm_rows <- params$intensityHeatmapRows | |
1183 # Input Filename | |
1184 input_file <- params$inputFile | |
1185 | |
1186 # First data column - ideally, this could be detected via regexSampleNames, | |
1187 # but for now leave it as is. | |
1188 first_data_column <- params$firstDataColumn | |
1189 fdc_is_integer <- is.integer(first_data_column) | |
1190 if (fdc_is_integer) { | |
1191 first_data_column <- as.integer(params$firstDataColumn) | |
1192 } | |
1193 | |
1194 # False discovery rate adjustment for ANOVA | |
1195 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 | |
1196 val_fdr <- | |
1197 read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "") | |
1198 | |
1199 if ( | |
1200 ncol(val_fdr) != 1 || | |
1201 sum(!is.numeric(val_fdr[, 1])) || | |
1202 sum(val_fdr[, 1] < 0) || | |
1203 sum(val_fdr[, 1] > 1) | |
1204 ) { | |
1205 stop("alphaFile should be one column of numbers within the range [0.0,1.0]") | |
1206 } | |
1207 val_fdr <- val_fdr[, 1] | |
1208 | |
1209 #Imputed Data filename | |
1210 imputed_data_filename <- params$imputedDataFilename | |
1211 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile | |
1212 anova_ksea_mtdt_file <- params$anovaKseaMetadata | |
1213 | |
1214 ``` | |
1215 | |
1216 ```{r echo = FALSE} | |
1217 # Imputation method, should be one of | |
1218 # "random", "group-median", "median", or "mean" | |
1219 imputation_method <- params$imputationMethod | |
1220 | |
1221 # Selection of percentile of logvalue data to set the mean for random number | |
1222 # generation when using random imputation | |
1223 mean_percentile <- params$meanPercentile / 100.0 | |
1224 | |
1225 # deviation adjustment-factor for random values; real number. | |
1226 sd_percentile <- params$sdPercentile | |
1227 | |
1228 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | |
1229 regex_sample_names <- params$regexSampleNames | |
1230 | |
1231 # Regular expression to extract Sample Grouping from Sample Name; | |
1232 # if error occurs, compare sample_treatment_levels vs. sample_name_matches | |
1233 # to see if groupings/pairs line up | |
1234 # e.g., "(\\d+)" | |
1235 regex_sample_grouping <- params$regexSampleGrouping | |
1236 | |
1237 one_way_all_categories_fname <- params$oneWayManyCategories | |
1238 one_way_all_categories <- try_catch_w_e( | |
1239 match.fun(one_way_all_categories_fname)) | |
1240 if (!is.function(one_way_all_categories$value)) { | |
1241 write("fatal error for parameter oneWayManyCategories:", stderr()) | |
1242 write(one_way_all_categories$value$message, stderr()) | |
1243 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
1244 stop("Cannot continue. Goodbye.") | |
1245 } | |
1246 one_way_all_categories <- one_way_all_categories$value | |
1247 | |
1248 one_way_two_categories_fname <- params$oneWayManyCategories | |
1249 one_way_two_categories <- try_catch_w_e( | |
1250 match.fun(one_way_two_categories_fname)) | |
1251 if (!is.function(one_way_two_categories$value)) { | |
1252 cat("fatal error for parameter oneWayTwoCategories: \n") | |
1253 cat(one_way_two_categories$value$message, fill = TRUE) | |
1254 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
1255 stop("Cannot continue. Goodbye.") | |
1256 } | |
1257 one_way_two_categories <- one_way_two_categories$value | |
1258 | |
1259 preproc_db <- params$preprocDb | |
1260 ksea_app_prep_db <- params$kseaAppPrepDb | |
1261 result <- file.copy( | |
1262 from = preproc_db, | |
1263 to = ksea_app_prep_db, | |
1264 overwrite = TRUE | |
1265 ) | |
1266 if (!result) { | |
1267 write( | |
1268 sprintf( | |
1269 "fatal error copying initial database '%s' to output '%s'", | |
1270 preproc_db, | |
1271 ksea_app_prep_db, | |
1272 ), | |
1273 stderr() | |
1274 ) | |
1275 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
1276 stop("Cannot continue. Goodbye.") | |
1277 } | |
1278 ``` | |
1279 | |
1280 ```{r echo = FALSE} | |
1281 ### READ DATA | |
1282 | |
1283 # read.table reads a file in table format and creates a data frame from it. | |
1284 # - note that `quote = ""` means that quotation marks are treated literally. | |
1285 full_data <- read.table( | |
1286 file = input_file, | |
1287 sep = "\t", | |
1288 header = TRUE, | |
1289 quote = "", | |
1290 check.names = FALSE | |
1291 ) | |
1292 ``` | |
1293 | |
1294 # Extract Sample Names and Treatment Levels | |
1295 | |
1296 Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2. | |
1297 | |
1298 ```{r echo = FALSE, results = 'asis'} | |
1299 | |
1300 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) | |
1301 | |
1302 if (!fdc_is_integer) { | |
1303 if (length(data_column_indices) > 0) { | |
1304 first_data_column <- data_column_indices[1] | |
1305 } else { | |
1306 stop(paste("failed to convert firstDataColumn:", first_data_column)) | |
1307 } | |
1308 } | |
1309 | |
1310 cat( | |
1311 sprintf( | |
1312 paste( | |
1313 "\n\nThe input data file has peptide-intensity data for each sample", | |
1314 "in one of columns %d through %d.\n\n" | |
1315 ), | |
1316 min(data_column_indices), | |
1317 max(data_column_indices) | |
1318 ) | |
1319 ) | |
1320 | |
1321 # Write column names as a LaTeX enumerated list. | |
1322 column_name_df <- data.frame( | |
1323 column = seq_len(length(colnames(full_data))), | |
1324 name = paste0("\\verb@", colnames(full_data), "@") | |
1325 ) | |
1326 data_frame_latex( | |
1327 x = column_name_df, | |
1328 justification = "l l", | |
1329 centered = TRUE, | |
1330 caption = "Input data column names", | |
1331 anchor = const_table_anchor_bp, | |
1332 underscore_whack = FALSE | |
1333 ) | |
1334 | |
1335 ``` | |
1336 | |
1337 ```{r echo = FALSE, results = 'asis'} | |
1338 quant_data <- full_data[first_data_column:length(full_data)] | |
1339 quant_data[quant_data == 0] <- NA | |
1340 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide | |
1341 # Extract factors and trt-replicates using regular expressions. | |
1342 # Typically: | |
1343 # regex_sample_names is "\\.\\d+[A-Z]$" | |
1344 # regex_sample_grouping is "\\d+" | |
1345 # This would distinguish trt-replicates by terminal letter [A-Z] | |
1346 # in sample names and group them into trts by the preceding digits. | |
1347 # e.g.: | |
1348 # group .1A .1B .1C into group 1; | |
1349 # group .2A .2B .2C, into group 2; | |
1350 # etc. | |
1351 m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE) | |
1352 sample_name_matches <- regmatches(names(quant_data), m) | |
1353 colnames(quant_data) <- sample_name_matches | |
1354 | |
1355 write_debug_file(quant_data) | |
1356 | |
1357 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) | |
1358 sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) | |
1359 number_of_samples <- length(sample_name_matches) | |
1360 sample_treatment_df <- data.frame( | |
1361 level = sample_treatment_levels, | |
1362 sample = sample_name_matches | |
1363 ) | |
1364 data_frame_latex( | |
1365 x = sample_treatment_df, | |
1366 justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", | |
1367 centered = TRUE, | |
1368 caption = "Treatment levels", | |
1369 anchor = const_table_anchor_tbp, | |
1370 underscore_whack = FALSE | |
1371 ) | |
1372 ``` | |
1373 | |
1374 ```{r echo = FALSE, results = 'asis'} | |
1375 cat("\\newpage\n") | |
1376 ``` | |
1377 | |
1378 ## Are the log-transformed sample distributions similar? | |
1379 | |
1380 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | |
1381 | |
1382 quant_data[quant_data == 0] <- NA #replace 0 with NA | |
1383 quant_data_log <- log10(quant_data) | |
1384 | |
1385 rownames(quant_data_log) <- rownames(quant_data) | |
1386 colnames(quant_data_log) <- sample_name_matches | |
1387 | |
1388 write_debug_file(quant_data_log) | |
1389 | |
1390 # data visualization | |
1391 old_par <- par( | |
1392 mai = par("mai") + c(0.5, 0, 0, 0) | |
1393 ) | |
1394 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
1395 # Vertical plot | |
1396 boxplot( | |
1397 quant_data_log | |
1398 , las = 1 | |
1399 , col = const_boxplot_fill | |
1400 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
1401 , xlab = "Sample" | |
1402 ) | |
1403 par(old_par) | |
1404 | |
1405 | |
1406 | |
1407 cat("\n\n\n") | |
1408 cat("\n\n\n") | |
1409 | |
1410 ``` | |
1411 | |
1412 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} | |
1413 if (nrow(quant_data_log) > 1) { | |
1414 quant_data_log_stack <- stack(quant_data_log) | |
1415 ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + | |
1416 ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + | |
1417 ggplot2::ylab("Probability density") + | |
1418 ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) | |
1419 } else { | |
1420 cat("No density plot because there are too few peptides.\n\n") | |
1421 } | |
1422 ``` | |
1423 | |
1424 ## Globally, are peptide intensities are approximately unimodal? | |
1425 | |
1426 <!-- | |
1427 # bquote could be used as an alternative to latex2exp::TeX below particularly | |
1428 # and when plotting math expressions generally, at the expense of mastering | |
1429 # another syntax, which hardly seems worthwhile when I need to use TeX | |
1430 # elsewhere; here's an introduction to bquote: | |
1431 # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ | |
1432 --> | |
1433 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} | |
1434 | |
1435 # identify the location of missing values | |
1436 fin <- is.finite(as.numeric(as.matrix(quant_data_log))) | |
1437 | |
1438 logvalues <- as.numeric(as.matrix(quant_data_log))[fin] | |
1439 logvalues_density <- density(logvalues) | |
1440 plot( | |
1441 x = logvalues_density, | |
1442 main = latex2exp::TeX( | |
1443 "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)" | |
1444 ), | |
1445 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), | |
1446 ylab = "Probability density" | |
1447 ) | |
1448 hist( | |
1449 x = as.numeric(as.matrix(quant_data_log)), | |
1450 xlim = c(min(logvalues_density$x), max(logvalues_density$x)), | |
1451 breaks = 100, | |
1452 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), | |
1453 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
1454 ) | |
1455 ``` | |
1456 | |
1457 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values | |
1458 | |
1459 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} | |
1460 # determine quantile | |
1461 q1 <- quantile(logvalues, probs = mean_percentile)[1] | |
1462 | |
1463 # 1 = row of matrix (ie, phosphopeptide) | |
1464 sds <- apply(quant_data_log, 1, sd_finite) | |
1465 if (sum(!is.na(sds)) > 2) { | |
1466 plot( | |
1467 density(sds, na.rm = TRUE) | |
1468 , main = "Smoothed estimated probability density vs. std. deviation" | |
1469 , sub = "(probability estimation made with Gaussian smoothing)" | |
1470 , ylab = "Probability density" | |
1471 ) | |
1472 } else { | |
1473 cat( | |
1474 "At least two non-missing values are required to plot", | |
1475 "probability density.\n\n" | |
1476 ) | |
1477 } | |
1478 | |
1479 ``` | |
1480 | |
1481 ```{r echo = FALSE} | |
1482 # Determine number of cells to impute | |
1483 temp <- quant_data[is.na(quant_data)] | |
1484 | |
1485 # Determine number of values to impute | |
1486 number_to_impute <- length(temp) | |
1487 | |
1488 # Determine percent of missing values | |
1489 pct_missing_values <- | |
1490 round(length(temp) / (length(logvalues) + length(temp)) * 100) | |
1491 ``` | |
1492 | |
1493 ```{r echo = FALSE} | |
1494 | |
1495 # prep for trt-median based imputation | |
1496 | |
1497 ``` | |
1498 # Impute Missing Values | |
1499 | |
1500 ```{r echo = FALSE} | |
1501 | |
1502 imp_smry_pot_peptides_before <- nrow(quant_data_log) | |
1503 imp_smry_missing_values_before <- number_to_impute | |
1504 imp_smry_pct_missing <- pct_missing_values | |
1505 | |
1506 ``` | |
1507 | |
1508 ```{r echo = FALSE} | |
1509 #Determine number of cells to impute | |
1510 | |
1511 ``` | |
1512 ```{r echo = FALSE} | |
1513 | |
1514 # Identify which values are missing and need to be imputed | |
1515 ind <- which(is.na(quant_data), arr.ind = TRUE) | |
1516 | |
1517 ``` | |
1518 ```{r echo = FALSE, results = 'asis'} | |
1519 | |
1520 # Apply imputation | |
1521 switch( | |
1522 imputation_method | |
1523 , "group-median" = { | |
1524 quant_data_imp <- quant_data | |
1525 imputation_method_description <- | |
1526 paste("Substitute missing value with", | |
1527 "median peptide-intensity for sample group.\n" | |
1528 ) | |
1529 sample_level_integers <- as.integer(sample_treatment_levels) | |
1530 # Take the accurate ln(x+1) because the data are log-normally distributed | |
1531 # and because median can involve an average of two measurements. | |
1532 quant_data_imp <- log1p(quant_data_imp) | |
1533 for (i in seq_len(length(levels(sample_treatment_levels)))) { | |
1534 # Determine the columns for this factor-level | |
1535 level_cols <- i == sample_level_integers | |
1536 # Extract those columns | |
1537 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] | |
1538 # assign to ind the row-column pairs corresponding to each NA | |
1539 ind <- which(is.na(lvlsbst), arr.ind = TRUE) | |
1540 # No group-median exists if there is only one sample | |
1541 # a given ppep has no measurement; otherwise, proceed. | |
1542 if (ncol(lvlsbst) > 1) { | |
1543 the_centers <- | |
1544 apply(lvlsbst, 1, median, na.rm = TRUE) | |
1545 for (j in seq_len(nrow(lvlsbst))) { | |
1546 for (k in seq_len(ncol(lvlsbst))) { | |
1547 if (is.na(lvlsbst[j, k])) { | |
1548 lvlsbst[j, k] <- the_centers[j] | |
1549 } | |
1550 } | |
1551 } | |
1552 quant_data_imp[, level_cols] <- lvlsbst | |
1553 } | |
1554 } | |
1555 # Take the accurate e^x - 1 to match scaling of original input. | |
1556 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | |
1557 good_rows <- !is.na(rowMeans(quant_data_imp)) | |
1558 } | |
1559 , "median" = { | |
1560 quant_data_imp <- quant_data | |
1561 imputation_method_description <- | |
1562 paste("Substitute missing value with", | |
1563 "median peptide-intensity across all sample classes.\n" | |
1564 ) | |
1565 # Take the accurate ln(x+1) because the data are log-normally distributed | |
1566 # and because median can involve an average of two measurements. | |
1567 quant_data_imp <- log1p(quant_data_imp) | |
1568 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] | |
1569 # Take the accurate e^x - 1 to match scaling of original input. | |
1570 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | |
1571 good_rows <- !is.nan(rowMeans(quant_data_imp)) | |
1572 } | |
1573 , "mean" = { | |
1574 quant_data_imp <- quant_data | |
1575 imputation_method_description <- | |
1576 paste("Substitute missing value with", | |
1577 "geometric-mean peptide intensity across all sample classes.\n" | |
1578 ) | |
1579 # Take the accurate ln(x+1) because the data are log-normally distributed, | |
1580 # so arguments to mean should be previously transformed. | |
1581 # this will have to be | |
1582 quant_data_imp <- log1p(quant_data_imp) | |
1583 # Assign to NA cells the mean for the row | |
1584 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] | |
1585 # Take the accurate e^x - 1 to match scaling of original input. | |
1586 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | |
1587 good_rows <- !is.nan(rowMeans(quant_data_imp)) | |
1588 } | |
1589 , "random" = { | |
1590 quant_data_imp <- quant_data | |
1591 m1 <- median(sds, na.rm = TRUE) * sd_percentile #sd to be used is the median sd | |
1592 # If you want results to be reproducible, you will want to call | |
1593 # base::set.seed before calling stats::rnorm | |
1594 imputation_method_description <- | |
1595 paste("Substitute each missing value with random intensity", | |
1596 sprintf( | |
1597 "random intensity $N \\sim (%0.2f, %0.2f)$.\n", | |
1598 q1, m1 | |
1599 ) | |
1600 ) | |
1601 cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n", | |
1602 100 * mean_percentile)) | |
1603 cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n", | |
1604 sd_percentile)) | |
1605 quant_data_imp[ind] <- | |
1606 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) | |
1607 quant_data_imp_ln <- log1p(quant_data_imp) | |
1608 good_rows <- !is.nan(rowMeans(quant_data_imp)) | |
1609 } | |
1610 ) | |
1611 quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e | |
1612 | |
1613 if (length(good_rows) < 1) { | |
1614 print("ERROR: Cannot impute data; there are no good rows!") | |
1615 return(-1) | |
1616 } | |
1617 ``` | |
1618 | |
1619 ```{r echo = FALSE, results = 'asis'} | |
1620 cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description) | |
1621 ``` | |
1622 | |
1623 ```{r echo = FALSE} | |
1624 | |
1625 imp_smry_pot_peptides_after <- sum(good_rows) | |
1626 imp_smry_rejected_after <- sum(!good_rows) | |
1627 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) | |
1628 ``` | |
1629 ```{r echo = FALSE, results = 'asis'} | |
1630 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf | |
1631 tabular_lines_fmt <- paste( | |
1632 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) | |
1633 " \\caption{Imputation Results}", | |
1634 " \\centering", # \centering centers the table on the page | |
1635 " \\begin{tabular}{l c c c}", | |
1636 " \\hline\\hline", | |
1637 " \\ & potential peptides & missing values & rejected", | |
1638 " peptides \\\\ [0.5ex]", | |
1639 " \\hline", | |
1640 " before imputation & %d & %d (%d\\%s) & \\\\", | |
1641 " after imputation & %d & %d & %d \\\\ [1ex]", | |
1642 " \\hline", | |
1643 " \\end{tabular}", | |
1644 #" \\label{table:nonlin}", # may be used to refer this table in the text | |
1645 "\\end{table}", | |
1646 sep = "\n" | |
1647 ) | |
1648 tabular_lines <- | |
1649 sprintf( | |
1650 tabular_lines_fmt, | |
1651 imp_smry_pot_peptides_before, | |
1652 imp_smry_missing_values_before, | |
1653 imp_smry_pct_missing, | |
1654 "%", | |
1655 imp_smry_pot_peptides_after, | |
1656 imp_smry_missing_values_after, | |
1657 imp_smry_rejected_after | |
1658 ) | |
1659 cat(tabular_lines) | |
1660 ``` | |
1661 ```{r echo = FALSE} | |
1662 | |
1663 | |
1664 # Zap rows where imputation was ineffective | |
1665 full_data <- full_data [good_rows, ] | |
1666 quant_data <- quant_data [good_rows, ] | |
1667 | |
1668 quant_data_imp <- quant_data_imp[good_rows, ] | |
1669 write_debug_file(quant_data_imp) | |
1670 quant_data_imp_good_rows <- quant_data_imp | |
1671 | |
1672 write_debug_file(quant_data_imp_good_rows) | |
1673 ``` | |
1674 | |
1675 ```{r echo = FALSE, results = 'asis'} | |
1676 | |
1677 can_plot_before_after_imp <- TRUE | |
1678 d_combined <- | |
1679 as.numeric( | |
1680 as.matrix( | |
1681 log10(quant_data_imp) | |
1682 ) | |
1683 ) | |
1684 d_original <- | |
1685 as.numeric( | |
1686 as.matrix( | |
1687 log10(quant_data_imp[!is.na(quant_data)]) | |
1688 ) | |
1689 ) | |
1690 | |
1691 if (sum(!is.na(d_original)) > 2) { | |
1692 d_original <- density(d_original) | |
1693 } else { | |
1694 can_plot_before_after_imp <- FALSE | |
1695 } | |
1696 if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) { | |
1697 d_combined <- density(d_combined) | |
1698 } else { | |
1699 can_plot_before_after_imp <- FALSE | |
1700 } | |
1701 | |
1702 if (sum(is.na(quant_data)) > 0) { | |
1703 # There ARE missing values | |
1704 d_imputed <- | |
1705 as.numeric( | |
1706 as.matrix( | |
1707 log10(quant_data_imp[is.na(quant_data)]) | |
1708 ) | |
1709 ) | |
1710 if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) { | |
1711 d_imputed <- (density(d_imputed)) | |
1712 } else { | |
1713 can_plot_before_after_imp <- FALSE | |
1714 } | |
1715 } else { | |
1716 # There are NO missing values | |
1717 d_imputed <- d_combined | |
1718 } | |
1719 | |
1720 ``` | |
1721 | |
1722 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | |
1723 zero_sd_rownames <- | |
1724 rownames(quant_data_imp)[ | |
1725 is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) | |
1726 ] | |
1727 | |
1728 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { | |
1729 stop("All peptides have zero standard deviation. Cannot continue.") | |
1730 } | |
1731 if (length(zero_sd_rownames) > 0) { | |
1732 cat( | |
1733 sprintf("%d peptides with zero variance were removed from statistical consideration", | |
1734 length(zero_sd_rownames) | |
1735 ) | |
1736 ) | |
1737 zap_named_rows <- function(df, nms) { | |
1738 return(df[!(row.names(df) %in% nms), ]) | |
1739 } | |
1740 quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) | |
1741 quant_data <- zap_named_rows(quant_data, zero_sd_rownames) | |
1742 full_data <- zap_named_rows(full_data, zero_sd_rownames) | |
1743 } | |
1744 | |
1745 if (sum(is.na(quant_data)) > 0) { | |
1746 cat("\\leavevmode\\newpage\n") | |
1747 # data visualization | |
1748 old_par <- par( | |
1749 mai = par("mai") + c(0.5, 0, 0, 0) | |
1750 ) | |
1751 # Copy quant data to x | |
1752 x <- quant_data | |
1753 # x gets to have values of: | |
1754 # - NA for observed values | |
1755 # - 1 for missing values | |
1756 x[is.na(x)] <- 0 | |
1757 x[x > 1] <- NA | |
1758 x[x == 0] <- 1 | |
1759 # Log-transform imputed data | |
1760 # update variable because rows may have been eliminated from quant_data_imp | |
1761 quant_data_imp_log10 <- log10(quant_data_imp) | |
1762 | |
1763 write_debug_file(quant_data_imp_log10) | |
1764 | |
1765 # Set blue_dots to log of quant data or NA for NA quant data | |
1766 blue_dots <- log10(quant_data) | |
1767 # Set red_dots to log of imputed data or NA for observed quant data | |
1768 red_dots <- quant_data_imp_log10 * x | |
1769 | |
1770 count_red <- sum(!is.na(red_dots)) | |
1771 count_blue <- sum(!is.na(blue_dots)) | |
1772 ylim_save <- ylim <- c( | |
1773 min(red_dots, blue_dots, na.rm = TRUE), | |
1774 max(red_dots, blue_dots, na.rm = TRUE) | |
1775 ) | |
1776 show_stripchart <- | |
1777 50 > (count_red + count_blue) / length(sample_name_matches) | |
1778 if (show_stripchart) { | |
1779 boxplot_sub <- "Light blue = data before imputation; Red = imputed data" | |
1780 } else { | |
1781 boxplot_sub <- "" | |
1782 } | |
1783 | |
1784 # Vertical plot | |
1785 colnames(blue_dots) <- sample_name_matches | |
1786 boxplot( | |
1787 blue_dots | |
1788 , las = 1 # "always horizontal" | |
1789 , col = const_boxplot_fill | |
1790 , ylim = ylim | |
1791 , main = "Peptide intensities after eliminating unusable peptides" | |
1792 , sub = boxplot_sub | |
1793 , xlab = "Sample" | |
1794 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
1795 ) | |
1796 | |
1797 if (show_stripchart) { | |
1798 # Points | |
1799 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
1800 # NA values are not plotted | |
1801 stripchart( | |
1802 blue_dots, # Data | |
1803 method = "jitter", # Random noise | |
1804 jitter = const_stripchart_jitter, | |
1805 pch = 19, # Pch symbols | |
1806 cex = const_stripsmall_cex, # Size of symbols reduced | |
1807 col = "lightblue", # Color of the symbol | |
1808 vertical = TRUE, # Vertical mode | |
1809 add = TRUE # Add it over | |
1810 ) | |
1811 stripchart( | |
1812 red_dots, # Data | |
1813 method = "jitter", # Random noise | |
1814 jitter = const_stripchart_jitter, | |
1815 pch = 19, # Pch symbols | |
1816 cex = const_stripsmall_cex, # Size of symbols reduced | |
1817 col = "red", # Color of the symbol | |
1818 vertical = TRUE, # Vertical mode | |
1819 add = TRUE # Add it over | |
1820 ) | |
1821 | |
1822 } | |
1823 if (TRUE) { | |
1824 # show measured values in blue on left half-violin plot | |
1825 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") | |
1826 vioplot::vioplot( | |
1827 x = lapply(blue_dots, function(x) x[!is.na(x)]), | |
1828 col = "lightblue1", | |
1829 side = "left", | |
1830 plotCentre = "line", | |
1831 ylim = ylim_save, | |
1832 main = "Distributions of observed and imputed data", | |
1833 sub = "Light blue = observed data; Pink = imputed data", | |
1834 xlab = "Sample", | |
1835 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
1836 ) | |
1837 red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) | |
1838 cols_to_delete <- c() | |
1839 for (ix in seq_len(length(red_violins))) { | |
1840 if (length(red_violins[[ix]]) < 1) { | |
1841 cols_to_delete <- c(cols_to_delete, ix) | |
1842 } | |
1843 } | |
1844 # destroy any unimputable columns | |
1845 if (!is.null(cols_to_delete)) { | |
1846 red_violins <- red_violins[-cols_to_delete] | |
1847 } | |
1848 # plot imputed values in red on right half-violin plot | |
1849 vioplot::vioplot( | |
1850 x = red_violins, | |
1851 col = "lightpink1", | |
1852 side = "right", | |
1853 plotCentre = "line", | |
1854 add = TRUE | |
1855 ) | |
1856 } | |
1857 | |
1858 par(old_par) | |
1859 | |
1860 # density plot | |
1861 cat("\\leavevmode\n\n\n\n\n\n\n") | |
1862 if (can_plot_before_after_imp) { | |
1863 ylim <- c( | |
1864 0, | |
1865 max( | |
1866 if (is.list(d_combined)) d_combined$y else d_combined, | |
1867 if (is.list(d_original)) d_original$y else d_original, | |
1868 if (is.list(d_imputed)) d_imputed$y else d_imputed, | |
1869 na.rm = TRUE | |
1870 ) | |
1871 ) | |
1872 plot( | |
1873 d_combined, | |
1874 ylim = ylim, | |
1875 sub = | |
1876 paste( | |
1877 "Blue = data before imputation; Red = imputed data;", | |
1878 "Black = combined" | |
1879 ), | |
1880 main = "Density of peptide intensity before and after imputation", | |
1881 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), | |
1882 ylab = "Probability density" | |
1883 ) | |
1884 lines(d_original, col = "blue") | |
1885 lines(d_imputed, col = "red") | |
1886 } else { | |
1887 cat( | |
1888 "There are too few points to plot the density of peptide intensity", | |
1889 "before and after imputation." | |
1890 ) | |
1891 } | |
1892 cat("\\leavevmode\\newpage\n") | |
1893 } | |
1894 ``` | |
1895 | |
1896 # Perform Quantile Normalization | |
1897 | |
1898 The excellent `normalize.quantiles` function from | |
1899 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* | |
1900 performs "quantile normalization" as described Bolstad *et al.* (2003), | |
1901 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* | |
1902 and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, | |
1903 i.e., it assumes that the goal is to detect | |
1904 subtle differences among grossly similar samples (having similar distributions) | |
1905 by equailzing intra-quantile quantitations. | |
1906 Unfortunately, one software library upon which it depends | |
1907 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* | |
1908 that requires that a specific, non-concurrent version of the library be | |
1909 installed. The installation command equivalent to what was used to install the library to produce the results presented here is: | |
1910 ``` | |
1911 conda install bioconductor-preprocesscore openblas=0.3.3 | |
1912 ``` | |
1913 | |
1914 | |
1915 <!-- | |
1916 # Apply quantile normalization using preprocessCore::normalize.quantiles | |
1917 # --- | |
1918 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html | |
1919 # except this: https://support.bioconductor.org/p/122925/#9135989 | |
1920 # says to install it like this: | |
1921 # ``` | |
1922 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) | |
1923 # ``` | |
1924 # conda installation (necessary because of a bug in recent openblas): | |
1925 # conda install bioconductor-preprocesscore openblas=0.3.3 | |
1926 # ... | |
1927 # --- | |
1928 # normalize.quantiles {preprocessCore} -- Quantile Normalization | |
1929 # | |
1930 # Description: | |
1931 # Using a normalization based upon quantiles, this function normalizes a | |
1932 # matrix of probe level intensities. | |
1933 # | |
1934 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the | |
1935 # assumption that the data is missing at random. | |
1936 # | |
1937 # Usage: | |
1938 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) | |
1939 # | |
1940 # Arguments: | |
1941 # | |
1942 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. | |
1943 # | |
1944 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, | |
1945 # but in certain situations not making a copy of the matrix, but instead normalizing | |
1946 # it in place will be more memory friendly. | |
1947 # | |
1948 # - keep.names: Boolean option to preserve matrix row and column names in output. | |
1949 # | |
1950 # Details: | |
1951 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. | |
1952 # No special allowances are made for outliers. If you make use of quantile normalization | |
1953 # please cite Bolstad et al, Bioinformatics (2003). | |
1954 # | |
1955 # This functions will handle missing data (ie NA values), based on | |
1956 # the assumption that the data is missing at random. | |
1957 # | |
1958 # Note that the current implementation optimizes for better memory usage | |
1959 # at the cost of some additional run-time. | |
1960 # | |
1961 # Value: A normalized matrix. | |
1962 # | |
1963 # Author: Ben Bolstad, bmbolstad.com | |
1964 # | |
1965 # References | |
1966 # | |
1967 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide | |
1968 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf | |
1969 # | |
1970 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of | |
1971 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias | |
1972 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 | |
1973 # http://bmbolstad.com/misc/normalize/normalize.html | |
1974 # ... | |
1975 --> | |
1976 ```{r echo = FALSE, results = 'asis'} | |
1977 | |
1978 if (nrow(quant_data_imp) > 0) { | |
1979 quant_data_imp_qn <- preprocessCore::normalize.quantiles( | |
1980 as.matrix(quant_data_imp), keep.names = TRUE | |
1981 ) | |
1982 } else { | |
1983 quant_data_imp_qn <- as.matrix(quant_data_imp) | |
1984 } | |
1985 | |
1986 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) | |
1987 | |
1988 write_debug_file(quant_data_imp_qn) | |
1989 | |
1990 quant_data_imp_qn_log <- log10(quant_data_imp_qn) | |
1991 | |
1992 write_debug_file(quant_data_imp_qn_log) | |
1993 | |
1994 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) | |
1995 | |
1996 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) | |
1997 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls | |
1998 | |
1999 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] | |
2000 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) | |
2001 | |
2002 quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls) | |
2003 | |
2004 write_debug_file(quant_data_imp_qn_ls) | |
2005 write_debug_file(quant_data_imp_qn_ls2) | |
2006 | |
2007 # Create data.frame used by ANOVA analysis | |
2008 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) | |
2009 ``` | |
2010 | |
2011 <!-- ACE insertion begin --> | |
2012 ## Are normalized, imputed, log-transformed sample distributions similar? | |
2013 | |
2014 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | |
2015 | |
2016 # Save unimputed quant_data_log for plotting below | |
2017 unimputed_quant_data_log <- quant_data_log | |
2018 | |
2019 # log10 transform (after preparing for zero values, | |
2020 # which should never happen...) | |
2021 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 | |
2022 quant_data_log <- log10(quant_data_imp_qn) | |
2023 | |
2024 how_many_peptides <- nrow(quant_data_log) | |
2025 | |
2026 if ((how_many_peptides) > 0) { | |
2027 cat( | |
2028 sprintf( | |
2029 "Intensities for %d peptides:\n\n\n", | |
2030 how_many_peptides | |
2031 ) | |
2032 ) | |
2033 cat("\n\n\n") | |
2034 | |
2035 | |
2036 # data visualization | |
2037 old_par <- par( | |
2038 mai = par("mai") + c(0.5, 0, 0, 0) | |
2039 , oma = par("oma") + c(0.5, 0, 0, 0) | |
2040 ) | |
2041 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
2042 # Vertical plot | |
2043 colnames(quant_data_log) <- sample_name_matches | |
2044 boxplot( | |
2045 quant_data_log | |
2046 , las = 1 | |
2047 , col = const_boxplot_fill | |
2048 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
2049 , xlab = "Sample" | |
2050 ) | |
2051 par(old_par) | |
2052 } else { | |
2053 cat("There are no peptides to plot\n") | |
2054 } | |
2055 | |
2056 cat("\n\n\n") | |
2057 | |
2058 ``` | |
2059 | |
2060 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} | |
2061 if (nrow(quant_data_log) > 1) { | |
2062 quant_data_log_stack <- stack(quant_data_log) | |
2063 ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + | |
2064 ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + | |
2065 ggplot2::ylab("Probability density") + | |
2066 ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) | |
2067 } else { | |
2068 cat("No density plot because there are fewer than two peptides to plot.\n\n") | |
2069 } | |
2070 ``` | |
2071 ```{r echo = FALSE, results = 'asis'} | |
2072 cat("\\leavevmode\\newpage\n") | |
2073 ``` | |
2074 | |
2075 # ANOVA Analysis | |
2076 | |
2077 ```{r, echo = FALSE} | |
2078 # Make new data frame containing only Phosphopeptides | |
2079 # to connect preANOVA to ANOVA (connect_df) | |
2080 connect_df <- data.frame( | |
2081 data_table_imp_qn_lt$Phosphopeptide | |
2082 , data_table_imp_qn_lt[, first_data_column] | |
2083 ) | |
2084 colnames(connect_df) <- c("Phosphopeptide", "Intensity") | |
2085 ``` | |
2086 | |
2087 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
2088 count_of_treatment_levels <- length(levels(sample_treatment_levels)) | |
2089 if (count_of_treatment_levels < 2) { | |
2090 nuke_control_sequences <- | |
2091 function(s) { | |
2092 s <- gsub("[\\]", "xyzzy_plugh", s) | |
2093 s <- gsub("[$]", "\\\\$", s) | |
2094 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) | |
2095 return(s) | |
2096 } | |
2097 cat( | |
2098 "ERROR!!!! Cannot perform ANOVA analysis", | |
2099 "(see next page)\\newpage\n" | |
2100 ) | |
2101 cat( | |
2102 "ERROR: ANOVA analysis", | |
2103 "requires two or more factor levels!\n\n\n" | |
2104 ) | |
2105 | |
2106 cat("\n\n\n\n\n") | |
2107 cat("Unparsed sample names are:\n\n\n", | |
2108 "\\begin{quote}\n", | |
2109 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), | |
2110 "\n\\end{quote}\n\n") | |
2111 | |
2112 regex_sample_names <- nuke_control_sequences(regex_sample_names) | |
2113 | |
2114 cat("\\leavevmode\n\n\n") | |
2115 cat("Parsing rule for SampleNames is", | |
2116 "\n\n\n", | |
2117 "\\text{'", | |
2118 regex_sample_names, | |
2119 "'}\n\n\n", | |
2120 sep = "" | |
2121 ) | |
2122 | |
2123 cat("\nParsed sample names are:\n", | |
2124 "\\begin{quote}\n", | |
2125 paste(sample_name_matches, collapse = "\n\n\n"), | |
2126 "\n\\end{quote}\n\n") | |
2127 | |
2128 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) | |
2129 | |
2130 cat("\\leavevmode\n\n\n") | |
2131 cat("Parsing rule for SampleGrouping is", | |
2132 "\n\n\n", | |
2133 "\\text{'", | |
2134 regex_sample_grouping, | |
2135 "'}\n\n\n", | |
2136 sep = "" | |
2137 ) | |
2138 | |
2139 cat("\n\n\n") | |
2140 cat("Sample group assignments are:\n", | |
2141 "\\begin{quote}\n", | |
2142 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), | |
2143 "\n\\end{quote}\n\n") | |
2144 | |
2145 } else { | |
2146 | |
2147 p_value_data_anova_ps <- | |
2148 apply( | |
2149 quant_data_imp_qn_log, | |
2150 1, | |
2151 anova_func, | |
2152 grouping_factor = sample_treatment_levels, | |
2153 one_way_f = one_way_all_categories | |
2154 ) | |
2155 | |
2156 p_value_data_anova_ps_fdr <- | |
2157 p.adjust(p_value_data_anova_ps, method = "fdr") | |
2158 p_value_data <- data.frame( | |
2159 phosphopeptide = full_data[, 1] | |
2160 , | |
2161 raw_anova_p = p_value_data_anova_ps | |
2162 , | |
2163 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr | |
2164 ) | |
2165 | |
2166 # output ANOVA file to constructed filename, | |
2167 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" | |
2168 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" | |
2169 | |
2170 # Re-output datasets to include p-values | |
2171 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) | |
2172 write.table( | |
2173 cbind(metadata_plus_p, quant_data_imp), | |
2174 file = imputed_data_filename, | |
2175 sep = "\t", | |
2176 col.names = TRUE, | |
2177 row.names = FALSE, | |
2178 quote = FALSE | |
2179 ) | |
2180 | |
2181 write.table( | |
2182 cbind(metadata_plus_p, quant_data_imp_qn_log), | |
2183 file = imp_qn_lt_data_filenm, | |
2184 sep = "\t", | |
2185 col.names = TRUE, | |
2186 row.names = FALSE, | |
2187 quote = FALSE | |
2188 ) | |
2189 | |
2190 | |
2191 p_value_data <- | |
2192 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] | |
2193 | |
2194 first_page_suppress <- 1 | |
2195 number_of_peptides_found <- 0 | |
2196 cutoff <- val_fdr[1] | |
2197 for (cutoff in val_fdr) { | |
2198 if (number_of_peptides_found > 49) { | |
2199 cat("\\leavevmode\n\n\n") | |
2200 break | |
2201 } | |
2202 | |
2203 #loop through FDR cutoffs | |
2204 | |
2205 filtered_p <- | |
2206 p_value_data[ | |
2207 which(p_value_data$fdr_adjusted_anova_p < cutoff), | |
2208 , drop = FALSE | |
2209 ] | |
2210 filtered_data_filtered <- | |
2211 quant_data_imp_qn_log[ | |
2212 rownames(filtered_p), | |
2213 , drop = FALSE | |
2214 ] | |
2215 filtered_data_filtered <- | |
2216 filtered_data_filtered[ | |
2217 order(filtered_p$fdr_adjusted_anova_p), | |
2218 , drop = FALSE | |
2219 ] | |
2220 | |
2221 # <!-- ACE insertion start --> | |
2222 | |
2223 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { | |
2224 if (first_page_suppress == 1) { | |
2225 first_page_suppress <- 0 | |
2226 } else { | |
2227 cat("\\newpage\n") | |
2228 } | |
2229 if (nrow(filtered_data_filtered) > 1) { | |
2230 subsection_header(sprintf( | |
2231 "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", | |
2232 nrow(filtered_data_filtered), | |
2233 cutoff | |
2234 )) | |
2235 } else { | |
2236 subsection_header(sprintf( | |
2237 "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", | |
2238 rownames(filtered_data_filtered)[1], | |
2239 cutoff | |
2240 )) | |
2241 } | |
2242 cat("\n\n\n") | |
2243 cat("\n\n\n") | |
2244 | |
2245 old_oma <- par("oma") | |
2246 old_par <- par( | |
2247 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), | |
2248 oma = old_oma * c(1, 1, 0.3, 1), | |
2249 cex.main = 0.9, | |
2250 cex.axis = 0.7, | |
2251 fin = c(9, 7.25) | |
2252 ) | |
2253 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
2254 # Vertical plot | |
2255 colnames(filtered_data_filtered) <- sample_name_matches | |
2256 tryCatch( | |
2257 boxplot( | |
2258 filtered_data_filtered, | |
2259 main = "Imputed, normalized intensities", # no line plot | |
2260 las = 1, | |
2261 col = const_boxplot_fill, | |
2262 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
2263 ), | |
2264 error = function(e) print(e) | |
2265 ) | |
2266 par(old_par) | |
2267 } else { | |
2268 cat(sprintf( | |
2269 "%s < %0.2f\n\n\n\n\n", | |
2270 "No peptides were found to have cutoff adjusted p-value", | |
2271 cutoff | |
2272 )) | |
2273 } | |
2274 | |
2275 if (nrow(filtered_data_filtered) > 0) { | |
2276 # Add Phosphopeptide column to anova_filtered table | |
2277 # The assumption here is that the first intensity is unique; | |
2278 # this is a hokey assumption but almost definitely will | |
2279 # be true in the real world unless there is a computation | |
2280 # error upstream. | |
2281 anova_filtered_merge <- base::merge( | |
2282 x = connect_df, | |
2283 y = filtered_data_filtered, | |
2284 by.x = "Intensity", | |
2285 by.y = 1 | |
2286 ) | |
2287 anova_filtered_merge_order <- rownames(filtered_p) | |
2288 | |
2289 anova_filtered <- data.frame( | |
2290 ppep = anova_filtered_merge$Phosphopeptide, | |
2291 intense = anova_filtered_merge$Intensity, | |
2292 data = anova_filtered_merge[, 2:number_of_samples + 1] | |
2293 ) | |
2294 colnames(anova_filtered) <- | |
2295 c("Phosphopeptide", colnames(filtered_data_filtered)) | |
2296 | |
2297 # Merge qualitative columns into the ANOVA data | |
2298 output_table <- data.frame(anova_filtered$Phosphopeptide) | |
2299 output_table <- base::merge( | |
2300 x = output_table, | |
2301 y = data_table_imp_qn_lt, | |
2302 by.x = "anova_filtered.Phosphopeptide", | |
2303 by.y = "Phosphopeptide" | |
2304 ) | |
2305 | |
2306 # Produce heatmap to visualize significance and the effect of imputation | |
2307 | |
2308 anova_filtered_merge_format <- sapply( | |
2309 X = filtered_p$fdr_adjusted_anova_p | |
2310 , | |
2311 FUN = function(x) { | |
2312 if (x > 0.0001) | |
2313 paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") | |
2314 else | |
2315 paste0("(%0.4e) %s") | |
2316 } | |
2317 ) | |
2318 | |
2319 cat_hm_heading <- function(m, cutoff) { | |
2320 cat("\\newpage\n") | |
2321 if (nrow(m) > intensity_hm_rows) { | |
2322 subsection_header( | |
2323 paste( | |
2324 sprintf("Heatmap for the %d most-significant peptides", | |
2325 intensity_hm_rows), | |
2326 sprintf("whose adjusted p-value < %0.2f\n", cutoff) | |
2327 ) | |
2328 ) | |
2329 } else { | |
2330 if (nrow(m) == 1) { | |
2331 return(FALSE) | |
2332 } else { | |
2333 subsection_header( | |
2334 paste( | |
2335 sprintf("Heatmap for %d usable peptides whose", nrow(m)), | |
2336 sprintf("adjusted p-value < %0.2f\n", cutoff) | |
2337 ) | |
2338 ) | |
2339 } | |
2340 } | |
2341 cat("\n\n\n") | |
2342 cat("\n\n\n") | |
2343 return(TRUE) | |
2344 } | |
2345 | |
2346 # construct matrix with appropriate rownames | |
2347 m <- | |
2348 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) | |
2349 if (nrow(m) > 0) { | |
2350 rownames_m <- rownames(m) | |
2351 rownames(m) <- sapply( | |
2352 X = seq_len(nrow(m)) | |
2353 , | |
2354 FUN = function(i) { | |
2355 sprintf( | |
2356 anova_filtered_merge_format[i], | |
2357 filtered_p$fdr_adjusted_anova_p[i], | |
2358 rownames_m[i] | |
2359 ) | |
2360 } | |
2361 ) | |
2362 } | |
2363 # draw the heading and heatmap | |
2364 if (nrow(m) > 0) { | |
2365 number_of_peptides_found <- | |
2366 draw_intensity_heatmap( | |
2367 m = m, | |
2368 cutoff = cutoff, | |
2369 hm_heading_function = cat_hm_heading, | |
2370 hm_main_title = "Unimputed, unnormalized log(intensities)", | |
2371 suppress_row_dendrogram = FALSE | |
2372 ) | |
2373 } | |
2374 } | |
2375 } | |
2376 } | |
2377 cat("\\leavevmode\n\n\n") | |
2378 ``` | |
2379 | |
2380 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
2381 | |
2382 if (count_of_treatment_levels > 1) { | |
2383 # Prepare two-way contrasts with adjusted p-values | |
2384 # Strategy: | |
2385 # - use imputed, log-transformed data: | |
2386 # - remember this when computing log2(fold-change) | |
2387 # - each contrast is between a combination of trt levels | |
2388 # - for each contrast, compute samples that are members | |
2389 # - compute one-way test: | |
2390 # - use `oneway.test` (Welch test) if numbers of samples | |
2391 # are not equivalent between trt levels | |
2392 # - otherwise, aov is fine but offers no advantage | |
2393 # - adjust p-value, assuming that | |
2394 # (# of pppeps)*(# of contrasts) tests were performed | |
2395 | |
2396 # Each contrast is between a combination of trt levels | |
2397 m2 <- combn( | |
2398 x = seq_len(length(levels(sample_treatment_levels))), | |
2399 m = 2, | |
2400 simplify = TRUE | |
2401 ) | |
2402 contrast_count <- ncol(m2) | |
2403 | |
2404 # For each contrast, compute samples that are members | |
2405 # - local function to construct a data.frame for each contrast | |
2406 # - the contrast in the first "column" | |
2407 f_m2 <- | |
2408 function(cntrst, lvl1, lvl2) { | |
2409 return( | |
2410 data.frame( | |
2411 contrast = cntrst, | |
2412 level = sample_treatment_levels[ | |
2413 sample_treatment_levels %in% | |
2414 levels(sample_treatment_levels)[c(lvl1, lvl2)] | |
2415 ], | |
2416 label = sample_name_matches[ | |
2417 sample_treatment_levels %in% | |
2418 levels(sample_treatment_levels)[c(lvl1, lvl2)] | |
2419 ] | |
2420 ) | |
2421 ) | |
2422 } | |
2423 # - compute a df for each contrast | |
2424 sample_level_dfs <- lapply( | |
2425 X = 1:contrast_count, | |
2426 FUN = function(i) f_m2(i, m2[1, i], m2[2, i]) | |
2427 ) | |
2428 # - compute a single df for all contrasts | |
2429 combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs) | |
2430 | |
2431 # - dispose objects to free resources | |
2432 rm(sample_level_dfs) | |
2433 | |
2434 # - write the df to a DB for later join-per-contrast | |
2435 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) | |
2436 | |
2437 RSQLite::dbWriteTable( | |
2438 conn = db, | |
2439 name = "contrast", | |
2440 value = combined_contrast_df, | |
2441 overwrite = TRUE | |
2442 ) | |
2443 | |
2444 # Create UK for insert | |
2445 ddl_exec(db, " | |
2446 CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx | |
2447 ON contrast(contrast, label); | |
2448 " | |
2449 ) | |
2450 # Create indexes for join | |
2451 ddl_exec(db, " | |
2452 -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label | |
2453 CREATE INDEX IF NOT EXISTS contrast__label__idx | |
2454 ON contrast(label); | |
2455 " | |
2456 ) | |
2457 ddl_exec(db, " | |
2458 -- index for joining two contrast_lvl_ppep_avg_quant on contrast | |
2459 CREATE INDEX IF NOT EXISTS contrast__contrast__idx | |
2460 ON contrast(contrast); | |
2461 " | |
2462 ) | |
2463 ddl_exec(db, " | |
2464 -- index for joining two contrast_lvl_ppep_avg_quant on phophospep | |
2465 CREATE INDEX IF NOT EXISTS contrast__level__idx | |
2466 ON contrast(level); | |
2467 " | |
2468 ) | |
2469 # - dispose objects to free resources | |
2470 rm(combined_contrast_df) | |
2471 | |
2472 # Use imputed, log-transformed data | |
2473 # - remember that this was donoe when computing log2(fold-change) | |
2474 # - melt data matrix for use in later join-per-contrast | |
2475 casted <- cbind( | |
2476 data.frame(vrbl = rownames(quant_data_imp_qn_log)), | |
2477 quant_data_imp_qn_log | |
2478 ) | |
2479 quant_data_imp_qn_log_melted <- reshape2::melt( | |
2480 casted, | |
2481 id.vars = "vrbl" | |
2482 ) | |
2483 colnames(quant_data_imp_qn_log_melted) <- | |
2484 c("phosphopep", "sample", "quant") | |
2485 # - dispose objects to free resources | |
2486 rm(casted) | |
2487 | |
2488 # - write the df to a DB for use in later join-per-contrast | |
2489 RSQLite::dbWriteTable( | |
2490 conn = db, | |
2491 name = "ppep_smpl_qnlt", | |
2492 value = quant_data_imp_qn_log_melted, | |
2493 overwrite = TRUE | |
2494 ) | |
2495 # Create UK for insert | |
2496 ddl_exec(db, " | |
2497 CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx | |
2498 ON ppep_smpl_qnlt(phosphopep, sample); | |
2499 " | |
2500 ) | |
2501 # Create index for join | |
2502 ddl_exec(db, " | |
2503 -- index for join in contrast_ppep_smpl_qnlt | |
2504 CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx | |
2505 ON ppep_smpl_qnlt(sample); | |
2506 " | |
2507 ) | |
2508 ddl_exec(db, " | |
2509 -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep | |
2510 CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx | |
2511 ON ppep_smpl_qnlt(phosphopep); | |
2512 " | |
2513 ) | |
2514 # - dispose objects to free resources | |
2515 rm(quant_data_imp_qn_log_melted) | |
2516 | |
2517 # - drop views if exist | |
2518 ddl_exec(db, " | |
2519 -- drop view dependent on contrast_lvl_ppep_avg_quant | |
2520 DROP VIEW IF EXISTS v_contrast_log2_fc; | |
2521 " | |
2522 ) | |
2523 ddl_exec(db, " | |
2524 -- drop table dependent on contrast_ppep_smpl_qnlt | |
2525 DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant; | |
2526 " | |
2527 ) | |
2528 ddl_exec(db, " | |
2529 DROP TABLE IF EXISTS contrast_lvl_lvl_metadata; | |
2530 " | |
2531 ) | |
2532 ddl_exec(db, " | |
2533 DROP VIEW IF EXISTS v_contrast_lvl_metadata; | |
2534 " | |
2535 ) | |
2536 ddl_exec(db, " | |
2537 -- drop view dependent on contrast_ppep_smpl_qnlt | |
2538 DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant; | |
2539 " | |
2540 ) | |
2541 ddl_exec(db, " | |
2542 DROP VIEW IF EXISTS v_contrast_lvl_lvl; | |
2543 " | |
2544 ) | |
2545 ddl_exec(db, " | |
2546 -- drop view upon which other views depend | |
2547 DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt; | |
2548 " | |
2549 ) | |
2550 # - create view | |
2551 dml_no_rows_exec(db, " | |
2552 -- view contrast_ppep_smpl_qnlt is used for each phopshopep to | |
2553 -- compute p-value for test of trt effect for two trt levels | |
2554 CREATE VIEW contrast_ppep_smpl_qnlt | |
2555 AS | |
2556 SELECT contrast, | |
2557 level, | |
2558 phosphopep, | |
2559 sample, | |
2560 quant | |
2561 FROM contrast AS c, | |
2562 ppep_smpl_qnlt AS q | |
2563 WHERE q.sample = c.label | |
2564 ORDER BY contrast, level, phosphopep | |
2565 ; | |
2566 " | |
2567 ) | |
2568 # - create simplification views | |
2569 dml_no_rows_exec(db, " | |
2570 CREATE VIEW v_contrast_lvl_metadata | |
2571 AS | |
2572 SELECT contrast, | |
2573 level, | |
2574 group_concat(label, ';') AS samples | |
2575 FROM contrast | |
2576 GROUP BY contrast, level | |
2577 /* view v_contrast_lvl_metadata is used | |
2578 to simplify creation of table contrast_lvl_lvl_metadata */ | |
2579 ; | |
2580 " | |
2581 ) | |
2582 dml_no_rows_exec(db, " | |
2583 CREATE VIEW v_contrast_lvl_ppep_avg_quant | |
2584 AS | |
2585 SELECT contrast, | |
2586 level, | |
2587 phosphopep, | |
2588 avg(quant) AS avg_quant | |
2589 FROM contrast_ppep_smpl_qnlt | |
2590 GROUP BY contrast, level, phosphopep | |
2591 /* view v_contrast_lvl_ppep_avg_quant is used | |
2592 to simplify view v_contrast_log2_fc */ | |
2593 ; | |
2594 " | |
2595 ) | |
2596 | |
2597 # - create contrast-metadata table | |
2598 dml_no_rows_exec(db, " | |
2599 CREATE TABLE contrast_lvl_lvl_metadata | |
2600 AS | |
2601 SELECT DISTINCT | |
2602 a.contrast AS ab_contrast, | |
2603 a.level AS a_level, | |
2604 b.level AS b_level, | |
2605 a.samples AS a_samples, | |
2606 b.samples AS b_samples, | |
2607 'log2(level_'||a.level||'/level_'||b.level||')' | |
2608 AS fc_description | |
2609 FROM v_contrast_lvl_metadata AS a, | |
2610 v_contrast_lvl_metadata AS b | |
2611 WHERE a.contrast = b.contrast | |
2612 AND a.level > b.level | |
2613 /* view v_contrast_lvl_lvl is used | |
2614 to simplify view v_contrast_log2_fc */ | |
2615 ; | |
2616 " | |
2617 ) | |
2618 # - create pseudo-materialized view table | |
2619 dml_no_rows_exec(db, " | |
2620 CREATE VIEW v_contrast_lvl_lvl | |
2621 AS | |
2622 SELECT DISTINCT | |
2623 a.contrast AS ab_contrast, | |
2624 a.level AS a_level, | |
2625 b.level AS b_level | |
2626 FROM contrast AS a, | |
2627 contrast AS b | |
2628 WHERE a.contrast = b.contrast | |
2629 AND a.level > b.level | |
2630 /* view v_contrast_lvl_lvl is used | |
2631 to simplify view v_contrast_log2_fc */ | |
2632 ; | |
2633 " | |
2634 ) | |
2635 | |
2636 # - create view to compute log2(fold-change) | |
2637 dml_no_rows_exec(db, " | |
2638 CREATE VIEW v_contrast_log2_fc | |
2639 AS | |
2640 SELECT ab.ab_contrast AS contrast, | |
2641 m.a_level AS a_level, | |
2642 c.avg_quant AS a_quant, | |
2643 m.a_samples AS a_samples, | |
2644 ab.b_level AS b_level, | |
2645 d.avg_quant AS b_quant, | |
2646 m.b_samples AS b_samples, | |
2647 m.fc_description AS fc_description, | |
2648 3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc, | |
2649 d.phosphopep AS phosphopep | |
2650 FROM contrast_lvl_lvl_metadata AS m, | |
2651 v_contrast_lvl_ppep_avg_quant AS d, | |
2652 v_contrast_lvl_lvl AS ab | |
2653 INNER JOIN v_contrast_lvl_ppep_avg_quant AS c | |
2654 ON c.contrast = ab.ab_contrast | |
2655 AND c.level = ab.a_level | |
2656 WHERE d.contrast = ab.ab_contrast | |
2657 AND m.ab_contrast = ab.ab_contrast | |
2658 AND d.level = ab.b_level | |
2659 AND d.phosphopep = c.phosphopep | |
2660 /* view to compute log2(fold-change) */ | |
2661 ; | |
2662 " | |
2663 ) | |
2664 | |
2665 # For each contrast, compute samples that are members | |
2666 # compute one-way test: | |
2667 # - use `oneway.test` (Welch test) if numbers of samples | |
2668 # are not equivalent between trt levels | |
2669 # - otherwise, aov is fine but offers no advantage | |
2670 for (contrast in contrast_count:2) { | |
2671 invisible(contrast) | |
2672 } | |
2673 for (contrast in 1:contrast_count) { | |
2674 contrast_df <- sqldf::sqldf( | |
2675 x = paste0(" | |
2676 SELECT level, phosphopep, sample, quant | |
2677 FROM contrast_ppep_smpl_qnlt | |
2678 WHERE contrast = ", contrast, " | |
2679 ORDER BY phosphopep, level, sample | |
2680 "), | |
2681 connection = db | |
2682 ) | |
2683 contrast_cast <- reshape2::dcast( | |
2684 data = contrast_df, | |
2685 formula = phosphopep ~ sample, | |
2686 value.var = "quant" | |
2687 ) | |
2688 contrast_cast_ncol <- ncol(contrast_cast) | |
2689 contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol] | |
2690 contrast_cast_samples <- colnames(contrast_cast_data) | |
2691 | |
2692 # - order grouping_factor by order of sample columns of contrast_cast_data | |
2693 grouping_factor <- sqldf::sqldf( | |
2694 x = paste0(" | |
2695 SELECT sample, level | |
2696 FROM contrast_ppep_smpl_qnlt | |
2697 WHERE contrast = ", contrast, " | |
2698 ORDER BY phosphopep, level, sample | |
2699 LIMIT ", contrast_cast_ncol - 1 | |
2700 ), | |
2701 connection = db | |
2702 ) | |
2703 rownames(grouping_factor) <- grouping_factor$sample | |
2704 grouping_factor <- grouping_factor[, "level", drop = FALSE] | |
2705 | |
2706 # - run the two-level (one-way) test | |
2707 p_value_data_contrast_ps <- | |
2708 apply( | |
2709 X = contrast_cast_data, | |
2710 MARGIN = 1, # apply to rows | |
2711 FUN = anova_func, | |
2712 grouping_factor = | |
2713 as.factor(as.numeric(grouping_factor$level)), # anova_func arg2 | |
2714 one_way_f = one_way_two_categories, # anova_func arg3 | |
2715 simplify = TRUE # TRUE is the default for simplify | |
2716 ) | |
2717 contrast_data_adj_p_values <- p.adjust( | |
2718 p = p_value_data_contrast_ps, | |
2719 method = "fdr", | |
2720 n = length(p_value_data_contrast_ps) # this is the default, length(p) | |
2721 ) | |
2722 # - compute the fold-change | |
2723 contrast_p_df <- | |
2724 data.frame( | |
2725 contrast = contrast, | |
2726 phosphopep = contrast_cast$phosphopep, | |
2727 p_value_raw = p_value_data_contrast_ps, | |
2728 p_value_adj = contrast_data_adj_p_values | |
2729 ) | |
2730 db_write_table_overwrite <- (contrast < 2) | |
2731 db_write_table_append <- !db_write_table_overwrite | |
2732 RSQLite::dbWriteTable( | |
2733 conn = db, | |
2734 name = "contrast_ppep_p_val", | |
2735 value = contrast_p_df, | |
2736 append = db_write_table_append | |
2737 ) | |
2738 # Create UK for insert | |
2739 ddl_exec(db, " | |
2740 CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx | |
2741 ON contrast_ppep_p_val(phosphopep, contrast); | |
2742 " | |
2743 ) | |
2744 } | |
2745 # Perhaps this could be done more elegantly using unique keys | |
2746 # or creating the tables before saving data to them, but this | |
2747 # is fast and, if the database exists on disk rather than in | |
2748 # memory, it doesn't stress memory. | |
2749 dml_no_rows_exec(db, " | |
2750 CREATE TEMP table contrast_log2_fc | |
2751 AS | |
2752 SELECT * | |
2753 FROM v_contrast_log2_fc | |
2754 ORDER BY contrast, phosphopep | |
2755 ; | |
2756 " | |
2757 ) | |
2758 dml_no_rows_exec(db, " | |
2759 CREATE TEMP table ppep_p_val | |
2760 AS | |
2761 SELECT p_value_raw, | |
2762 p_value_adj, | |
2763 contrast AS p_val_contrast, | |
2764 phosphopep AS p_val_ppep | |
2765 FROM contrast_ppep_p_val | |
2766 ORDER BY contrast, phosphopep | |
2767 ; | |
2768 " | |
2769 ) | |
2770 dml_no_rows_exec(db, " | |
2771 DROP TABLE IF EXISTS contrast_log2_fc_p_val | |
2772 ; | |
2773 " | |
2774 ) | |
2775 dml_no_rows_exec(db, " | |
2776 CREATE TABLE contrast_log2_fc_p_val | |
2777 AS | |
2778 SELECT a.*, | |
2779 b.p_value_raw, | |
2780 b.p_value_adj, | |
2781 b.p_val_contrast, | |
2782 b.p_val_ppep | |
2783 FROM contrast_log2_fc a, ppep_p_val b | |
2784 WHERE a.rowid = b.rowid | |
2785 AND a.phosphopep = b.p_val_ppep | |
2786 ; | |
2787 " | |
2788 ) | |
2789 # Create UK | |
2790 ddl_exec(db, " | |
2791 CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx | |
2792 ON contrast_log2_fc_p_val(phosphopep, contrast); | |
2793 " | |
2794 ) | |
2795 # Create indices for future queries | |
2796 ddl_exec(db, " | |
2797 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx | |
2798 ON contrast_log2_fc_p_val(contrast); | |
2799 " | |
2800 ) | |
2801 ddl_exec(db, " | |
2802 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx | |
2803 ON contrast_log2_fc_p_val(phosphopep); | |
2804 " | |
2805 ) | |
2806 ddl_exec(db, " | |
2807 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx | |
2808 ON contrast_log2_fc_p_val(p_value_raw); | |
2809 " | |
2810 ) | |
2811 ddl_exec(db, " | |
2812 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx | |
2813 ON contrast_log2_fc_p_val(p_value_adj); | |
2814 " | |
2815 ) | |
2816 dml_no_rows_exec(db, " | |
2817 DROP VIEW IF EXISTS v_contrast_log2_fc_p_val | |
2818 ; | |
2819 " | |
2820 ) | |
2821 dml_no_rows_exec(db, " | |
2822 CREATE VIEW v_contrast_log2_fc_p_val | |
2823 AS | |
2824 SELECT contrast, | |
2825 a_level, | |
2826 a_samples, | |
2827 b_level, | |
2828 b_samples, | |
2829 a_quant, | |
2830 b_quant, | |
2831 fc_description, | |
2832 log2_fc, | |
2833 p_value_raw, | |
2834 p_value_adj, | |
2835 phosphopep | |
2836 FROM contrast_log2_fc_p_val | |
2837 ORDER BY contrast, phosphopep | |
2838 ; | |
2839 " | |
2840 ) | |
2841 ddl_exec(db, " | |
2842 DROP TABLE IF EXISTS kseaapp_metadata | |
2843 ; | |
2844 " | |
2845 ) | |
2846 dml_no_rows_exec(db, " | |
2847 CREATE TABLE kseaapp_metadata | |
2848 AS | |
2849 WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS ( | |
2850 SELECT DISTINCT | |
2851 deppep.seq, | |
2852 ppep.seq, | |
2853 GeneName||';', | |
2854 UniProtID||';', | |
2855 PhosphoResidue||';' | |
2856 FROM | |
2857 ppep, deppep, mrgfltr_metadata | |
2858 WHERE | |
2859 mrgfltr_metadata.ppep_id = ppep.id | |
2860 AND | |
2861 ppep.deppep_id = deppep.id | |
2862 ) | |
2863 SELECT | |
2864 ppep AS `ppep`, | |
2865 SUBSTR(uniprot_id, 1, INSTR(uniprot_id,';') - 1 ) AS `Protein`, | |
2866 SUBSTR(gene_name, 1, INSTR(gene_name,';') - 1 ) AS `Gene`, | |
2867 deppep AS `Peptide`, | |
2868 REPLACE( | |
2869 REPLACE( | |
2870 SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ), | |
2871 'p', | |
2872 '' | |
2873 ), | |
2874 ', ', | |
2875 ';' | |
2876 ) AS `Residue.Both` | |
2877 FROM extended | |
2878 ; | |
2879 " | |
2880 ) | |
2881 # Create indexes for join | |
2882 ddl_exec(db, " | |
2883 CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx | |
2884 ON kseaapp_metadata(ppep); | |
2885 " | |
2886 ) | |
2887 ddl_exec(db, " | |
2888 DROP VIEW IF EXISTS v_kseaapp_contrast | |
2889 ; | |
2890 " | |
2891 ) | |
2892 dml_no_rows_exec(db, " | |
2893 CREATE VIEW v_kseaapp_contrast | |
2894 AS | |
2895 SELECT a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both` | |
2896 FROM v_contrast_log2_fc_p_val a, kseaapp_metadata b | |
2897 WHERE b.ppep = a.phosphopep | |
2898 ; | |
2899 " | |
2900 ) | |
2901 ddl_exec(db, " | |
2902 DROP VIEW IF EXISTS v_kseaapp_input | |
2903 ; | |
2904 " | |
2905 ) | |
2906 dml_no_rows_exec(db, " | |
2907 CREATE VIEW v_kseaapp_input | |
2908 AS | |
2909 SELECT v.contrast, | |
2910 v.phosphopep, | |
2911 m.`Protein`, | |
2912 m.`Gene`, | |
2913 m.`Peptide`, | |
2914 m.`Residue.Both`, | |
2915 v.p_value_raw AS `p`, | |
2916 v.log2_fc AS `FC` | |
2917 FROM kseaapp_metadata AS m, | |
2918 v_contrast_log2_fc_p_val AS v | |
2919 WHERE m.ppep = v.phosphopep | |
2920 AND NOT m.`Gene` = 'No_Gene_Name' | |
2921 AND NOT v.log2_fc = 0 | |
2922 ; | |
2923 " | |
2924 ) | |
2925 } | |
2926 ``` | |
2927 | |
2928 ```{r echo = FALSE, results = 'asis'} | |
2929 cat("\\newpage\n") | |
2930 ``` | |
2931 | |
2932 # KSEA Analysis | |
2933 | |
2934 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: | |
2935 | |
2936 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp | |
2937 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). | |
2938 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). | |
2939 | |
2940 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: | |
2941 | |
2942 $$ | |
2943 \text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} | |
2944 $$ | |
2945 | |
2946 and fold-enrichment is computed as: | |
2947 | |
2948 $$ | |
2949 \text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} | |
2950 $$ | |
2951 | |
2952 where: | |
2953 | |
2954 - $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, | |
2955 - $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and | |
2956 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, | |
2957 - $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. | |
2958 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. | |
2959 | |
2960 $\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) | |
2961 | |
2962 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). | |
2963 | |
2964 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. | |
2965 | |
2966 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. | |
2967 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). | |
2968 - Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). | |
2969 | |
2970 ```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
2971 | |
2972 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) | |
2973 | |
2974 # -- eliminate the table that's about to be defined | |
2975 ddl_exec(db, " | |
2976 DROP TABLE IF EXISTS site_metadata; | |
2977 ") | |
2978 | |
2979 # -- define the site_metadata table | |
2980 ddl_exec(db, " | |
2981 CREATE TABLE site_metadata( | |
2982 id INTEGER PRIMARY KEY | |
2983 , site_type_id INTEGER REFERENCES site_type(id) | |
2984 , full TEXT UNIQUE ON CONFLICT IGNORE | |
2985 , abbrev TEXT | |
2986 , pattern TEXT | |
2987 , motif TEXT | |
2988 ); | |
2989 ") | |
2990 | |
2991 # -- populate the table with initial values | |
2992 ddl_exec(db, " | |
2993 INSERT INTO site_metadata(full, abbrev, site_type_id) | |
2994 SELECT DISTINCT kinase_map, kinase_map, site_type_id | |
2995 FROM ppep_gene_site | |
2996 ORDER BY kinase_map; | |
2997 ") | |
2998 | |
2999 # -- drop bogus KSData view if exists | |
3000 ddl_exec(db, " | |
3001 DROP VIEW IF EXISTS ks_data_v; | |
3002 ") | |
3003 | |
3004 # -- create view to serve as an impostor for KSEAapp::KSData | |
3005 ddl_exec(db, " | |
3006 CREATE VIEW IF NOT EXISTS ks_data_v | |
3007 AS | |
3008 SELECT | |
3009 'NA' AS KINASE, | |
3010 'NA' AS KIN_ACC_ID, | |
3011 kinase_map AS GENE, | |
3012 'NA' AS KIN_ORGANISM, | |
3013 'NA' AS SUBSTRATE, | |
3014 0 AS SUB_GENE_ID, | |
3015 'NA' AS SUB_ACC_ID, | |
3016 gene_names AS SUB_GENE, | |
3017 'NA' AS SUB_ORGANISM, | |
3018 phospho_peptide AS SUB_MOD_RSD, | |
3019 0 AS SITE_GROUP_ID, | |
3020 'NA' AS 'SITE_7AA', | |
3021 2 AS networkin_score, | |
3022 type_name AS Source | |
3023 FROM ppep_gene_site_view; | |
3024 ") | |
3025 | |
3026 contrast_metadata_df <- | |
3027 sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db) | |
3028 rslt <- new_env() | |
3029 rslt$score_list <- list() | |
3030 rslt$name_list <- list() | |
3031 rslt$longname_list <- list() | |
3032 | |
3033 ddl_exec(db, " | |
3034 DROP TABLE IF EXISTS contrast_ksea_scores; | |
3035 " | |
3036 ) | |
3037 | |
3038 next_index <- 0 | |
3039 err_na_subscr_df_const <- | |
3040 "missing values are not allowed in subscripted assignments of data frames" | |
3041 | |
3042 for (i_cntrst in seq_len(nrow(contrast_metadata_df))) { | |
3043 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] | |
3044 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | |
3045 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | |
3046 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | |
3047 contrast_longlabel <- ( | |
3048 sprintf( | |
3049 "Trt %s {%s} -> Trt %s {%s}", | |
3050 contrast_metadata_df[i_cntrst, "b_level"], | |
3051 gsub( | |
3052 pattern = ";", | |
3053 replacement = ", ", | |
3054 x = contrast_metadata_df[i_cntrst, "b_samples"], | |
3055 fixed = TRUE | |
3056 ), | |
3057 contrast_metadata_df[i_cntrst, "a_level"], | |
3058 gsub( | |
3059 pattern = ";", | |
3060 replacement = ", ", | |
3061 x = contrast_metadata_df[i_cntrst, "a_samples"], | |
3062 fixed = TRUE | |
3063 ) | |
3064 ) | |
3065 ) | |
3066 kseaapp_input <- | |
3067 sqldf::sqldf( | |
3068 x = sprintf(" | |
3069 SELECT `Protein`, `Gene`, `Peptide`, phosphopep AS `Residue.Both`, `p`, `FC` | |
3070 FROM v_kseaapp_input | |
3071 WHERE contrast = %d | |
3072 ", | |
3073 i_cntrst | |
3074 ), | |
3075 connection = db | |
3076 ) | |
3077 | |
3078 pseudo_ksdata <- dbReadTable(db, "ks_data_v") | |
3079 | |
3080 # This hack is because SQL table has the log2-transformed values | |
3081 kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE] | |
3082 main_title <- ( | |
3083 sprintf( | |
3084 "Change from treatment %s to treatment %s", | |
3085 contrast_metadata_df[i_cntrst, "b_level"], | |
3086 contrast_metadata_df[i_cntrst, "a_level"] | |
3087 ) | |
3088 ) | |
3089 sub_title <- contrast_longlabel | |
3090 tryCatch( | |
3091 expr = { | |
3092 ksea_scores_rslt <- | |
3093 ksea_scores( | |
3094 ksdata = pseudo_ksdata, # KSEAapp::KSData, | |
3095 px = kseaapp_input, | |
3096 networkin = TRUE, | |
3097 networkin_cutoff = 2 | |
3098 ) | |
3099 | |
3100 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | |
3101 next_index <- 1 + next_index | |
3102 rslt$score_list[[next_index]] <- ksea_scores_rslt | |
3103 rslt$name_list[[next_index]] <- contrast_label | |
3104 rslt$longname_list[[next_index]] <- contrast_longlabel | |
3105 low_fdr_print( | |
3106 rslt = rslt, | |
3107 i_cntrst = i_cntrst, | |
3108 i = next_index, | |
3109 a_level = cntrst_a_level, | |
3110 b_level = cntrst_b_level, | |
3111 fold_change = cntrst_fold_change, | |
3112 caption = contrast_longlabel | |
3113 ) | |
3114 } | |
3115 }, | |
3116 error = function(e) str(e) | |
3117 ) | |
3118 } | |
3119 | |
3120 plotted_kinases <- NULL | |
3121 if (length(rslt$score_list) > 1) { | |
3122 for (i in seq_len(length(ksea_heatmap_titles))) { | |
3123 hdr <- ksea_heatmap_titles[[i]] | |
3124 which_kinases <- i | |
3125 | |
3126 cat("\\clearpage\n\\begin{center}\n") | |
3127 if (i == const_ksea_astrsk_kinases) { | |
3128 subsection_header(hdr) | |
3129 } else { | |
3130 subsection_header(hdr) | |
3131 } | |
3132 cat("\\end{center}\n") | |
3133 | |
3134 plotted_kinases <- ksea_heatmap( | |
3135 # the data frame outputs from the KSEA.Scores() function, in list format | |
3136 score_list = rslt$score_list, | |
3137 # a character vector of all the sample names for heatmap annotation: | |
3138 # - the names must be in the same order as the data in score_list | |
3139 # - please avoid long names, as they may get cropped in the final image | |
3140 sample_labels = rslt$name_list, | |
3141 # character string of either "p.value" or "FDR" indicating the data column | |
3142 # to use for marking statistically significant scores | |
3143 stats = c("p.value", "FDR")[2], | |
3144 # a numeric value between 0 and infinity indicating the min. number of | |
3145 # substrates a kinase must have to be included in the heatmap | |
3146 m_cutoff = 1, | |
3147 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | |
3148 # for indicating significant kinases in the heatmap | |
3149 p_cutoff = 0.05, | |
3150 # a binary input of TRUE or FALSE, indicating whether or not to perform | |
3151 # hierarchical clustering of the sample columns | |
3152 sample_cluster = TRUE, | |
3153 # a binary input of TRUE or FALSE, indicating whether or not to export | |
3154 # the heatmap as a .png image into the working directory | |
3155 export = FALSE, | |
3156 # additional arguments to gplots::heatmap.2, such as: | |
3157 # - main: main title of plot | |
3158 # - xlab: x-axis label | |
3159 # - ylab: y-axis label | |
3160 xlab = "Contrast", | |
3161 ylab = "Kinase", | |
3162 # print which kinases: | |
3163 # - 1 : all kinases | |
3164 # - 2 : significant kinases | |
3165 # - 3 : non-significant kinases | |
3166 which_kinases = which_kinases | |
3167 ) | |
3168 cat("\\begin{center}\n") | |
3169 cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") | |
3170 cat("\\end{center}\n") | |
3171 } # end for (i in ... | |
3172 } # end if (length ... | |
3173 | |
3174 for (i_cntrst in seq_len(length(rslt$score_list))) { | |
3175 next_index <- i_cntrst | |
3176 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] | |
3177 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | |
3178 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | |
3179 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | |
3180 contrast_longlabel <- ( | |
3181 sprintf( | |
3182 "Trt %s {%s} -> Trt %s {%s}", | |
3183 contrast_metadata_df[i_cntrst, "b_level"], | |
3184 gsub( | |
3185 pattern = ";", | |
3186 replacement = ", ", | |
3187 x = contrast_metadata_df[i_cntrst, "b_samples"], | |
3188 fixed = TRUE | |
3189 ), | |
3190 contrast_metadata_df[i_cntrst, "a_level"], | |
3191 gsub( | |
3192 pattern = ";", | |
3193 replacement = ", ", | |
3194 x = contrast_metadata_df[i_cntrst, "a_samples"], | |
3195 fixed = TRUE | |
3196 ) | |
3197 ) | |
3198 ) | |
3199 main_title <- ( | |
3200 sprintf( | |
3201 "Change from treatment %s to treatment %s", | |
3202 contrast_metadata_df[i_cntrst, "b_level"], | |
3203 contrast_metadata_df[i_cntrst, "a_level"] | |
3204 ) | |
3205 ) | |
3206 sub_title <- contrast_longlabel | |
3207 tryCatch( | |
3208 expr = { | |
3209 ksea_scores_rslt <- rslt$score_list[[next_index]] | |
3210 | |
3211 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | |
3212 low_fdr_barplot( | |
3213 rslt = rslt, | |
3214 i_cntrst = i_cntrst, | |
3215 i = next_index, | |
3216 a_level = cntrst_a_level, | |
3217 b_level = cntrst_b_level, | |
3218 fold_change = cntrst_fold_change, | |
3219 caption = contrast_longlabel | |
3220 ) | |
3221 } | |
3222 }, | |
3223 error = function(e) str(e) | |
3224 ) | |
3225 } | |
3226 ``` | |
3227 | |
3228 ```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
3229 | |
3230 # Use enriched kinases to find enriched kinase-substrate pairs | |
3231 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) | |
3232 all_enriched_substrates <- sqldf(" | |
3233 SELECT | |
3234 gene AS kinase, | |
3235 ppep, | |
3236 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label | |
3237 FROM ( | |
3238 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
3239 FROM pseudo_ksdata | |
3240 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
3241 ) | |
3242 GROUP BY ppep | |
3243 ") | |
3244 | |
3245 # helper used to label per-kinase substrate enrichment figure | |
3246 cat_enriched_heading <- function(m, cut_args) { | |
3247 cutoff <- cut_args$cutoff | |
3248 kinase <- cut_args$kinase | |
3249 statistic <- cut_args$statistic | |
3250 threshold <- cut_args$threshold | |
3251 cat("\\newpage\n") | |
3252 if (nrow(m) > intensity_hm_rows) { | |
3253 subsection_header( | |
3254 paste( | |
3255 sprintf( | |
3256 "Lowest p-valued %d (of %d) enriched %s-substrates,", | |
3257 intensity_hm_rows, | |
3258 nrow(m), | |
3259 kinase | |
3260 ), | |
3261 sprintf(" KSEA %s < %0.2f\n", statistic, threshold) | |
3262 ) | |
3263 ) | |
3264 } else { | |
3265 if (nrow(m) == 1) { | |
3266 return(FALSE) | |
3267 } else { | |
3268 subsection_header( | |
3269 paste( | |
3270 sprintf( | |
3271 "%d enriched %s-substrates,", | |
3272 nrow(m), | |
3273 kinase | |
3274 ), | |
3275 sprintf( | |
3276 " KSEA %s < %0.2f\n", | |
3277 statistic, | |
3278 threshold | |
3279 ) | |
3280 ) | |
3281 ) | |
3282 } | |
3283 } | |
3284 cat("\n\n\n") | |
3285 cat("\n\n\n") | |
3286 return(TRUE) | |
3287 } | |
3288 | |
3289 # Disabling heatmaps for substrates pending decision whether to eliminate them altogether | |
3290 if (FALSE) | |
3291 for (kinase_name in sort(enriched_kinases$kinase)) { | |
3292 enriched_substrates <- | |
3293 all_enriched_substrates[ | |
3294 all_enriched_substrates$kinase == kinase_name, | |
3295 , | |
3296 drop = FALSE | |
3297 ] | |
3298 # Get the intensity values for the heatmap | |
3299 enriched_intensities <- | |
3300 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) | |
3301 # Remove rows having too many NA values to be relevant | |
3302 na_counter <- is.na(enriched_intensities) | |
3303 na_counts <- apply(na_counter, 1, sum) | |
3304 enriched_intensities <- | |
3305 enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE] | |
3306 # Rename the rows with the display-name for the heatmap | |
3307 rownames(enriched_intensities) <- | |
3308 sapply( | |
3309 X = rownames(enriched_intensities), | |
3310 FUN = function(rn) { | |
3311 enriched_substrates[enriched_substrates$ppep == rn, "label"] | |
3312 } | |
3313 ) | |
3314 # Format as matrix for heatmap | |
3315 m <- as.matrix(enriched_intensities) | |
3316 # Draw the heading and heatmap | |
3317 if (nrow(m) > 0) { | |
3318 cut_args <- new_env() | |
3319 cut_args$cutoff <- cutoff | |
3320 cut_args$kinase <- kinase_name | |
3321 cut_args$statistic <- ksea_cutoff_statistic | |
3322 cut_args$threshold <- ksea_cutoff_threshold | |
3323 number_of_peptides_found <- | |
3324 draw_intensity_heatmap( | |
3325 m = m, | |
3326 cutoff = cut_args, | |
3327 hm_heading_function = cat_enriched_heading, | |
3328 hm_main_title | |
3329 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", | |
3330 suppress_row_dendrogram = FALSE | |
3331 ) | |
3332 } | |
3333 } | |
3334 | |
3335 # Write output tabular files | |
3336 | |
3337 # get kinase, ppep, concat(kinase) tuples for enriched kinases | |
3338 | |
3339 kinase_ppep_label <- sqldf(" | |
3340 WITH | |
3341 t(ppep, label) AS | |
3342 ( | |
3343 SELECT DISTINCT | |
3344 SUB_MOD_RSD AS ppep, | |
3345 group_concat(gene, '; ') AS label | |
3346 FROM pseudo_ksdata | |
3347 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
3348 GROUP BY ppep | |
3349 ), | |
3350 k(kinase, ppep_join) AS | |
3351 ( | |
3352 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join | |
3353 FROM pseudo_ksdata | |
3354 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
3355 ) | |
3356 SELECT k.kinase, t.ppep, t.label | |
3357 FROM t, k | |
3358 WHERE t.ppep = k.ppep_join | |
3359 ORDER BY k.kinase, t.ppep | |
3360 ") | |
3361 | |
3362 # extract what we need from full_data | |
3363 impish <- cbind(rownames(quant_data_imp), quant_data_imp) | |
3364 colnames(impish)[1] <- "Phosphopeptide" | |
3365 data_table_imputed_sql <- " | |
3366 SELECT | |
3367 f.*, | |
3368 k.label AS KSEA_enrichments, | |
3369 q.* | |
3370 FROM | |
3371 metadata_plus_p f | |
3372 LEFT JOIN kinase_ppep_label k | |
3373 ON f.Phosphopeptide = k.ppep, | |
3374 impish q | |
3375 WHERE | |
3376 f.Phosphopeptide = q.Phosphopeptide | |
3377 " | |
3378 data_table_imputed <- sqldf(data_table_imputed_sql) | |
3379 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | |
3380 data_table_imputed <- | |
3381 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | |
3382 | |
3383 # Output with imputed, un-normalized data | |
3384 | |
3385 write.table( | |
3386 data_table_imputed | |
3387 , file = imputed_data_filename | |
3388 , sep = "\t" | |
3389 , col.names = TRUE | |
3390 , row.names = FALSE | |
3391 , quote = FALSE | |
3392 ) | |
3393 | |
3394 | |
3395 #output quantile normalized data | |
3396 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) | |
3397 colnames(impish)[1] <- "Phosphopeptide" | |
3398 data_table_imputed <- sqldf(data_table_imputed_sql) | |
3399 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | |
3400 data_table_imputed <- | |
3401 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | |
3402 write.table( | |
3403 data_table_imputed, | |
3404 file = imp_qn_lt_data_filenm, | |
3405 sep = "\t", | |
3406 col.names = TRUE, | |
3407 row.names = FALSE, | |
3408 quote = FALSE | |
3409 ) | |
3410 | |
3411 ppep_kinase <- sqldf(" | |
3412 SELECT DISTINCT k.ppep, k.kinase | |
3413 FROM ( | |
3414 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep | |
3415 FROM pseudo_ksdata | |
3416 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
3417 ) k | |
3418 ORDER BY k.ppep, k.kinase | |
3419 ") | |
3420 | |
3421 RSQLite::dbWriteTable( | |
3422 conn = db, | |
3423 name = "ksea_enriched_ks", | |
3424 value = ppep_kinase, | |
3425 append = FALSE | |
3426 ) | |
3427 | |
3428 RSQLite::dbWriteTable( | |
3429 conn = db, | |
3430 name = "anova_signif", | |
3431 value = p_value_data, | |
3432 append = FALSE | |
3433 ) | |
3434 | |
3435 ddl_exec(db, " | |
3436 DROP VIEW IF EXISTS stats_metadata_v; | |
3437 " | |
3438 ) | |
3439 dml_no_rows_exec(db, " | |
3440 CREATE VIEW stats_metadata_v | |
3441 AS | |
3442 SELECT DISTINCT m.*, | |
3443 p.raw_anova_p, | |
3444 p.fdr_adjusted_anova_p, | |
3445 kek.kinase AS ksea_enrichments | |
3446 FROM | |
3447 mrgfltr_metadata_view m | |
3448 LEFT JOIN anova_signif p | |
3449 ON m.phospho_peptide = p.phosphopeptide | |
3450 LEFT JOIN ksea_enriched_ks kek | |
3451 ON m.phospho_peptide = kek.ppep | |
3452 ; | |
3453 " | |
3454 ) | |
3455 | |
3456 write.table( | |
3457 dbReadTable(db, "stats_metadata_v"), | |
3458 file = anova_ksea_mtdt_file, | |
3459 sep = "\t", | |
3460 col.names = TRUE, | |
3461 row.names = FALSE, | |
3462 quote = FALSE | |
3463 ) | |
3464 | |
3465 | |
3466 ``` | |
3467 | |
3468 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
3469 cat("\\leavevmode\n\n\n") | |
3470 | |
3471 # write parameters to report | |
3472 | |
3473 param_unlist <- unlist(as.list(params)) | |
3474 param_df <- data.frame( | |
3475 parameter = paste0("\\verb@", names(param_unlist), "@"), | |
3476 value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@") | |
3477 ) | |
3478 | |
3479 data_frame_latex( | |
3480 x = param_df, | |
3481 justification = "p{0.35\\linewidth} p{0.6\\linewidth}", | |
3482 centered = TRUE, | |
3483 caption = "Input parameters", | |
3484 anchor = const_table_anchor_bp, | |
3485 underscore_whack = FALSE | |
3486 ) | |
3487 | |
3488 # write parameters to SQLite output | |
3489 | |
3490 mqppep_anova_script_param_df <- data.frame( | |
3491 script = "mqppep_anova_script.Rmd", | |
3492 parameter = names(param_unlist), | |
3493 value = param_unlist | |
3494 ) | |
3495 ddl_exec(db, " | |
3496 DROP TABLE IF EXISTS script_parameter; | |
3497 " | |
3498 ) | |
3499 ddl_exec(db, " | |
3500 CREATE TABLE IF NOT EXISTS script_parameter( | |
3501 script TEXT, | |
3502 parameter TEXT, | |
3503 value ANY, | |
3504 UNIQUE (script, parameter) ON CONFLICT REPLACE | |
3505 ) | |
3506 ; | |
3507 " | |
3508 ) | |
3509 RSQLite::dbWriteTable( | |
3510 conn = db, | |
3511 name = "script_parameter", | |
3512 value = mqppep_anova_script_param_df, | |
3513 append = TRUE | |
3514 ) | |
3515 | |
3516 # We are done with output | |
3517 RSQLite::dbDisconnect(db) | |
3518 ``` | |
3519 <!-- | |
3520 There's gotta be a better way... | |
3521 | |
3522 loaded_packages_df <- sessioninfo::package_info("loaded") | |
3523 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) | |
3524 loaded_packages_df <- data.frame( | |
3525 package = loaded_packages_df$package, | |
3526 version = loaded_packages_df$loadedversion, | |
3527 date = loaded_packages_df$date | |
3528 ) | |
3529 data_frame_latex( | |
3530 x = loaded_packages_df, | |
3531 justification = "l | l l", | |
3532 centered = FALSE, | |
3533 caption = "Loaded R packages", | |
3534 anchor = const_table_anchor_bp | |
3535 ) | |
3536 --> |