Mercurial > repos > galaxyp > mqppep_preproc
diff mqppep_anova_script.Rmd @ 1:b76c75521d91 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
author | galaxyp |
---|---|
date | Fri, 28 Oct 2022 18:26:42 +0000 |
parents | 8dfd5d2b5903 |
children | a5e7469dfdfa |
line wrap: on
line diff
--- a/mqppep_anova_script.Rmd Mon Jul 11 19:22:54 2022 +0000 +++ b/mqppep_anova_script.Rmd Fri Oct 28 18:26:42 2022 +0000 @@ -7,81 +7,153 @@ date: - "May 28, 2018" - "; revised June 23, 2022" +lot: true output: pdf_document: toc: true - toc_depth: 3 + toc_depth: 2 keep_tex: true -header-includes: - - \usepackage{longtable} - - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut - - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut + dev: pdf + includes: + in_header: mqppep_anova_preamble.tex +latex_macros: false +raw_tex: true +urlcolor: blue params: alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/test_input_for_anova.tabular" preprocDb: "test-data/test_input_for_anova.sqlite" kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] - show_toc: true - firstDataColumn: "^Intensity[^_]" - imputationMethod: !r c("group-median", "median", "mean", "random")[1] - meanPercentile: 1 - sdPercentile: 1.0 regexSampleNames: "\\.\\d+[A-Z]$" regexSampleGrouping: "\\d+" + groupFilterPatterns: ".+" + groupFilter: !r c("none", "exclude", "include")[1] + imputationMethod: !r c("group-median", "median", "mean", "random")[4] + kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5] + #imputationMethod: !r c("group-median", "median", "mean", "random")[1] + + # how should sample groups be interpreted? + # - "f": fixed patterns (like `grep -F`) + # - "p": PERL-compatible (like `grep -P`) + # - "r": extended grep patterns (like `grep -E`) + # use what case sensitivity? + # - "i": case insensitive matching (like `grep -i`) + groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1] + # what pattern should be used for the first column + # (extended grep pattern, case sensitive) + firstDataColumn: "^Intensity[^_]" + # for small random value imputation, what percentile should be center? + meanPercentile: 50 + #meanPercentile: 1 + # for small random value imputation, what should `s / mean(x)` ratio be? + sdPercentile: 1.0 + # output path for imputed data file imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" + # output path for imputed/quantile-normalized/log-transformed data file imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" + # output path for contents of `stats_metadata_v` table anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" + # how to test one variable with > 2 categories (e.g., aov or kruskal.test) oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] + # how to test one variable with 2 categories (e.g., oneway.test) oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] - kseaCutoffStatistic: !r c("p.value", "FDR")[2] - kseaCutoffThreshold: !r c( 0.1, 0.05)[2] - kseaMinKinaseCount: 1 - intensityHeatmapRows: 75 + # what should be the minimum quality for consideration in both + minQuality: 0 + # correct KSEA with FDR (recommended) or raw p-value + kseaCutoffStatistic: !r c("FDR", "p.value")[1] + # correct KSEA threshold 0.05 (conventional) or higher (perhaps better) + # "perhaps better" meaning that KSEA is an hypothesis-generator, not -test + #kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5)[1] + # minimum number of substrates required for a kinase to be considered in KSEA + kseaMinSubstrateCount: 1 + # Should KSEA be performed aggregating signed log2FC or absolute? + # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores + # TRUE use abs(log2FC) for KSEA as Justin Drake requested; this is a + # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. + kseaUseAbsoluteLog2FC: TRUE + #kseaUseAbsoluteLog2FC: FALSE + # minimum number of observed values per sample-group + intensityMinValuesPerGroup: 1 + # maximum number of heatmap rows (result are poor when > 50) + intensityHeatmapRows: 50 + # what should be the primary criterion to eliminate excessive heatmap rows + intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1] + # should correlation among substrates be used (rather than covariance) + correlateSubstrates: TRUE + # only show covariance among variables having variance > 1 + filterCovVarGT1: TRUE + # maximum number of residues to display for ppeps in rownames or columnames + ppepTruncN: 10 + # maximum number of characters of subgenes to display in rownames or columnames + subgeneTruncN: 10 + # maximum number of characters for paste(subgene, ppep) for enrichment plots + substTruncN: 20 + # should boxplots use variable-width boxes to reflect # of samples + boxPlotVarWidth: TRUE + # should boxplots use notched boxes to reflect difference between samples + boxPlotNotch: TRUE + # look-up tables for kinase descriptions + kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2" + kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2" + # should debugging trace messages be printed? + showEnrichedSubstrates: FALSE + + # should debugging nb/nbe messages be printed? + printNBMsgs: FALSE + # should debugging trace messages be printed? + printTraceMsgs: FALSE + # when debugging files are needed, set debugFileBasePath to the path + # to the directory where they should be writtn + debugFileBasePath: !r if (TRUE) NULL else "test-data" --- -<!-- - kseaCutoffStatistic: !r c("p.value", "FDR")[2] - kseaCutoffThreshold: !r c(0.05, 0.1)[1] - - alphaFile: "test-data/alpha_levels.tabular" - inputFile: "test-data/test_input_for_anova.tabular" - preprocDb: "test-data/test_input_for_anova.sqlite" - kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] - - alphaFile: "test-data/alpha_levels.tabular" - inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" - preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" - kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] - - alphaFile: "test-data/alpha_levels.tabular" - inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" - preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" - kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] - - alphaFile: "test-data/alpha_levels.tabular" - inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" - preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" - kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] - - inputFile: "test-data/density_failure.preproc_tab.tabular" - kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] - latex_document: default ---> -```{r setup, include = FALSE} + +```{r setup, include = FALSE, results = 'asis'} + +# simple debug messaging +print_nb_messages <- params$printNBMsgs + +nb <- if (!print_nb_messages) { + function(...) invisible() + } else { + function(..., f = cat) f("\n$\\exists{}\\supset\\forall{}$", ...) + } + +nbe <- if (!print_nb_messages) { + function(...) invisible() + } else { + function(..., f = cat, file = stderr()) { + cat( + stringi::stri_unescape_unicode("\nNBE \\u2203\\u2283\\u2200"), + ..., + file = file + ) + } + } + #ref for debugging: https://yihui.org/tinytex/r/#debugging options(tinytex.verbose = TRUE) # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 -knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) +knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10), dpi = 300) # freeze the random number generator so the same results will be produced # from run to run set.seed(28571) ### LIBRARIES + +if (print_nb_messages) nbe("library(gplots)") library(gplots) +if (print_nb_messages) nbe("library(caret)") +# load caret for nearZeroVar +if (print_nb_messages) nbe("Please ignore the messages about systemd, if any.\n") +library(caret) +if (print_nb_messages) nbe("library(DBI)") library(DBI) +if (print_nb_messages) nbe("library(RSQLite)") library(RSQLite) +if (print_nb_messages) nbe("library(sqldf)\n") # Suppress "Warning: no DISPLAY variable so Tk is not available" suppressWarnings(suppressMessages(library(sqldf))) @@ -97,30 +169,42 @@ ### CONSTANTS -const_parfin <- par("fin") -const_boxplot_fill <- "grey94" -const_stripchart_cex <- 0.5 -const_stripsmall_cex <- - sqrt(const_stripchart_cex * const_stripchart_cex / 2) -const_stripchart_jitter <- 0.3 -const_write_debug_files <- FALSE -const_table_anchor_bp <- "bp" -const_table_anchor_ht <- "ht" -const_table_anchor_p <- "p" -const_table_anchor_tbp <- "tbp" - - +const_boxplot_fill <- "grey94" const_ksea_astrsk_kinases <- 1 const_ksea_nonastrsk_kinases <- 2 const_ksea_all_kinases <- 3 - -const_log10_e <- log10(exp(1)) - -### FUNCTIONS - -# from `demo(error.catching)` +const_log10_e <- log10(exp(1)) +const_stripchart_cex <- 0.5 +const_stripchart_jitter <- 0.3 +const_table_anchor_bp <- "bp" +const_table_anchor_ht <- "ht" +const_table_anchor_p <- "p" +const_table_anchor_t <- "t" +const_table_anchor_tbp <- "tbp" + + +### GLOBAL VARIABLES (params) + +## functions to process params + +is_string_null_or_empty <- function(x) { + # N. B. non-strings are intentionally treated as NULL + if (is.null(x)) + TRUE + else if (!is.character(x)) + TRUE + else x == "" +} + ##' Catch *and* save both errors and warnings, and in the case of ##' a warning, also keep the computed result. +##' return result as list(value = ..., warning = ...) +##' - value will be: +##' - the result if no exception is thrown +##' - the exception if an exception is caught +##' - warning will be a string except perhaps when warning argument is not NULL +##' +##' adapted from `demo(error.catching)` ##' ##' @title tryCatch both warnings (with value) and errors ##' @param expr an \R expression to evaluate @@ -128,32 +212,428 @@ ##' 'value' may be an error caught. ##' @author Martin Maechler; ##' Copyright (C) 2010-2012 The R Core Team -try_catch_w_e <- function(expr) { - wrn <- NULL - # warning handler - w_handler <- function(w) { - wrn <<- w - invokeRestart("muffleWarning") +try_catch_w_e <- + function(expr, error = function(e) e, warning = NULL) { + wrn <- NULL + # warning handler + w_handler <- + if (is.function(warning)) + warning + else + function(w) { + wrn <<- w + invokeRestart("muffleWarning") + } + e_handler <- + if (is.function(error)) + error + else + function(e) e + # return result as list(value = ..., warning = ...) + # - value will be: + # - the result if no exception is thrown + # - the exception if an exception is caught + list( + value = withCallingHandlers( + tryCatch( + expr, + error = e_handler + ), + warning = w_handler + ), + warning = wrn + ) + } + +see_kvp <- + function(format, key, value, suffix = "") { + if ( + !all( + is.character(format), + is.character(key), + is.character(value), + is.character(suffix) + ) + ) { + cat("all arguments to see_kvp should be character") + knitr::knit_exit() + } + result <- sprintf(format, value) + if (length(result) > 1) { + sprintf( + "%s = c(%s)%s", + whack_underscores(key), + paste(result, collapse = ", "), + suffix + ) + } else { + sprintf( + "%s = %s%s", + key, + result, + suffix + ) + } + } + +see_logical <- + function(x, suffix = "", xprssn = deparse1(substitute(x))) { + result <- as.character(as.logical(x)) + # handle NAs and NaNs + result[is.na(result)] <- "NA" + see_kvp( + format = "%s", + key = xprssn, + value = result, + suffix = suffix + ) + } + +see_numeric <- + function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { + if (is.numeric(digits) && is.numeric(x)) { + digits <- as.integer(digits) + digits <- min(16, max(0, digits)) + format <- paste0("%0.", as.character(digits), "g") + result <- sprintf(format, x) + see_kvp( + format = "%s", + key = xprssn, + value = result, + suffix = suffix + ) + } + } + +see_character <- + function(x, suffix = "", xprssn = deparse1(substitute(x))) { + if (is.character(x)) { + see_kvp( + format = "%s", + key = xprssn, + value = sprintf("\"%s\"", x), + suffix = suffix + ) + } + } + +see_variable <- + function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { + if (is.character(x)) { + see_character(x, suffix, xprssn) + } else if (is.numeric(x)) { + see_numeric(x, suffix, digits, xprssn) + } else if (is.logical(x)) { + see_logical(x, suffix, xprssn) + } else { + f <- file("") + sink(f) + str(x) + msg <- paste(readLines(f), collapse = "\n") + sink() + close(f) + paste0( + "see_variable - str(", + xprssn, + "):\n", + msg, "\n" + ) + } + } + +# ref: https://tug.org/texinfohtml/latex2e.html +# LaTeX sets aside the following characters for special purposes. +# For example, the percent sign % is for comments. +# They are called reserved characters or special characters. +# They are all discussed elsewhere in this manual. +# +# $ % & { } _ ~ ^ \ # +# +# If you want a reserved character to be printed as itself, in the text body +# font, for all but the final three characters in that list simply put +# a backslash \ in front of the character. +# Thus, typing \$1.23 will produce $1.23 in your output. +# +# As to the last three characters, to get a tilde in the text body font, +# use \~{} (omitting the curly braces would result in the next character +# receiving a tilde accent). +# Similarly, to get a text body font circumflex use \^{}. +# To get a backslash in the font of the text body enter \textbackslash{}. +whack_math <- + function(v) { + v <- as.character(v) + w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE) + w <- Reduce( + f = function(l, r) { + gsub(r, paste0("\\", r), l, fixed = TRUE) + }, + x = c("#", "$", "%", "&", "{", "}", "_"), + init = w + ) + w <- gsub("^", "\\^{}", w, fixed = TRUE) + return(w) + if (all(v == w)) + v + else + paste0("\\texttt{", w, "}") } - list( - value = withCallingHandlers( - tryCatch( - expr, - error = function(e) e - ), - warning = w_handler - ), - warning = wrn +whack_underscores <- whack_math + +## dump params to stderr (remove this eventually) + +if (FALSE) nbe(see_variable(params)) + +## unlist params for eventual output + +param_unlist <- unlist(as.list(params)) + +# no need to whack underscores and dollars because this is verbatim +param_df <- data.frame( + parameter = paste0("\\verb@", names(param_unlist), "@"), + value = paste0( + "\n\\begin{tiny}\n\\verb@", + param_unlist, + "@\n\\end{tiny}" + ) + ) +param_df <- data.frame( + parameter = names(param_unlist), + value = param_unlist + ) +param_df <- param_df[order(param_df$parameter), ] + +## general output control + +debug_file_base_path <- params$debugFileBasePath +print_trace_messages <- params$printTraceMsgs +show_enriched_substrates <- params$showEnrichedSubstrates +boxplot_varwidth <- params$boxPlotVarWidth +boxplot_notch <- params$boxPlotNotch + +## parameters for static data + +kinase_name_uprt_lut_bz2 <- params$kinaseNameUprtLutBz2 +kinase_uprt_desc_lut_bz2 <- params$kinaseUprtDescLutBz2 + +## parameters for input file + +preproc_db <- params$preprocDb +alpha_file <- params$alphaFile +input_file <- params$inputFile + +# First data column - ideally, this could be detected via +# regexSampleNames, but for now leave it as is. +first_data_column <- params$firstDataColumn +fdc_is_integer <- is.integer(first_data_column) +if (fdc_is_integer) { + first_data_column <- as.integer(params$firstDataColumn) +} + +## parameters for output files + +ksea_app_prep_db <- params$kseaAppPrepDb +imputed_data_filename <- params$imputedDataFilename +imp_qn_lt_data_filenm <- params$imputedQNLTDataFile +anova_ksea_mtdt_file <- params$anovaKseaMetadata + +## parameters for imputation + +# Imputation method, should be one of +# "random", "group-median", "median", or "mean" +imputation_method <- params$imputationMethod + +# Selection of percentile of logvalue data to set the mean for random number +# generation when using random imputation +mean_percentile <- params$meanPercentile / 100.0 + +# deviation adjustment-factor for random values; real number. +sd_percentile <- params$sdPercentile + +## parameters for group parsing and filtering + +# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" +regex_sample_names <- params$regexSampleNames +# Regular expression to extract Sample Grouping from Sample Name; +# if error occurs, compare smpl_trt vs. sample_name_matches +# to see if groupings/pairs line up +# e.g., "(\\d+)" + +regex_sample_grouping <- params$regexSampleGrouping +# What are the patterns for filtering sample groups? +# How should sample groups be filtered? +# - none: do not filter +# - include: include sample groups matching filter +# - exclude: include sample groups not matching filter + +sample_group_filter <- params$groupFilter +if (grepl("f", params$groupFilterMode, fixed = TRUE)) { + sample_group_filter_perl <- FALSE + sample_group_filter_fixed <- TRUE +} else if (grepl("p", params$groupFilterMode, fixed = TRUE)) { + sample_group_filter_perl <- TRUE + sample_group_filter_fixed <- FALSE +} else { # normal regex + sample_group_filter_perl <- FALSE + sample_group_filter_fixed <- FALSE +} + +sample_group_filter_nocase <- + grepl("i", params$groupFilterMode, fixed = TRUE) + +# What PCRE patterns should be included or excluded +group_filter_patterns_csv <- params$groupFilterPatterns +sample_group_filter_patterns <- strsplit( + x = group_filter_patterns_csv, + split = ",", + fixed = TRUE + )[[1]] + +## parameters for hypothesis testing + +one_way_all_categories_fname <- params$oneWayManyCategories + +one_way_all_categories <- try_catch_w_e( + match.fun(one_way_all_categories_fname)) + +if (!is.function(one_way_all_categories$value)) { + write("fatal error for parameter oneWayManyCategories:", stderr()) + write(one_way_all_categories$value$message, stderr()) + if (sys.nframe() > 0) { + cat("Cannot continue and quit() failed. Goodbye.") + knitr::knit_exit() + quit(save = "no", status = 1) + } +} + +one_way_all_categories <- one_way_all_categories$value + +one_way_two_categories_fname <- params$oneWayManyCategories +one_way_two_categories <- try_catch_w_e( + match.fun(one_way_two_categories_fname)) +if (!is.function(one_way_two_categories$value)) { + cat("fatal error for parameter oneWayTwoCategories: \n") + cat(one_way_two_categories$value$message, fill = TRUE) + if (sys.nframe() > 0) { + cat("Cannot continue and quit() failed. Goodbye.") + knitr::knit_exit() + quit(save = "no", status = 1) + } +} +one_way_two_categories <- one_way_two_categories$value + +## parameters for KSEA + +ksea_cutoff_statistic <- params$kseaCutoffStatistic +ksea_cutoff_threshold <- params$kseaCutoffThreshold +ksea_min_substrate_count <- params$kseaMinSubstrateCount + +## parameters for global variables consumed by functions + +# intensityHeatmapCriteria: !r c("na_count", "p_value")[2] # TODO switch to 1 +# TODO Validate within list +g_intensity_hm_criteria <- params$intensityHeatmapCriteria +if (is_string_null_or_empty(g_intensity_hm_criteria)) { + cat("invalid intensityHeatmapCriteria parameter (must be string)") + knitr::knit_exit() +} +switch( + g_intensity_hm_criteria, + "quality" = NULL, + "na_count" = NULL, + "p_value" = NULL, + { + with( + params, + cat( + sprintf( + "invalid %s (must be %s)", + see_variable(intensityHeatmapCriteria), + "one of quality or na_count or p_value" + ) + ) + ) + knitr::knit_exit() + } +) + +# intensityHeatmapRows: 50 +# TODO Validate >> 0 < 75 +g_intensity_hm_rows <- params$intensityHeatmapRows +if (!is.integer(g_intensity_hm_rows) || g_intensity_hm_rows < 1) { + cat("invalid intensityHeatmapRows (must be integer > 0)") + knitr::knit_exit() +} + +g_intensity_min_per_class <- params$intensityMinValuesPerGroup +if (!is.integer(g_intensity_min_per_class) || g_intensity_min_per_class < 0) { + cat("invalid intensityMinValuesPerGroup (must be integer > -1") + knitr::knit_exit() +} + +if (is.na(as.logical(g_correlate_substrates <- params$correlateSubstrates))) { + cat("invalid correlateSubstrates (must be TRUE or FALSE)") + knitr::knit_exit() +} + +if (is.na(as.logical(g_filter_cov_var_gt_1 <- params$filterCovVarGT1))) { + cat("invalid filterCovVarGT1 parameter (must be TRUE or FALSE)") + knitr::knit_exit() +} + +# TODO Validate >> 0 < 30 +g_ppep_trunc_n <- params$ppepTruncN + +# TODO Validate >> 0 < 30 +g_subgene_trunc_n <- params$subgeneTruncN + +# TODO Validate >> 0 < 30 +g_sbstr_trunc_n <- params$substTruncN + + +### OPERATORS + +# Test for exclusion +# ref: https://www.reneshbedre.com/blog/in-operator-r.html +`%notin%` <- Negate(`%in%`) + +# Augmented assignment +# ref: https://www2.cs.arizona.edu/icon/refernce/infix2.htm#aug_assign +`%||:=%` <- function(lvalue, ...) { + pf <- parent.frame() + rvalue <- Reduce(paste0, x = ..., init = lvalue) + assign( + x = as.character(substitute(lvalue)), + value = rvalue, + pos = pf + ) + invisible(rvalue) +} + +### FUNCTIONS + +no_op <- + function() { + } +# this function is not used in this file and should be removed while +# factoring out reusable code +all_apply <- function(f, v, na_rm = TRUE, ...) { + Reduce( + f = function(l, r) if (na_rm && is.na(r)) TRUE else l && r, + x = sapply(X = v, FUN = f, ...), + init = TRUE ) } - -write_debug_file <- function(s) { - if (const_write_debug_files) { - s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) - print(sprintf("DEBUG writing file %s", spath)) +write_debug_file <- function(data_frame) { + if (!is.null(debug_file_base_path)) { + s_path <- + sprintf( + "%s/%s.txt", + debug_file_base_path, + deparse(substitute(data_frame)) + ) write.table( - s, + data_frame, file = s_path, sep = "\t", col.names = TRUE, @@ -174,6 +654,137 @@ new.env(parent = emptyenv()) } +# make apply readable for rows +row_apply <- function(x, fun, ..., simplify = TRUE) { + apply(x, MARGIN = 1, fun, ..., simplify = TRUE) +} + +# make apply readable for columns +column_apply <- function(x, fun, ..., simplify = TRUE) { + apply(x, MARGIN = 2, fun, ..., simplify = TRUE) +} + +##' Produce a vector of boolean values whose i-th value is TRUE when any +##' member of v matches the i-th membr of s, where i in 1:seq_len(length(s)) +##' +##' @title Search multiple strings for matches of multiple substrings +##' @param v a vector of substrings to match +##' @param s a vector of strings to search for matches +##' @param ... additional arguments to grepl +##' @return a list with keys in s and valuse that are vectors of elements of v +##' @author Art Eschenlauer +##' Copyright (C) 2022 Art Eschenlauer; +##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms +mgrepl <- function(v, s, ...) { + grpl_rslt <- rep_len(0, length(s)) + for (vi in v) { + grpl_rslt_v <- sapply( + X = s, + FUN = function(t) { + Reduce( + f = function(l, r) if (is.null(l)) r else c(l, r), + x = sapply( + X = vi, + FUN = function(f) grepl(f, t, ...) + ), + init = c() + ) + }, + simplify = "array" + ) + grpl_rslt <- grpl_rslt + grpl_rslt_v + } + rslt <- unname(grpl_rslt > 0) +} + +##' Produce positions in a vector where succeeding value != current valus +##' +##' @title Search vector for neighboring positions having different values +##' @param v a vector of comparable numeric values (e.g. integers) +##' @return a vector of positions i where v[i] != v[i + 1] +##' @author Art Eschenlauer +##' Copyright (C) 2022 Art Eschenlauer; +##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms +transition_positions <- function(v) { + Reduce( + f = function(l, i) if ((i != 1) && (v[i - 1] != v[i])) c(l, i - 1) else l, + x = seq_along(v)[-1:0], + init = c() + ) +} + +### figure debug functions + +cat_par_vector <- function(par_name, lbl = "", newlines = TRUE) { + cat( + sprintf( + "%spar(%s) = c(%s)%s", + lbl, + par_name, + paste(par(par_name), collapse = ", "), + if (newlines) "\n\n" else "" + ) + ) +} + +cat_margins <- function(lbl = NULL) { + for (p in c("fig", "fin", "mar", "mai", "omd", "omi", "oma")) + cat_par_vector(p, if (!is.null(lbl)) paste0(lbl, " ") else NULL) +} + +cat_variable <- + function(x, suffix = "", digits = 3, force_str = FALSE) { + xprssn <- deparse1(substitute(x)) + if (force_str || is.matrix(x) || is.list(x) || is.data.frame(x)) { + cat( + paste0( + "\n\\texttt{\\textbf{", + whack_underscores(xprssn), + "}} [", + typeof(x), + ",", + mode(x), + "] =\n" + ) + ) + cat("\n\\begin{verbatim}\n") + str(x) + cat("\n\\end{verbatim}\n") + } else { + cat("\n", see_variable(x, suffix, digits, xprssn)) + } + } + +### structure helper functions + +# ref: staque.R - Icon-oriented stack and queue operations +# - https://gist.github.com/eschen42/917690355e53918b9e7ba7138a02d1f8 +# +# sq_get(v):x produces the leftmost element of v and removes it from v, +# but produces NA if v is empty +sq_get <- function(v) { + if (length(v) == 0) return(NA) + assign(as.character(substitute(v)), v[-1], parent.frame()) + return(v[1]) +} +# +# sq_put(v,x1,...,xn):v puts x1, x2, ..., xn onto the right end of v, +# producing v. +# Values are pushed in order from left to right, +# so xn becomes the last (rightmost) value on v. +# sq_put(v) with no second argument does nothing. +sq_put <- function(v, x = NA, ...) { + pf <- parent.frame() + if (is.null(x)) return(pf$v) + if ( + !(length(x) > 1) && + !rlang::is_closure(x) && + is.na(x) + ) return(pf$v) + assign(as.character(substitute(v)), c(v, x, ...), pf) + pf[[as.character(substitute(v))]] +} + ### numerical/statistical helper functions any_nan <- function(x) { @@ -186,6 +797,7 @@ sd(x[ok]) } +# compute anova raw p-value anova_func <- function(x, grouping_factor, one_way_f) { subject <- data.frame( intensity = x @@ -203,12 +815,421 @@ pvalue } +# This code adapted from matrixcalc::is.positive.definite +# Notably, this simply tests without calling stop() +is_positive_definite <- function(x, tol = 1e-08) { + if (!is.matrix(x)) + return(FALSE) + if (!is.numeric(x)) + return(FALSE) + if (nrow(x) < 1) + return(FALSE) + if (ncol(x) < 1) + return(FALSE) + if (nrow(x) != ncol(x)) + return(FALSE) + sum_symm <- sum(x == t(x), na.rm = TRUE) + value_count <- Reduce("*", dim(x)) + if (sum_symm != value_count) + return(FALSE) + eigenvalues <- eigen(x, only.values = TRUE)$values + n <- nrow(x) + for (i in 1:n) { + if (abs(eigenvalues[i]) < tol) { + eigenvalues[i] <- 0 + } + } + if (any(eigenvalues <= 0)) { + return(FALSE) + } + return(TRUE) +} ### LaTeX functions -latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) { - v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v - cat( +# Use this like print.data.frame, from which it is adapted: +data_frame_table_latex <- + function( + x, + # digits to pass to format.data.frame + digits = NULL, + # TRUE -> right-justify columns; FALSE -> left-justify + right = TRUE, + # maximumn number of rows to print + max = NULL, + # string with justification of each column + justification = NULL, + # TRUE to center on page + centered = TRUE, + # optional caption + caption = NULL, + # h(inline); b(bottom); t (top) or p (separate page) + anchor = "h", + # set underscore_whack to TRUE to escape underscores + underscore_whack = TRUE, + # how to emit results + emit = cat + ) { + if (is.null(justification)) + justification <- + Reduce( + f = paste, + x = rep_len(if (right) "r" else "l", length(colnames(x))) + ) + n <- length(rownames(x)) + if (length(x) == 0L) { + emit( + sprintf( + # if n is one, use singular 'row', else use plural 'rows' + ngettext( + n, + "data frame with 0 columns and %d row", + "data frame with 0 columns and %d rows" + ), + n + ), + "\n", + sep = "" + ) + } else if (n == 0L) { + emit("0 rows for:\n") + latex_itemized_list( + v = names(x), + underscore_whack = underscore_whack + ) + } else { + if (is.null(max)) + max <- getOption("max.print", 99999L) + if (!is.finite(max)) { + cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) + knitr::knit_exit() + } + omit <- (n0 <- max %/% length(x)) < n + m <- as.matrix( + format.data.frame( + if (omit) x[seq_len(n0), , drop = FALSE] else x, + digits = digits, + na.encode = FALSE + ) + ) + emit( + # h(inline); b(bottom); t (top) or p (separate page) + paste0("\\begin{table}[", anchor, "]"), + "\\leavevmode", + sep = "\n" + ) + if (!is.null(caption)) + emit(paste0(" \\caption{", caption, "}")) + if (centered) emit("\\centering\n") + emit( + paste( + " \\begin{tabular}{", + justification, + "}\n", + sep = "" + ) + ) + + # ref for top and bottom struts (\T and \B): + # https://tex.stackexchange.com/a/50355 + if (!is.null(caption)) + emit("\\B \\\\\n") + latex_table_row( + v = colnames(m), + extra = " \\T \\B", + underscore_whack = underscore_whack + ) + emit("\\hline \\\\\n") + for (i in seq_len(length(m[, 1]))) { + latex_table_row( + v = m[i, ], + underscore_whack = underscore_whack + ) + } + emit( + paste( + " \\end{tabular}", + "\\end{table}", + sep = "\n" + ) + ) + if (omit) + emit(" [ reached 'max' / getOption(\"max.print\") -- omitted", + n - n0, "rows ]\n") + } + invisible(x) + } + +# Use this like print.data.frame, from which it is adapted: +data_frame_tabbing_latex <- + function( + x, + # vector of tab stops, in inches + tabstops, + # vector of headings, registered with tab-stops + headings = colnames(x), + # digits to pass to format.data.frame + digits = NULL, + # maximumn number of rows to print + max = NULL, + # optional caption + caption = NULL, + # set underscore_whack to TRUE to escape underscores + underscore_whack = TRUE, + # flag for landscape mode + landscape = FALSE, + # flag indicating that subsubsection should be used for caption + # rather than subsection + use_subsubsection_header = TRUE, + # character-size indicator; for possible values, see: + # https://tug.org/texinfohtml/latex2e.html#Font-sizes + charactersize = "small", + # set verbatim to TRUE to debug output + verbatim = FALSE + ) { + + hlinport <- if (landscape) { + function() cat("\\hlinlscp \\\\\n") + } else { + function() cat("\\hlinport \\\\\n") + } + + tabstops_tex <- + Reduce( + f = function(l, r) paste0(l, r), + x = sprintf("\\hspace{%0.2fin}\\=", tabstops), + init = "" + ) + + n <- length(rownames(x)) + if (length(x) == 0L) { + cat( + sprintf( + # if n is one, use singular 'row', else use plural 'rows' + ngettext( + n, + "data frame with 0 columns and %d row", + "data frame with 0 columns and %d rows" + ), + n + ), + "\n", + sep = "" + ) + } else if (n == 0L) { + cat("0 rows for:\n") + latex_itemized_list( + v = names(x), + underscore_whack = underscore_whack + ) + } else { + if (is.null(max)) + max <- getOption("max.print", 99999L) + if (!is.finite(max)) { + cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) + knitr::knit_exit() + } + omit <- (n0 <- max %/% length(x)) < n + m <- as.matrix( + format.data.frame( + if (omit) x[seq_len(n0), , drop = FALSE] else x, + digits = digits, + na.encode = FALSE + ) + ) + if (landscape) + cat("\n\\begin{landscape}") + tex_caption <- + if (!is.null(caption)) sprintf("\\captionof{table}{%s}\n", caption) + else "\n" + # build the column names, which have multiple lines when + # length(headings) is a multiple of the number of columns + column_names <- "" + while (length(headings) > 0) { + my_row <- c() + for (i in 1:(1 + length(tabstops))) { + my_field <- sq_get(headings) + sq_put(my_row, if (is.na(my_field)) "" else my_field) + } + column_names %||:=% latex_tabbing_row( + v = my_row, + underscore_whack = underscore_whack, + action = paste0 + ) + } + + # Begin tabbing environment after beginning charactersize environment + if (verbatim) cat("\n\\begin{verbatim}") + cat( + paste0( + "\n\\begin{", charactersize, "}", tex_caption, + "\\begin{tabwrap}{", tabstops_tex, "}\n" + ) + ) + # emit column names + cat(column_names) + # emit hline + hlinport() + for (i in seq_len(length(m[, 1]))) { + my_row <- latex_tabbing_row( + v = m[i, ], + underscore_whack = underscore_whack, + action = paste0 + ) + if (FALSE) + cat(my_row) + else + cat(my_row) + } + hlinport() + if (omit) + cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", + n - n0, "rows ]\n") + # End charactersize environment after ending tabbing environment + cat(paste0("\\end{tabwrap}\n\\end{", charactersize, "}\n")) + if (verbatim) cat("\\end{verbatim}\n") + if (landscape) + cat("\\end{landscape}\n") + } + invisible(x) + } + +param_df_noexit <- + function(e = NULL) { + data_frame_tabbing_latex( + x = param_df, + tabstops = c(1.75), + underscore_whack = TRUE, + caption = "Input parameters", + verbatim = FALSE + ) + if (!is.null(e)) { + sink(stderr()) + cat("Caught fatal error:\n\n") + str(e) + sink() + } + } + +param_df_exit <- + function(e = NULL) { + param_df_noexit(e) + knitr::knit_exit() + exit(-1) + } + +# exit with exit code (default 0) and optional msg +exit <- + function(code = 0, msg = NULL, use_stderr = FALSE) { + if (!is.null(msg)) { + if (use_stderr) sink(stderr()) + cat("\n\n", msg, "\n\n") + if (use_stderr) sink() + } + q(save = "no", status = code) + } + +# make control sequences into printable latex sequences +latex_printable_control_seqs <- + function(s) { + s <- gsub("[\\]", "xyzzy_plugh", s) + s <- gsub("[$]", "\\\\$", s) + s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) + return(s) + } +nolatex_verbatim <- + function(expr) eval(expr) + +latex_verbatim <- + function(expr) { + arg_string <- deparse1(substitute(expr)) + cat("\n\\begin{verbatim}\n___\n") + tryCatch( + expr = expr, + error = param_df_exit, + #ACE error = + #ACE function(e) { + #ACE cat("Caught error:\n\n") + #ACE str(e) + #ACE knitr::knit_exit() + #ACE stop(e) + #ACE }, + finally = cat("...\n\\end{verbatim}\n") + ) + } + +latex_samepage <- + function(expr) { + arg_string <- deparse1(substitute(expr)) + cat("\n\\begin{samepage}\n") + tryCatch( + expr = expr, + error = param_df_exit, + #ACE error = + #ACE function(e) { + #ACE cat("Caught error:\n\n") + #ACE str(e) + #ACE knitr::knit_exit() + #ACE stop(e) + #ACE }, + finally = cat("\n\\end{samepage}\n") + ) + } + +# return the result of invocation after showing parameters +# ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ +latex_show_invocation <- + function(f, f_name = deparse1(substitute(f)), head_patch = FALSE) { + function(...) { + my_env <- (as.list(environment())) + va <- list(...) + my_rslt <- new_env() + my_rslt$rslt <- NULL + latex_verbatim( + expr = { + cat(sprintf("\n .. Local variables for '%s':\n\n", f_name)) + str(va) + if (!head_patch) { + # return this result + # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ + cat(sprintf("\n .. Invoking '%s'\n", f_name)) + tryCatch( + { + cat("\n\\end{verbatim}\n") + rslt <- do.call(f, va) + }, + error = param_df_exit, + #ACE error = function(e) { + #ACE cat("\n\\begin{verbatim}\n") + #ACE str(e) + #ACE cat("\n\\end{verbatim}\n") + #ACE knitr::knit_exit() + #ACE stop(e) + #ACE }, + finally = cat("\n\\begin{verbatim}\n") + ) + cat(sprintf("\n .. '%s' returned:\n", f_name)) + str(rslt) + my_rslt$rslt <- rslt + } + } + ) + # return the result of invocation with the shown parameters + # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ + if (head_patch) my_rslt$rslt <- do.call(f, va) + (my_rslt$rslt) + } + } + +latex_collapsed_vector <- function( + collapse_string, + v, + underscore_whack = TRUE, + action = cat0 + ) { + v_sub <- + if (underscore_whack) whack_underscores(v) else v + action( paste0( v_sub, collapse = collapse_string @@ -242,113 +1263,36 @@ cat(" \\\\\n") } -# Use this like print.data.frame, from which it is adapted: -data_frame_latex <- - function( - x, - ..., - # digits to pass to format.data.frame - digits = NULL, - # TRUE -> right-justify columns; FALSE -> left-justify - right = TRUE, - # maximumn number of rows to print - max = NULL, - # string with justification of each column - justification = NULL, - # TRUE to center on page - centered = TRUE, - # optional caption - caption = NULL, - # h(inline); b(bottom); t (top) or p (separate page) - anchor = "h", - # set underscore_whack to TRUE to escape underscores - underscore_whack = TRUE +latex_tabbing_row <- function( + v, + extra = "", + underscore_whack = TRUE, + action = cat0 ) { - if (is.null(justification)) - justification <- - Reduce( - f = paste, - x = rep_len(if (right) "r" else "l", length(colnames(x))) - ) - n <- length(rownames(x)) - if (length(x) == 0L) { - cat( - sprintf( - # if n is one, use singular 'row', else use plural 'rows' - ngettext( - n, - "data frame with 0 columns and %d row", - "data frame with 0 columns and %d rows" - ), - n - ), - "\n", - sep = "" - ) - } else if (n == 0L) { - cat("0 rows for:\n") - latex_itemized_list( - v = names(x), - underscore_whack = underscore_whack - ) - } else { - if (is.null(max)) - max <- getOption("max.print", 99999L) - if (!is.finite(max)) - stop("invalid 'max' / getOption(\"max.print\"): ", - max) - omit <- (n0 <- max %/% length(x)) < n - m <- as.matrix( - format.data.frame( - if (omit) x[seq_len(n0), , drop = FALSE] else x, - digits = digits, - na.encode = FALSE - ) - ) - cat( - # h(inline); b(bottom); t (top) or p (separate page) - paste0("\\begin{table}[", anchor, "]\n") - ) - if (!is.null(caption)) - cat(paste0(" \\caption{", caption, "}")) - if (centered) cat("\\centering\n") - cat( - paste( - " \\begin{tabular}{", - justification, - "}\n", - sep = "" - ) - ) - # ref: https://tex.stackexchange.com/a/50353 - # Describes use of \rule{0pt}{3ex} - if (!is.null(caption)) - cat("\\B \\\\ \\hline\\hline\n") - # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 - latex_table_row( - v = colnames(m), - extra = "\\T\\B", - underscore_whack = underscore_whack - ) - cat("\\hline\n") - for (i in seq_len(length(m[, 1]))) { - latex_table_row( - v = m[i, ], - underscore_whack = underscore_whack - ) - } - cat( - paste( - " \\end{tabular}", - "\\end{table}", - sep = "\n" - ) - ) - if (omit) - cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", - n - n0, "rows ]\n") - } - invisible(x) + # latex_collapsed_vector applies action to result of paste0; + # by default, action = cat; + # hence, a scalar string is assigned to v_collapsed + v_collapsed <- + latex_collapsed_vector( + "} \\> \\tabfill{", + v, + underscore_whack, + action = paste0 + ) + action( + "\\tabfill{", + v_collapsed, + "}", + extra, + " \\\\\n" + ) + } + +# N.B. use con = "" to emulate regular cat +fcat0 <- + function(..., sprtr = " ", cnnctn = file()) { + cat0(..., sep = sprtr, file = cnnctn) + invisible(cnnctn) } hypersub <- @@ -356,32 +1300,39 @@ hyper <- tolower(s) hyper <- gsub("[^a-z0-9]+", "-", hyper) hyper <- gsub("[-]+", "-", hyper) + hyper <- gsub("[_]+", "-", hyper) hyper <- sub("^[-]", "", hyper) hyper <- sub("[-]$", "", hyper) return(hyper) } -subsection_header <- - function(s) { - hyper <- hypersub(s) - cat( - sprintf( - "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", - hyper, s, hyper - ) - ) +table_href <- function(s = "offset", caption = "") { + paste0("\\hyperlink{table.\\arabic{", s, "}}{Table \\arabic{", s, "}}") + } + +table_offset <- function(i = 0, s = "offset", new = FALSE) { + paste0( + if (new) paste0("\\newcounter{", s, "}\n") else "", + "\\setcounter{", s, "}{\\value{table}}\n", + paste0(if (i > 0) rep(paste0("\\stepcounter{", s, "}"), i), "\n") + ) } -subsubsection_header <- - function(s) { +a_section_header <- + function(s, prefix = "") { hyper <- hypersub(s) - cat( - sprintf( - "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n", - hyper, s, hyper - ) + my_subsection_header <- sprintf( + "\\hypertarget{%s}{\\%ssection{%s}\\label{%s}}\n", + hyper, + prefix, + gsub("_", "\\_", s, fixed = TRUE), + hyper ) + my_subsection_header } +section_header <- function(s) a_section_header(s, "") +subsection_header <- function(s) a_section_header(s, "sub") +subsubsection_header <- function(s) a_section_header(s, "subsub") ### SQLite functions @@ -419,10 +1370,54 @@ ### KSEA functions and helpers -# Adapted from KSEAapp::KSEA.Scores to allow retrieval of: -# - maximum log2(FC) +#' The KSEA App Analysis (KSEA Kinase Scores Only) +#' +#' Compute KSEA kinase scores and statistics from phoshoproteomics data input +#' Adapted from KSEAapp::KSEA.Scores to allow retrieval of maximum log2(FC) +#' +#' Result is an R data.frame with column names +#' "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" +#' "Please refer to the original Casado et al. publication for detailed +#' description of these columns and what they represent: +#' +#' - Kinase.Gene indicates the gene name for each kinase. +#' - mS represents the mean log2(fold change) of all the +#' kinase's substrates. +#' - Enrichment is the background-adjusted value of the kinase's mS. +#' - m is the total number of detected substrates +#' from the experimental dataset for each kinase. +#' - z.score is the normalized score for each kinase, weighted by +#' the number of identified substrates. +#' - p.value represents the statistical assessment for the z.score. +#' - FDR is the p-value adjusted for multiple hypothesis testing +#' using the Benjamini & Hochberg method." +#' +#' @param ksdata the Kinase-Substrate dataset uploaded from the file +#' prefaced with "PSP&NetworKIN_" +#' available from github.com/casecpb/KSEA/ +#' @param px the experimental data file formatted as described in +#' the KSEA.Complete() documentation +#' @param networkin a binary input of TRUE or FALSE, indicating whether +#' or not to include NetworKIN predictions, where +#' \code{NetworKIN = TRUE} +#' means include NetworKIN predictions +#' @param networkin_cutoff a numeric value between 1 and infinity setting +#' the minimum NetworKIN score +#' (this can be omitted if NetworKIN = FALSE) +#' +#' @return creates a new R data.frame with all the KSEA kinase +#' scores, along with each one's statistical +#' assessment, as described herein. +#' +#' @references +#' +#' Casado et al. (2013) Sci Signal. 6(268):rs6 +#' +#' Hornbeck et al. (2015) Nucleic Acids Res. 43:D512-20 +#' +#' Horn et al. (2014) Nature Methods 11(6):603-4 +#' ksea_scores <- function( - # For human data, typically, ksdata = KSEAapp::ksdata ksdata, @@ -444,9 +1439,13 @@ # A numeric value between 1 and infinity setting the minimum NetworKIN # score (can be left out if networkin = FALSE) - networkin_cutoff + networkin_cutoff, + + # Minimum substrate count, necessary to adjust the p-value appropriately. + minimum_substrate_count ) { + # no px$FC should be <= 0, but abs(px$FC) is used below as a precaution. if (length(grep(";", px$Residue.Both)) == 0) { # There are no Residue.Both entries having semicolons, so new is # simply px except two columns are renamed and a column is added @@ -492,18 +1491,53 @@ # Eliminate rows having missing values (e.g., non-imputed data) new <- new[complete.cases(new$log2_fc), ] } - if (networkin == TRUE) { - # When NetworKIN is true, filter on NetworKIN.cutoff which includes - # PhosphoSitePlus data *because its networkin_score is set to Inf* - ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] - ksdata_filtered <- ksdata_filtered[ - (ksdata_filtered$networkin_score >= networkin_cutoff), ] - } else { - # Otherwise, simply use PhosphSitePlus rows - ksdata_filtered <- ksdata[ - grep("PhosphoSitePlus", ksdata$Source), ] + # At this point, new$log2_fc is signed according to which contrast has + # the greater intensity + # To take the magnitude into account without taking the direction into + # account, set params$kseaUseAbsoluteLog2FC to TRUE + # + # Should KSEA be performed aggregating signed log2FC or absolute? + # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores + if (params$kseaUseAbsoluteLog2FC) { + # TRUE use abs(log2FC) for KSEA as Justin requested; this is a + # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. + new$log2_fc <- abs(new$log2_fc) + } + + monitor_filtration_on_stderr <- TRUE + if (monitor_filtration_on_stderr) { + # set to TRUE to monitor filtration on stderr + sink(stderr()) + cat(see_variable(networkin, "\n")) } - # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD + ksdata_filtered <- + sqldf( + sprintf("%s %s", + "select * from ksdata where not Source = 'NetworKIN'", + if (networkin) + sprintf("or networkin_score >= %d", networkin_cutoff) + else + "" + ) + ) + if (monitor_filtration_on_stderr) { + cat(see_variable(sqldf( + "select count(*), Source from ksdata group by Source"), "\n")) + cat(see_variable(sqldf( + "select count(*), Source from ksdata_filtered group by Source"), "\n")) + sink() + } + + ############################################################################ + # Line numbers below refer to lines of: + # https://github.com/casecpb/KSEAapp/blob/master/R/KSEA.Scores.R + # I would put the original line in a comment but then lint would complain... + # - Indeed, I had to rename all the variables because lint didn't like names + # containing periods or capital letters. + # ACE + ############################################################################ + # + # (1) Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD # colnames of ksdata_filtered: # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" @@ -516,6 +1550,8 @@ # INNER JOIN new b # ON a.SUB_GENE = b.SUB_GENE # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD + # (KSEA.Scores.R line # 105) + # "Extract KSData.filtered annotations that are only found in new" ksdata_dataset <- base::merge(ksdata_filtered, new) # colnames of ksdata_dataset: # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" @@ -523,24 +1559,31 @@ # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) # Re-order dataset; prior to accounting for isoforms + # (KSEA.Scores.R line # 106) ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] # Extract non-isoform accession in UniProtKB + # (KSEA.Scores.R line # 107) ksdata_dataset$uniprot_no_isoform <- sapply( ksdata_dataset$KIN_ACC_ID, function(x) unlist(strsplit(as.character(x), split = "-"))[1] ) + # "last expression collapses isoforms ... for easy processing" # Discard previous results while selecting interesting columns ... + # (KSEA.Scores.R line # 110) ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] # Column names are now: # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" # "FC" "log2_fc" "Source" # Make column names human-readable + # (KSEA.Scores.R line # 111) colnames(ksdata_dataset_abbrev) <- c( "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", "FC", "log2FC", "Source" ) # SELECT * FROM ksdata_dataset_abbrev # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p + # (KSEA.Scores.R line # 112) + # "Extract KSData.filtered annotations that are only found in new" ksdata_dataset_abbrev <- ksdata_dataset_abbrev[ order( @@ -549,6 +1592,7 @@ ksdata_dataset_abbrev$Substrate.Mod, ksdata_dataset_abbrev$p), ] + if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) # First aggregation step to account for multiply phosphorylated peptides # and differing peptide sequences; the goal here is to combine results # for all measurements of the same substrate. @@ -560,12 +1604,16 @@ # ORDER BY `Kinase.Gene`; # in two steps: # (1) compute average log_2(fold-change) + # "take the mean of the log2FC amongst phosphosite duplicates" + # (KSEA.Scores.R line # 115) ksdata_dataset_abbrev <- aggregate( log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, data = ksdata_dataset_abbrev, FUN = mean ) + if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) # (2) order by Kinase.Gene + # (KSEA.Scores.R line # 117) ksdata_dataset_abbrev <- ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] # SELECT `Kinase.Gene`, count(*) @@ -573,9 +1621,14 @@ # GROUP BY `Kinase.Gene`; # in two steps: # (1) Extract the list of Kinase.Gene names + # "@@@@@@@@@@@@@@@@@@@@" + # "Do analysis for KSEA" + # "@@@@@@@@@@@@@@@@@@@@" + # (KSEA.Scores.R line # 124) kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, # named by Kinase.Gene + # (KSEA.Scores.R line # 125) kinase_list <- as.matrix(table(kinase_list)) # Second aggregation step to account for all substrates per kinase # CREATE TABLE mean_fc @@ -583,50 +1636,123 @@ # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC # FROM ksdata_dataset_abbrev # GROUP BY `Kinase.Gene` - mean_fc <- aggregate( - log2FC ~ Kinase.Gene, - data = ksdata_dataset_abbrev, - FUN = mean - ) - # mean_fc columns: "Kinase.Gene", "log2FC" - if (FALSE) { - # I need to re-think this; I was trying to find the most-represented - # peptide, but that horse has already left the barn - # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC - # FROM ksdata_dataset_abbrev - # GROUP BY `Kinase.Gene` - max_fc <- aggregate( + # (KSEA.Scores.R line # 127) + if (print_nb_messages) nb(see_variable(ksdata_dataset_abbrev), "\n") + mean_fc <- + aggregate( log2FC ~ Kinase.Gene, data = ksdata_dataset_abbrev, - FUN = function(r) max(abs(r)) + FUN = mean ) - } + if (print_nb_messages) nbe(see_variable(mean_fc), "\n") + + # for contrast j + # for each kinase i + # extract log2 of fold-change (from `new` above) + # (used in KSEA.Scores.R lines # 130 & 132) + log2_fc_j_each_i <- + new$log2_fc + + # for contrast j + # for all kinases i + # compute mean of abs(log2 of fold-change) + # (used in KSEA.Scores.R lines # 130) + mean_abs_log2_fc_j_all_i <- + mean(abs(log2_fc_j_each_i), na.rm = TRUE) + + # for contrast j + # for all kinases i + # compute mean of log2 of fold-change + # (used in KSEA.Scores.R lines # 132) + mean_log2_fc_j_all_i <- + mean(log2_fc_j_each_i, na.rm = TRUE) + + # Reorder mean_fc, although I don't see why + # (KSEA.Scores.R line 128 + mean_fc <- mean_fc[order(mean_fc[, 1]), ] + # mean_fc columns so far: "Kinase.Gene", "log2FC" + # - Kinase.Gene + # indicates the gene name for each kinase. # Create column 3: mS - mean_fc$m_s <- mean_fc[, 2] + # - mS + # represents the mean log2(fold change) of all the + # kinase's substrates. + # (KSEA.Scores.R line # 129) + mean_fc$m_s <- + mean_fc_m_s <- mean_fc[, 2] + # Create column 4: Enrichment - mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) - # Create column 5: m, count of substrates - mean_fc$m <- kinase_list + # - Enrichment + # is the background-adjusted value of the kinase's mS. + # (KSEA.Scores.R line # 130) + mean_fc$enrichment <- + mean_fc_m_s / mean_abs_log2_fc_j_all_i + + # Create column 5: m, count of substrates of kinase (count of j for i) + # - m + # is the total number of detected substrates + # from the experimental dataset for each kinase. + # (KSEA.Scores.R line # 131) + mean_fc$m <- + mean_fc_m <- kinase_list + + # Create column 6: z-score - mean_fc$z_score <- ( - (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * - sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) + # - z.score + # is the normalized score for each kinase, weighted by + # the number of identified substrates. + # (KSEA.Scores.R line # 132) + mean_fc$z_score <- + (mean_fc_m_s - mean_log2_fc_j_all_i) * sqrt(mean_fc_m) / + sd(log2_fc_j_each_i, na.rm = TRUE) + # Create column 7: p-value, deduced from z-score - mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) + # - p.value + # represents the statistical assessment for the z.score. + # (KSEA.Scores.R line # 133) + # "one-tailed p-value" + mean_fc$p_value <- + pnorm(-abs(mean_fc$z_score)) + + # zap excluded kinases; this must be done before adjusting p-value + if (TRUE) { + mean_fc <- + mean_fc[ + mean_fc$m >= minimum_substrate_count, + , + drop = FALSE + ] + } + + #ACE nb(see_variable(nrow(mean_fc)), "\n") # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value - mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") - - # Remove log2FC column, which is duplicated as mS - mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] + # - FDR + # is the p-value adjusted for multiple hypothesis testing + # using the Benjamini & Hochberg method." + # (KSEA.Scores.R line # 134) + mean_fc$fdr <- + p.adjust(mean_fc$p_value, method = "fdr") + + # It makes no sense to leave Z-scores negative when using + # absolute value of fold-change + if (params$kseaUseAbsoluteLog2FC) { + mean_fc$z_score <- abs(mean_fc$z_score) + } + + # Remove second column (log2FC), which is duplicated as mS + # (KSEA.Scores.R line # 136) + mean_fc <- + mean_fc[order(mean_fc$Kinase.Gene), -2] # Correct the column names which we had to hack because of the linter... colnames(mean_fc) <- c( "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" ) + # (KSEA.Scores.R line # 138) return(mean_fc) } -low_fdr_barplot <- function( +ksea_low_fdr_barplot_factory <- function( rslt, i_cntrst, i, @@ -658,57 +1784,94 @@ "p.value" = { k$p_value }, - stop( - sprintf( - "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", - ksea_cutoff_statistic + { + cat( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) ) - ) - ) + param_df_exit() + knitr::knit_exit() + } + ) k <- k[selector < ksea_cutoff_threshold, ] - - if (nrow(k) > 1) { - op <- par(mai = c(1, 1.5, 0.4, 0.4)) + nrow_k <- nrow(k) + + #ACE nbe(see_variable(fdr_barplot_dataframe <- k)) + + if (nrow_k > 0) { + max_nchar_rowname <- max(nchar(rownames(k))) + my_cex_names <- 1.0 / (1 + nrow_k / 50) + + if (print_trace_messages) cat_margins("Initially") + if (print_trace_messages) cat_variable(nrow_k, "\n\n", 0) + if (print_trace_messages) cat_variable(my_cex_names, "\n\n", 0) + if (print_trace_messages) cat_variable(max_nchar_rowname, "\n\n", 0) + + # fin: The figure region dimensions, (width, height), in inches. + # mar: A numerical vector of the form c(bottom, left, top, right) + # that gives the number of lines of margin to be specified + # on the four sides of the plot; default: c(5, 4, 4, 2) + 0.1 + +# mar: The figure region dimensions, (width, height), in inches. numeric_z_score <- as.numeric(k$z_score) - z_score_order <- order(numeric_z_score) + bar_order <- order(-as.numeric(k$p_value)) kinase_name <- k$kinase_gene long_caption <- sprintf( - "Kinase z-score, %s < %s, %s", + "Kinase z-score, %s, KSEA %s < %s", + caption, ksea_cutoff_statistic, - ksea_cutoff_threshold, - caption + ksea_cutoff_threshold ) my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) - cat("\n\\clearpage\n") - barplot( - height = numeric_z_score[z_score_order], - border = NA, - xpd = FALSE, - cex.names = 1.0, - cex.axis = 1.0, - main = long_caption, - cex.main = my_cex_caption, - names.arg = kinase_name[z_score_order], - horiz = TRUE, - srt = 45, - las = 1) - par(op) + # return a function that draws the plot + function() { + par_fin <- par("fin") # vector of width_in_inches and height_in_inches) + op <- par( + bg = if (print_trace_messages) "yellow" else "white", + fin = c(par_fin[1], min(par_fin[2], 2.5 + nrow_k / 6)), + mar = par("mar") + + c(3 / nrow_k, (1 + max_nchar_rowname * my_cex_names) / 2, 0, 0) + # bottom, left, top, right + ) + on.exit(par(op)) + if (print_trace_messages) cat_margins("Eventually") + + barplot( + height = numeric_z_score[bar_order], + border = NA, + xpd = FALSE, + cex.names = my_cex_names, + main = long_caption, + cex.main = my_cex_caption, + names.arg = kinase_name[bar_order], + horiz = TRUE, + srt = 45, + las = 1, + cex.axis = 0.9 + ) + } } + } else { + no_op } } # note that this adds elements to the global variable `ksea_asterisk_hash` -low_fdr_print <- function( +ksea_low_fdr_print <- function( rslt, i_cntrst, i, a_level, b_level, fold_change, - caption + caption, + write_db = TRUE, # if TRUE, write to DB, else print table + anchor = c(const_table_anchor_p, const_table_anchor_t) ) { rslt_score_list_i <- rslt$score_list[[i]] if (!is.null(rslt_score_list_i)) { @@ -734,13 +1897,17 @@ "p.value" = { k$p_value }, - stop( - sprintf( - "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", - ksea_cutoff_statistic + { + cat( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) ) - ) - ) + param_df_exit() + knitr::knit_exit() + } + ) k <- k[selector < ksea_cutoff_threshold, ] # save kinase names to ksea_asterisk_hash @@ -748,110 +1915,175 @@ ksea_asterisk_hash[[kinase_name]] <- 1 } - db_write_table_overwrite <- (i_cntrst < 2) - db_write_table_append <- !db_write_table_overwrite - RSQLite::dbWriteTable( - conn = db, - name = "contrast_ksea_scores", - value = contrast_ksea_scores, - append = db_write_table_append + if (write_db) { + db_write_table_overwrite <- (i_cntrst < 2) + db_write_table_append <- !db_write_table_overwrite + RSQLite::dbWriteTable( + conn = db, + name = "contrast_ksea_scores", + value = contrast_ksea_scores, + append = db_write_table_append + ) + "" + } else { + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + contrast_ksea_scores$fdr + }, + "p.value" = { + contrast_ksea_scores$p_value + }, + { + cat( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + param_df_exit() + knitr::knit_exit() + } ) - selector <- switch( - ksea_cutoff_statistic, - "FDR" = { - contrast_ksea_scores$fdr - }, - "p.value" = { - contrast_ksea_scores$p_value - }, - stop( - sprintf( - "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", - ksea_cutoff_statistic + if (print_nb_messages) nbe(see_variable(contrast_ksea_scores)) + output_df <- contrast_ksea_scores[ + selector < ksea_cutoff_threshold, + c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", + "z_score", "p_value", "fdr") + ] + output_df$kinase_gene <- + gsub( + "_", + "\\\\_", + output_df$kinase_gene + ) + colnames(output_df) <- + c( + colnames(output_df)[1], + colnames(output_df)[2], + "enrichment", + "m_s", + "z_score", + "p_value", + "fdr" + ) + #ACE output_order <- with(output_df, order(fdr)) + output_order <- with(output_df, order(p_value)) + output_df <- output_df[output_order, ] + + output_df[, 2] <- sprintf("%0.3g", output_df[, 2]) + output_df$fdr <- sprintf("%0.4f", output_df$fdr) + output_df$p_value <- sprintf("%0.2e", output_df$p_value) + output_df$z_score <- sprintf("%0.2f", output_df$z_score) + output_df$m_s <- sprintf("%d", output_df$m_s) + output_df$enrichment <- sprintf("%0.3g", output_df$enrichment) + output_ncol <- ncol(output_df) + colnames(output_df) <- + c( + "Kinase", + "\\(\\overline{{\\lvert}\\log_2 (\\text{fold-change}){\\rvert}}\\)", + "Enrichment", + "Substrates", + "z-score", + "p-value", + "FDR" + ) + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + rslt$score_list[[i]]$FDR + }, + "p.value" = { + rslt$score_list[[i]]$p.value + }, + { + cat( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + param_df_exit() + knitr::knit_exit() + } + ) + if (sum(selector < ksea_cutoff_threshold) > 0) { + if (print_nb_messages) nbe(see_variable(output_df)) + math_caption <- gsub("{", "\\{", caption, fixed = TRUE) + math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) + # with ( + # output_df, + # ) + if (TRUE) { + output_df$Kinase <- whack_underscores(output_df$Kinase) + data_frame_tabbing_latex( + x = output_df, + # vector of tab stops, in inches + tabstops = c(1.0, 1.2, 1.0, 1.0, 1.0, 1.0), + # vector of headings, registered with tab-stops + headings = colnames(output_df), + # digits to pass to format.data.frame + digits = NULL, + # maximumn number of rows to print + max = NULL, + # optional caption + caption = sprintf( + "\\text{%s}, KSEA %s < %s", + math_caption, + ksea_cutoff_statistic, + ksea_cutoff_threshold + ), + # set underscore_whack to TRUE to escape underscores + underscore_whack = FALSE, + # flag for landscape mode + landscape = FALSE, + # flag indicating that subsubsection should be used for caption + # rather than subsection + use_subsubsection_header = TRUE, + # character-size indicator; for possible values, see: + # https://tug.org/texinfohtml/latex2e.html#Font-sizes + charactersize = "small", + # set verbatim to TRUE to debug output + verbatim = FALSE + ) + } else { + data_frame_table_latex( + x = output_df, + justification = "l c c c c c c", + centered = TRUE, + caption = sprintf( + "\\text{%s}, KSEA %s < %s", + math_caption, + ksea_cutoff_statistic, + ksea_cutoff_threshold + ), + anchor = anchor, + underscore_whack = FALSE + ) + } + } else { + cat( + sprintf( + "\\break + No kinases had + \\(\\text{KSEA %s}_\\text{enrichment} < %s\\) + for contrast %s\\hfill\\break\n", + ksea_cutoff_statistic, + ksea_cutoff_threshold, + caption ) ) - ) - output_df <- contrast_ksea_scores[ - selector < ksea_cutoff_threshold, - c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", - "z_score", "p_value", "fdr") - ] - output_order <- with(output_df, order(mean_log2_fc, kinase_gene)) - output_df <- output_df[output_order, ] - colnames(output_df) <- - c( - colnames(output_df)[1], - colnames(output_df)[2], - "enrichment", - "m_s", - "z_score", - "p_value", - "fdr" - ) - output_df$fdr <- sprintf("%0.4f", output_df$fdr) - output_df$p_value <- sprintf("%0.2e", output_df$p_value) - output_df$z_score <- sprintf("%0.2f", output_df$z_score) - output_df$m_s <- sprintf("%d", output_df$m_s) - output_df$enrichment <- sprintf("%0.2f", output_df$enrichment) - output_ncol <- ncol(output_df) - colnames(output_df) <- - c( - "Kinase", - "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)", - "Enrichment", - "Substrates", - "z-score", - "p-value", - "FDR" - ) - selector <- switch( - ksea_cutoff_statistic, - "FDR" = { - rslt$score_list[[i]]$FDR - }, - "p.value" = { - rslt$score_list[[i]]$p.value - }, - stop( - sprintf( - "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", - ksea_cutoff_statistic - ) - ) - ) - if (sum(selector < ksea_cutoff_threshold) > 0) { - math_caption <- gsub("{", "\\{", caption, fixed = TRUE) - math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) - data_frame_latex( - x = output_df, - justification = "l c c c c c c", - centered = TRUE, - caption = sprintf( - "\\text{%s}, %s < %s", - math_caption, - ksea_cutoff_statistic, - ksea_cutoff_threshold - ), - anchor = const_table_anchor_p - ) - } else { - cat( - sprintf( - "\\break - No kinases had - \\(\\text{%s}_\\text{enrichment} < %s\\) - for contrast %s\\hfill\\break\n", - ksea_cutoff_statistic, - ksea_cutoff_threshold, - caption - ) - ) + } } + } else { + "" } } # create_breaks is a helper for ksea_heatmap create_breaks <- function(merged_scores) { + if (sum(!is.na(merged_scores)) < 2) + return(NULL) if (min(merged_scores, na.rm = TRUE) < -1.6) { breaks_neg <- seq(-1.6, 0, length.out = 30) breaks_neg <- @@ -889,14 +2121,135 @@ return(color_breaks) } +hm2plus <- function( + x, + mat = matrix( + c( + c(0, 4, 0), + c(0, 3, 3), + c(2, 1, 1) + ), + nrow = 3, + ncol = 3, + byrow = TRUE + ), + denwid = 0.5, + denhgt = 0.15, + widths = c(0.5, 2.5, 1.5), + heights = c(0.4, 0.15, 3.95), + divergent = FALSE, + notecol = "grey50", + trace = "none", + margins = c(6, 20), + srtcol = 90, + srtrow = 0, + density_info = "none", + key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), + key_par = list(), + hclustfun = hclust, + ... +) { + + varargs <- list(...) + if (FALSE) # this is to avoid commenting out code to pass linting... + my_hm2 <- latex_show_invocation(heatmap.2, head_patch = FALSE) + else + my_hm2 <- heatmap.2 + + x <- as.matrix(x) + if (sum(!is.na(x)) < 1) + return(NULL) + color_count <- 1 + max(64, length(as.vector(x))) # 8 was not enough + break_count <- 1 + color_count + min_nonax <- min(x, na.rm = TRUE) + max_nonax <- max(x, na.rm = TRUE) + if (print_nb_messages) nb("within hm2plus", see_variable(divergent), "\n") + if (divergent) { + zlim <- max(abs(min_nonax), abs(max_nonax)) + if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n")) + breaks <- (zlim) / (break_count:1) + if (print_nb_messages) nb(see_variable(breaks, "\n")) + breaks <- breaks - median(breaks) + zlim <- c(-zlim, zlim) + if (print_nb_messages) nb(see_variable(zlim, "\n")) + } else { + zlim <- max(abs(min_nonax), abs(max_nonax)) + if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n")) + breaks <- zlim / (break_count:1) + if (print_nb_messages) nb(see_variable(breaks, "\n")) + if (max_nonax < 0) { + breaks <- breaks - zlim + zlim <- c(-zlim, 0) + } else { + zlim <- c(0, zlim) + } + if (print_nb_messages) nb(see_variable(zlim, "\n")) + } + nonax <- x + nonax[is.na(x)] <- min_nonax + if (is.null(widths)) widths <- c(denwid, 4 - denwid, 1.5) + if (is.null(heights)) heights <- c(0.4, denhgt, 4.0) + colors <- + if (divergent && min_nonax < 0) { + # divergent colors on both sides of zero + colorRampPalette(c("red", "white", "blue"))(color_count) + } else if (divergent && min_nonax > 0) { + # "divergent" colors > zero + colorRampPalette(c("white", "blue"))(color_count) + } else if (divergent && max_nonax < 0) { + # "divergent" colors < zero + colorRampPalette(c("red", "white"))(color_count) + } else { + # "non-divergent" colors including zero + hcl.colors(color_count, "YlOrRd", rev = TRUE) + } + + #ACE if (print_nb_messages) nb("within hm2plus", see_variable(key_par), "\n") + #ACE if (print_nb_messages) nb(see_variable(colors, "\n")) + #ACE key_par$col = colors + #ACE key_par$breaks = breaks + + if (print_nb_messages) nb(see_variable(par(), "\n")) #ACE TODO remove me + if (print_nb_messages) cat("\\leavevmode\n\\linebreak\n") #ACE TODO remove me + suppressWarnings( + my_hm2( + x = x, + col = colors, + #ACE symkey = FALSE, + density.info = density_info, + srtCol = srtcol, + srtRow = srtrow, + margins = margins, + lwid = widths, + lhei = heights, + key.title = NA, + key.xlab = key_xlab, + key.par = key_par, + lmat = mat, + notecol = notecol, + trace = trace, + bg = "yellow", + hclustfun = hclustfun, + #ACE breaks = breaks, + oldstyle = FALSE, + ... # varargs + ) + ) + # implicitly returning value returned by heatmap.2 +} + # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap draw_kseaapp_summary_heatmap <- function( - x, - sample_cluster, - merged_asterisk, - my_cex_row, - color_breaks, - margins, + x, # matrix with row/col names already formatted + sample_cluster, # a binary input of TRUE or FALSE, + # indicating whether or not to perform + # hierarchical clustering of the sample columns + merged_asterisk, # matrix having dimensions of x, values "*" or "" + color_breaks, # breaks for color gradation, from create_breaks + # passed to `breaks` argument of `image` + margins = c(8, 15), # two integers setting the bottom and right margins + # to accommodate row and column labels + master_cex = 0.7, # basis for text sizes ... ) { merged_scores <- x @@ -908,44 +2261,128 @@ "' rather than 'matrix'.\n\n" ) ) - } else if (nrow(x) < 2) { - cat("No plot because matrix x has ", nrow(x), " rows.\n\n") - cat("\\begin{verbatim}\n") - str(x) - cat("\\end{verbatim}\n") - } else if (ncol(x) < 2) { - cat("No plot because matrix x has ", ncol(x), " columns.\n\n") - cat("\\begin{verbatim}\n") - str(x) - cat("\\end{verbatim}\n") - } else { - gplots::heatmap.2( - x = merged_scores, - Colv = sample_cluster, - scale = "none", - cellnote = merged_asterisk, - notecol = "white", - cexCol = 0.9, - # Heuristically assign size of row labels - cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25), - srtCol = 45, - srtRow = 45, - notecex = 3 * my_cex_row, - col = color_breaks[[2]], - density.info = "none", - trace = "none", - breaks = color_breaks[[1]], - lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), - lhei = c(0.4, 8.0, 1.1), - lwid = c(0.5, 3), - key = FALSE, - margins = margins, - ... - ) + cat_variable(x) + return(FALSE) + } + if (print_trace_messages) cat(sprintf("master_cex = %03f\n\n", master_cex)) + nrow_x <- nrow(x) + ncol_x <- ncol(x) + #if (nrow_x < 2) { + if (nrow_x < 1) { + cat("No plot because matrix has no rows.\n\n") + return(FALSE) + } else if (nrow_x < 2) { + cat("No plot because matrix has one row. Matrix looks like this:\n\n") + cat("\n\\begin{verbatim}\n") + print(x) + cat("\n\\end{verbatim}\n") + return(FALSE) + } else if (ncol_x < 2) { + cat("No plot because matrix x has ", ncol_x, " columns.\n\n") + cat_variable(x) + return(FALSE) } + max_nchar_rowname <- max(nchar(rownames(x))) + max_nchar_colname <- max(nchar(colnames(x))) + my_limit <- g_intensity_hm_rows + + my_row_cex_scale <- master_cex * 150 / nrow_x + my_col_cex_scale <- 3.0 + my_asterisk_scale <- 0.4 * my_row_cex_scale + my_row_warp <- 1 + my_note_warp <- 2 + my_row_warp <- 1 + my_row_cex_asterisk <- + master_cex * my_row_warp * my_asterisk_scale + + + my_col_cex <- my_col_cex_scale * master_cex + my_row_cex <- min(3.5 * my_row_cex_asterisk, my_col_cex) + my_key_cex <- 1.286 + my_hm2_cex <- 1 * master_cex + my_offset <- (4.8 / (9 + nrow_x / 10)) - 0.4 + if (print_trace_messages) cat(sprintf("nrow_x = %03f\n\n", nrow_x)) + if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) + my_offset <- 0.05 + if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) + my_scale <- 3.0 + if (ncol_x < 10 && nrow_x < 10) + my_scale <- my_scale * 10 / (10 - nrow_x) * 10 / (10 - ncol_x) + + my_heights <- c( + 0.15, + 3.85 - my_offset, + 0.5 + my_offset + ) + my_margins <- c(1, 1) + + c( + margins[1] * 0.08 * max_nchar_colname * my_col_cex, + margins[2] * 0.04 * max_nchar_rowname * my_row_cex + ) + + my_notecex <- + my_scale * + min( + 1.1, + my_row_cex_asterisk * my_note_warp, + my_col_cex * my_note_warp + ) + + if (print_trace_messages) { + cat_variable(my_heights, suffix = "; ") + cat_variable(my_margins, suffix = "\n\n") + cat_variable(my_row_cex_scale, suffix = "; ") + cat_variable(my_col_cex_scale, suffix = "\n\n") + cat_variable(my_row_cex_asterisk, suffix = "\n\n") + cat_variable(my_row_cex, suffix = "; ") + cat_variable(my_col_cex, suffix = "\n\n") + cat_variable(my_row_cex, suffix = "; ") + cat_variable(my_col_cex, suffix = "\n\n") + } + + hm2plus( + x = merged_scores, + Colv = sample_cluster, + cellnote = merged_asterisk, + cex = my_hm2_cex, + cexCol = my_col_cex, + cexRow = my_row_cex, + denhgt = 0.15, + density_info = "none", + denwid = 0.5, + divergent = TRUE, + key_par = list(cex = my_key_cex), + key_xlab = "Z-score", + margins = my_margins, + notecex = my_scale * min( + 1.5, + my_row_cex_asterisk * my_note_warp, + my_col_cex * my_note_warp + ), + notecol = "white", + scale = "none", + srtcol = 90, + srtrow = 0, + trace = "none", + mat = matrix( + c( + c(0, 3, 3), + c(2, 1, 1), + c(0, 4, 0) + ), + nrow = 3, + ncol = 3, + byrow = TRUE + ), + widths = c(0.5, 3.1, 0.9), + heights = my_heights, + ... + ) + return(TRUE) } -# Adapted from KSEAapp::KSEA.Heatmap +# function drawing heatmap of contrast fold-change for each kinase, +# adapted from KSEAapp::KSEA.Heatmap ksea_heatmap <- function( # the data frame outputs from the KSEA.Scores() function, in list format score_list, @@ -961,16 +2398,19 @@ m_cutoff, # a numeric value between 0 and 1 indicating the p-value/FDR cutoff # for indicating significant kinases in the heatmap - p_cutoff = - stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), + p_cutoff = { + cat("argument 'p_cutoff' is required for function 'ksea_heatmap'") + param_df_exit() + knitr::knit_exit() + }, # a binary input of TRUE or FALSE, indicating whether or not to perform # hierarchical clustering of the sample columns sample_cluster, # a binary input of TRUE or FALSE, indicating whether or not to export # the heatmap as a .png image into the working directory export = FALSE, - # bottom and right margins; adjust as needed if contrast names are too long - margins = c(6, 20), + # bottom and right margins; adjust as needehttps://tex.stackexchange.com/a/56795d if contrast names are too long + margins = c(6, 6), # print which kinases? # - Mandatory argument, must be one of const_ksea_.*_kinases which_kinases, @@ -993,7 +2433,7 @@ master <- Reduce( f = function(...) { - base::merge(..., by = "Kinase.Gene", all = FALSE) + base::merge(..., by = "Kinase.Gene", all = TRUE) }, x = score_list_m ) @@ -1019,15 +2459,13 @@ } merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) - # begin hack to print only significant rows asterisk_rows <- rowSums(merged_asterisk == "*") > 0 all_rows <- rownames(merged_stats) names(asterisk_rows) <- all_rows non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) - merged_scores_asterisk <- merged_scores[names(asterisk_rows), ] - merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ] - # end hack to print only significant rows + merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE] + merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE] row_list <- list() row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows @@ -1036,14 +2474,17 @@ i <- which_kinases my_row_names <- row_list[[i]] - scrs <- merged_scores[my_row_names, ] - stts <- merged_stats[my_row_names, ] + scrs <- merged_scores[my_row_names, , drop = FALSE] + stts <- merged_stats[my_row_names, , drop = FALSE] merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) color_breaks <- create_breaks(scrs) + if (is.null(color_breaks)) { + cat("No plot because matrix has too few rows.\n\n") + return(NULL) + } plot_height <- nrow(scrs) ^ 0.55 plot_width <- ncol(scrs) ^ 0.7 - my_cex_row <- 0.25 * 16 / plot_height if (export == "TRUE") { png( "KSEA.Merged.Heatmap.png", @@ -1053,96 +2494,600 @@ pointsize = 14 ) } - draw_kseaapp_summary_heatmap( + did_draw <- draw_kseaapp_summary_heatmap( x = scrs, sample_cluster = sample_cluster, merged_asterisk = merged_asterisk, - my_cex_row = my_cex_row, color_breaks = color_breaks, margins = margins ) if (export == "TRUE") { dev.off() } + if (!did_draw) + return(NULL) return(my_row_names) } -# helper for heatmaps of phosphopeptide intensities - -draw_intensity_heatmap <- +# helpers for heatmaps of phosphopeptide intensities + +# factory producing function to truncate string after n characters +trunc_n <- function(n) { + function(x) { + sapply( + X = x, + FUN = function(s) { + if (is.na(s)) + return("NA") + cond <- try_catch_w_e(nchar(s) > n) + if (!is.logical(cond$value)) { + return(cond$value$message) + } else if (cond$value) { + paste0( + strtrim(s, n), + "..." + ) + } else { + s + } + }, + USE.NAMES = FALSE + ) + } + } +trunc_long_ppep <- function(x) trunc_n(40)(x) +trunc_ppep <- function(x) trunc_n(g_ppep_trunc_n)(x) +trunc_subgene <- function(x) trunc_n(g_subgene_trunc_n)(x) +trunc_enriched_substrate <- function(x) trunc_n(g_sbstr_trunc_n)(x) + +# factory producing a function that returns a covariance +# matrix's rows (and columns) having variance > v_min +keep_cov_w_var_gtr_min <- function(v_min) { + function(x) { + if (!is.matrix(x)) + return(NULL) + keepers <- sapply( + X = seq_len(nrow(x)), + FUN = function(i) { + if (x[i, i] < v_min) + NA + else + x[i, i] + } + ) + names(keepers) <- rownames(x) + keepers <- keepers[!is.na(keepers)] + keepers <- names(keepers) + if (length(keepers) == 0) + return(NULL) + x[keepers, keepers] + } +} +# function that returns a matrix's rows having variance > 1 +keep_cov_w_var_gtr_1 <- keep_cov_w_var_gtr_min(1) + +# factory producing a function that returns +# - either a matrix's rows (rows = TRUE) +# - or a matrix's columns (rows = FALSE) +# having variance > v_min +keep_var_gtr_min <- function(v_min) { + function(x, rows = TRUE) { + nrowcol <- if (rows) nrow else ncol + if (!is.matrix(x)) + return(NULL) + keepers <- sapply( + X = seq_len(nrowcol(x)), + FUN = function(i) { + row_var <- var( + if (rows) x[i, ] else x[, i], + na.rm = TRUE + ) + if (is.na(row_var) || row_var <= v_min) NA else i + } + ) + keepers <- keepers[!is.na(keepers)] + if (rows) x[keepers, ] else x[, keepers] + } +} + +keep_var_gtr_0 <- keep_var_gtr_min(0) + +# function drawing heatmap of phosphopeptide intensities +ppep_heatmap <- function( m, # matrix with rownames already formatted cutoff, # cutoff used by hm_heading_function - hm_heading_function, # construct and cat heading from m and cutoff + hm_heading_function, # construct $ cat heading from m and cutoff hm_main_title, # main title for plot (drawn below heading) suppress_row_dendrogram = TRUE, # set to false to show dendrogram - max_peptide_count # experimental: - = intensity_hm_rows, # values of 50 and 75 worked well - ... # passthru parameters for heatmap + max_peptide_count = # experimental: + g_intensity_hm_rows, # values of 50 and 75 worked well + master_cex = 1.0, # basis for text sizes + margins = NULL, # optional margins (bottom, right) + cellnote = NULL, # optional matrix of character; dim = dim(m) + adj = 0.5, # adjust text: 0 left, 0.5 middle, 1 right + ... # passthru to hm2plus or heatmap.2 ) { + use_heatmap_1 <- FALSE peptide_count <- 0 # emit the heading for the heatmap if (hm_heading_function(m, cutoff)) { - peptide_count <- min(max_peptide_count, nrow(m)) - if (nrow(m) > 1) { + nrow_m <- nrow(m) + peptide_count <- min(max_peptide_count, nrow_m) + if (nrow_m > 1) { m_margin <- m[peptide_count:1, ] - # Margin setting was heuristically derived - margins <- - c(0.5, # col - max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row - ) - } - if (nrow(m) > 1) { + # Margin was heuristically derived to accommodate the widest label + row_mchar_max <- max(nchar(rownames(m_margin))) + col_mchar_max <- max(nchar(colnames(m_margin))) + row_margin <- master_cex * row_mchar_max * 2.6 + col_margin <- master_cex * col_mchar_max * 2.6 + if (print_trace_messages) cat(sprintf("row_margin = %0.3f; ", row_margin)) + if (print_trace_messages) cat(sprintf("col_margin = %0.3f; ", col_margin)) + hm_call <- NULL tryCatch( { - old_oma <- par("oma") - par(cex.main = 0.6) - # Heuristically determined character size adjustment formula - char_contractor <- - 250000 / ( - max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows + # set non-argument parameters for hm_call inner function + my_row_cex <- + master_cex * 200000 / ( + (max(nchar(rownames(m_margin)))^2) * g_intensity_hm_rows ) - heatmap( - m[peptide_count:1, ], - Rowv = if (suppress_row_dendrogram) NA else NULL, - Colv = NA, - cexRow = char_contractor, - cexCol = char_contractor * 50 / max_peptide_count, - scale = "row", - margins = margins, - main = - "Unimputed, unnormalized log(intensities)", - xlab = "", - las = 1, - ... + m_hm <- m[peptide_count:1, , drop = FALSE] + if (is.null(cellnote)) { + cellnote <- matrix("", nrow = nrow(m_hm), ncol = ncol(m_hm)) + cellnote[is.na(m_hm)] <- "NA" + } else { + cellnote <- cellnote[peptide_count:1, , drop = FALSE] + } + m_hm[is.na(m_hm)] <- 0 + nrow_m_hm <- nrow(m_hm) + ncol_m_hm <- ncol(m_hm) + if (nrow_m_hm < 1 || ncol_m_hm < 1) + return(peptide_count) # return zero as initialized above + my_limit <- g_intensity_hm_rows + + + my_row_cex <- master_cex * (100 / (2 + row_mchar_max)) + my_col_cex <- master_cex * 6 * row_margin / col_margin + my_col_adj <- min(my_col_cex, my_row_cex) / my_col_cex + my_col_cex <- min(my_col_cex, my_row_cex) + col_margin <- sqrt(my_col_adj) * col_margin + if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex)) + if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex)) + if (is.null(margins)) my_margins <- + c( + (my_col_cex + col_margin), # col + (my_row_cex + row_margin) / my_row_cex # row + ) + else + my_margins <- margins + + if (print_trace_messages) cat( + sprintf( + "my_margins = c(%s)\n\n", + paste(my_margins, collapse = ", ") + ) + ) + my_hm2_cex <- 2 * master_cex + my_key_cex <- 0.9 - 0.1 * (g_intensity_hm_rows + nrow_m_hm) / g_intensity_hm_rows + my_key_warp <- 1.5 * 22.75 / row_margin + my_key_cex <- min(1.10, my_key_warp * my_key_cex) + my_hgt_scale <- 3.70 - 0.4 * (max(1, 0.9 * my_row_cex) - 1) + my_hgt_scale <- 3.75 # 3.615 + my_hgt_scale <- 3.60 # 3.615 + if (print_trace_messages) + cat_variable(my_hgt_scale, "\n\n", 3) + my_warp <- max(0.1, 1.4 * (7.5 + nrow_m) / g_intensity_hm_rows) + if (print_trace_messages) + cat_variable(my_warp, "\n\n", 3) + # added 0.9 heuristically... + my_plot_height <- + (0.566 + 0.354 * (nrow_m / g_intensity_hm_rows)) * + min(my_hgt_scale, my_hgt_scale * my_warp) + my_plot_height <- min(3.65, my_plot_height * g_intensity_hm_rows / 50) + my_heights <- c( + 0.3, # title and top dendrogram + my_plot_height, # plot and bottom margin + 4.15 - my_hgt_scale, # legend + 0.05 + my_hgt_scale - my_plot_height # whitespace below legend ) + my_note_cex <- min(0.8, my_row_cex, my_col_cex) + if (print_trace_messages) { + cat_variable(my_plot_height, "\n\n", 3) + cat_variable(4.19 - my_hgt_scale, "\n\n", 3) + cat_variable(nrow_m_hm, "; ", 0) + cat_variable(ncol_m_hm, "; ", 0) + cat_variable(my_row_cex, "; ", 3) + cat_variable(my_col_cex, "; ", 3) + cat_variable(my_note_cex, "; ", 3) + cat_variable(my_key_cex, "\n\n", 3) + cat_variable(my_hgt_scale, "; ", 3) + cat_variable(my_plot_height, "; ", 3) + cat_variable(my_warp, "\n\n", 3) + cat_variable(my_heights, "; ", 2) + cat_variable(sum(my_heights), "\n\n", 3) + } + + # define hm_call inner function + hm_call <- function(x, scaling, title) { + my_cex_main <- min(5.0, 220 / nchar(title)) + op <- par( + cex.main = my_cex_main * master_cex, + adj = adj + ) + if ( + !is.null( + hm2plus( + x, + Colv = NA, + Rowv = TRUE, + cexRow = my_row_cex, + cexCol = my_col_cex, + dendrogram = "row", + las = 1, + main = title, + key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), + cex = my_hm2_cex, + key_par = list(cex = my_key_cex), + margins = my_margins, + widths = c(0.4, 2.6, 1.5), + heights = my_heights, + mat = matrix( + c( + c(0, 3, 3), + c(2, 1, 1), + c(0, 4, 0), + c(0, 0, 0) + ), + nrow = 4, + ncol = 3, + byrow = TRUE + ), + na.rm = TRUE, + scale = scaling, + srtcol = 90, + srtrow = 0, + xlab = "", + cellnote = cellnote, + notecex = my_note_cex, + ... + ) + ) + ) { + if (print_trace_messages) cat( + sprintf( + "my_heights = c(%s); sum = %0.3f\n\n", + paste( + sprintf("%0.3f", my_heights), + collapse = ", " + ), + sum(my_heights) + ) + ) + if (print_trace_messages) cat( + sprintf("my_key_cex = %0.3f\n\n", + my_key_cex) + ) + if (print_trace_messages) cat( + sprintf("my_key_cex/my_heights[3] = %0.3f\n\n", + my_key_cex / my_heights[3]) + ) + if (print_trace_messages) cat( + sprintf("my_heights[2]/my_heights[3] = %0.3f\n\n", + my_heights[2] / my_heights[3]) + ) + } + par(op) + } + + # invoke hm_call inner function + if (sum(rowSums(!is.na(m_hm)) < 2)) + hm_call( + m_hm, + "none", + "log(intensities), unscaled, unimputed, and unnormalized" + ) + else + hm_call( + m_hm, + "row", + "log(intensities), row-scaled, unimputed, and unnormalized" + ) }, error = function(e) { - cat( - sprintf( - "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n", - e$message + if (!is.null(hm_call)) { + m_hm[is.na(m_hm)] <- 0 + tryCatch( + { + if (nrow(m_hm) > 1) + hm_call( + m_hm, + "none", + paste( + "log(intensities), unscaled,", + "zero-imputed, unnormalized" + ) + ) + else + cat("\nThere are too few peptides to produce a heatmap.\n") + }, + error = function(r) { + cat( + sprintf( + "\n%s %s Internal message: %s\n\\newline\n\n", + "Failure drawing heatmap,", + "possibly because of too many missing values.\n\\newline\n\n", + r$message + ) + ) + cat_margins() + } + ) + } else { + cat( + "\nFailure drawing heatmap, possibly because of too many missing values.\n" ) - ) - }, - finally = par(old_oma) + } + } ) } } - return(peptide_count) + # return value: + peptide_count } + +# function drawing heatmap of correlations if they exist, else covariances +cov_heatmap <- + function( + m, # matrix with rownames already formatted + top_substrates = FALSE, + ... # passthru to hm2plus or heatmap.2 + ) { + if (print_nb_messages) nbe(see_variable(m), " [", nrow(m), "x", ncol(m), "\n") + #ACE nb(rowSums(m, na.rm = TRUE)) + #ACE bad_rows <- (rowSums(m, na.rm = TRUE) == 0) + #ACE nb(see_variable(bad_rows)) + #ACE m <- m[-bad_rows, , drop = FALSE] + colnames_m <- colnames(m) + is_na_m <- is.na(m) + tmp <- m + tmp[is_na_m] <- 0 + + tmp <- m[, 0 < colSums(x = tmp)] # by default, na.rm is FALSE + + colnames_tmp <- colnames(tmp) + + my_low_p_seq <- seq( + from = min(g_intensity_hm_rows, nrow(m)), + to = 1, + by = -1 + ) + + if (g_correlate_substrates) { + # zap samples having zero or near-zero variance + tmp[is.na(tmp)] <- 0 + nzv <- caret::nearZeroVar( + tmp, # matrix of values, samples x variables + freqCut = 1.01, # min(freq most prevalent value / + # freq second most prevalent) + uniqueCut = 99 # max(number of unique values / + # total number of samples) + ) + tmp <- if (length(nzv) > 0) { + m[, -nzv, drop = FALSE] + } else { + m + } + } else { + tmp <- m[my_low_p_seq, , drop = FALSE] + } + + + t_m <- t(tmp) + t_m[is.na(t_m)] <- 0 + prefiltered_nrow <- ncol(t_m) + + my_corcov <- cov(t_m) + did_filter_rows <- did_filter_cols <- FALSE + if (g_correlate_substrates && !is_positive_definite(my_corcov)) { + my_correlate_substrates <- FALSE + t_m <- t(m[my_low_p_seq, , drop = FALSE]) + t_m[is.na(t_m)] <- 0 + unfiltered_row_count <- ncol(t_m) + unfiltered_col_count <- nrow(t_m) + + # zap empty samples + t_m <- t_m[0 < rowSums(x = t_m), ] + # zap substrates present in fewer than two samples + foo <- t_m > 0 + foo <- colSums(x = foo) > 1 + t_m <- t_m[, foo] + + did_filter_rows <- unfiltered_row_count > ncol(t_m) + did_filter_cols <- unfiltered_col_count > nrow(t_m) + + colnames_tmp <- rownames(t_m) + my_corcov <- cov(t_m) + if (g_filter_cov_var_gt_1) { + my_corcov <- keep_cov_w_var_gtr_1(my_corcov) + } + } else if (g_correlate_substrates) { + my_corcov <- cov2cor(my_corcov) + my_correlate_substrates <- TRUE + } else { + my_correlate_substrates <- FALSE + if (g_filter_cov_var_gt_1) my_corcov <- keep_cov_w_var_gtr_1(my_corcov) + } + + omitted_samples <- colnames_m[colnames_m %notin% colnames_tmp] + suffix <- if (length(omitted_samples) > 1) "s" else "" + + f_omissions <- + function(is_corr) { + cat( + sprintf( + "Below is the %s plot for %s substrates", + if (is_corr) "correlation" else "covariance", + sprintf( + if (top_substrates) + "%0.0f \"highest-quality\"" + else + "%0.0f", + ncol(t_m) + ) + ) + ) + if (did_filter_cols) { + cat(sprintf(", omitting sample%s ", suffix)) + latex_collapsed_vector(", ", omitted_samples) + } + cat(".\n\n") + } + + if (is.null(my_corcov) || sum(!is.na(t_m)) < 2) { + cat( + sprintf( + "\\newline\n%s %s plot.\n", + "Insufficient covariance to produce", + if (my_correlate_substrates) + "correlation" + else + "covariance" + ), + "\\newpage\n" + ) + return(NULL) + } + + cat("\\leavevmode\n", "\\newpage\n") + f_omissions(my_correlate_substrates) + + master_cex <- 0.4 + max_nchar <- max(nchar(rownames(t_m))) + my_limit <- g_intensity_hm_rows + diminution <- sqrt(my_limit / (my_limit + ncol(t_m))) + my_row_cex <- + my_col_cex <- + min(1.75, master_cex * 9 * diminution ^ 1.5) + my_margin <- 3 + my_row_cex * 64 / (8 + max_nchar) + my_key_cex <- 1.4 + my_hm2_cex <- 1.0 * master_cex + my_hgt_scale <- 3.50 - 0.26 * (max(0.4, my_key_cex) - 0.4) + my_hgt_scale <- 2.7 + + my_legend_height <- 4.0 - my_hgt_scale + my_legend_height <- 0.5 * my_key_cex + my_warp <- 0.65 * (my_limit + ncol(t_m)) / my_limit + my_warp <- 0.8 + my_legend_height <- 0.77 + my_legend_height <- 0.67 + my_plot_height <- my_hgt_scale + (1 - my_warp) * my_legend_height + my_legend_height <- my_warp * my_legend_height + + parjust <- par(adj = 0.5) + on.exit(par(parjust)) + my_corcov <- my_corcov[order(rownames(my_corcov)), ] + my_main <- + sprintf("%s among %s substrates %s", + if (my_correlate_substrates) "Correlation" + else "Covariance", + kinase_name, + if (!my_correlate_substrates && + g_filter_cov_var_gt_1 && + did_filter_rows + ) + "having variance > 1" + else "" + ) + my_main_nchar <- nchar(my_main) + my_heights <- c( + 0.3, + my_plot_height, + my_legend_height # was 4.0 - my_hgt_scale # was 4.19 + ) + if (print_trace_messages) cat(sprintf("max_nchar = %0.3f; ", max_nchar)) + if (print_trace_messages) cat(sprintf("my_margin = %0.3f; ", my_margin)) + if (print_trace_messages) cat(sprintf("my_plot_height = %0.3f\n\n", my_plot_height)) + if (print_trace_messages) cat(sprintf("master_cex = %0.3f; ", master_cex)) + if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex)) + if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex)) + if (print_trace_messages) cat(sprintf("my_key_cex = %0.3f\n\n", my_key_cex)) + if (print_trace_messages) cat(sprintf("my_hgt_scale = %0.3f\n\n", my_hgt_scale)) + if (print_trace_messages) cat(sprintf("legend height = %0.3f\n\n", my_legend_height)) + if (print_trace_messages) cat( + sprintf( + "my_heights = c(%s); sum = %0.3f\n\n", + paste( + sprintf("%0.3f", my_heights), + collapse = ", " + ), + sum(my_heights) + ) + ) + op <- par(cex.main = (30 + my_main_nchar) / my_main_nchar) + on.exit(par(op)) + hm2plus( + x = my_corcov, + cex = my_hm2_cex, + cexCol = my_col_cex, + cexRow = my_row_cex, + density_info = "none", + denhgt = 0.15, + denwid = 0.5, + divergent = TRUE, + key_par = list(cex = my_key_cex), + key_xlab = if (my_correlate_substrates) "Correlation" + else "Covariance", + main = my_main, + mat = matrix( + c( + c(0, 3, 3), + c(2, 1, 1), + c(0, 4, 0) + ), + nrow = 3, + ncol = 3, + byrow = TRUE + ), + heights = my_heights, + margins = c(my_margin, my_margin), + widths = c(0.5, 3.1, 0.9), + scale = "none", + symkey = TRUE, + symbreaks = TRUE, + symm = FALSE #TODO evaluate TRUE + # ... + ) + } # end cov_heatmap + +### FILE IMPORT + +# function reading bzipped file to data.frame +bzip2df <- function(d, f, ctor = bzfile) { + # read.delim file (by default, compressed by bzip2) + if (file.exists(f)) { + conn <- NULL + pf <- parent.frame() + tryCatch( + assign( + as.character(substitute(d)), + read.delim(conn <- bzfile(f, open = "r")), + pf + ), + finally = if (!is.null(conn)) close(conn) + ) + } +} + ``` -```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} -cat("\\listoftables\n") -``` # Purpose -To perform for phosphopeptides: +The purpose of this analysis is to perform for phosphopeptides: - imputation of missing values, - quantile normalization, -- ANOVA (using the R stats::`r params$oneWayManyCategories` function), and +- ANOVA (using the R stats::`r params$oneWayManyCategories` function), +- assignment of an FDR-adjusted $p$-value and a "quality score" to each phosphopeptide, and - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) - The Human Proteome Database [http://hprd.org](http://hprd.org) @@ -1151,13 +3096,15 @@ ```{r include = FALSE} -### GLOBAL VARIABLES - -# parameters for KSEA - -ksea_cutoff_statistic <- params$kseaCutoffStatistic -ksea_cutoff_threshold <- params$kseaCutoffThreshold -ksea_min_kinase_count <- params$kseaMinKinaseCount +if (params$kseaUseAbsoluteLog2FC) { + sfc <- "|s|" + pfc <- "|p|" + pfc_txt <- "$\\text{absolute value}({\\log_2 (\\text{fold-change})})$" +} else { + sfc <- "s" + pfc <- "p" + pfc_txt <- "${\\log_2 (\\text{fold-change}})$" +} ksea_heatmap_titles <- list() ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- @@ -1177,24 +3124,11 @@ # hash to hold names of significantly enriched kinases ksea_asterisk_hash <- new_env() -# READ PARAMETERS (mostly) - -intensity_hm_rows <- params$intensityHeatmapRows -# Input Filename -input_file <- params$inputFile - -# First data column - ideally, this could be detected via regexSampleNames, -# but for now leave it as is. -first_data_column <- params$firstDataColumn -fdc_is_integer <- is.integer(first_data_column) -if (fdc_is_integer) { - first_data_column <- as.integer(params$firstDataColumn) -} +# PROCESS (mostly read) PARAMETERS # False discovery rate adjustment for ANOVA # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 -val_fdr <- - read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "") +val_fdr <- read.table(file = alpha_file, sep = "\t", header = FALSE, quote = "") if ( ncol(val_fdr) != 1 || @@ -1202,62 +3136,17 @@ sum(val_fdr[, 1] < 0) || sum(val_fdr[, 1] > 1) ) { - stop("alphaFile should be one column of numbers within the range [0.0,1.0]") + cat("alphaFile should be one column of numbers within the range [0.0,1.0]") + param_df_exit() + knitr::knit_exit() } val_fdr <- val_fdr[, 1] -#Imputed Data filename -imputed_data_filename <- params$imputedDataFilename -imp_qn_lt_data_filenm <- params$imputedQNLTDataFile -anova_ksea_mtdt_file <- params$anovaKseaMetadata - ``` -```{r echo = FALSE} -# Imputation method, should be one of -# "random", "group-median", "median", or "mean" -imputation_method <- params$imputationMethod - -# Selection of percentile of logvalue data to set the mean for random number -# generation when using random imputation -mean_percentile <- params$meanPercentile / 100.0 - -# deviation adjustment-factor for random values; real number. -sd_percentile <- params$sdPercentile - -# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" -regex_sample_names <- params$regexSampleNames - -# Regular expression to extract Sample Grouping from Sample Name; -# if error occurs, compare sample_treatment_levels vs. sample_name_matches -# to see if groupings/pairs line up -# e.g., "(\\d+)" -regex_sample_grouping <- params$regexSampleGrouping - -one_way_all_categories_fname <- params$oneWayManyCategories -one_way_all_categories <- try_catch_w_e( - match.fun(one_way_all_categories_fname)) -if (!is.function(one_way_all_categories$value)) { - write("fatal error for parameter oneWayManyCategories:", stderr()) - write(one_way_all_categories$value$message, stderr()) - if (sys.nframe() > 0) quit(save = "no", status = 1) - stop("Cannot continue. Goodbye.") -} -one_way_all_categories <- one_way_all_categories$value - -one_way_two_categories_fname <- params$oneWayManyCategories -one_way_two_categories <- try_catch_w_e( - match.fun(one_way_two_categories_fname)) -if (!is.function(one_way_two_categories$value)) { - cat("fatal error for parameter oneWayTwoCategories: \n") - cat(one_way_two_categories$value$message, fill = TRUE) - if (sys.nframe() > 0) quit(save = "no", status = 1) - stop("Cannot continue. Goodbye.") -} -one_way_two_categories <- one_way_two_categories$value - -preproc_db <- params$preprocDb -ksea_app_prep_db <- params$kseaAppPrepDb +```{r echo = FALSE, results = 'asis'} + + result <- file.copy( from = preproc_db, to = ksea_app_prep_db, @@ -1272,9 +3161,19 @@ ), stderr() ) - if (sys.nframe() > 0) quit(save = "no", status = 1) - stop("Cannot continue. Goodbye.") + if (sys.nframe() > 0) { + cat("Cannot continue and quit() failed. Goodbye.") + param_df_exit() + knitr::knit_exit() + # in case knit_exit doesn't exit + quit(save = "no", status = 1) + } } + +if (FALSE) { + write.table(x = param_df, file = "test-data/params.txt") +} + ``` ```{r echo = FALSE} @@ -1289,55 +3188,75 @@ quote = "", check.names = FALSE ) + ``` -# Extract Sample Names and Treatment Levels - -Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2. + +# Extraction of Sample Classes and Names from Input Data ```{r echo = FALSE, results = 'asis'} data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) +my_column_names <- names(full_data) if (!fdc_is_integer) { if (length(data_column_indices) > 0) { first_data_column <- data_column_indices[1] } else { - stop(paste("failed to convert firstDataColumn:", first_data_column)) + cat(paste("failed to convert firstDataColumn:", first_data_column)) + param_df_exit() + knitr::knit_exit() } } cat( sprintf( paste( - "\n\nThe input data file has peptide-intensity data for each sample", - "in one of columns %d through %d.\n\n" + "\n\nThe input data file has peptide-intensity data", + "in columns %d (\"%s\") through %d (\"%s\")." ), - min(data_column_indices), - max(data_column_indices) + tmp <- min(data_column_indices), + my_column_names[tmp], + tmp <- max(data_column_indices), + my_column_names[tmp] ) ) -# Write column names as a LaTeX enumerated list. -column_name_df <- data.frame( - column = seq_len(length(colnames(full_data))), - name = paste0("\\verb@", colnames(full_data), "@") +if (TRUE) { + cat0( + table_offset(i = 1, new = TRUE), + "Sample classes and names are shown in ", + table_href(), + ".\n\n" ) -data_frame_latex( - x = column_name_df, - justification = "l l", - centered = TRUE, - caption = "Input data column names", - anchor = const_table_anchor_bp, - underscore_whack = FALSE +} else { + cat0( + "\\newcounter{offset}\n", + "\\setcounter{offset}{\\value{table}}\n", + "\\stepcounter{offset}\n", + "Sample classes and names are shown in ", + table_href(), + ".\n\n" ) +} + +#TODO remove this unused variable and assignment +if (FALSE) { + # Write column names as a LaTeX enumerated list. + column_name_df <- data.frame( + column = seq_len(length(colnames(full_data))), + name = paste0("\\verb@", colnames(full_data), "@") + ) +} ``` ```{r echo = FALSE, results = 'asis'} +# extract intensity columns from full_data to quant_data quant_data <- full_data[first_data_column:length(full_data)] quant_data[quant_data == 0] <- NA rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide +full_data_names <- colnames(quant_data) # Extract factors and trt-replicates using regular expressions. # Typically: # regex_sample_names is "\\.\\d+[A-Z]$" @@ -1355,20 +3274,133 @@ write_debug_file(quant_data) rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) -sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) -number_of_samples <- length(sample_name_matches) +smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) + +if (print_nb_messages) nbe(see_variable(smpl_trt, "\n\n")) +if (print_nb_messages) nbe(see_variable(sample_name_matches, "\n\n")) +if (print_nb_messages) nbe(see_variable(full_data_names, "\n\n")) + +sample_treatment_df <- + save_sample_treatment_df <- + data.frame( + class = smpl_trt, + sample = sample_name_matches, + full_sample_names = full_data_names + ) + +if (print_nb_messages) nbe(see_variable(sample_treatment_df, "\n\n")) + +# reorder data +my_order <- with(sample_treatment_df, order(class, sample)) +quant_data <- quant_data[, my_order] +sample_name_matches <- sample_name_matches[my_order] +smpl_trt <- smpl_trt[my_order] sample_treatment_df <- data.frame( - level = sample_treatment_levels, + class = smpl_trt, sample = sample_name_matches ) -data_frame_latex( + +# filter smpl_trt as appropriate +if (sample_group_filter %in% c("include", "exclude")) { + include_sample <- + mgrepl( + v = sample_group_filter_patterns, + s = as.character(smpl_trt), + fixed = sample_group_filter_fixed, + perl = sample_group_filter_perl, + ignore.case = sample_group_filter_nocase + ) + if (sum(include_sample) < 2) { + errmsg <- + paste( + "ERROR:", + sum(include_sample), + "samples are too few for analysis;", + "check input parameters for sample-name parsing" + ) + cat0( + errmsg, + "\\stepcounter{offset}\n", + " in ", + table_href(), + ".\n\n" + ) + data_frame_tabbing_latex( + x = save_sample_treatment_df, + tabstops = c(1.25, 1.25), + caption = "Sample classes", + use_subsubsection_header = FALSE + ) + data_frame_tabbing_latex( + x = + param_df[ + c("regexSampleNames", + "regexSampleGrouping", + "groupFilterPatterns", + "groupFilter", + "groupFilterMode" + ), + ], + tabstops = c(1.75), + underscore_whack = TRUE, + caption = "Input parameters for sample-name parsing", + verbatim = FALSE + ) + param_df_exit() + knitr::knit_exit() + return(invisible(-1)) + } + sample_treatment_df <- + if (sample_group_filter == "include") + sample_treatment_df[include_sample, ] + else + sample_treatment_df[!include_sample, ] +} else { + include_sample <- rep.int(TRUE, length(smpl_trt)) +} +sample_name_matches <- sample_treatment_df$sample +rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) +smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) +sample_treatment_df$class <- smpl_trt + +# filter quant_data as appropriate +number_of_samples <- length(sample_name_matches) +quant_data <- quant_data[, sample_name_matches] + +sample_level_integers <- as.integer(smpl_trt) +sample_treatment_levels <- levels(smpl_trt) +count_of_treatment_levels <- length(sample_treatment_levels) + +# for each phosphopeptide, across treatment levels, compute minimum +# count of observed (i.e., non-missing) values +my_env <- new_env() +for (l in sample_treatment_levels) + my_env[[as.character(l)]] <- + as.vector(rowSums(!is.na(quant_data[l == smpl_trt]))) +min_group_obs_count <- row_apply( + x = Reduce( + f = function(l, r) cbind(l, my_env[[r]]), + x = sample_treatment_levels, + init = c() + ), + fun = min + ) +names(min_group_obs_count) <- rownames(quant_data) +rm(my_env) + +# display (possibly-filtered) results +cat("\\newpage\n") + +if (sum(include_sample) > 1) { +data_frame_tabbing_latex( x = sample_treatment_df, - justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", - centered = TRUE, - caption = "Treatment levels", - anchor = const_table_anchor_tbp, - underscore_whack = FALSE + tabstops = c(1.25), + caption = "Sample classes", + use_subsubsection_header = FALSE ) +} +sample_name_grow <- 10 / (10 + max(nchar(sample_name_matches) + 6)) +sample_colsep <- transition_positions(as.integer(sample_treatment_df$class)) ``` ```{r echo = FALSE, results = 'asis'} @@ -1377,7 +3409,7 @@ ## Are the log-transformed sample distributions similar? -```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} +```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} quant_data[quant_data == 0] <- NA #replace 0 with NA quant_data_log <- log10(quant_data) @@ -1387,25 +3419,81 @@ write_debug_file(quant_data_log) -# data visualization -old_par <- par( - mai = par("mai") + c(0.5, 0, 0, 0) -) -# ref: https://r-charts.com/distribution/add-points-boxplot/ -# Vertical plot -boxplot( - quant_data_log -, las = 1 -, col = const_boxplot_fill -, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") -, xlab = "Sample" -) -par(old_par) - - - -cat("\n\n\n") -cat("\n\n\n") +g_ppep_distrib_ctl <- new_env() +g_ppep_distrib_ctl$xlab_line <- 3.5 + 11.86 * (0.67 - sample_name_grow) +g_ppep_distrib_ctl$mai_bottom <- (0.5 + 3.95 * (0.67 - sample_name_grow)) +g_ppep_distrib_ctl$axis <- (0.6 + 0.925 * (0.67 - sample_name_grow)) + +my_ppep_distrib_bxp <- function( + x + , sample_name_grow = sample_name_grow + , main + , varwidth = FALSE + , sub = NULL + , xlab + , ylab + , col = const_boxplot_fill + , notch = FALSE + , ppep_distrib_ctl = g_ppep_distrib_ctl + , ... + ) { + my_xlab_line <- g_ppep_distrib_ctl$xlab_line + my_mai_bottom <- g_ppep_distrib_ctl$mai_bottom + my_axis <- g_ppep_distrib_ctl$axis + + if (print_trace_messages) { + cat_variable(my_xlab_line, suffix = "; ") + cat_variable(my_mai_bottom, suffix = "; ") + cat_variable(my_axis, suffix = "\n\n") + } + + old_par <- par( + mai = par("mai") + c(my_mai_bottom, 0, 0, 0), + cex.axis = my_axis, + cex.lab = 1.2 + ) + tryCatch( + { + # Vertical plot + boxplot( + x + , las = 2 + , col = col + , main = main + , sub = NULL + , ylab = ylab + , xlab = NULL + , notch = notch + , varwidth = varwidth + , ... + ) + title( + sub = sub + , cex.sub = 1.0 + , line = my_xlab_line + 1 + ) + title( + xlab = xlab + , line = my_xlab_line + ) + }, + finally = par(old_par) + ) + } + +my_ppep_distrib_bxp( + x = quant_data_log + , sample_name_grow = sample_name_grow + , main = "Peptide intensities for each sample" + , varwidth = boxplot_varwidth + , sub = "Box widths reflect number of peptides for sample" + , xlab = "Sample" + , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + , col = const_boxplot_fill + , notch = FALSE + ) + +cat("\n\n\n\n") ``` @@ -1454,6 +3542,8 @@ ) ``` +# Characterization of Input Data + ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} @@ -1461,7 +3551,7 @@ q1 <- quantile(logvalues, probs = mean_percentile)[1] # 1 = row of matrix (ie, phosphopeptide) -sds <- apply(quant_data_log, 1, sd_finite) +sds <- row_apply(quant_data_log, sd_finite) if (sum(!is.na(sds)) > 2) { plot( density(sds, na.rm = TRUE) @@ -1495,7 +3585,7 @@ # prep for trt-median based imputation ``` -# Impute Missing Values +# Imputation of Missing Values ```{r echo = FALSE} @@ -1526,11 +3616,10 @@ paste("Substitute missing value with", "median peptide-intensity for sample group.\n" ) - sample_level_integers <- as.integer(sample_treatment_levels) # Take the accurate ln(x+1) because the data are log-normally distributed # and because median can involve an average of two measurements. quant_data_imp <- log1p(quant_data_imp) - for (i in seq_len(length(levels(sample_treatment_levels)))) { + for (i in seq_len(count_of_treatment_levels)) { # Determine the columns for this factor-level level_cols <- i == sample_level_integers # Extract those columns @@ -1541,7 +3630,7 @@ # a given ppep has no measurement; otherwise, proceed. if (ncol(lvlsbst) > 1) { the_centers <- - apply(lvlsbst, 1, median, na.rm = TRUE) + row_apply(lvlsbst, median, na.rm = TRUE) for (j in seq_len(nrow(lvlsbst))) { for (k in seq_len(ncol(lvlsbst))) { if (is.na(lvlsbst[j, k])) { @@ -1565,7 +3654,7 @@ # Take the accurate ln(x+1) because the data are log-normally distributed # and because median can involve an average of two measurements. quant_data_imp <- log1p(quant_data_imp) - quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] + quant_data_imp[ind] <- row_apply(quant_data_imp, median, na.rm = TRUE)[ind[, 1]] # Take the accurate e^x - 1 to match scaling of original input. quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) good_rows <- !is.nan(rowMeans(quant_data_imp)) @@ -1581,7 +3670,7 @@ # this will have to be quant_data_imp <- log1p(quant_data_imp) # Assign to NA cells the mean for the row - quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] + quant_data_imp[ind] <- row_apply(quant_data_imp, mean, na.rm = TRUE)[ind[, 1]] # Take the accurate e^x - 1 to match scaling of original input. quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) good_rows <- !is.nan(rowMeans(quant_data_imp)) @@ -1625,11 +3714,40 @@ imp_smry_pot_peptides_after <- sum(good_rows) imp_smry_rejected_after <- sum(!good_rows) imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) + +# From ?`%in%`, %in% is currently defined as function(x, table) match(x, table, nomatch = 0) > 0 + +sink(stderr()) +print("`%in%`:") +print(`%in%`) +sink() + +stock_in <- + names(good_rows) %in% + names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) +if (print_nb_messages) nbe(see_variable(stock_in), "\n") + +explicit_in <- + 0 < match( + names(good_rows), + names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) + ) +if (print_nb_messages) nbe(see_variable(explicit_in), "\n") + +great_enough_row_names <- good_rows[ + names(good_rows) %in% + names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) +] +if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") +great_enough_row_names <- great_enough_row_names[great_enough_row_names] +if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") ``` + ```{r echo = FALSE, results = 'asis'} # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf tabular_lines_fmt <- paste( "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) + " \\leavevmode", " \\caption{Imputation Results}", " \\centering", # \centering centers the table on the page " \\begin{tabular}{l c c c}", @@ -1637,8 +3755,9 @@ " \\ & potential peptides & missing values & rejected", " peptides \\\\ [0.5ex]", " \\hline", - " before imputation & %d & %d (%d\\%s) & \\\\", - " after imputation & %d & %d & %d \\\\ [1ex]", + " before imputation & %d & %d (%d\\%s) & \\\\", + " after imputation & %d & %d & %d \\\\", + " after keep comparable & %d & & %d \\\\ [1ex]", " \\hline", " \\end{tabular}", #" \\label{table:nonlin}", # may be used to refer this table in the text @@ -1654,18 +3773,37 @@ "%", imp_smry_pot_peptides_after, imp_smry_missing_values_after, - imp_smry_rejected_after + imp_smry_rejected_after, + length(great_enough_row_names), + imp_smry_pot_peptides_before - + length(great_enough_row_names) ) cat(tabular_lines) ``` -```{r echo = FALSE} - - -# Zap rows where imputation was ineffective + +```{r filter_good_rows, echo = FALSE} + +if (print_nb_messages) nbe("before name extraction, ", see_variable(length(good_rows)), " ", see_variable(good_rows), "\n") +good_rows <- names(good_rows[names(great_enough_row_names)]) +if (print_nb_messages) nbe("after name extraction, ", see_variable(length(good_rows)), see_variable(good_rows), "\n") + +#ACE min_group_obs_count <- min_group_obs_count[names(great_enough_row_names)] +#ACE nbe("good_rows") +#ACE nbe(see_variable(good_rows)) +#ACE nbe("names(min_group_obs_count) before filter for good rows") +#ACE nbe(see_variable(names(min_group_obs_count))) +min_group_obs_count <- min_group_obs_count[good_rows] +#ACE nbe("min_group_obs_count after filter for good rows") +#ACE nbe(see_variable(names(min_group_obs_count))) + +# Zap rows where imputation was insufficiently effective full_data <- full_data [good_rows, ] quant_data <- quant_data [good_rows, ] - +quant_data_log <- quant_data_log [good_rows, ] + +if (print_nb_messages) nbe("before row filter, ", see_variable(nrow(quant_data_imp)), "\n") quant_data_imp <- quant_data_imp[good_rows, ] +if (print_nb_messages) nbe("after row filter, ", see_variable(nrow(quant_data_imp)), "\n") write_debug_file(quant_data_imp) quant_data_imp_good_rows <- quant_data_imp @@ -1719,35 +3857,41 @@ ``` -```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} +```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} zero_sd_rownames <- rownames(quant_data_imp)[ - is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) + is.na((row_apply(quant_data_imp, sd, na.rm = TRUE)) == 0) ] if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { - stop("All peptides have zero standard deviation. Cannot continue.") + cat("All peptides have zero standard deviation. Cannot continue.") + param_df_exit() + knitr::knit_exit() } if (length(zero_sd_rownames) > 0) { cat( - sprintf("%d peptides with zero variance were removed from statistical consideration", - length(zero_sd_rownames) + sprintf( + "%d %s %s", + length(zero_sd_rownames), + "peptides with zero variance", + "were removed from statistical consideration" ) ) zap_named_rows <- function(df, nms) { return(df[!(row.names(df) %in% nms), ]) } - quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) - quant_data <- zap_named_rows(quant_data, zero_sd_rownames) - full_data <- zap_named_rows(full_data, zero_sd_rownames) + quant_data_imp <- + zap_named_rows(quant_data_imp, zero_sd_rownames) + quant_data <- + zap_named_rows(quant_data, zero_sd_rownames) + full_data <- + zap_named_rows(full_data, zero_sd_rownames) + min_group_obs_count <- + min_group_obs_count[names(min_group_obs_count) %notin% zero_sd_rownames] } if (sum(is.na(quant_data)) > 0) { cat("\\leavevmode\\newpage\n") - # data visualization - old_par <- par( - mai = par("mai") + c(0.5, 0, 0, 0) - ) # Copy quant data to x x <- quant_data # x gets to have values of: @@ -1776,22 +3920,24 @@ show_stripchart <- 50 > (count_red + count_blue) / length(sample_name_matches) if (show_stripchart) { - boxplot_sub <- "Light blue = data before imputation; Red = imputed data" + boxplot_sub <- "Light blue = data before imputation; Red = imputed data;" } else { boxplot_sub <- "" } # Vertical plot colnames(blue_dots) <- sample_name_matches - boxplot( - blue_dots - , las = 1 # "always horizontal" - , col = const_boxplot_fill - , ylim = ylim + my_ppep_distrib_bxp( + x = blue_dots + , sample_name_grow = sample_name_grow , main = "Peptide intensities after eliminating unusable peptides" - , sub = boxplot_sub + , varwidth = boxplot_varwidth + , sub = paste(boxplot_sub, "Box widths reflect number of peptides for sample") , xlab = "Sample" , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + , col = const_boxplot_fill + , notch = FALSE + , ylim = ylim ) if (show_stripchart) { @@ -1803,7 +3949,7 @@ method = "jitter", # Random noise jitter = const_stripchart_jitter, pch = 19, # Pch symbols - cex = const_stripsmall_cex, # Size of symbols reduced + cex = const_stripchart_cex, # Size of symbols reduced col = "lightblue", # Color of the symbol vertical = TRUE, # Vertical mode add = TRUE # Add it over @@ -1813,49 +3959,71 @@ method = "jitter", # Random noise jitter = const_stripchart_jitter, pch = 19, # Pch symbols - cex = const_stripsmall_cex, # Size of symbols reduced + cex = const_stripchart_cex, # Size of symbols reduced col = "red", # Color of the symbol vertical = TRUE, # Vertical mode add = TRUE # Add it over ) } - if (TRUE) { - # show measured values in blue on left half-violin plot - cat("\\leavevmode\n\\quad\n\n\\quad\n\n") - vioplot::vioplot( - x = lapply(blue_dots, function(x) x[!is.na(x)]), - col = "lightblue1", - side = "left", - plotCentre = "line", - ylim = ylim_save, - main = "Distributions of observed and imputed data", - sub = "Light blue = observed data; Pink = imputed data", - xlab = "Sample", - ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") +} +``` + +```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} +if (sum(is.na(quant_data)) > 0) { + # show measured values in blue on left half-violin plot + cat("\\leavevmode\n\\quad\n\n\\quad\n\n") + old_par <- par( + mai = par("mai") + c(g_ppep_distrib_ctl$mai_bottom, 0, 0, 0), + cex.axis = g_ppep_distrib_ctl$axis, + cex.lab = 1.2 + ) + tryCatch( + { + vioplot::vioplot( + x = lapply(blue_dots, function(x) x[!is.na(x)]), + col = "lightblue1", + side = "left", + plotCentre = "line", + ylim = ylim_save, + main = "Distributions of observed and imputed data", + sub = NULL, + las = 2, + xlab = NULL, + ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + ) + title( + sub = "Light blue = observed data; Pink = imputed data", + cex.sub = 1.0, + line = g_ppep_distrib_ctl$xlab_line + 1 ) - red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) - cols_to_delete <- c() - for (ix in seq_len(length(red_violins))) { - if (length(red_violins[[ix]]) < 1) { - cols_to_delete <- c(cols_to_delete, ix) + title( + xlab = "Sample", + line = g_ppep_distrib_ctl$xlab_line + ) + red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) + cols_to_delete <- c() + for (ix in seq_len(length(red_violins))) { + if (length(red_violins[[ix]]) < 1) { + cols_to_delete <- c(cols_to_delete, ix) + } } - } - # destroy any unimputable columns - if (!is.null(cols_to_delete)) { - red_violins <- red_violins[-cols_to_delete] - } - # plot imputed values in red on right half-violin plot - vioplot::vioplot( - x = red_violins, - col = "lightpink1", - side = "right", - plotCentre = "line", - add = TRUE - ) - } - - par(old_par) + # destroy any unimputable columns + if (!is.null(cols_to_delete)) { + red_violins <- red_violins[-cols_to_delete] + } + # plot imputed values in red on right half-violin plot + vioplot::vioplot( + x = red_violins, + col = "lightpink1", + side = "right", + plotCentre = "line", + add = TRUE + ) + + }, + finally = par(old_par) + ) # density plot cat("\\leavevmode\n\n\n\n\n\n\n") @@ -1893,88 +4061,88 @@ } ``` -# Perform Quantile Normalization +# Quantile Normalization The excellent `normalize.quantiles` function from *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* performs "quantile normalization" as described Bolstad *et al.* (2003), DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* -and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, +and its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html), i.e., it assumes that the goal is to detect subtle differences among grossly similar samples (having similar distributions) -by equailzing intra-quantile quantitations. -Unfortunately, one software library upon which it depends +by equalizing intra-quantile quantitations^[Unfortunately, +one software library upon which `preprocessCore` depends *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* -that requires that a specific, non-concurrent version of the library be +that requires that a specific, non-concurrent version of the library (`openblas` version $0.3.3$) be installed. The installation command equivalent to what was used to install the library to produce the results presented here is: -``` - conda install bioconductor-preprocesscore openblas=0.3.3 -``` +\linebreak +` conda install bioconductor-preprocesscore openblas=0.3.3`]. <!-- -# Apply quantile normalization using preprocessCore::normalize.quantiles -# --- -# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html -# except this: https://support.bioconductor.org/p/122925/#9135989 -# says to install it like this: -# ``` -# BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) -# ``` -# conda installation (necessary because of a bug in recent openblas): -# conda install bioconductor-preprocesscore openblas=0.3.3 -# ... -# --- -# normalize.quantiles {preprocessCore} -- Quantile Normalization -# -# Description: -# Using a normalization based upon quantiles, this function normalizes a -# matrix of probe level intensities. -# -# THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the -# assumption that the data is missing at random. -# -# Usage: -# normalize.quantiles(x, copy = TRUE, keep.names = FALSE) -# -# Arguments: -# -# - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. -# -# - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, -# but in certain situations not making a copy of the matrix, but instead normalizing -# it in place will be more memory friendly. -# -# - keep.names: Boolean option to preserve matrix row and column names in output. -# -# Details: -# This method is based upon the concept of a quantile-quantile plot extended to n dimensions. -# No special allowances are made for outliers. If you make use of quantile normalization -# please cite Bolstad et al, Bioinformatics (2003). -# -# This functions will handle missing data (ie NA values), based on -# the assumption that the data is missing at random. -# -# Note that the current implementation optimizes for better memory usage -# at the cost of some additional run-time. -# -# Value: A normalized matrix. -# -# Author: Ben Bolstad, bmbolstad.com -# -# References -# -# - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide -# Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf -# -# - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of -# Normalization Methods for High Density Oligonucleotide Array Data Based on Bias -# and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 -# http://bmbolstad.com/misc/normalize/normalize.html -# ... + # Apply quantile normalization using preprocessCore::normalize.quantiles + # --- + # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html + # except this: https://support.bioconductor.org/p/122925/#9135989 + # says to install it like this: + # ``` + # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) + # ``` + # conda installation (necessary because of a bug in recent openblas): + # conda install bioconductor-preprocesscore openblas=0.3.3 + # ... + # --- + # normalize.quantiles {preprocessCore} -- Quantile Normalization + # + # Description: + # Using a normalization based upon quantiles, this function normalizes a + # matrix of probe level intensities. + # + # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the + # assumption that the data is missing at random. + # + # Usage: + # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) + # + # Arguments: + # + # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. + # + # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, + # but in certain situations not making a copy of the matrix, but instead normalizing + # it in place will be more memory friendly. + # + # - keep.names: Boolean option to preserve matrix row and column names in output. + # + # Details: + # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. + # No special allowances are made for outliers. If you make use of quantile normalization + # please cite Bolstad et al, Bioinformatics (2003). + # + # This functions will handle missing data (ie NA values), based on + # the assumption that the data is missing at random. + # + # Note that the current implementation optimizes for better memory usage + # at the cost of some additional run-time. + # + # Value: A normalized matrix. + # + # Author: Ben Bolstad, bmbolstad.com + # + # References + # + # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide + # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf + # + # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of + # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias + # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 + # http://bmbolstad.com/misc/normalize/normalize.html + # ... --> ```{r echo = FALSE, results = 'asis'} +if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp)), "\n") if (nrow(quant_data_imp) > 0) { quant_data_imp_qn <- preprocessCore::normalize.quantiles( as.matrix(quant_data_imp), keep.names = TRUE @@ -1983,17 +4151,20 @@ quant_data_imp_qn <- as.matrix(quant_data_imp) } +if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn)), "\n") + quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) - write_debug_file(quant_data_imp_qn) quant_data_imp_qn_log <- log10(quant_data_imp_qn) - write_debug_file(quant_data_imp_qn_log) +if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") +if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") + quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) -sel <- apply(quant_data_imp_qn_ls, 1, any_nan) +sel <- row_apply(quant_data_imp_qn_ls, any_nan) quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] @@ -2008,10 +4179,9 @@ data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) ``` -<!-- ACE insertion begin --> ## Are normalized, imputed, log-transformed sample distributions similar? -```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} +```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} # Save unimputed quant_data_log for plotting below unimputed_quant_data_log <- quant_data_log @@ -2034,6 +4204,22 @@ # data visualization + if (TRUE) { + + my_ppep_distrib_bxp( + x = quant_data_log + , sample_name_grow = sample_name_grow + , main = "Peptide intensities for each sample" + , varwidth = boxplot_varwidth + , sub = NULL + , xlab = "Sample" + , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + , col = const_boxplot_fill + , notch = boxplot_notch + ) + + } else { + old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) , oma = par("oma") + c(0.5, 0, 0, 0) @@ -2043,12 +4229,16 @@ colnames(quant_data_log) <- sample_name_matches boxplot( quant_data_log - , las = 1 + , las = 2 + , cex.axis = 0.9 * sample_name_grow , col = const_boxplot_fill , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , xlab = "Sample" + , notch = boxplot_notch + , varwidth = boxplot_varwidth ) par(old_par) + } } else { cat("There are no peptides to plot\n") } @@ -2074,7 +4264,28 @@ # ANOVA Analysis -```{r, echo = FALSE} +## Assignment of $p$-value and quality score + +For each phosphopeptide, ANOVA analysis was performed to compute a $p$-value representing the evidence against the "null hypothesis" ($H_0$) that the intensity does not vary significantly among sample groups. + +However, because as more and more phosphopeptides are tested, there is increasd probability that, by random chance, a given peptide will have a $p$-value that appears to indicate significance. For this reason, the $p$-values were adjusted by applying the False Discovery Rate (FDR) correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x). + +Furthermore, to give more weight to phosphopeptides having fewer missing values, an (arbitrarily defined) quality score was assigned to each, defined as: + +$$ +\textit{quality}_j + = \frac{1 + o_{j}}{v_{j}(1 + o_{j}) + 0.005} +$$ + +where: + +- $o_j$ is the minimum number of non-missing observations per sample group for substrate $j$ for all sample groups, and +- $v_j$ is the FDR-adjusted ANOVA $p$-value for substrate $j$. + + +```{r, echo = FALSE, results = 'asis'} +cat("\\newpage\n") + # Make new data frame containing only Phosphopeptides # to connect preANOVA to ANOVA (connect_df) connect_df <- data.frame( @@ -2084,16 +4295,10 @@ colnames(connect_df) <- c("Phosphopeptide", "Intensity") ``` -```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} -count_of_treatment_levels <- length(levels(sample_treatment_levels)) +```{r anova, echo = FALSE, fig.dim = c(10, 12), results = 'asis'} +g_can_run_ksea <- FALSE +old_oma <- par("oma") if (count_of_treatment_levels < 2) { - nuke_control_sequences <- - function(s) { - s <- gsub("[\\]", "xyzzy_plugh", s) - s <- gsub("[$]", "\\\\$", s) - s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) - return(s) - } cat( "ERROR!!!! Cannot perform ANOVA analysis", "(see next page)\\newpage\n" @@ -2109,7 +4314,7 @@ paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), "\n\\end{quote}\n\n") - regex_sample_names <- nuke_control_sequences(regex_sample_names) + regex_sample_names <- latex_printable_control_seqs(regex_sample_names) cat("\\leavevmode\n\n\n") cat("Parsing rule for SampleNames is", @@ -2125,7 +4330,7 @@ paste(sample_name_matches, collapse = "\n\n\n"), "\n\\end{quote}\n\n") - regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) + regex_sample_grouping <- latex_printable_control_seqs(regex_sample_grouping) cat("\\leavevmode\n\n\n") cat("Parsing rule for SampleGrouping is", @@ -2144,31 +4349,90 @@ } else { - p_value_data_anova_ps <- - apply( - quant_data_imp_qn_log, - 1, - anova_func, - grouping_factor = sample_treatment_levels, - one_way_f = one_way_all_categories - ) + if (print_nb_messages) nbe("computing p_value_data_anova_ps\n") + if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") + if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") + if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log[, ".NE.7C"]), "\n") + if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log), "\n") + if (print_nb_messages) nbe(see_variable(anova_func), "\n") + if (print_nb_messages) nbe(see_variable(smpl_trt), "\n") + if (print_nb_messages) nbe(see_variable(one_way_all_categories), "\n") + tryCatch( + { + p_value_data_anova_ps <- + row_apply( + quant_data_imp_qn_log, + anova_func, + grouping_factor = smpl_trt, + one_way_f = one_way_all_categories + ) + }, + error = function(e) { + mesg <- paste("Could not compute ANOVA because", e$message) + cat("\n\n", mesg, "\n\n") + param_df_noexit(e) + sink(stderr()) + cat("\n\n", mesg, "\n\n") + values <- paste(param_df$parameter, "=", param_df$value, collapse = "\n") + cat(values) + sink() + knitr::knit_exit() + exit(code = 1) + } + ) + if (print_nb_messages) nbe(see_variable(p_value_data_anova_ps), "\n") p_value_data_anova_ps_fdr <- p.adjust(p_value_data_anova_ps, method = "fdr") + my_ppep_v <- full_data[, 1] + p_value_data <- list( + phosphopeptide = my_ppep_v, + raw_anova_p = p_value_data_anova_ps, + fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, + missing_values = rowSums(is.na(quant_data)), + min_group_obs_count = min_group_obs_count + ) p_value_data <- data.frame( - phosphopeptide = full_data[, 1] - , - raw_anova_p = p_value_data_anova_ps - , - fdr_adjusted_anova_p = p_value_data_anova_ps_fdr + phosphopeptide = my_ppep_v, + raw_anova_p = p_value_data_anova_ps, + fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, + missing_values = rowSums(is.na(quant_data)), + min_group_obs_count = min_group_obs_count ) + p_value_data$quality <- 1.0 / with( + p_value_data, + fdr_adjusted_anova_p + 0.005 / (1 + min_group_obs_count) + ) + + p_value_data$ranking <- + with( + p_value_data, + switch( + g_intensity_hm_criteria, + "quality" = order(-quality), + "na_count" = order(missing_values, fdr_adjusted_anova_p), + # the default is "p_value" + order(fdr_adjusted_anova_p) + ) + ) + p_value_data <- p_value_data[p_value_data$ranking, , drop = FALSE] + + write.table( + p_value_data, + file = "p_value_data.txt", + sep = "\t", + col.names = TRUE, + row.names = FALSE, + quote = FALSE + ) + # output ANOVA file to constructed filename, # e.g. "Outputfile_pST_ANOVA_STEP5.txt" - # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" + # becomes "Outputfile_pST_ANOVA_STEP5_FDR0.05.txt" # Re-output datasets to include p-values - metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) + metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:ncol(p_value_data)]) write.table( cbind(metadata_plus_p, quant_data_imp), file = imputed_data_filename, @@ -2188,61 +4452,97 @@ ) - p_value_data <- - p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] - first_page_suppress <- 1 number_of_peptides_found <- 0 cutoff <- val_fdr[1] for (cutoff in val_fdr) { - if (number_of_peptides_found > 49) { + #loop through FDR cutoffs + if (number_of_peptides_found > g_intensity_hm_rows - 1) { cat("\\leavevmode\n\n\n") break } - #loop through FDR cutoffs + bool_1 <- (p_value_data$fdr_adjusted_anova_p < cutoff) + bool_2 <- (p_value_data$min_group_obs_count >= g_intensity_min_per_class) + g_can_run_ksea <- g_can_run_ksea || (sum(bool_2) > 0) + bool_4 <- (p_value_data$quality >= params$minQuality) + bool_3 <- as.logical( + as.integer(bool_1) * + as.integer(bool_2) * + as.integer(bool_4) + ) + if (print_trace_messages) { + if (length(bool_1) > 30) { + cat_variable(bool_1, force_str = TRUE) + cat_variable(bool_2, force_str = TRUE) + cat_variable(bool_3, force_str = TRUE) + } else { + cat_variable(bool_1, suffix = "\n\n") + cat_variable(bool_2, suffix = "\n\n") + cat_variable(bool_3, suffix = "\n\n") + } + cat_variable(length(bool_3), digits = 0, suffix = "; ") + cat_variable(sum(bool_3), digits = 0, suffix = "\n\n") + } filtered_p <- - p_value_data[ - which(p_value_data$fdr_adjusted_anova_p < cutoff), - , drop = FALSE - ] + p_value_data[bool_3, , drop = FALSE] + filtered_p <- + filtered_p[!is.na(filtered_p$phosphopeptide), , drop = FALSE] + + if (print_trace_messages) + cat_variable(filtered_p, force_str = TRUE) + + have_remaining_peptides <- sum(bool_3, na.rm = TRUE) > 0 + filter_result_string <- + sprintf( + "%s, %s of %0.0f peptides remained having both %s and %s.\n\n", + "After filtering for ANOVA results", + if (have_remaining_peptides) + as.character(sum(bool_3, na.rm = TRUE)) + else + "none", + length(bool_3), + sprintf("adjusted p-value < %s", as.character(signif(cutoff, 2))), + sprintf( + "more than %0.0f observations in some groups", + max(0, g_intensity_min_per_class - 1) + ) + ) + filtered_data_filtered <- quant_data_imp_qn_log[ rownames(filtered_p), , drop = FALSE ] + # order by p-value filtered_data_filtered <- filtered_data_filtered[ order(filtered_p$fdr_adjusted_anova_p), , drop = FALSE ] - # <!-- ACE insertion start --> - - if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { + if (have_remaining_peptides && nrow(filtered_p) > 0 && nrow(filtered_data_filtered) > 0) { if (first_page_suppress == 1) { first_page_suppress <- 0 } else { cat("\\newpage\n") } - if (nrow(filtered_data_filtered) > 1) { - subsection_header(sprintf( - "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", - nrow(filtered_data_filtered), - cutoff - )) - } else { - subsection_header(sprintf( - "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", - rownames(filtered_data_filtered)[1], - cutoff - )) - } - cat("\n\n\n") - cat("\n\n\n") - - old_oma <- par("oma") + latex_samepage({ + cat(filter_result_string) + if (nrow(filtered_data_filtered) > 1) { + cat(subsection_header(sprintf( + "Intensity distributions for %d phosphopeptides\n", + nrow(filtered_data_filtered) + ))) + } else { + cat(subsection_header(sprintf( + "Intensity distribution for one phosphopeptide (%s)\n", + rownames(filtered_data_filtered)[1] + ))) + } + }) # end latex_samepage + old_par <- par( mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), oma = old_oma * c(1, 1, 0.3, 1), @@ -2250,18 +4550,24 @@ cex.axis = 0.7, fin = c(9, 7.25) ) - # ref: https://r-charts.com/distribution/add-points-boxplot/ # Vertical plot colnames(filtered_data_filtered) <- sample_name_matches tryCatch( boxplot( filtered_data_filtered, main = "Imputed, normalized intensities", # no line plot - las = 1, + las = 2, + cex.axis = 0.9 * sample_name_grow, col = const_boxplot_fill, - ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + ylab = latex2exp::TeX("$log_{10}$(peptide intensity)"), + notch = FALSE, + varwidth = boxplot_varwidth ), - error = function(e) print(e) + error = function(e) { + print(e) + cat_margins() + } + ) par(old_par) } else { @@ -2272,7 +4578,7 @@ )) } - if (nrow(filtered_data_filtered) > 0) { + if (have_remaining_peptides && nrow(filtered_data_filtered) > 0) { # Add Phosphopeptide column to anova_filtered table # The assumption here is that the first intensity is unique; # this is a hokey assumption but almost definitely will @@ -2305,37 +4611,26 @@ # Produce heatmap to visualize significance and the effect of imputation - anova_filtered_merge_format <- sapply( - X = filtered_p$fdr_adjusted_anova_p - , - FUN = function(x) { - if (x > 0.0001) - paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") - else - paste0("(%0.4e) %s") - } - ) - cat_hm_heading <- function(m, cutoff) { - cat("\\newpage\n") - if (nrow(m) > intensity_hm_rows) { - subsection_header( + if (nrow(m) > g_intensity_hm_rows) { + cat("\\clearpage\n") + cat(subsection_header( paste( sprintf("Heatmap for the %d most-significant peptides", - intensity_hm_rows), + g_intensity_hm_rows), sprintf("whose adjusted p-value < %0.2f\n", cutoff) ) - ) + )) } else { - if (nrow(m) == 1) { + if (nrow(m) == 0) { return(FALSE) } else { - subsection_header( + cat(subsection_header( paste( - sprintf("Heatmap for %d usable peptides whose", nrow(m)), + sprintf("Heatmap for %d usable peptide genes whose", nrow(m)), sprintf("adjusted p-value < %0.2f\n", cutoff) ) - ) + )) } } cat("\n\n\n") @@ -2346,40 +4641,125 @@ # construct matrix with appropriate rownames m <- as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) - if (nrow(m) > 0) { + nrow_m <- nrow(m) + ncol_m <- ncol(m) + if (nrow_m > 0) { rownames_m <- rownames(m) - rownames(m) <- sapply( - X = seq_len(nrow(m)) - , + q <- data.frame(pepname = rownames_m) + g <- sqldf(" + SELECT q.pepname, substr(met.Gene_Name, 1, 30) AS gene_name + FROM q, metadata_plus_p AS met + WHERE q.pepname = met.Phosphopeptide + ORDER BY q.rowid + ") + tmp <- sapply( + X = seq_len(nrow(g)), + FUN = function(i) { + pre <- strsplit(g$gene_name[i], "; ")[[1]] + rslt <- paste(unique(pre), sep = "; ") + return(rslt) + } + ) + tmp <- unlist(tmp) + tmp <- + make.names(tmp, unique = TRUE) + tmp <- sub( + "No_Gene_Name", + "not_found", + tmp, + fixed = TRUE + ) + ten_trunc_names <- trunc_ppep(rownames_m) + tmp <- sapply( + X = seq_len(nrow_m), FUN = function(i) { sprintf( - anova_filtered_merge_format[i], - filtered_p$fdr_adjusted_anova_p[i], - rownames_m[i] + "(%s) %s", + tmp[i], + ten_trunc_names[i] ) } ) + rownames(m) <- tmp } # draw the heading and heatmap - if (nrow(m) > 0) { + if (nrow_m > 0) { number_of_peptides_found <- - draw_intensity_heatmap( + ppep_heatmap( m = m, cutoff = cutoff, hm_heading_function = cat_hm_heading, - hm_main_title = "Unimputed, unnormalized log(intensities)", - suppress_row_dendrogram = FALSE + hm_main_title = + "log(intensities), row-scaled, unimputed, unnormalized", + suppress_row_dendrogram = FALSE, + master_cex = 0.35, + sepcolor = "black", + colsep = sample_colsep ) + if (number_of_peptides_found > 1) { + cat("\\leavevmode\n") + } } } } } -cat("\\leavevmode\n\n\n") +cat(filter_result_string) +cat("\\leavevmode\n") + +if (!g_can_run_ksea) { + errmsg <- paste("Cannot proceed with KSEA analysis", + "because too many values are missing.") + if (FALSE) cat0( + errmsg, + "\\stepcounter{offset}\n", + "\\stepcounter{offset}\n", + "\\stepcounter{offset}\n", + " in ", + table_href(), + ".\n\n" + ) + if (FALSE) { + if (print_nb_messages) nbe(see_variable(p_value_data)) + } else { + if (print_nb_messages) nbe(see_variable(p_value_data)) + + display_p_value_data <- p_value_data + display_p_value_data$raw_anova_p <- + sprintf("%0.3g", display_p_value_data$raw_anova_p) + display_p_value_data$fdr_adjusted_anova_p <- + sprintf("%0.3g", display_p_value_data$fdr_adjusted_anova_p) + display_p_value_data$quality <- + sprintf("%0.3g", display_p_value_data$quality) + + headers_1st_line <- + c("", "Raw ANOVA", "FDR-adj.", "Missing", "Min. #", "", "") + headers_2nd_line <- + c("Phosphopeptide", "p-value", "p-value", "values", "group-obs", "Quality", "Ranking") + data_frame_tabbing_latex( + x = display_p_value_data, + tabstops = c(2.75, 0.80, 0.80, 0.5, 0.6, 0.60), + use_subsubsection_header = FALSE, + headings = c(headers_1st_line, headers_2nd_line), + caption = "ANOVA results" + + ) + } + data_frame_tabbing_latex( + x = save_sample_treatment_df, + tabstops = c(1.25, 1.25), + caption = "Sample classes", + use_subsubsection_header = FALSE + ) + param_df_exit() + knitr::knit_exit() + return(invisible(-1)) +} + ``` ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} -if (count_of_treatment_levels > 1) { +if (g_can_run_ksea && count_of_treatment_levels > 1) { # Prepare two-way contrasts with adjusted p-values # Strategy: # - use imputed, log-transformed data: @@ -2395,7 +4775,7 @@ # Each contrast is between a combination of trt levels m2 <- combn( - x = seq_len(length(levels(sample_treatment_levels))), + x = seq_len(length(levels(smpl_trt))), m = 2, simplify = TRUE ) @@ -2409,13 +4789,13 @@ return( data.frame( contrast = cntrst, - level = sample_treatment_levels[ - sample_treatment_levels %in% - levels(sample_treatment_levels)[c(lvl1, lvl2)] + level = smpl_trt[ + smpl_trt %in% + levels(smpl_trt)[c(lvl1, lvl2)] ], label = sample_name_matches[ - sample_treatment_levels %in% - levels(sample_treatment_levels)[c(lvl1, lvl2)] + smpl_trt %in% + levels(smpl_trt)[c(lvl1, lvl2)] ] ) ) @@ -2595,6 +4975,7 @@ ) # - create contrast-metadata table + if (print_nb_messages) nbe("CREATE TABLE contrast_lvl_lvl_metadata") dml_no_rows_exec(db, " CREATE TABLE contrast_lvl_lvl_metadata AS @@ -2705,12 +5086,11 @@ # - run the two-level (one-way) test p_value_data_contrast_ps <- - apply( - X = contrast_cast_data, - MARGIN = 1, # apply to rows - FUN = anova_func, + row_apply( + x = contrast_cast_data, + fun = anova_func, grouping_factor = - as.factor(as.numeric(grouping_factor$level)), # anova_func arg2 + as.factor(grouping_factor$level), # anova_func arg2 one_way_f = one_way_two_categories, # anova_func arg3 simplify = TRUE # TRUE is the default for simplify ) @@ -2922,6 +5302,8 @@ ; " ) + # We are done with DDL and insertion + RSQLite::dbDisconnect(db) } ``` @@ -2929,7 +5311,7 @@ cat("\\newpage\n") ``` -# KSEA Analysis +# KSEA Analysis Summaries Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: @@ -2940,24 +5322,24 @@ For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: $$ -\text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} +\text{kinase enrichment }z\text{-score}_{j,i} = \frac{(\overline{`r sfc`}_{j,i} - \overline{`r pfc`}_j)\sqrt{m_{j,i}}}{\delta_j} $$ and fold-enrichment is computed as: $$ -\text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} +\text{Enrichment}_{j,i} = \frac{\overline{`r sfc`}_{j,i}}{\overline{`r pfc`}_j} $$ where: -- $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, -- $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and +- $\overline{`r sfc`}_{j,i}$ is the mean `r pfc_txt` in intensities of known substrates of the kinase $i$ in contrast $j$, +- $\overline{`r pfc`}_j$ is the mean `r pfc_txt` of all phosphosites identified in contrast $j$, and - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, -- $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. +- $\delta_j$ is the standard deviation of the $\log_2 (\text{fold-change})$ for contrast $j$ across all phosphosites in the dataset. - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. -$\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) +$\text{FDR}_{j,i}$ is the False Discovery Rate corrected kinase enrichment score. Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). @@ -2965,9 +5347,14 @@ - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). -- Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). - -```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} +- Kinase-substrate data were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). + +For each enriched kinase, a heatmap showing the intensities is presented for up to `r g_intensity_hm_rows` substrates, i.e., those substrates having the highest"quality". + +Where possible, a heatmap of the correlations among these the selected substrates is also presented; if correlations cannot be computed (because of too many missing values), then the covariances are heatmapped for substrates having a variance greater than 1. + +```{r ksea, echo = FALSE, fig.dim = c(12, 14.5), results = 'asis'} +cat("\\clearpage\n") db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) @@ -3046,21 +5433,9 @@ contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) contrast_longlabel <- ( sprintf( - "Trt %s {%s} -> Trt %s {%s}", + "Class %s -> Class %s", contrast_metadata_df[i_cntrst, "b_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "b_samples"], - fixed = TRUE - ), - contrast_metadata_df[i_cntrst, "a_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "a_samples"], - fixed = TRUE - ) + contrast_metadata_df[i_cntrst, "a_level"] ) ) kseaapp_input <- @@ -3091,18 +5466,44 @@ expr = { ksea_scores_rslt <- ksea_scores( - ksdata = pseudo_ksdata, # KSEAapp::KSData, - px = kseaapp_input, - networkin = TRUE, - networkin_cutoff = 2 + ksdata = pseudo_ksdata, + px = kseaapp_input, + networkin = TRUE, + networkin_cutoff = 2, + minimum_substrate_count = ksea_min_substrate_count ) + if (FALSE) { + ksea_scores_rslt <- + ksea_scores_rslt[ + ksea_scores_rslt$m >= ksea_min_substrate_count, + , + drop = FALSE + ] + } + + if (FALSE) { + data_frame_tabbing_latex( + x = ksea_scores_rslt, + tabstops = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.8), + caption = paste("KSEA scores for contrast ", + cntrst_b_level, "to", cntrst_a_level), + use_subsubsection_header = FALSE + ) + } + + if (FALSE) { + if (print_nb_messages) nbe("Output contents of `ksea_scores_rslt` table\n") + cat_variable(ksea_scores_rslt) + cat("\n\\clearpage\n") + } + if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { next_index <- 1 + next_index rslt$score_list[[next_index]] <- ksea_scores_rslt rslt$name_list[[next_index]] <- contrast_label rslt$longname_list[[next_index]] <- contrast_longlabel - low_fdr_print( + ksea_low_fdr_print( rslt = rslt, i_cntrst = i_cntrst, i = next_index, @@ -3113,21 +5514,24 @@ ) } }, - error = function(e) str(e) + error = function(e) { + str(e) + cat_margins() + } ) } plotted_kinases <- NULL -if (length(rslt$score_list) > 1) { +if (g_can_run_ksea && length(rslt$score_list) > 1) { for (i in seq_len(length(ksea_heatmap_titles))) { hdr <- ksea_heatmap_titles[[i]] which_kinases <- i cat("\\clearpage\n\\begin{center}\n") if (i == const_ksea_astrsk_kinases) { - subsection_header(hdr) + cat(subsection_header(hdr)) } else { - subsection_header(hdr) + cat(subsection_header(hdr)) } cat("\\end{center}\n") @@ -3146,7 +5550,7 @@ m_cutoff = 1, # a numeric value between 0 and 1 indicating the p-value/FDR cutoff # for indicating significant kinases in the heatmap - p_cutoff = 0.05, + p_cutoff = params$kseaCutoffThreshold, # a binary input of TRUE or FALSE, indicating whether or not to perform # hierarchical clustering of the sample columns sample_cluster = TRUE, @@ -3163,131 +5567,446 @@ # - 1 : all kinases # - 2 : significant kinases # - 3 : non-significant kinases - which_kinases = which_kinases + which_kinases = which_kinases, + margins = c(7, 15) ) - cat("\\begin{center}\n") - cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") - cat("\\end{center}\n") + if (!is.null(plotted_kinases)) { + cat("\\begin{center}\n") + if (which_kinases != const_ksea_nonastrsk_kinases) + cat("Asterisks reflect significance.\n") + cat("\\end{center}\n") + } } # end for (i in ... } # end if (length ... - -for (i_cntrst in seq_len(length(rslt$score_list))) { - next_index <- i_cntrst - cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] - cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] - cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] - contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) - contrast_longlabel <- ( - sprintf( - "Trt %s {%s} -> Trt %s {%s}", - contrast_metadata_df[i_cntrst, "b_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "b_samples"], - fixed = TRUE - ), - contrast_metadata_df[i_cntrst, "a_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "a_samples"], - fixed = TRUE +``` + +```{r kseabar_calc, echo = FALSE, fig.dim = c(9.5, 6), results = 'asis'} +ksea_prints <- list() +ksea_barplots <- list() + for (i_cntrst in seq_len(length(rslt$score_list))) { + next_index <- i_cntrst + cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] + cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] + cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] + contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) + contrast_longlabel <- ( + sprintf( + "Class %s -> Class %s", + contrast_metadata_df[i_cntrst, "b_level"], + contrast_metadata_df[i_cntrst, "a_level"] + ) + ) + main_title <- ( + sprintf( + "Change from treatment %s to treatment %s", + contrast_metadata_df[i_cntrst, "b_level"], + contrast_metadata_df[i_cntrst, "a_level"] + ) ) + sub_title <- contrast_longlabel + tryCatch( + expr = { + ksea_scores_rslt <- rslt$score_list[[next_index]] + if (print_nb_messages) nbe(see_variable(ksea_scores_rslt)) #ACE + + if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { + sink(deferred <- file()) + ksea_low_fdr_print( + rslt = rslt, + i_cntrst = i_cntrst, + i = next_index, + a_level = cntrst_a_level, + b_level = cntrst_b_level, + fold_change = cntrst_fold_change, + caption = contrast_longlabel, + write_db = FALSE, + anchor = const_table_anchor_t + ) + cat("\n") + sink() + lines <- + paste( + readLines(deferred, warn = FALSE), + collapse = "\n" + ) + close(deferred) + sq_put(ksea_prints, lines) + sink(stderr()) + cat("\n---\n") + cat_variable(ksea_prints) + barplot_closure <- + ksea_low_fdr_barplot_factory( + rslt = rslt, + i_cntrst = i_cntrst, + i = next_index, + a_level = cntrst_a_level, + b_level = cntrst_b_level, + fold_change = cntrst_fold_change, + caption = contrast_longlabel + ) + if (rlang::is_closure(barplot_closure)) + sq_put(ksea_barplots, barplot_closure) + else + sq_put(ksea_barplots, no_op) + str(ksea_barplots) + cat("\n...\n") + sink() + } + }, + error = function(e) { + str(e) + cat_margins() + } + ) + } +``` + +```{r phosphoelm_kinase_upid_desc, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} + +have_kinase_descriptions <- + if (!is.null(bzip2df(kinase_uprt_desc_lut, kinase_uprt_desc_lut_bz2)) && + !is.null(bzip2df(kinase_name_uprt_lut, kinase_name_uprt_lut_bz2)) + ) { + rownames(kinase_uprt_desc_lut) <- kinase_uprt_desc_lut$UniProtID + kinase_name_to_desc_uprt <- function(s) { + rslt <- NULL + tryCatch( + { + which_rows <- eval(s == kinase_name_uprt_lut$kinase) + kinase_uprtid <- + kinase_name_uprt_lut[which_rows, 2] + # filter for first _HUMAN match if any + grepl_human <- grepl("_HUMAN$", kinase_uprtid) + if (0 < sum(grepl_human)) + kinase_uprtid <- kinase_uprtid[grepl_human] + # filter for first match if any + if (0 < length(kinase_uprtid)) { + kinase_uprtid <- kinase_uprtid[1] + kinase_desc <- kinase_uprt_desc_lut[kinase_uprtid, 2] + if (!is.na(kinase_desc)) + rslt <- c(kinase_desc, kinase_uprtid) + else + rslt <- c(kinase_desc, "") + } + }, + warning = str + ) + rslt + } + TRUE + } else { + kinase_name_to_desc_uprt <- function(s) NULL + FALSE + } +``` + +```{r write_params, echo = FALSE, results = 'asis'} +# perhaps this should be moved into the functions section, eventually ... +write_params <- function(db) { + # write parameters to report + + # write parameters to SQLite output + + mqppep_anova_script_param_df <- data.frame( + script = "mqppep_anova_script.Rmd", + parameter = names(param_unlist), + value = param_unlist ) + ddl_exec(db, " + DROP TABLE IF EXISTS script_parameter; + " ) - main_title <- ( - sprintf( - "Change from treatment %s to treatment %s", - contrast_metadata_df[i_cntrst, "b_level"], - contrast_metadata_df[i_cntrst, "a_level"] - ) + ddl_exec(db, " + CREATE TABLE IF NOT EXISTS script_parameter( + script TEXT, + parameter TEXT, + value ANY, + UNIQUE (script, parameter) ON CONFLICT REPLACE + ) + ; + " + ) + RSQLite::dbWriteTable( + conn = db, + name = "script_parameter", + value = mqppep_anova_script_param_df, + append = TRUE ) - sub_title <- contrast_longlabel - tryCatch( - expr = { - ksea_scores_rslt <- rslt$score_list[[next_index]] - - if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { - low_fdr_barplot( - rslt = rslt, - i_cntrst = i_cntrst, - i = next_index, - a_level = cntrst_a_level, - b_level = cntrst_b_level, - fold_change = cntrst_fold_change, - caption = contrast_longlabel - ) - } - }, - error = function(e) str(e) - ) + + loaded_packages_df <- sessioninfo::package_info("loaded") + loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) + loaded_packages_df <- data.frame( + package = loaded_packages_df$package, + version = loaded_packages_df$loadedversion, + date = loaded_packages_df$date + ) + #ACE cat("\\clearpage\n\\section{R package versions}\n") + #ACE data_frame_tabbing_latex( + #ACE x = loaded_packages_df, + #ACE tabstops = c(2.5, 1.25), + #ACE caption = "R package versions" + #ACE ) + cat("\\clearpage\n\\section{Input parameter settings}\n") + data_frame_tabbing_latex( + x = param_df, + tabstops = c(1.75), + underscore_whack = TRUE, + caption = "Input parameters", + verbatim = FALSE + ) +} + +if (!have_kinase_descriptions) { + write_params(db) + # We are done with output + RSQLite::dbDisconnect(db) + param_df_exit() + knitr::knit_exit() } ``` -```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} - -# Use enriched kinases to find enriched kinase-substrate pairs -enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) -all_enriched_substrates <- sqldf(" - SELECT - gene AS kinase, - ppep, - '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label - FROM ( - SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep - FROM pseudo_ksdata - WHERE GENE IN (SELECT kinase FROM enriched_kinases) +```{r kseabar, echo = FALSE, fig.dim = c(9.5, 12.3), results = 'asis'} +if (have_kinase_descriptions) { + my_section_header <- + sprintf( + "inases whose KSEA %s < %s\n", + ksea_cutoff_statistic, + signif(ksea_cutoff_threshold, 2) + ) + + # Use enriched kinases to find enriched kinase-substrate pairs + enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) + + enriched_kinase_descs <- + Reduce( + f = function(l, r) { + lkup <- kinase_name_to_desc_uprt(r) + if (is.null(lkup)) l + else r2 <- rbind( + l, + data.frame( + kinase = r, + uniprot_id = lkup[2], + description = lkup[1] + ) + ) + }, + x = enriched_kinases$kinase, + init = NULL + ) + + if (length(enriched_kinase_descs) > 0 && nrow(enriched_kinase_descs) > 0) { + cat("\n\\clearpage\n") + data_frame_tabbing_latex( + x = enriched_kinase_descs, + tabstops = c(0.9, 1.3), + headings = c("Kinase", "UniProt ID", "Description"), + caption = paste0("Descriptions of k", my_section_header) + ) + } + + if (FALSE) { + cat_variable(sqldf("SELECT kinase FROM enriched_kinases")) + cat_variable(sqldf(" + SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE gene IN (SELECT kinase FROM enriched_kinases) + ")) + data_frame_table_latex( + x = sqldf(" + SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE gene IN (SELECT kinase FROM enriched_kinases) + "), + justification = "l l l", + centered = TRUE, + caption = "substrates of enriched kinases", + anchor = c(const_table_anchor_p, const_table_anchor_t), + underscore_whack = TRUE ) - GROUP BY ppep - ") - -# helper used to label per-kinase substrate enrichment figure -cat_enriched_heading <- function(m, cut_args) { - cutoff <- cut_args$cutoff - kinase <- cut_args$kinase - statistic <- cut_args$statistic - threshold <- cut_args$threshold - cat("\\newpage\n") - if (nrow(m) > intensity_hm_rows) { - subsection_header( - paste( + data_frame_table_latex( + x = sqldf(" + SELECT + gene AS kinase, + ppep, + sub_gene, + '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, + fdr_adjusted_anova_p, + quality, + min_group_obs_count + FROM ( + SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE gene IN (SELECT kinase FROM enriched_kinases) + ), + p_value_data + WHERE ppep = phosphopeptide + GROUP BY kinase, ppep + ORDER BY kinase, ppep, p_value_data.quality DESC + "), + justification = "l l l l l l l", + centered = TRUE, + caption = "labeled substrates of enriched kinases", + anchor = c(const_table_anchor_p, const_table_anchor_t), + underscore_whack = TRUE + ) + } + all_enriched_substrates <- sqldf(" + SELECT + gene AS kinase, + ppep, + sub_gene, + '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, + fdr_adjusted_anova_p, + quality, + min_group_obs_count + FROM ( + SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE gene IN (SELECT kinase FROM enriched_kinases) + ), + p_value_data + WHERE ppep = phosphopeptide + GROUP BY kinase, ppep + ORDER BY kinase, ppep, p_value_data.quality DESC + ") + + all_enriched_substrates <- + all_enriched_substrates[ + all_enriched_substrates$quality >= params$minQuality, + , + drop = FALSE + ] + + all_enriched_substrates$sub_gene <- + sub( + " ///.*", + " ...", + all_enriched_substrates$sub_gene + ) + + all_enriched_substrates$label <- + with( + all_enriched_substrates, + sprintf( + "(%s-%s) %s", + kinase, + trunc_subgene(sub_gene), + ppep + ) + ) + + # this global is set to TRUE by cat_enriched_heading immediately below + g_neednewpage <- FALSE + + # helper used to label per-kinase substrate enrichment figure + cat_enriched_heading <- function(m, cut_args) { + cutoff <- cut_args$cutoff + kinase <- cut_args$kinase + if (g_neednewpage) cat("\\newpage\n") + g_neednewpage <- TRUE + if (nrow(m) > g_intensity_hm_rows) { + cat(subsection_header( sprintf( - "Lowest p-valued %d (of %d) enriched %s-substrates,", - intensity_hm_rows, + "Highest-quality %d (of %d) enriched %s-substrates", + g_intensity_hm_rows, nrow(m), kinase - ), - sprintf(" KSEA %s < %0.2f\n", statistic, threshold) - ) - ) - } else { - if (nrow(m) == 1) { - return(FALSE) + ) + )) } else { - subsection_header( - paste( + if (nrow(m) == 0) { + return(FALSE) + } else { + nrow_m <- nrow(m) + cat(subsection_header( sprintf( - "%d enriched %s-substrates,", - nrow(m), - kinase - ), - sprintf( - " KSEA %s < %0.2f\n", - statistic, - threshold + "%d enriched %s-substrate%s", + nrow_m, + kinase, + if (nrow_m > 1) "s" else "" ) + )) + } + } + cat("\n\n\n") + cat("\n\n\n") + return(TRUE) + } + + # -------------------------------- + # hack begin - show all substrates + enriched_substrates <- all_enriched_substrates + # add "FALSE &&" to prevent listing of substrates + if (show_enriched_substrates && nrow(enriched_substrates) > 0) { + short_row_names <- sub( + "$FAILED_MATCH_GENE_NAME", + "not_found", + enriched_substrates$sub_gene, + fixed = TRUE + ) + + if (print_nb_messages) nbe(see_variable(enriched_substrates)) + substrates_df <- with( + enriched_substrates, + data.frame( + kinase = kinase, + substrate = sub(" ///*", "...", short_row_names), + anova_p_value = signif(fdr_adjusted_anova_p, 2), + min_group_obs_count = signif(min_group_obs_count, 0), + quality = signif(quality, 3), + sequence = trunc_n(30)(ppep) ) ) - } + + substrates_df <- substrates_df[ + with(substrates_df, order(kinase, -quality)), + , + drop = FALSE + ] + + if (print_nb_messages) nbe(see_variable(substrates_df)) + if (nrow(substrates_df) < 1) + substrates_df$sequence <- c() + if (print_nb_messages) nbe(see_variable(substrates_df)) + names(substrates_df) <- headers_2nd_line <- + c("Kinase", "Substrate", "p-value", "per group)", "quality", "Sequence") + headers_1st_line <- c("", "", "ANOVA", "min(values", "", "") + data_frame_tabbing_latex( + x = substrates_df, + tabstops = c(1.2, 0.8, 0.5, 0.65, 0.5), + headings = c(headers_1st_line, headers_2nd_line), + caption = "Details for all enriched substrates of enriched kinases" + ) + rm( + enriched_substrates, + substrates_df, + short_row_names, + headers_1st_line, + headers_2nd_line + ) } - cat("\n\n\n") - cat("\n\n\n") - return(TRUE) + cat("\\clearpage\n") + # hack end - show all substrates + # -------------------------------- + + # print deferred tables and graphs for kinases from contrasts + for (i_cntrst in seq_len(length(ksea_prints))) { + #latex_samepage({ + cat(ksea_prints[[i_cntrst]]) + cat("\n") + ksea_barplots[[i_cntrst]]() + cat("\n") + cat("\\clearpage\n") + #}) + } + } - -# Disabling heatmaps for substrates pending decision whether to eliminate them altogether -if (FALSE) +``` + +```{r enriched, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} +if (g_can_run_ksea) { + g_did_enriched_header <- FALSE for (kinase_name in sort(enriched_kinases$kinase)) { enriched_substrates <- all_enriched_substrates[ @@ -3295,135 +6014,256 @@ , drop = FALSE ] + ten_trunc_ppep <- trunc_enriched_substrate(enriched_substrates$ppep) + enriched_substrates$label <- with( + enriched_substrates, + sprintf( + "(%s) %s", + make.names( + sub("$FAILED_MATCH_GENE_NAME", "not_found", sub_gene, fixed = TRUE), + unique = TRUE + ), + ten_trunc_ppep + ) + ) # Get the intensity values for the heatmap enriched_intensities <- as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) + # Remove rows having too many NA values to be relevant - na_counter <- is.na(enriched_intensities) - na_counts <- apply(na_counter, 1, sum) - enriched_intensities <- - enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE] + good_rows <- (rowSums(enriched_intensities, na.rm = TRUE) != 0) + #ACE nbe(see_variable(good_rows), "\n") + enriched_substrates <- enriched_substrates[good_rows, , drop = FALSE] + enriched_intensities <- enriched_intensities[good_rows, , drop = FALSE] + # Rename the rows with the display-name for the heatmap - rownames(enriched_intensities) <- + short_row_names <- sub( + "$FAILED_MATCH_GENE_NAME", + "not_found", + enriched_substrates$sub_gene, + fixed = TRUE + ) + short_row_names <- + make.names(short_row_names, unique = TRUE) + long_row_names <- sapply( X = rownames(enriched_intensities), FUN = function(rn) { enriched_substrates[enriched_substrates$ppep == rn, "label"] } ) + rownames(enriched_intensities) <- long_row_names # Format as matrix for heatmap m <- as.matrix(enriched_intensities) + rownames(m) <- trunc_enriched_substrate(rownames(m)) + + #ACE nb("m with bad rows: ", see_variable(m), "\n") + #ACE good_rows <- (rowSums(m, na.rm = TRUE) != 0) + #ACE nb(see_variable(good_rows), "\n") + #ACE m <- m[good_rows, , drop = FALSE] + #ACE nb("m without(?) bad rows: ", see_variable(m), "\n") + #ACE nb(see_variable(short_row_names), "\n") + #ACE local_short_row_names <- short_row_names[good_rows] + #ACE local_long_row_names <- long_row_names[good_rows] + #ACE local_enriched_intensities <- enriched_intensities[local_long_row_names, ] + # Draw the heading and heatmap - if (nrow(m) > 0) { + nrow_m <- nrow(m) + if (nrow_m > 0) { + if (!g_did_enriched_header) { + cat("\n\\clearpage\n") + cat(section_header(paste0("K", my_section_header))) + g_did_enriched_header <- TRUE + } + is_na_m <- is.na(m) + cellnote_m <- is_na_m + cellnote_m[!is_na_m] <- "" + cellnote_m[is_na_m] <- "NA" cut_args <- new_env() cut_args$cutoff <- cutoff cut_args$kinase <- kinase_name cut_args$statistic <- ksea_cutoff_statistic cut_args$threshold <- ksea_cutoff_threshold number_of_peptides_found <- - draw_intensity_heatmap( + ppep_heatmap( m = m, + cellnote = cellnote_m, cutoff = cut_args, hm_heading_function = cat_enriched_heading, hm_main_title = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", - suppress_row_dendrogram = FALSE + suppress_row_dendrogram = FALSE, + master_cex = 0.35, + sepcolor = "black", + colsep = sample_colsep ) + if (number_of_peptides_found > 1) { + + tryCatch( + { + rownames(m) <- short_row_names + cov_heatmap(m, nrow_m > g_intensity_hm_rows) + }, + error = function(e) { + cat( + sprintf( + "ERROR: %s\n\\newline\n", + mget("e") + ) + ) + cat( + sprintf( + "message: %s\n\\newline\n", + e$message + ) + ) + cat_margins() + } + ) + } + substrates_df <- with( + enriched_substrates, + data.frame( + substrate = sub(" ///*", "...", short_row_names), + sequence = trunc_long_ppep(ppep), + anova_p_value = signif(fdr_adjusted_anova_p, 2), + min_group_obs_count = signif(min_group_obs_count, 0), + quality = signif(quality, 3) + ) + ) + excess_substrates <- nrow(substrates_df) > g_intensity_hm_rows + if (excess_substrates) + substrates_df <- substrates_df[1:g_intensity_hm_rows, ] + names(substrates_df) <- headers_2nd_line <- + c("Substrate", "Sequence", "p-value", "per group)", "quality") + headers_1st_line <- c("", "", "ANOVA", "min(values", "") + if (1 < nrow(enriched_substrates)) + cat("\n\\newpage\n") + cat(subsubsection_header( + sprintf( + "Details for %s%s-substrates", + if (excess_substrates) + sprintf( + "%s \"highest quality\" ", + g_intensity_hm_rows + ) + else "", + kinase_name + ) + )) + substrates_df <- substrates_df[order(-substrates_df$quality), ] + data_frame_tabbing_latex( + x = substrates_df, + tabstops = c(0.8, 3.8, 0.6, 0.8), + headings = c(headers_1st_line, headers_2nd_line) + ) + } else { + if (print_nb_messages) nbe(see_variable(nrow_m > 0), "\n") } + if (print_nb_messages) nb("end kinase ", kinase_name, "\n") } -# Write output tabular files - -# get kinase, ppep, concat(kinase) tuples for enriched kinases - -kinase_ppep_label <- sqldf(" - WITH - t(ppep, label) AS - ( - SELECT DISTINCT - SUB_MOD_RSD AS ppep, - group_concat(gene, '; ') AS label + # Write output tabular files + + # get kinase, ppep, concat(kinase) tuples for enriched kinases + + if (print_nb_messages) nb("kinase_ppep_label <- ...\n") + if (print_nb_messages) nbe("kinase_ppep_label <- ...\n") + kinase_ppep_label <- sqldf(" + WITH + t(ppep, label) AS + ( + SELECT DISTINCT + SUB_MOD_RSD AS ppep, + group_concat(gene, '; ') AS label + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + GROUP BY ppep + ), + k(kinase, ppep_join) AS + ( + SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join FROM pseudo_ksdata WHERE GENE IN (SELECT kinase FROM enriched_kinases) - GROUP BY ppep - ), - k(kinase, ppep_join) AS - ( - SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join - FROM pseudo_ksdata - WHERE GENE IN (SELECT kinase FROM enriched_kinases) - ) - SELECT k.kinase, t.ppep, t.label - FROM t, k - WHERE t.ppep = k.ppep_join - ORDER BY k.kinase, t.ppep - ") - -# extract what we need from full_data -impish <- cbind(rownames(quant_data_imp), quant_data_imp) -colnames(impish)[1] <- "Phosphopeptide" -data_table_imputed_sql <- " - SELECT - f.*, - k.label AS KSEA_enrichments, - q.* - FROM - metadata_plus_p f - LEFT JOIN kinase_ppep_label k - ON f.Phosphopeptide = k.ppep, - impish q - WHERE - f.Phosphopeptide = q.Phosphopeptide - " -data_table_imputed <- sqldf(data_table_imputed_sql) -# Zap the duplicated 'Phosphopeptide' column named 'ppep' -data_table_imputed <- - data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] - -# Output with imputed, un-normalized data - -write.table( - data_table_imputed - , file = imputed_data_filename - , sep = "\t" - , col.names = TRUE - , row.names = FALSE - , quote = FALSE + ) + SELECT k.kinase, t.ppep, t.label + FROM t, k + WHERE t.ppep = k.ppep_join + ORDER BY k.kinase, t.ppep + ") + + + # extract what we need from full_data + impish <- cbind(rownames(quant_data_imp), quant_data_imp) + colnames(impish)[1] <- "Phosphopeptide" + data_table_imputed_sql <- " + SELECT + f.*, + k.label AS KSEA_enrichments, + q.* + FROM + metadata_plus_p f + LEFT JOIN kinase_ppep_label k + ON f.Phosphopeptide = k.ppep, + impish q + WHERE + f.Phosphopeptide = q.Phosphopeptide + " + data_table_imputed <- sqldf(data_table_imputed_sql) + # Zap the duplicated 'Phosphopeptide' column named 'ppep' + data_table_imputed <- + data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] + + # Output imputed, un-normalized data + if (print_nb_messages) nb("Output imputed, un-normalized data tabular file\n") + if (print_nb_messages) nbe("Output imputed, un-normalized data tabular file\n") + write.table( + data_table_imputed + , file = imputed_data_filename + , sep = "\t" + , col.names = TRUE + , row.names = FALSE + , quote = FALSE + ) + + + #output quantile normalized data + impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) + colnames(impish)[1] <- "Phosphopeptide" + data_table_imputed <- sqldf(data_table_imputed_sql) + # Zap the duplicated 'Phosphopeptide' column named 'ppep' + data_table_imputed <- + data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] + if (print_nb_messages) nb("Output quantile normalized data tabular file\n") + if (print_nb_messages) nbe("Output quantile normalized data tabular file\n") + write.table( + data_table_imputed, + file = imp_qn_lt_data_filenm, + sep = "\t", + col.names = TRUE, + row.names = FALSE, + quote = FALSE ) - -#output quantile normalized data -impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) -colnames(impish)[1] <- "Phosphopeptide" -data_table_imputed <- sqldf(data_table_imputed_sql) -# Zap the duplicated 'Phosphopeptide' column named 'ppep' -data_table_imputed <- - data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] -write.table( - data_table_imputed, - file = imp_qn_lt_data_filenm, - sep = "\t", - col.names = TRUE, - row.names = FALSE, - quote = FALSE -) - -ppep_kinase <- sqldf(" - SELECT DISTINCT k.ppep, k.kinase - FROM ( - SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep - FROM pseudo_ksdata - WHERE GENE IN (SELECT kinase FROM enriched_kinases) - ) k - ORDER BY k.ppep, k.kinase - ") - -RSQLite::dbWriteTable( - conn = db, - name = "ksea_enriched_ks", - value = ppep_kinase, - append = FALSE - ) + ppep_kinase <- sqldf(" + SELECT DISTINCT k.ppep, k.kinase + FROM ( + SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + ) k + ORDER BY k.ppep, k.kinase + ") + + RSQLite::dbWriteTable( + conn = db, + name = "ksea_enriched_ks", + value = ppep_kinase, + append = FALSE + ) +} + +if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\n") RSQLite::dbWriteTable( conn = db, @@ -3453,6 +6293,8 @@ " ) +if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\n") +if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\n") write.table( dbReadTable(db, "stats_metadata_v"), file = anova_ksea_mtdt_file, @@ -3462,75 +6304,21 @@ quote = FALSE ) +cat("\n\\clearpage\n") ``` +# Data-processing summary flowchart + +![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf) + ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} cat("\\leavevmode\n\n\n") -# write parameters to report - -param_unlist <- unlist(as.list(params)) -param_df <- data.frame( - parameter = paste0("\\verb@", names(param_unlist), "@"), - value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@") - ) - -data_frame_latex( - x = param_df, - justification = "p{0.35\\linewidth} p{0.6\\linewidth}", - centered = TRUE, - caption = "Input parameters", - anchor = const_table_anchor_bp, - underscore_whack = FALSE - ) - -# write parameters to SQLite output - -mqppep_anova_script_param_df <- data.frame( - script = "mqppep_anova_script.Rmd", - parameter = names(param_unlist), - value = param_unlist - ) -ddl_exec(db, " - DROP TABLE IF EXISTS script_parameter; - " -) -ddl_exec(db, " - CREATE TABLE IF NOT EXISTS script_parameter( - script TEXT, - parameter TEXT, - value ANY, - UNIQUE (script, parameter) ON CONFLICT REPLACE - ) - ; - " -) -RSQLite::dbWriteTable( - conn = db, - name = "script_parameter", - value = mqppep_anova_script_param_df, - append = TRUE -) - +write_params(db) # We are done with output RSQLite::dbDisconnect(db) + +cat("\\clearpage\n\\section{R package versions}\n") +utils::toLatex(utils::sessionInfo()) ``` -<!-- -There's gotta be a better way... - -loaded_packages_df <- sessioninfo::package_info("loaded") -loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) -loaded_packages_df <- data.frame( - package = loaded_packages_df$package, - version = loaded_packages_df$loadedversion, - date = loaded_packages_df$date - ) -data_frame_latex( - x = loaded_packages_df, - justification = "l | l l", - centered = FALSE, - caption = "Loaded R packages", - anchor = const_table_anchor_bp - ) --->