Mercurial > repos > galaxyp > mqppep_preproc
view mqppep_anova_script.Rmd @ 5:b889e05ce77d draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit aa5f4a19e76ec636812865293b8ee9b196122121
author | galaxyp |
---|---|
date | Fri, 17 Feb 2023 22:38:40 +0000 |
parents | 5c2beb4eb41c |
children |
line wrap: on
line source
--- title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA" author: - "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]" - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" date: - "May 28, 2018" - "; revised December 7, 2022" lot: true output: pdf_document: toc: true toc_depth: 2 keep_tex: true dev: pdf includes: in_header: mqppep_anova_preamble.tex latex_macros: false raw_tex: true urlcolor: blue params: alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/test_input_for_anova.tabular" preprocDb: "test-data/test_input_for_anova.sqlite" kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] regexSampleNames: "\\.\\d+[A-Z]$" regexSampleGrouping: "\\d+" groupFilterPatterns: ".+" groupFilter: !r c("none", "exclude", "include")[1] imputationMethod: !r c("group-median", "median", "mean", "random")[4] kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5] #imputationMethod: !r c("group-median", "median", "mean", "random")[1] # how should sample groups be interpreted? # - "f": fixed patterns (like `grep -F`) # - "p": PERL-compatible (like `grep -P`) # - "r": extended grep patterns (like `grep -E`) # use what case sensitivity? # - "i": case insensitive matching (like `grep -i`) groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1] # what pattern should be used for the first column # (extended grep pattern, case sensitive) firstDataColumn: "^Intensity[^_]" # for small random value imputation, what percentile should be center? meanPercentile: 50 #meanPercentile: 1 # for small random value imputation, what should `s / mean(x)` ratio be? sdPercentile: 1.0 # output path for imputed data file imputedDataFilename: "test-data/imputedDataFilename.txt" # output path for imputed/quantile-normalized/log-transformed data file imputedQNLTDataFile: "test-data/imputedQNLTDataFile.txt" # output path for contents of `stats_metadata_v` table anovaKseaMetadata: "test-data/anovaKseaMetadata.txt" # how to test one variable with > 2 categories (e.g., aov or kruskal.test) oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] # how to test one variable with 2 categories (e.g., oneway.test) oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] # what should be the minimum quality for consideration in both minQuality: 0 # correct KSEA with FDR (recommended) or raw p-value kseaCutoffStatistic: !r c("FDR", "p.value")[1] # correct KSEA threshold 0.05 (conventional) or higher (perhaps better) # "perhaps better" meaning that KSEA is an hypothesis-generator, not -test #kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5)[1] # minimum number of substrates required for a kinase to be considered in KSEA kseaMinSubstrateCount: 1 # Should KSEA be performed aggregating signed log2FC or absolute? # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores # TRUE use abs(log2FC) for KSEA as Justin Drake requested; this is a # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. kseaUseAbsoluteLog2FC: TRUE #kseaUseAbsoluteLog2FC: FALSE # minimum number of observed values per sample-group intensityMinValuesPerGroup: 1 # maximum number of heatmap rows (result are poor when > 50) intensityHeatmapRows: 50 # what should be the primary criterion to eliminate excessive heatmap rows intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1] # should correlation among substrates be used (rather than covariance) correlateSubstrates: TRUE # only show covariance among variables having variance > 1 filterCovVarGT1: FALSE # maximum number of residues to display for ppeps in rownames or columnames ppepTruncN: 10 # maximum number of characters of subgenes to display in rownames or columnames subgeneTruncN: 10 # maximum number of characters for paste(subgene, ppep) for enrichment plots substTruncN: 20 # should boxplots use variable-width boxes to reflect # of samples boxPlotVarWidth: TRUE # should boxplots use notched boxes to reflect difference between samples boxPlotNotch: TRUE # look-up tables for kinase descriptions kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2" kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2" # should debugging trace messages be printed? showEnrichedSubstrates: FALSE # should row-scaling be applied to heatmaps: "none" or "row" defaultHeatMapRowScaling: !r c("none", "row")[1] # how missing values be displayed on heat maps: "NA" or " " heatMapNAcellNote: !r c(" ", "NA")[1] # how missing values be displayed on heat maps: "NA" or " " heatMapNAgrey: "#D8D8D8" # temporary hack heatMapNAsubstitute: TRUE # what color should be used for missing values be displayed on heat maps heatMapNAcellColor: "grey15" # should debugging trace messages be printed? printTraceMsgs: FALSE # when debugging files are needed, set debugFileBasePath to the path # to the directory where they should be written debugFileBasePath: !r if (TRUE) NULL else "test-data" # should debugging nb/nbe messages be printed? printNBMsgs: FALSE #printNBMsgs: TRUE --- ```{r setup, include = FALSE, results = 'asis'} # simple debug messaging print_nb_messages <- params$printNBMsgs nb <- if (!print_nb_messages) { function(...) invisible() } else { function(..., f = cat) f("\n$\\exists{}\\supset\\forall{}$", ...) } nbe <- if (!print_nb_messages) { function(...) invisible() } else { function(..., f = cat, file = stderr()) { cat( stringi::stri_unescape_unicode("\nN.B. \\u2203\\u2283\\u2200"), ..., file = file ) } } #ref for debugging: https://yihui.org/tinytex/r/#debugging options(tinytex.verbose = TRUE) # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10), dpi = 300) # freeze the random number generator so the same results will be produced # from run to run set.seed(28571) ### LIBRARIES if (print_nb_messages) nbe("library(gplots)") library(gplots) if (print_nb_messages) nbe("library(caret)") # load caret for nearZeroVar if (print_nb_messages) nbe("Please ignore any messages about systemd.\n") library(caret) if (print_nb_messages) nbe("library(DBI)") library(DBI) if (print_nb_messages) nbe("library(RSQLite)") library(RSQLite) if (print_nb_messages) nbe("library(sqldf)\n") # Suppress "Warning: no DISPLAY variable so Tk is not available" suppressWarnings(suppressMessages(library(sqldf))) # required but not added to search list: # - DBI # - RSQLite # - ggplot2 # - knitr # - latex2exp # - preprocessCore # - reshape2 # - vioplot ### CONSTANTS const_boxplot_fill <- "grey94" const_ksea_astrsk_kinases <- 1 const_ksea_nonastrsk_kinases <- 2 const_ksea_all_kinases <- 3 const_log10_e <- log10(exp(1)) const_stripchart_cex <- 0.5 const_stripchart_jitter <- 0.3 const_table_anchor_bp <- "bp" const_table_anchor_ht <- "ht" const_table_anchor_p <- "p" const_table_anchor_t <- "t" const_table_anchor_tbp <- "tbp" ### GLOBAL VARIABLES (params) ## functions to process params is_string_null_or_empty <- function(x) { # N. B. non-strings are intentionally treated as NULL if (is.null(x)) TRUE else if (!is.character(x)) TRUE else x == "" } ##' Catch *and* save both errors and warnings, and in the case of ##' a warning, also keep the computed result. ##' return result as list(value = ..., warning = ...) ##' - value will be: ##' - the result if no exception is thrown ##' - the exception if an exception is caught ##' - warning will be a string except perhaps when warning argument is not NULL ##' ##' adapted from `demo(error.catching)` ##' ##' @title tryCatch both warnings (with value) and errors ##' @param expr an \R expression to evaluate ##' @return a list with 'value' and 'warning', where ##' 'value' may be an error caught. ##' @author Martin Maechler; ##' Copyright (C) 2010-2012 The R Core Team try_catch_w_e <- function(expr, error = function(e) e, warning = NULL) { wrn <- NULL # warning handler w_handler <- if (is.function(warning)) warning else function(w) { wrn <<- w invokeRestart("muffleWarning") } e_handler <- if (is.function(error)) error else function(e) e # return result as list(value = ..., warning = ...) # - value will be: # - the result if no exception is thrown # - the exception if an exception is caught list( value = withCallingHandlers( tryCatch( expr, error = e_handler ), warning = w_handler ), warning = wrn ) } see_kvp <- function(format, key, value, suffix = "") { if ( !all( is.character(format), is.character(key), is.character(value), is.character(suffix) ) ) { cat("all arguments to see_kvp should be character") knitr::knit_exit() } result <- sprintf(format, value) if (length(result) > 1) { sprintf( "%s = c(%s)%s", whack_underscores(key), paste(result, collapse = ", "), suffix ) } else { sprintf( "%s = %s%s", key, result, suffix ) } } see_logical <- function(x, suffix = "", xprssn = deparse1(substitute(x))) { result <- as.character(as.logical(x)) # handle NAs and NaNs result[is.na(result)] <- "NA" see_kvp( format = "%s", key = xprssn, value = result, suffix = suffix ) } see_numeric <- function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { if (is.numeric(digits) && is.numeric(x)) { digits <- as.integer(digits) digits <- min(16, max(0, digits)) format <- paste0("%0.", as.character(digits), "g") result <- sprintf(format, x) see_kvp( format = "%s", key = xprssn, value = result, suffix = suffix ) } } see_character <- function(x, suffix = "", xprssn = deparse1(substitute(x))) { if (is.character(x)) { see_kvp( format = "%s", key = xprssn, value = sprintf("\"%s\"", x), suffix = suffix ) } } see_variable <- function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { if (is.character(x)) { see_character(x, suffix, xprssn) } else if (is.numeric(x)) { see_numeric(x, suffix, digits, xprssn) } else if (is.logical(x)) { see_logical(x, suffix, xprssn) } else { f <- file("") sink(f) str(x) msg <- paste(readLines(f), collapse = "\n") sink() close(f) paste0( "see_variable - str(", xprssn, "):\n", msg, "\n" ) } } divert_warnings <- function(expr, classes = "warning") { withCallingHandlers( expr, warning = function(w) { cat(" divert_warnings: ", w$message, "\n", file = stderr()) if (inherits(w, classes)) tryInvokeRestart("muffleWarning") } ) } # ref: https://tug.org/texinfohtml/latex2e.html # LaTeX sets aside the following characters for special purposes. # For example, the percent sign % is for comments. # They are called reserved characters or special characters. # They are all discussed elsewhere in this manual. # # $ % & { } _ ~ ^ \ # # # If you want a reserved character to be printed as itself, in the text body # font, for all but the final three characters in that list simply put # a backslash \ in front of the character. # Thus, typing \$1.23 will produce $1.23 in your output. # # As to the last three characters, to get a tilde in the text body font, # use \~{} (omitting the curly braces would result in the next character # receiving a tilde accent). # Similarly, to get a text body font circumflex use \^{}. # To get a backslash in the font of the text body enter \textbackslash{}. whack_math <- if (TRUE) { function(v) { v <- as.character(v) w <- v w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE) w <- Reduce( f = function(l, r) { ptrn <- paste0("\\", r) for (i in seq_along(l)) { if (!grepl(ptrn, l[i], fixed = TRUE)) { l[i] <- gsub(r, ptrn, l[i], fixed = TRUE) } } l }, x = c("#", "$", "%", "&", "{", "}", "_"), init = w ) w <- gsub("^", "\\^{}", w, fixed = TRUE) w <- gsub("\\textbackslash \\_", "\\_", w, fixed = TRUE) return(w) } } else { function(v) { v <- as.character(v) w <- v w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE) w <- Reduce( f = function(l, r) { if (length(l) > 1) { cat("whack_math: deparse1(v) = `", deparse1(v), "`\n", file = stderr()) cat("whack_math: v = `", v, "`\n", file = stderr()) cat(" Reduce f: l = `", l, "` r = `", r, "`\n", file = stderr() ) divert_warnings(grepl(r, l, fixed = TRUE)) } ptrn <- paste0("\\", r) cat("ptrn: `", ptrn, "`\n", file = stderr()) for (i in seq_along(l)) { cat(" before l[i] = ", l[i], "\n", file = stderr()) if (!grepl(ptrn, l[i], fixed = TRUE)) { l[i] <- gsub(r, ptrn, l[i], fixed = TRUE) } cat(" after l[i] = ", l[i], "\n", file = stderr()) } l }, x = c("#", "$", "%", "&", "{", "}", "_"), init = w ) w <- gsub("^", "\\^{}", w, fixed = TRUE) w <- gsub("\\textbackslash \\_", "\\_", w, fixed = TRUE) return(w) if (all(v == w)) v else paste0("\\texttt{", w, "}") } } whack_underscores <- whack_math ## dump params to stderr (remove this eventually) if (FALSE) nbe(see_variable(params)) ## unlist params for eventual output param_unlist <- unlist(as.list(params)) # no need to whack underscores and dollars because this is verbatim param_df <- data.frame( parameter = paste0("\\verb@", names(param_unlist), "@"), value = paste0( "\n\\begin{tiny}\n\\verb@", param_unlist, "@\n\\end{tiny}" ) ) param_df <- data.frame( parameter = names(param_unlist), value = param_unlist ) param_df <- param_df[order(param_df$parameter), ] ## general output control debug_file_base_path <- params$debugFileBasePath print_trace_messages <- params$printTraceMsgs show_enriched_substrates <- params$showEnrichedSubstrates boxplot_varwidth <- params$boxPlotVarWidth boxplot_notch <- params$boxPlotNotch ## parameters for static data kinase_name_uprt_lut_bz2 <- params$kinaseNameUprtLutBz2 kinase_uprt_desc_lut_bz2 <- params$kinaseUprtDescLutBz2 ## parameters for input file preproc_db <- params$preprocDb alpha_file <- params$alphaFile input_file <- params$inputFile # First data column - ideally, this could be detected via # regexSampleNames, but for now leave it as is. first_data_column <- params$firstDataColumn fdc_is_integer <- is.integer(first_data_column) if (fdc_is_integer) { first_data_column <- as.integer(params$firstDataColumn) } ## parameters for output files ksea_app_prep_db <- params$kseaAppPrepDb imputed_data_filename <- params$imputedDataFilename imp_qn_lt_data_filenm <- params$imputedQNLTDataFile anova_ksea_mtdt_file <- params$anovaKseaMetadata ## parameters for imputation # Imputation method, should be one of # "random", "group-median", "median", or "mean" imputation_method <- params$imputationMethod # Selection of percentile of logvalue data to set the mean for random number # generation when using random imputation mean_percentile <- params$meanPercentile / 100.0 # deviation adjustment-factor for random values; real number. sd_percentile <- params$sdPercentile ## parameters for group parsing and filtering # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" regex_sample_names <- params$regexSampleNames # Regular expression to extract Sample Grouping from Sample Name; # if error occurs, compare smpl_trt vs. sample_name_matches # to see if groupings/pairs line up # e.g., "(\\d+)" regex_sample_grouping <- params$regexSampleGrouping # What are the patterns for filtering sample groups? # How should sample groups be filtered? # - none: do not filter # - include: include sample groups matching filter # - exclude: include sample groups not matching filter sample_group_filter <- params$groupFilter if (grepl("f", params$groupFilterMode, fixed = TRUE)) { sample_group_filter_perl <- FALSE sample_group_filter_fixed <- TRUE } else if (grepl("p", params$groupFilterMode, fixed = TRUE)) { sample_group_filter_perl <- TRUE sample_group_filter_fixed <- FALSE } else { # normal regex sample_group_filter_perl <- FALSE sample_group_filter_fixed <- FALSE } sample_group_filter_nocase <- grepl("i", params$groupFilterMode, fixed = TRUE) # What PCRE patterns should be included or excluded group_filter_patterns_csv <- params$groupFilterPatterns sample_group_filter_patterns <- strsplit( x = group_filter_patterns_csv, split = ",", fixed = TRUE )[[1]] ## parameters for hypothesis testing one_way_all_categories_fname <- params$oneWayManyCategories one_way_all_categories <- try_catch_w_e( match.fun(one_way_all_categories_fname)) if (!is.function(one_way_all_categories$value)) { write("fatal error for parameter oneWayManyCategories:", stderr()) write(one_way_all_categories$value$message, stderr()) if (sys.nframe() > 0) { cat("Cannot continue and quit() failed. Goodbye.") knitr::knit_exit() quit(save = "no", status = 1) } } one_way_all_categories <- one_way_all_categories$value one_way_two_categories_fname <- params$oneWayManyCategories one_way_two_categories <- try_catch_w_e( match.fun(one_way_two_categories_fname)) if (!is.function(one_way_two_categories$value)) { cat("fatal error for parameter oneWayTwoCategories: \n") cat(one_way_two_categories$value$message, fill = TRUE) if (sys.nframe() > 0) { cat("Cannot continue and quit() failed. Goodbye.") knitr::knit_exit() quit(save = "no", status = 1) } } one_way_two_categories <- one_way_two_categories$value ## parameters for KSEA ksea_cutoff_statistic <- params$kseaCutoffStatistic ksea_cutoff_threshold <- params$kseaCutoffThreshold ksea_min_substrate_count <- params$kseaMinSubstrateCount ## parameters for global variables consumed by functions # intensityHeatmapCriteria: !r c("na_count", "p_value")[2] # TODO switch to 1 # TODO Validate within list g_intensity_hm_criteria <- params$intensityHeatmapCriteria if (is_string_null_or_empty(g_intensity_hm_criteria)) { cat("invalid intensityHeatmapCriteria parameter (must be string)") knitr::knit_exit() } switch( g_intensity_hm_criteria, "quality" = NULL, "na_count" = NULL, "p_value" = NULL, { with( params, cat( sprintf( "invalid %s (must be %s)", see_variable(intensityHeatmapCriteria), "one of quality or na_count or p_value" ) ) ) knitr::knit_exit() } ) g_default_heatmap_row_scaling <- params$defaultHeatMapRowScaling if ( !is.character(g_default_heatmap_row_scaling) || !(g_default_heatmap_row_scaling %in% c("row", "none")) ) { cat("invalid defaultHeatMapRowScaling (must be 'row' or 'none')") knitr::knit_exit() } # intensityHeatmapRows: 50 # TODO Validate >> 0 < 75 g_intensity_hm_rows <- params$intensityHeatmapRows if (!is.integer(g_intensity_hm_rows) || g_intensity_hm_rows < 1) { cat("invalid intensityHeatmapRows (must be integer > 0)") knitr::knit_exit() } g_intensity_min_per_class <- params$intensityMinValuesPerGroup if (!is.integer(g_intensity_min_per_class) || g_intensity_min_per_class < 0) { cat("invalid intensityMinValuesPerGroup (must be integer > -1") knitr::knit_exit() } g_correlate_substrates <- params$correlateSubstrates if (is.na(as.logical(g_correlate_substrates))) { cat("invalid correlateSubstrates (must be TRUE or FALSE)") knitr::knit_exit() } g_filter_cov_var_gt_1 <- params$filterCovVarGT1 if (is.na(as.logical(g_filter_cov_var_gt_1))) { cat("invalid filterCovVarGT1 parameter (must be TRUE or FALSE)") knitr::knit_exit() } # TODO Validate >> 0 < 30 g_ppep_trunc_n <- params$ppepTruncN # TODO Validate >> 0 < 30 g_subgene_trunc_n <- params$subgeneTruncN # TODO Validate >> 0 < 30 g_sbstr_trunc_n <- params$substTruncN ### OPERATORS # Test for exclusion # ref: https://www.reneshbedre.com/blog/in-operator-r.html `%notin%` <- Negate(`%in%`) # Augmented assignment # ref: https://www2.cs.arizona.edu/icon/refernce/infix2.htm#aug_assign `%||:=%` <- function(lvalue, ...) { pf <- parent.frame() rvalue <- Reduce(paste0, x = ..., init = lvalue) assign( x = as.character(substitute(lvalue)), value = rvalue, pos = pf ) invisible(rvalue) } # Infix concatenation `%||%` <- function(lvalue, rvalue) paste0(lvalue, rvalue) ### FUNCTIONS no_op <- function() { } # this function is not used in this file and should be removed while # factoring out reusable code all_apply <- function(f, v, na_rm = TRUE, ...) { Reduce( f = function(l, r) if (na_rm && is.na(r)) TRUE else l && r, x = sapply(X = v, FUN = f, ...), init = TRUE ) } write_debug_file <- function(data_frame) { if (!is.null(debug_file_base_path)) { s_path <- sprintf( "%s/%s.txt", debug_file_base_path, deparse(substitute(data_frame)) ) write.table( data_frame, file = s_path, sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE ) } } # ref: http://adv-r.had.co.nz/Environments.html # "When creating your own environment, note that you should set its parent # environment to be the empty environment. This ensures you don't # accidentally inherit objects from somewhere else." # Caution: this prevents `with(my_env, expr)` from working when `expr` # contains anything from the global environment, even operators! # Hence, `x <- 1; get("x", new_env())` fails by design. new_env <- function() { new.env(parent = emptyenv()) } # make apply readable for rows row_apply <- function(x, fun, ..., simplify = TRUE) { apply(x, MARGIN = 1, fun, ..., simplify = TRUE) } # make apply readable for columns column_apply <- function(x, fun, ..., simplify = TRUE) { apply(x, MARGIN = 2, fun, ..., simplify = TRUE) } ##' Produce a vector of boolean values whose i-th value is TRUE when any ##' member of v matches the i-th membr of s, where i in 1:seq_len(length(s)) ##' ##' @title Search multiple strings for matches of multiple substrings ##' @param v a vector of substrings to match ##' @param s a vector of strings to search for matches ##' @param ... additional arguments to grepl ##' @return a list with keys in s and valuse that are vectors of elements of v ##' @author Art Eschenlauer ##' Copyright (C) 2022 Art Eschenlauer; ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms mgrepl <- function(v, s, ...) { grpl_rslt <- rep_len(0, length(s)) for (vi in v) { grpl_rslt_v <- sapply( X = s, FUN = function(t) { Reduce( f = function(l, r) if (is.null(l)) r else c(l, r), x = sapply( X = vi, FUN = function(f) grepl(f, t, ...) ), init = c() ) }, simplify = "array" ) grpl_rslt <- grpl_rslt + grpl_rslt_v } unname(grpl_rslt > 0) } ##' Produce positions in a vector where succeeding value != current valus ##' ##' @title Search vector for neighboring positions having different values ##' @param v a vector of comparable numeric values (e.g. integers) ##' @return a vector of positions i where v[i] != v[i + 1] ##' @author Art Eschenlauer ##' Copyright (C) 2022 Art Eschenlauer; ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms transition_positions <- function(v) { Reduce( f = function(l, i) if ((i != 1) && (v[i - 1] != v[i])) c(l, i - 1) else l, x = seq_along(v)[-1:0], init = c() ) } ### figure debug functions cat_par_vector <- function(par_name, lbl = "", newlines = TRUE) { cat( sprintf( "%spar(%s) = c(%s)%s", lbl, par_name, paste(par(par_name), collapse = ", "), if (newlines) "\n\n" else "" ) ) } cat_margins <- function(lbl = NULL) { for (p in c("fig", "fin", "mar", "mai", "omd", "omi", "oma")) cat_par_vector(p, if (!is.null(lbl)) paste0(lbl, " ") else NULL) } cat_variable <- function(x, suffix = "", digits = 3, force_str = FALSE) { xprssn <- deparse1(substitute(x)) if (force_str || is.matrix(x) || is.list(x) || is.data.frame(x)) { cat( paste0( "\n\\texttt{\\textbf{", whack_underscores(xprssn), "}} [", typeof(x), ",", mode(x), "] =\n" ) ) cat("\n\\begin{verbatim}\n") str(x) cat("\n\\end{verbatim}\n") } else { cat("\n", see_variable(x, suffix, digits, xprssn)) } } ### structure helper functions # ref: staque.R - Icon-oriented stack and queue operations # - https://gist.github.com/eschen42/917690355e53918b9e7ba7138a02d1f8 # # sq_get(v):x produces the leftmost element of v and removes it from v, # but produces NA if v is empty sq_get <- function(v) { if (length(v) == 0) return(NA) assign(as.character(substitute(v)), v[-1], parent.frame()) return(v[1]) } # # sq_put(v,x1,...,xn):v puts x1, x2, ..., xn onto the right end of v, # producing v. # Values are pushed in order from left to right, # so xn becomes the last (rightmost) value on v. # sq_put(v) with no second argument does nothing. sq_put <- function(v, x = NA, ...) { pf <- parent.frame() if (is.null(x)) return(pf$v) if ( !(length(x) > 1) && !rlang::is_closure(x) && is.na(x) ) return(pf$v) assign(as.character(substitute(v)), c(v, x, ...), pf) pf[[as.character(substitute(v))]] } ### numerical/statistical helper functions any_nan <- function(x) { !any(x == "NaN") } # determine standard deviation of quantile to impute sd_finite <- function(x) { ok <- is.finite(x) sd(x[ok]) } # compute anova raw p-value anova_func <- function(x, grouping_factor, one_way_f) { subject <- data.frame( intensity = x ) x_aov <- one_way_f( formula = intensity ~ grouping_factor, data = subject ) pvalue <- if (identical(one_way_f, aov)) summary(x_aov)[[1]][["Pr(>F)"]][1] else pvalue <- x_aov$p.value pvalue } # This code adapted from matrixcalc::is.positive.definite # Notably, this simply tests without calling stop() # nolint start # squash un-actionable cyclomatic_complexity warning is_positive_definite <- function(x, tol = 1e-08) { # nolint end if (!is.matrix(x)) return(FALSE) if (!is.numeric(x)) return(FALSE) if (nrow(x) < 1) return(FALSE) if (ncol(x) < 1) return(FALSE) if (nrow(x) != ncol(x)) return(FALSE) sum_symm <- sum(x == t(x), na.rm = TRUE) value_count <- Reduce("*", dim(x)) if (sum_symm != value_count) return(FALSE) eigenvalues <- eigen(x, only.values = TRUE)$values n <- nrow(x) for (i in 1:n) { if (abs(eigenvalues[i]) < tol) { eigenvalues[i] <- 0 } } if (any(eigenvalues <= 0)) { return(FALSE) } return(TRUE) } ### LaTeX functions # Use this like print.data.frame, from which it is adapted: data_frame_table_latex <- function( x, # digits to pass to format.data.frame digits = NULL, # TRUE -> right-justify columns; FALSE -> left-justify right = TRUE, # maximumn number of rows to print max = NULL, # string with justification of each column justification = NULL, # TRUE to center on page centered = TRUE, # optional caption caption = NULL, # h(inline); b(bottom); t (top) or p (separate page) anchor = "h", # set underscore_whack to TRUE to escape underscores underscore_whack = TRUE, # how to emit results emit = cat ) { if (is.null(justification)) justification <- Reduce( f = paste, x = rep_len(if (right) "r" else "l", length(colnames(x))) ) n <- length(rownames(x)) if (length(x) == 0L) { emit( sprintf( # if n is one, use singular 'row', else use plural 'rows' ngettext( n, "data frame with 0 columns and %d row", "data frame with 0 columns and %d rows" ), n ), "\n", sep = "" ) } else if (n == 0L) { emit("0 rows for:\n") latex_itemized_list( v = names(x), underscore_whack = underscore_whack ) } else { if (is.null(max)) max <- getOption("max.print", 99999L) if (!is.finite(max)) { cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) knitr::knit_exit() } omit <- (n0 <- max %/% length(x)) < n m <- as.matrix( format.data.frame( if (omit) x[seq_len(n0), , drop = FALSE] else x, digits = digits, na.encode = FALSE ) ) emit( # h(inline); b(bottom); t (top) or p (separate page) paste0("\\begin{table}[", anchor, "]"), "\\leavevmode", sep = "\n" ) if (!is.null(caption)) emit(paste0(" \\caption{", caption, "}")) if (centered) emit("\\centering\n") emit( paste( " \\begin{tabular}{", justification, "}\n", sep = "" ) ) # ref for top and bottom struts (\T and \B): # https://tex.stackexchange.com/a/50355 if (!is.null(caption)) emit("\\B \\\\\n") latex_table_row( v = colnames(m), extra = " \\T \\B", underscore_whack = underscore_whack ) emit("\\hline \\\\\n") for (i in seq_len(length(m[, 1]))) { latex_table_row( v = m[i, ], underscore_whack = underscore_whack ) } emit( paste( " \\end{tabular}", "\\end{table}", sep = "\n" ) ) if (omit) emit(" [ reached 'max' / getOption(\"max.print\") -- omitted", n - n0, "rows ]\n") } invisible(x) } # Use this like print.data.frame, from which it is adapted: # nolint start # squash un-actionable cyclomatic_complexity warning # squash un-actionable "no visible global for ..." data_frame_tabbing_latex <- function( # nolint end x, # vector of tab stops, in inches tabstops, # vector of headings, registered with tab-stops headings = colnames(x), # digits to pass to format.data.frame digits = NULL, # maximumn number of rows to print max = NULL, # optional caption caption = NULL, # set underscore_whack to TRUE to escape underscores underscore_whack = TRUE, # flag for landscape mode landscape = FALSE, # flag indicating that subsubsection should be used for caption # rather than subsection use_subsubsection_header = TRUE, # character-size indicator; for possible values, see: # https://tug.org/texinfohtml/latex2e.html#Font-sizes charactersize = "small", # set verbatim to TRUE to debug output verbatim = FALSE ) { hlinport <- if (landscape) { function() cat("\\hlinlscp \\\\\n") } else { function() cat("\\hlinport \\\\\n") } tabstops_tex <- Reduce( f = function(l, r) paste0(l, r), x = sprintf("\\hspace{%0.2fin}\\=", tabstops), init = "" ) n <- length(rownames(x)) if (length(x) == 0L) { cat( sprintf( # if n is one, use singular 'row', else use plural 'rows' ngettext( n, "data frame with 0 columns and %d row", "data frame with 0 columns and %d rows" ), n ), "\n", sep = "" ) } else if (n == 0L) { cat("0 rows for:\n") latex_itemized_list( v = names(x), underscore_whack = underscore_whack ) } else { if (is.null(max)) max <- getOption("max.print", 99999L) if (!is.finite(max)) { cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) knitr::knit_exit() } omit <- (n0 <- max %/% length(x)) < n m <- as.matrix( format.data.frame( if (omit) x[seq_len(n0), , drop = FALSE] else x, digits = digits, na.encode = FALSE ) ) if (landscape) cat("\n\\begin{landscape}") tex_caption <- if (!is.null(caption)) sprintf("\\captionof{table}{%s}\n", caption) else "\n" # build the column names, which have multiple lines when # length(headings) is a multiple of the number of columns column_names <- "" while (length(headings) > 0) { my_row <- c() for (i in 1:(1 + length(tabstops))) { my_field <- sq_get(headings) sq_put(my_row, if (is.na(my_field)) "" else my_field) } column_names %||:=% latex_tabbing_row( v = my_row, underscore_whack = underscore_whack, action = paste0 ) } # Begin tabbing environment after beginning charactersize environment if (verbatim) cat("\n\\begin{verbatim}") cat( paste0( "\n\\begin{", charactersize, "}", tex_caption, "\\begin{tabwrap}{", tabstops_tex, "}\n" ) ) # emit column names cat(column_names) # emit hline hlinport() for (i in seq_len(length(m[, 1]))) { my_row <- latex_tabbing_row( v = m[i, ], underscore_whack = underscore_whack, action = paste0 ) if (FALSE) cat(my_row) else cat(my_row) } hlinport() if (omit) cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", n - n0, "rows ]\n") # End charactersize environment after ending tabbing environment cat(paste0("\\end{tabwrap}\n\\end{", charactersize, "}\n")) if (verbatim) cat("\\end{verbatim}\n") if (landscape) cat("\\end{landscape}\n") } invisible(x) } param_df_noexit <- function(e = NULL) { data_frame_tabbing_latex( x = param_df, tabstops = c(1.75), underscore_whack = TRUE, caption = "Input parameters", verbatim = FALSE ) if (!is.null(e)) { sink(stderr()) cat("Caught fatal error:\n\n") str(e) sink() } } param_df_exit <- function(e = NULL) { param_df_noexit(e) knitr::knit_exit() exit(-1) } # exit with exit code (default 0) and optional msg exit <- function(code = 0, msg = NULL, use_stderr = FALSE) { if (!is.null(msg)) { if (use_stderr) sink(stderr()) cat("\n\n", msg, "\n\n") if (use_stderr) sink() } q(save = "no", status = code) } # make control sequences into printable latex sequences latex_printable_control_seqs <- function(s) { s <- gsub("[\\]", "xyzzy_plugh", s) s <- gsub("[$]", "\\\\$", s) s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) return(s) } nolatex_verbatim <- function(expr) eval(expr) error_knitrexit_stop <- function(e) { cat("Caught error:\n\n") str(e) knitr::knit_exit() stop(e) } latex_verbatim <- function(expr) { cat("\n\\begin{verbatim}\n___\n") tryCatch( expr = expr, error = param_df_exit, finally = cat("...\n\\end{verbatim}\n") ) } latex_samepage <- function(expr) { cat("\n\\begin{samepage}\n") tryCatch( expr = expr, error = param_df_exit, finally = cat("\n\\end{samepage}\n") ) } # return the result of invocation after showing parameters # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ latex_show_invocation <- function(f, f_name = deparse1(substitute(f)), head_patch = FALSE) { function(...) { va <- list(...) my_rslt <- new_env() my_rslt$rslt <- NULL latex_verbatim( expr = { cat(sprintf("\n .. Local variables for '%s':\n\n", f_name)) str(va) if (!head_patch) { # return this result # ref: # https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ cat(sprintf("\n .. Invoking '%s'\n", f_name)) tryCatch( { cat("\n\\end{verbatim}\n") rslt <- do.call(f, va) }, error = param_df_exit, finally = cat("\n\\begin{verbatim}\n") ) cat(sprintf("\n .. '%s' returned:\n", f_name)) str(rslt) my_rslt$rslt <- rslt } } ) # return the result of invocation with the shown parameters # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ if (head_patch) my_rslt$rslt <- do.call(f, va) (my_rslt$rslt) } } latex_collapsed_vector <- function( collapse_string, v, underscore_whack = TRUE, action = cat0 ) { v_sub <- if (underscore_whack) whack_underscores(v) else v action( paste0( v_sub, collapse = collapse_string ) ) } latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { cat("\\begin{itemize}\n\\item ") latex_collapsed_vector(collapse_string, v, underscore_whack) cat("\n\\end{itemize}\n") } latex_itemized_list <- function(v, underscore_whack = TRUE) { latex_itemized_collapsed("\n\\item ", v, underscore_whack) } latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { cat("\\begin{enumerate}\n\\item ") latex_collapsed_vector(collapse_string, v, underscore_whack) cat("\n\\end{enumerate}\n") } latex_enumerated_list <- function(v) { latex_enumerated_collapsed("\n\\item ", v) } latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { latex_collapsed_vector(" & ", v, underscore_whack) cat(extra) cat(" \\\\\n") } latex_tabbing_row <- function( v, extra = "", underscore_whack = TRUE, action = cat0 ) { # latex_collapsed_vector applies action to result of paste0; # by default, action = cat; # hence, a scalar string is assigned to v_collapsed v_collapsed <- latex_collapsed_vector( "} \\> \\tabfill{", v, underscore_whack, action = paste0 ) action( "\\tabfill{", v_collapsed, "}", extra, " \\\\\n" ) } # N.B. use con = "" to emulate regular cat fcat0 <- function(..., sprtr = " ", cnnctn = file()) { gsubfn::cat0(..., sep = sprtr, file = cnnctn) invisible(cnnctn) } hypersub <- function(s) { hyper <- tolower(s) hyper <- gsub("[^a-z0-9]+", "-", hyper) hyper <- gsub("[-]+", "-", hyper) hyper <- gsub("[_]+", "-", hyper) hyper <- sub("^[-]", "", hyper) hyper <- sub("[-]$", "", hyper) return(hyper) } table_href <- function(s = "offset", caption = "") { paste0("\\hyperlink{table.\\arabic{", s, "}}{Table \\arabic{", s, "}}") } table_offset <- function(i = 0, s = "offset", new = FALSE) { paste0( if (new) paste0("\\newcounter{", s, "}\n") else "", "\\setcounter{", s, "}{\\value{table}}\n", paste0(if (i > 0) rep(paste0("\\stepcounter{", s, "}"), i), "\n") ) } a_section_header <- function(s, prefix = "") { hyper <- hypersub(s) my_subsection_header <- sprintf( "\\hypertarget{%s}{\\%ssection{%s}\\label{%s}}\n", hyper, prefix, gsub("_", "\\_", s, fixed = TRUE), hyper ) my_subsection_header } section_header <- function(s) a_section_header(s, "") subsection_header <- function(s) a_section_header(s, "sub") subsubsection_header <- function(s) a_section_header(s, "subsub") ### SQLite functions ddl_exec <- function(db, sql) { discard <- DBI::dbExecute(conn = db, statement = sql) if (FALSE && discard != 0) { need_newpage <- TRUE if (need_newpage) { need_newpage <<- FALSE cat("\\newpage\n") } o_file <- stdout() cat("\n\\begin{verbatim}\n") cat(sql, file = o_file) cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file) cat("\n\\end{verbatim}\n") } } dml_no_rows_exec <- function(db, sql) { discard <- DBI::dbExecute(conn = db, statement = sql) if (discard != 0) { need_newpage <- TRUE if (need_newpage) { need_newpage <<- FALSE cat("\\newpage\n") } cat("\n\\begin{verbatim}\n") o_file <- stdout() cat(sql, file = o_file) cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file) cat("\n\\end{verbatim}\n") } } ### KSEA functions and helpers #' The KSEA App Analysis (KSEA Kinase Scores Only) #' #' Compute KSEA kinase scores and statistics from phoshoproteomics data input #' Adapted from KSEAapp::KSEA.Scores to allow retrieval of maximum log2(FC) #' #' Result is an R data.frame with column names #' "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" #' "Please refer to the original Casado et al. publication for detailed #' description of these columns and what they represent: #' #' - Kinase.Gene indicates the gene name for each kinase. #' - mS represents the mean log2(fold change) of all the #' kinase's substrates. #' - Enrichment is the background-adjusted value of the kinase's mS. #' - m is the total number of detected substrates #' from the experimental dataset for each kinase. #' - z.score is the normalized score for each kinase, weighted by #' the number of identified substrates. #' - p.value represents the statistical assessment for the z.score. #' - FDR is the p-value adjusted for multiple hypothesis testing #' using the Benjamini & Hochberg method." #' #' @param ksdata the Kinase-Substrate dataset uploaded from the file #' prefaced with "PSP&NetworKIN_" #' available from github.com/casecpb/KSEA/ #' @param px the experimental data file formatted as described in #' the KSEA.Complete() documentation #' @param networkin a binary input of TRUE or FALSE, indicating whether #' or not to include NetworKIN predictions, where #' \code{NetworKIN = TRUE} #' means include NetworKIN predictions #' @param networkin_cutoff a numeric value between 1 and infinity setting #' the minimum NetworKIN score #' (this can be omitted if NetworKIN = FALSE) #' #' @return creates a new R data.frame with all the KSEA kinase #' scores, along with each one's statistical #' assessment, as described herein. #' #' @references #' #' Casado et al. (2013) Sci Signal. 6(268):rs6 #' #' Hornbeck et al. (2015) Nucleic Acids Res. 43:D512-20 #' #' Horn et al. (2014) Nature Methods 11(6):603-4 #' # nolint start # squash un-actionable cyclomatic_complexity warning ksea_scores <- function( # nolint end # For human data, typically, ksdata = KSEAapp::ksdata ksdata, # Input data file having columns: # - Protein : abbreviated protein name # - Gene : HUGO gene name # - Peptide : peptide sequence without indications of phosphorylation # - Reside.Both : position(s) of phosphorylation within Gene sequence # - First letter designates AA that is modified # - Numbers indicate position within Gene # - Multiple values are separated by semicolons # - p : p-value # - FC : fold-change px, # A binary input of TRUE or FALSE, indicating whether or not to include # NetworKIN predictions networkin, # A numeric value between 1 and infinity setting the minimum NetworKIN # score (can be left out if networkin = FALSE) networkin_cutoff, # Minimum substrate count, necessary to adjust the p-value appropriately. minimum_substrate_count ) { if (FALSE) { if (print_nb_messages) nbe("Output contents of `ksdata` table\n") cat_variable(ksdata) cat("\n\\clearpage\n") data_frame_tabbing_latex( x = ksdata[order(ksdata$GENE, ksdata$SUB_GENE), ], tabstops = c(0.3, 0.3, 1.2, 0.3, 0.3, 0.3, 0.3, 0.8, 0.3, 0.8, 0.3, 0.3, 0.3), caption = "ksdata dump", use_subsubsection_header = FALSE ) } # no px$FC should be <= 0, but abs(px$FC) is used below as a precaution. if (length(grep(";", px$Residue.Both)) == 0) { # There are no Residue.Both entries having semicolons, so new is # simply px except two columns are renamed and a column is added # for log2(abs(fold-change)) new <- px colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") new$log2_fc <- log2(abs(as.numeric(as.character(new$FC)))) new <- new[complete.cases(new$log2_fc), ] } else { # Split each row having semicolons in Residue.Both into rows that are # duplicated in all respects except that each row has a single # member of the set "split-on-semicolon-Residue.Both" px_double <- px[grep(";", px$Residue.Both), ] residues <- as.character(px_double$Residue.Both) residues <- as.matrix(residues, ncol = 1) split <- strsplit(residues, split = ";") # x gets count of residues in each row, # i.e., 1 + count of semicolons x <- sapply(split, length) # Here is the set of split rows px_single <- data.frame( Protein = rep(px_double$Protein, x), Gene = rep(px_double$Gene, x), Peptide = rep(px_double$Peptide, x), Residue.Both = unlist(split), p = rep(px_double$p, x), FC = rep(px_double$FC, x) ) # new first gets the split rows new <- px[-grep(";", px$Residue.Both), ] # to new, append the rows that didn't need splitting in the first place new <- rbind(new, px_single) # map Gene to SUB_GENE # map Residue.Both to SUB_MOD_RSD colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") # Eliminate any non-positive values to prevent introduction of # infinite or NaN values new[(0 <= new$log2_fc), "log2_fc"] <- NA # Because of preceding step, there is no need for abs in the next line new$log2_fc <- log2(as.numeric(as.character(new$FC))) # Convert any illegal values from NaN to NA new[is.nan(new$log2_fc), "log2_fc"] <- NA # Eliminate rows having missing values (e.g., non-imputed data) new <- new[complete.cases(new$log2_fc), ] } # At this point, new$log2_fc is signed according to which contrast has # the greater intensity # To take the magnitude into account without taking the direction into # account, set params$kseaUseAbsoluteLog2FC to TRUE # # Should KSEA be performed aggregating signed log2FC or absolute? # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores # nolint start # squash un-actionable "no visible global for ..." if (params$kseaUseAbsoluteLog2FC) { # nolint end # TRUE use abs(log2FC) for KSEA as Justin requested; this is a # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. new$log2_fc <- abs(new$log2_fc) } monitor_filtration_on_stderr <- TRUE if (monitor_filtration_on_stderr) { # set to TRUE to monitor filtration on stderr sink(stderr()) cat(see_variable(networkin, "\n")) } ksdata_filtered <- sqldf::sqldf( sprintf("%s %s", "select * from ksdata where not Source = 'NetworKIN'", if (networkin) sprintf("or networkin_score >= %d", networkin_cutoff) else "" ) ) if (FALSE) write.csv(x = ksdata_filtered, file = "ksdata_filtered.csv") if (monitor_filtration_on_stderr) { cat(see_variable(sqldf::sqldf( "select count(*), Source from ksdata group by Source"), "\n")) cat(see_variable(sqldf::sqldf( "select count(*), Source from ksdata_filtered group by Source"), "\n")) sink() } ############################################################################ # Line numbers below refer to lines of: # https://github.com/casecpb/KSEAapp/blob/master/R/KSEA.Scores.R # I would put the original line in a comment but then lint would complain... # - Indeed, I had to rename all the variables because lint didn't like names # containing periods or capital letters. # ACE ############################################################################ # # (1) Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD # colnames of ksdata_filtered: # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" # "SITE_...7_AA" "networkin_score" "Source" # colnames of new: # "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc" # Equivalent to: # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc # FROM ksdata_filtered a # INNER JOIN new b # ON a.SUB_GENE = b.SUB_GENE # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD # (KSEA.Scores.R line # 105) # "Extract KSData.filtered annotations that are only found in new" ksdata_dataset <- base::merge(ksdata_filtered, new) # colnames of ksdata_dataset: # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) # Re-order dataset; prior to accounting for isoforms # (KSEA.Scores.R line # 106) ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] # Extract non-isoform accession in UniProtKB # (KSEA.Scores.R line # 107) ksdata_dataset$uniprot_no_isoform <- sapply( ksdata_dataset$KIN_ACC_ID, function(x) unlist(strsplit(as.character(x), split = "-"))[1] ) # "last expression collapses isoforms ... for easy processing" # Discard previous results while selecting interesting columns ... # (KSEA.Scores.R line # 110) ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] # Column names are now: # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" # "FC" "log2_fc" "Source" # Make column names human-readable # (KSEA.Scores.R line # 111) colnames(ksdata_dataset_abbrev) <- c( "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", "FC", "log2FC", "Source" ) # SELECT * FROM ksdata_dataset_abbrev # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p # (KSEA.Scores.R line # 112) # "Extract KSData.filtered annotations that are only found in new" ksdata_dataset_abbrev <- ksdata_dataset_abbrev[ order( ksdata_dataset_abbrev$Kinase.Gene, ksdata_dataset_abbrev$Substrate.Gene, ksdata_dataset_abbrev$Substrate.Mod, ksdata_dataset_abbrev$p), ] if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) if (FALSE) write.csv( x = ksdata_dataset_abbrev, file = "ksdata_dataset_abbrev_no_aggregation.csv", row.names = FALSE ) # For some kinases, the only difference between kinase names in two databases # is the case of the the letters; dedup to eliminate this false distinction # The only imperfection with this approach is when the NetworKIN-only fliter # is in force and there is a duplication between NetworKIN and HPRD (which # outranks NetworKIN). Presently, the script does not invoke with this # function with the NetworKIN-only filter active. dedup_sql <- " select r.'Kinase.Gene', r.'Substrate.Gene', r.'Substrate.Mod', r.Peptide, r.p, r.FC, r.log2FC, r.Source from ( select rowid, rank() over (partition by lower(k.'Kinase.Gene'), k.p, k.FC order by k.Source ) as rank, * from ksdata_dataset_abbrev k ) as r where r.rank = 1 " ksdata_dataset_abbrev <- sqldf::sqldf(x = dedup_sql) if (FALSE) write.csv( x = ksdata_dataset_abbrev, file = "ksdata_dataset_abbrev_no_aggregation_dedup.csv", row.names = FALSE ) # First aggregation step to account for multiply phosphorylated peptides # and differing peptide sequences; the goal here is to combine results # for all measurements of the same substrate. # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, # `Source`, avg(log2FC) AS log2FC # FROM ksdata_dataset_abbrev # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, # `Source` # ORDER BY `Kinase.Gene`; # in two steps: # (1) compute average log_2(fold-change) # "take the mean of the log2FC amongst phosphosite duplicates" # (KSEA.Scores.R line # 115) ksdata_dataset_abbrev <- aggregate( log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, data = ksdata_dataset_abbrev, FUN = mean ) if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) # (2) order by Kinase.Gene # (KSEA.Scores.R line # 117) ksdata_dataset_abbrev <- ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] # SELECT `Kinase.Gene`, count(*) # FROM ksdata_dataset_abbrev # GROUP BY `Kinase.Gene`; # in two steps: # (1) Extract the list of Kinase.Gene names # "@@@@@@@@@@@@@@@@@@@@" # "Do analysis for KSEA" # "@@@@@@@@@@@@@@@@@@@@" # (KSEA.Scores.R line # 124) kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, # named by Kinase.Gene # (KSEA.Scores.R line # 125) kinase_list <- as.matrix(table(kinase_list)) # Second aggregation step to account for all substrates per kinase # CREATE TABLE mean_fc # AS # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC # FROM ksdata_dataset_abbrev # GROUP BY `Kinase.Gene` # (KSEA.Scores.R line # 127) if (print_nb_messages) nb(see_variable(ksdata_dataset_abbrev), "\n") mean_fc <- aggregate( log2FC ~ Kinase.Gene, data = ksdata_dataset_abbrev, FUN = mean ) if (print_nb_messages) nbe(see_variable(mean_fc), "\n") if (FALSE) write.csv(x = mean_fc, file = "mean_fc.csv") # for contrast j # for each kinase i # extract log2 of fold-change (from `new` above) # (used in KSEA.Scores.R lines # 130 & 132) log2_fc_j_each_i <- new$log2_fc # for contrast j # for all kinases i # compute mean of abs(log2 of fold-change) # (used in KSEA.Scores.R lines # 130) mean_abs_log2_fc_j_all_i <- mean(abs(log2_fc_j_each_i), na.rm = TRUE) # for contrast j # for all kinases i # compute mean of log2 of fold-change # (used in KSEA.Scores.R lines # 132) mean_log2_fc_j_all_i <- mean(log2_fc_j_each_i, na.rm = TRUE) # Reorder mean_fc, although I don't see why # (KSEA.Scores.R line 128 mean_fc <- mean_fc[order(mean_fc[, 1]), ] if (FALSE) write.csv(x = mean_fc, file = "mean_fc.csv") # mean_fc columns so far: "Kinase.Gene", "log2FC" # - Kinase.Gene # indicates the gene name for each kinase. # Create column 3: mS # - mS # represents the mean log2(fold change) of all the # kinase's substrates. # (KSEA.Scores.R line # 129) mean_fc$m_s <- mean_fc_m_s <- mean_fc[, 2] # Create column 4: Enrichment # - Enrichment # is the background-adjusted value of the kinase's mS. # (KSEA.Scores.R line # 130) mean_fc$enrichment <- mean_fc_m_s / mean_abs_log2_fc_j_all_i # Create column 5: m, count of substrates of kinase (count of j for i) # - m # is the total number of detected substrates # from the experimental dataset for each kinase. # (KSEA.Scores.R line # 131) mean_fc$m <- mean_fc_m <- kinase_list # Create column 6: z-score # - z.score # is the normalized score for each kinase, weighted by # the number of identified substrates. # (KSEA.Scores.R line # 132) mean_fc$z_score <- (mean_fc_m_s - mean_log2_fc_j_all_i) * sqrt(mean_fc_m) / sd(log2_fc_j_each_i, na.rm = TRUE) # Create column 7: p-value, deduced from z-score # - p.value # represents the statistical assessment for the z.score. # (KSEA.Scores.R line # 133) # "one-tailed p-value" mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) # zap excluded kinases; this must be done before adjusting p-value if (TRUE) { mean_fc <- mean_fc[ mean_fc$m >= minimum_substrate_count, , drop = FALSE ] } # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value # - FDR # is the p-value adjusted for multiple hypothesis testing # using the Benjamini & Hochberg method." # (KSEA.Scores.R line # 134) mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") # It makes no sense to leave Z-scores negative when using # absolute value of fold-change # nolint start # squash un-actionable "no visible global for ..." if (params$kseaUseAbsoluteLog2FC) { # nolint end mean_fc$z_score <- abs(mean_fc$z_score) } # Remove second column (log2FC), which is duplicated as mS # (KSEA.Scores.R line # 136) mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] # Correct the column names which we had to hack because of the linter... colnames(mean_fc) <- c( "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" ) # (KSEA.Scores.R line # 138) return(mean_fc) } ksea_low_fdr_barplot_factory <- function( rslt, i_cntrst, i, a_level, b_level, fold_change, caption ) { rslt_score_list_i <- rslt$score_list[[i]] if (!is.null(rslt_score_list_i)) { rslt_score_list_i_nrow <- nrow(rslt_score_list_i) k <- data.frame( contrast = as.integer(i_cntrst), a_level = rep.int(a_level, rslt_score_list_i_nrow), b_level = rep.int(b_level, rslt_score_list_i_nrow), kinase_gene = rslt_score_list_i$Kinase.Gene, mean_log2_fc = rslt_score_list_i$mS, enrichment = rslt_score_list_i$Enrichment, substrate_count = rslt_score_list_i$m, z_score = rslt_score_list_i$z.score, p_value = rslt_score_list_i$p.value, fdr = rslt_score_list_i$FDR ) selector <- switch( ksea_cutoff_statistic, "FDR" = { k$fdr }, "p.value" = { k$p_value }, { cat( sprintf( "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", ksea_cutoff_statistic ) ) param_df_exit() knitr::knit_exit() } ) k <- k[selector < ksea_cutoff_threshold, ] nrow_k <- nrow(k) if (nrow_k > 0) { max_nchar_rowname <- max(nchar(rownames(k))) my_cex_names <- 1.0 / (1 + nrow_k / 50) if (print_trace_messages) cat_margins("Initially") if (print_trace_messages) cat_variable(nrow_k, "\n\n", 0) if (print_trace_messages) cat_variable(my_cex_names, "\n\n", 0) if (print_trace_messages) cat_variable(max_nchar_rowname, "\n\n", 0) # fin: The figure region dimensions, (width, height), in inches. # mar: A numerical vector of the form c(bottom, left, top, right) # that gives the number of lines of margin to be specified # on the four sides of the plot; default: c(5, 4, 4, 2) + 0.1 # mar: The figure region dimensions, (width, height), in inches. numeric_z_score <- as.numeric(k$z_score) bar_order <- order(-as.numeric(k$p_value)) kinase_name <- k$kinase_gene long_caption <- sprintf( "Kinase z-score, %s, KSEA %s < %s", caption, ksea_cutoff_statistic, ksea_cutoff_threshold ) my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) # return a function that draws the plot function() { par_fin <- par("fin") # vector of width_in_inches and height_in_inches) op <- par( bg = if (print_trace_messages) "yellow" else "white", fin = c(par_fin[1], min(par_fin[2], 2.5 + nrow_k / 6)), mar = par("mar") + c(3 / nrow_k, (1 + max_nchar_rowname * my_cex_names) / 2, 0, 0) # bottom, left, top, right ) on.exit(par(op)) if (print_trace_messages) cat_margins("Eventually") barplot( height = numeric_z_score[bar_order], border = NA, xpd = FALSE, cex.names = my_cex_names, main = long_caption, cex.main = my_cex_caption, names.arg = kinase_name[bar_order], horiz = TRUE, srt = 45, las = 1, cex.axis = 0.9 ) } } } else { no_op } } # note that this adds elements to the global variable `ksea_asterisk_hash` ksea_low_fdr_print <- function( rslt, i_cntrst, i, a_level, b_level, fold_change, caption, write_db = TRUE, # if TRUE, write to DB, else print table anchor = c(const_table_anchor_p, const_table_anchor_t) ) { rslt_score_list_i <- rslt$score_list[[i]] if (!is.null(rslt_score_list_i)) { rslt_score_list_i_nrow <- nrow(rslt_score_list_i) k <- contrast_ksea_scores <- data.frame( contrast = as.integer(i_cntrst), a_level = rep.int(a_level, rslt_score_list_i_nrow), b_level = rep.int(b_level, rslt_score_list_i_nrow), kinase_gene = rslt_score_list_i$Kinase.Gene, mean_log2_fc = rslt_score_list_i$mS, enrichment = rslt_score_list_i$Enrichment, substrate_count = rslt_score_list_i$m, z_score = rslt_score_list_i$z.score, p_value = rslt_score_list_i$p.value, fdr = rslt_score_list_i$FDR ) selector <- switch( ksea_cutoff_statistic, "FDR" = { k$fdr }, "p.value" = { k$p_value }, { cat( sprintf( "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", ksea_cutoff_statistic ) ) param_df_exit() knitr::knit_exit() } ) k <- k[selector < ksea_cutoff_threshold, ] # save kinase names to ksea_asterisk_hash for (kinase_name in k$kinase_gene) { ksea_asterisk_hash[[kinase_name]] <- 1 } if (write_db) { db_write_table_overwrite <- (i_cntrst < 2) db_write_table_append <- !db_write_table_overwrite RSQLite::dbWriteTable( conn = db, name = "contrast_ksea_scores", value = contrast_ksea_scores, append = db_write_table_append ) "" } else { selector <- switch( ksea_cutoff_statistic, "FDR" = { contrast_ksea_scores$fdr }, "p.value" = { contrast_ksea_scores$p_value }, { cat( sprintf( "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", ksea_cutoff_statistic ) ) param_df_exit() knitr::knit_exit() } ) if (print_nb_messages) nbe(see_variable(contrast_ksea_scores)) output_df <- contrast_ksea_scores[ selector < ksea_cutoff_threshold, c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", "z_score", "p_value", "fdr") ] output_df$kinase_gene <- gsub( "_", "\\\\_", output_df$kinase_gene ) colnames(output_df) <- c( colnames(output_df)[1], colnames(output_df)[2], "enrichment", "m_s", "z_score", "p_value", "fdr" ) # nolint start output_order <- with(output_df, order(p_value)) # nolint end output_df <- output_df[output_order, ] output_df[, 2] <- sprintf("%0.3g", output_df[, 2]) output_df$fdr <- sprintf("%0.4f", output_df$fdr) output_df$p_value <- sprintf("%0.2e", output_df$p_value) output_df$z_score <- sprintf("%0.2f", output_df$z_score) output_df$m_s <- sprintf("%d", output_df$m_s) output_df$enrichment <- sprintf("%0.3g", output_df$enrichment) colnames(output_df) <- c( "Kinase or motif", "\\(\\overline{{\\lvert}\\log_2 (\\text{fold-change}){\\rvert}}\\)", "Enrichment", "Substrates", "z-score", "p-value", "FDR" ) selector <- switch( ksea_cutoff_statistic, "FDR" = { rslt$score_list[[i]]$FDR }, "p.value" = { rslt$score_list[[i]]$p.value }, { cat( sprintf( "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", ksea_cutoff_statistic ) ) param_df_exit() knitr::knit_exit() } ) if (sum(selector < ksea_cutoff_threshold) > 0) { if (print_nb_messages) nbe(see_variable(output_df)) math_caption <- gsub("{", "\\{", caption, fixed = TRUE) math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) if (TRUE) { data_frame_tabbing_latex( x = output_df, # vector of tab stops, in inches tabstops = c(1.8, 1.2, 0.8, 0.8, 0.8, 0.8, 0.8), # vector of headings, registered with tab-stops headings = colnames(output_df), # digits to pass to format.data.frame digits = NULL, # maximumn number of rows to print max = NULL, # optional caption caption = sprintf( "\\text{%s}, KSEA %s < %s", math_caption, ksea_cutoff_statistic, ksea_cutoff_threshold ), # set underscore_whack to TRUE to escape underscores underscore_whack = FALSE, # flag for landscape mode landscape = FALSE, # flag indicating that subsubsection should be used for caption # rather than subsection use_subsubsection_header = TRUE, # character-size indicator; for possible values, see: # https://tug.org/texinfohtml/latex2e.html#Font-sizes charactersize = "small", # set verbatim to TRUE to debug output verbatim = FALSE ) } } else { cat( sprintf( "\\break No kinases had \\(\\text{KSEA %s}_\\text{enrichment} < %s\\) for contrast %s\\hfill\\break\n", ksea_cutoff_statistic, ksea_cutoff_threshold, caption ) ) } } } else { "" } } # create_breaks is a helper for ksea_heatmap create_breaks <- function(merged_scores) { if (sum(!is.na(merged_scores)) < 2) return(NULL) if (min(merged_scores, na.rm = TRUE) < -1.6) { breaks_neg <- seq(-1.6, 0, length.out = 30) breaks_neg <- append( seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10), breaks_neg ) breaks_neg <- sort(unique(breaks_neg)) } else { breaks_neg <- seq(-1.6, 0, length.out = 30) } if (max(merged_scores, na.rm = TRUE) > 1.6) { breaks_pos <- seq(0, 1.6, length.out = 30) breaks_pos <- append( breaks_pos, seq(1.6, max(merged_scores, na.rm = TRUE), length.out = 10) ) breaks_pos <- sort(unique(breaks_pos)) } else { breaks_pos <- seq(0, 1.6, length.out = 30) } breaks_all <- unique(append(breaks_neg, breaks_pos)) mycol_neg <- gplots::colorpanel(n = length(breaks_neg), low = "blue", high = "white") mycol_pos <- gplots::colorpanel(n = length(breaks_pos) - 1, low = "white", high = "red") mycol <- unique(append(mycol_neg, mycol_pos)) color_breaks <- list(breaks_all, mycol) return(color_breaks) } # nolint start # squash un-actionable cyclomatic_complexity warning hm2plus <- function( # nolint end x, mat = matrix( c( c(0, 4, 0), c(0, 3, 3), c(2, 1, 1) ), nrow = 3, ncol = 3, byrow = TRUE ), denwid = 0.5, denhgt = 0.15, widths = c(0.5, 2.5, 1.5), heights = c(0.4, 0.15, 3.95), divergent = FALSE, notecol = "grey50", trace = "none", margins = c(6, 20), srtcol = 90, srtrow = 0, density_info = "none", key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), key_par = list(), hclustfun = hclust, ... ) { if (FALSE) # this is to avoid commenting out code to pass linting... my_hm2 <- latex_show_invocation(gplots::heatmap.2, head_patch = FALSE) else my_hm2 <- gplots::heatmap.2 x <- as.matrix(x) if (sum(!is.na(x)) < 1) return(NULL) color_count <- 1 + max(64, length(as.vector(x))) # 8 was not enough break_count <- 1 + color_count min_nonax <- min(x, na.rm = TRUE) max_nonax <- max(x, na.rm = TRUE) if (print_nb_messages) nb("within hm2plus", see_variable(divergent), "\n") if (divergent) { zlim <- max(abs(min_nonax), abs(max_nonax)) if (print_nb_messages) nb(see_variable(zlim, "\n")) breaks <- (zlim) / (break_count:1) if (print_nb_messages) nb(see_variable(breaks, "\n")) breaks <- breaks - median(breaks) zlim <- c(-zlim, zlim) if (print_nb_messages) nb(see_variable(zlim, "\n")) } else { zlim <- max(abs(min_nonax), abs(max_nonax)) if (print_nb_messages) nb(see_variable(zlim, "\n")) breaks <- zlim / (break_count:1) if (print_nb_messages) nb(see_variable(breaks, "\n")) if (max_nonax < 0) { breaks <- breaks - zlim zlim <- c(-zlim, 0) } else { zlim <- c(0, zlim) } if (print_nb_messages) nb(see_variable(zlim, "\n")) } nonax <- x # nolint start # squash un-actionable "no visible global for ..." if (params$heatMapNAsubstitute) nonax[is.na(x)] <- min_nonax # nolint end if (is.null(widths)) widths <- c(denwid, 4 - denwid, 1.5) if (is.null(heights)) heights <- c(0.4, denhgt, 4.0) colors <- if (divergent && min_nonax < 0) { # divergent colors on both sides of zero colorRampPalette(c("red", "white", "blue"))(color_count) } else if (divergent && min_nonax > 0) { # "divergent" colors > zero colorRampPalette(c("white", "blue"))(color_count) } else if (divergent && max_nonax < 0) { # "divergent" colors < zero colorRampPalette(c("red", "white"))(color_count) } else { # "non-divergent" colors including zero tmp <- hcl.colors(color_count, "YlOrRd", rev = TRUE) # nolint start # squash un-actionable "no visible global for ..." tmp[1] <- params$heatMapNAgrey # nolint end tmp } nbe("There are ", sum(is.na(x)), " NA values") suppressWarnings( my_hm2( x = x, col = colors, density.info = density_info, srtCol = srtcol, srtRow = srtrow, margins = margins, lwid = widths, lhei = heights, key.title = NA, key.xlab = key_xlab, key.par = key_par, lmat = mat, notecol = notecol, trace = trace, bg = "yellow", hclustfun = hclustfun, oldstyle = FALSE, ... # varargs ) ) # implicitly returning value returned by heatmap.2 } # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap # nolint start # squash un-actionable cyclomatic_complexity warning draw_kseaapp_summary_heatmap <- function( # nolint end x, # matrix with row/col names already formatted sample_cluster, # a binary input of TRUE or FALSE, # indicating whether or not to perform # hierarchical clustering of the sample columns merged_asterisk, # matrix having dimensions of x, values "*" or "" color_breaks, # breaks for color gradation, from create_breaks # passed to `breaks` argument of `image` margins = c(8, 15), # two integers setting the bottom and right margins # to accommodate row and column labels master_cex = 0.7, # basis for text sizes ... ) { merged_scores <- x if (!is.matrix(x)) { cat( paste0( "No plot because \\texttt{typeof(x)} is '", typeof(x), "' rather than 'matrix'.\n\n" ) ) cat_variable(x) return(FALSE) } if (print_trace_messages) cat(sprintf("master_cex = %03f\n\n", master_cex)) nrow_x <- nrow(x) ncol_x <- ncol(x) #if (nrow_x < 2) { if (nrow_x < 1) { cat("No plot because matrix has no rows.\n\n") return(FALSE) } else if (nrow_x < 2) { cat("No plot because matrix has one row. Matrix looks like this:\n\n") cat("\n\\begin{verbatim}\n") print(x) cat("\n\\end{verbatim}\n") return(FALSE) } else if (ncol_x < 2) { cat("No plot because matrix x has ", ncol_x, " columns.\n\n") cat_variable(x) return(FALSE) } max_nchar_rowname <- max(nchar(rownames(x))) max_nchar_colname <- max(nchar(colnames(x))) my_row_cex_scale <- master_cex * 150 / nrow_x # row cex shrink hack begin my_row_cex_scale <- master_cex * 150 / max(nrow_x, ncol_x) # row cex shrink hack end my_col_cex_scale <- 3.0 # col cex shrink hack begin if (ncol_x > 40) my_col_cex_scale <- 3.0 * 40 / ncol_x # col cex shrink hack end my_asterisk_scale <- 0.4 * my_row_cex_scale my_row_warp <- 1 my_note_warp <- 2 my_row_warp <- 1 my_row_cex_asterisk <- master_cex * my_row_warp * my_asterisk_scale my_col_cex <- my_col_cex_scale * master_cex my_row_cex <- min(3.5 * my_row_cex_asterisk, my_col_cex) my_key_cex <- 1.286 my_hm2_cex <- 1 * master_cex my_offset <- (4.8 / (9 + nrow_x / 10)) - 0.4 if (print_trace_messages) cat(sprintf("nrow_x = %03f\n\n", nrow_x)) if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) my_offset <- 0.05 if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) my_scale <- 3.0 if (ncol_x < 10 && nrow_x < 10) my_scale <- my_scale * 10 / (10 - nrow_x) * 10 / (10 - ncol_x) my_heights <- c( 0.15, 3.85 - my_offset, 0.5 + my_offset ) my_margins <- c(1, 1) + c( margins[1] * 0.08 * max_nchar_colname * my_col_cex, margins[2] * 0.04 * max_nchar_rowname * my_row_cex ) if (print_trace_messages) { cat_variable(my_heights, suffix = "; ") cat_variable(my_margins, suffix = "\n\n") cat_variable(my_row_cex_scale, suffix = "; ") cat_variable(my_col_cex_scale, suffix = "\n\n") cat_variable(my_row_cex_asterisk, suffix = "\n\n") cat_variable(my_row_cex, suffix = "; ") cat_variable(my_col_cex, suffix = "\n\n") cat_variable(my_row_cex, suffix = "; ") cat_variable(my_col_cex, suffix = "\n\n") } hm2plus( x = merged_scores, Colv = sample_cluster, cellnote = merged_asterisk, cex = my_hm2_cex, cexCol = my_col_cex, cexRow = my_row_cex, denhgt = 0.15, density_info = "none", denwid = 0.5, divergent = TRUE, key_par = list(cex = my_key_cex), key_xlab = "Z-score", margins = my_margins, notecex = my_scale * min( 1.5, my_row_cex_asterisk * my_note_warp, my_col_cex * my_note_warp ), notecol = "white", scale = "none", srtcol = 90, srtrow = 0, trace = "none", mat = matrix( c( c(0, 3, 3), c(2, 1, 1), c(0, 4, 0) ), nrow = 3, ncol = 3, byrow = TRUE ), widths = c(0.5, 3.1, 0.9), heights = my_heights, ... ) return(TRUE) } # function drawing heatmap of contrast fold-change for each kinase, # adapted from KSEAapp::KSEA.Heatmap # nolint start # squash un-actionable cyclomatic_complexity warning ksea_heatmap <- function( # nolint end # the data frame outputs from the KSEA.Scores() function, in list format score_list, # a character vector of all the sample names for heatmap annotation: # - the names must be in the same order as the data in score_list # - please avoid long names, as they may get cropped in the final image sample_labels, # character string of either "p.value" or "FDR" indicating the data column # to use for marking statistically significant scores stats, # a numeric value between 0 and infinity indicating the min. number of # substrates a kinase must have to be included in the heatmap m_cutoff, # a numeric value between 0 and 1 indicating the p-value/FDR cutoff # for indicating significant kinases in the heatmap p_cutoff = { cat("argument 'p_cutoff' is required for function 'ksea_heatmap'") param_df_exit() knitr::knit_exit() }, # a binary input of TRUE or FALSE, indicating whether or not to perform # hierarchical clustering of the sample columns sample_cluster, # a binary input of TRUE or FALSE, indicating whether or not to export # the heatmap as a .png image into the working directory export = FALSE, # bottom and right margins; adjust as needed if contrast names are too long margins = c(6, 6), # print which kinases? # - Mandatory argument, must be one of const_ksea_.*_kinases which_kinases, # additional arguments to gplots::heatmap.2, such as: # - main: main title of plot # - xlab: x-axis label # - ylab: y-axis label ... ) { filter_m <- function(dataset, m_cutoff) { filtered <- dataset[(dataset$m >= m_cutoff), ] return(filtered) } score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff)) for (i in seq_len(length(score_list_m))) { names <- colnames(score_list_m[[i]])[c(2:7)] colnames(score_list_m[[i]])[c(2:7)] <- paste(names, i, sep = ".") } master <- Reduce( f = function(...) { base::merge(..., by = "Kinase.Gene", all = TRUE) }, x = score_list_m ) row.names(master) <- master$Kinase.Gene columns <- as.character(colnames(master)) merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE]) colnames(merged_scores) <- sample_labels merged_stats <- as.matrix(master[, grep(stats, columns)]) asterisk <- function(mtrx, p_cutoff) { new <- data.frame() for (i in seq_len(nrow(mtrx))) { for (j in seq_len(ncol(mtrx))) { my_value <- mtrx[i, j] if (!is.na(my_value) && my_value < p_cutoff) { new[i, j] <- "*" } else { new[i, j] <- "" } } } return(new) } merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) asterisk_rows <- rowSums(merged_asterisk == "*") > 0 all_rows <- rownames(merged_stats) names(asterisk_rows) <- all_rows non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) row_list <- list() row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows row_list[[const_ksea_all_kinases]] <- all_rows row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows i <- which_kinases my_row_names <- row_list[[i]] scrs <- merged_scores[my_row_names, , drop = FALSE] stts <- merged_stats[my_row_names, , drop = FALSE] merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) color_breaks <- create_breaks(scrs) if (is.null(color_breaks)) { cat("No plot because matrix has too few rows.\n\n") return(NULL) } plot_height <- nrow(scrs) ^ 0.55 plot_width <- ncol(scrs) ^ 0.7 if (export == "TRUE") { png( "KSEA.Merged.Heatmap.png", width = plot_width * 300, height = 2 * plot_height * 300, res = 300, pointsize = 14 ) } did_draw <- draw_kseaapp_summary_heatmap( x = scrs, sample_cluster = sample_cluster, merged_asterisk = merged_asterisk, color_breaks = color_breaks, margins = margins ) if (export == "TRUE") { dev.off() } if (!did_draw) return(NULL) return(my_row_names) } # helpers for heatmaps of phosphopeptide intensities # factory producing function to truncate string after n characters trunc_n <- function(n) { function(x) { sapply( X = x, FUN = function(s) { if (is.na(s)) return("NA") cond <- try_catch_w_e(nchar(s) > n) if (!is.logical(cond$value)) { return(cond$value$message) } else if (cond$value) { paste0( strtrim(s, n), "..." ) } else { s } }, USE.NAMES = FALSE ) } } trunc_long_ppep <- function(x) trunc_n(40)(x) trunc_ppep <- function(x) trunc_n(g_ppep_trunc_n)(x) trunc_subgene <- function(x) trunc_n(g_subgene_trunc_n)(x) trunc_enriched_substrate <- function(x) trunc_n(g_sbstr_trunc_n)(x) # factory producing a function that returns a covariance # matrix's rows (and columns) having variance > v_min keep_cov_w_var_gtr_min <- function(v_min) { function(x) { if (!is.matrix(x)) return(NULL) keepers <- sapply( X = seq_len(nrow(x)), FUN = function(i) { if (x[i, i] < v_min) NA else x[i, i] } ) names(keepers) <- rownames(x) keepers <- keepers[!is.na(keepers)] keepers <- names(keepers) if (length(keepers) == 0) return(NULL) x[keepers, keepers] } } # function that returns a matrix's rows having variance > 1 keep_cov_w_var_gtr_1 <- keep_cov_w_var_gtr_min(1) # factory producing a function that returns # - either a matrix's rows (rows = TRUE) # - or a matrix's columns (rows = FALSE) # having variance > v_min keep_var_gtr_min <- function(v_min) { function(x, rows = TRUE) { nrowcol <- if (rows) nrow else ncol if (!is.matrix(x)) return(NULL) keepers <- sapply( X = seq_len(nrowcol(x)), FUN = function(i) { row_var <- var( if (rows) x[i, ] else x[, i], na.rm = TRUE ) if (is.na(row_var) || row_var <= v_min) NA else i } ) keepers <- keepers[!is.na(keepers)] if (rows) x[keepers, ] else x[, keepers] } } keep_var_gtr_0 <- keep_var_gtr_min(0) # function drawing heatmap of phosphopeptide intensities # nolint start # squash un-actionable cyclomatic_complexity warning ppep_heatmap <- # nolint end function( m, # matrix with rownames already formatted cutoff, # cutoff used by hm_heading_function hm_heading_function, # construct $ cat heading from m and cutoff hm_main_title, # main title for plot (drawn below heading) suppress_row_dendrogram = TRUE, # set to false to show dendrogram max_peptide_count = # experimental: g_intensity_hm_rows, # values of 50 and 75 worked well master_cex = 1.0, # basis for text sizes margins = NULL, # optional margins (bottom, right) cellnote = NULL, # optional matrix of character; dim = dim(m) adj = 0.5, # adjust text: 0 left, 0.5 middle, 1 right row_scaling = # should row-scaling be applied if possible? g_default_heatmap_row_scaling, ... # passthru to hm2plus or heatmap.2 ) { peptide_count <- 0 # emit the heading for the heatmap if (hm_heading_function(m, cutoff)) { nrow_m <- nrow(m) peptide_count <- min(max_peptide_count, nrow_m) if (nrow_m > 1) { m_margin <- m[peptide_count:1, ] # Margin was heuristically derived to accommodate the widest label row_mchar_max <- max(nchar(rownames(m_margin))) col_mchar_max <- max(nchar(colnames(m_margin))) row_margin <- master_cex * row_mchar_max * 2.6 col_margin <- master_cex * col_mchar_max * 2.6 if (print_trace_messages) cat( sprintf("row_margin = %0.3f; ", row_margin)) if (print_trace_messages) cat( sprintf("col_margin = %0.3f; ", col_margin)) hm_call <- NULL tryCatch( { # set non-argument parameters for hm_call inner function my_row_cex <- master_cex * 200000 / ( (max(nchar(rownames(m_margin)))^2) * g_intensity_hm_rows ) m_hm <- m[peptide_count:1, , drop = FALSE] if (is.null(cellnote)) { cellnote <- matrix("", nrow = nrow(m_hm), ncol = ncol(m_hm)) # nolint start # squash un-actionable "no visible global for ..." cellnote[is.na(m_hm)] <- params$heatMapNAcellNote # nolint end } else { cellnote <- cellnote[peptide_count:1, , drop = FALSE] } # nolint start # squash un-actionable "no visible global for ..." if (params$heatMapNAsubstitute) m_hm[is.na(m_hm)] <- 0 # nolint end nrow_m_hm <- nrow(m_hm) ncol_m_hm <- ncol(m_hm) if (nrow_m_hm < 1 || ncol_m_hm < 1) return(peptide_count) # return zero as initialized above my_row_cex <- master_cex * (100 / (2 + row_mchar_max)) my_col_cex <- master_cex * 6 * row_margin / col_margin my_col_adj <- min(my_col_cex, my_row_cex) / my_col_cex my_col_cex <- min(my_col_cex, my_row_cex) col_margin <- sqrt(my_col_adj) * col_margin if (print_trace_messages) cat( sprintf("my_row_cex = %0.3f; ", my_row_cex)) if (print_trace_messages) cat( sprintf("my_col_cex = %0.3f; ", my_col_cex)) if (is.null(margins)) my_margins <- c( (my_col_cex + col_margin), # col (my_row_cex + row_margin) / my_row_cex # row ) else my_margins <- margins if (print_trace_messages) cat( sprintf( "my_margins = c(%s)\n\n", paste(my_margins, collapse = ", ") ) ) my_hm2_cex <- 2 * master_cex my_key_cex <- 0.9 - 0.1 * (g_intensity_hm_rows + nrow_m_hm) / g_intensity_hm_rows my_key_warp <- 1.5 * 22.75 / row_margin my_key_cex <- min(1.10, my_key_warp * my_key_cex) my_hgt_scale <- 3.70 - 0.4 * (max(1, 0.9 * my_row_cex) - 1) my_hgt_scale <- 3.75 # 3.615 my_hgt_scale <- 3.60 # 3.615 if (print_trace_messages) cat_variable(my_hgt_scale, "\n\n", 3) my_warp <- max(0.1, 1.4 * (7.5 + nrow_m) / g_intensity_hm_rows) if (print_trace_messages) cat_variable(my_warp, "\n\n", 3) # added 0.9 heuristically... my_plot_height <- (0.566 + 0.354 * (nrow_m / g_intensity_hm_rows)) * min(my_hgt_scale, my_hgt_scale * my_warp) my_plot_height <- min(3.65, my_plot_height * g_intensity_hm_rows / 50) my_heights <- c( 0.3, # title and top dendrogram my_plot_height, # plot and bottom margin 4.15 - my_hgt_scale, # legend 0.05 + my_hgt_scale - my_plot_height # whitespace below legend ) my_note_cex <- min(0.8, my_row_cex, my_col_cex) if (print_trace_messages) { cat_variable(my_plot_height, "\n\n", 3) cat_variable(4.19 - my_hgt_scale, "\n\n", 3) cat_variable(nrow_m_hm, "; ", 0) cat_variable(ncol_m_hm, "; ", 0) cat_variable(my_row_cex, "; ", 3) cat_variable(my_col_cex, "; ", 3) cat_variable(my_note_cex, "; ", 3) cat_variable(my_key_cex, "\n\n", 3) cat_variable(my_hgt_scale, "; ", 3) cat_variable(my_plot_height, "; ", 3) cat_variable(my_warp, "\n\n", 3) cat_variable(my_heights, "; ", 2) cat_variable(sum(my_heights), "\n\n", 3) } # define hm_call inner function hm_call <- function(x, scaling, title) { my_cex_main <- min(5.0, 220 / nchar(title)) op <- par( cex.main = my_cex_main * master_cex, adj = adj ) if ( !is.null( hm2plus( x, Colv = NA, Rowv = TRUE, cexRow = my_row_cex, cexCol = my_col_cex, dendrogram = "row", las = 1, main = title, key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), cex = my_hm2_cex, key_par = list(cex = my_key_cex), margins = my_margins, widths = c(0.4, 2.6, 1.5), heights = my_heights, mat = matrix( c( c(0, 3, 3), c(2, 1, 1), c(0, 4, 0), c(0, 0, 0) ), nrow = 4, ncol = 3, byrow = TRUE ), na.rm = TRUE, scale = scaling, srtcol = 90, srtrow = 0, xlab = "", cellnote = cellnote, # ref for na.color: # https://cran.r-project.org/web/packages/gplots/gplots.pdf # nolint start # squash un-actionable "no visible global for ..." na.color = params$heatMapNAcellColor, # nolint end notecex = my_note_cex, ... ) ) ) { if (print_trace_messages) cat( sprintf( "my_heights = c(%s); sum = %0.3f\n\n", paste( sprintf("%0.3f", my_heights), collapse = ", " ), sum(my_heights) ) ) if (print_trace_messages) cat( sprintf("my_key_cex = %0.3f\n\n", my_key_cex) ) if (print_trace_messages) cat( sprintf("my_key_cex/my_heights[3] = %0.3f\n\n", my_key_cex / my_heights[3]) ) if (print_trace_messages) cat( sprintf("my_heights[2]/my_heights[3] = %0.3f\n\n", my_heights[2] / my_heights[3]) ) } par(op) } # invoke hm_call inner function if (row_scaling != "row" || sum(rowSums(!is.na(m_hm)) < 2)) hm_call( m_hm, "none", "log(intensities), unimputed, and unnormalized" ) else hm_call( m_hm, "row", "log(intensities), row-scaled, unimputed, and unnormalized" ) }, error = function(e) { if (!is.null(hm_call)) { m_hm[is.na(m_hm)] <- 0 tryCatch( { if (nrow(m_hm) > 1) hm_call( m_hm, "none", "log(intensities), zero-imputed, unnormalized" ) else cat("\nThere are too few peptides to produce a heatmap.\n") }, error = function(r) { cat( sprintf( "\n%s %s Internal message: %s\n\\newline\n\n", "Failure drawing heatmap, possibly because of ", "too many missing values.\n\\newline\n\n", r$message ) ) cat_margins() } ) } else { cat( "\nFailure drawing heatmap, ", "possibly because of too many missing values.\n" ) } } ) } } # return value: peptide_count } # function drawing heatmap of correlations if they exist, else covariances # nolint start # squash un-actionable cyclomatic_complexity warning # squash un-actionable "no visible global for ..." cov_heatmap <- function( # nolint end m, # matrix with rownames already formatted kinase_name, top_substrates = FALSE, ... # passthru to hm2plus or heatmap.2 ) { if (print_nb_messages) nbe( see_variable(m), " [", nrow(m), "x", ncol(m), "\n") colnames_m <- colnames(m) is_na_m <- is.na(m) tmp <- m tmp[is_na_m] <- 0 tmp <- m[, 0 < colSums(x = tmp)] # by default, na.rm is FALSE colnames_tmp <- colnames(tmp) my_low_p_seq <- seq( from = min(g_intensity_hm_rows, nrow(m)), to = 1, by = -1 ) if (g_correlate_substrates) { # zap samples having zero or near-zero variance tmp[is.na(tmp)] <- 0 nzv <- caret::nearZeroVar( tmp, # matrix of values, samples x variables freqCut = 1.01, # min(freq most prevalent value / # freq second most prevalent) uniqueCut = 99 # max(number of unique values / # total number of samples) ) tmp <- if (length(nzv) > 0) { m[, -nzv, drop = FALSE] } else { m } } else { tmp <- m[my_low_p_seq, , drop = FALSE] } t_m <- t(tmp) t_m[is.na(t_m)] <- 0 my_corcov <- cov(t_m) did_filter_rows <- did_filter_cols <- FALSE if (g_correlate_substrates && !is_positive_definite(my_corcov)) { my_correlate_substrates <- FALSE t_m <- t(m[my_low_p_seq, , drop = FALSE]) t_m[is.na(t_m)] <- 0 unfiltered_row_count <- ncol(t_m) unfiltered_col_count <- nrow(t_m) # zap empty samples t_m <- t_m[0 < rowSums(x = t_m), ] # zap substrates present in fewer than two samples foo <- t_m > 0 foo <- colSums(x = foo) > 1 t_m <- t_m[, foo] did_filter_rows <- unfiltered_row_count > ncol(t_m) did_filter_cols <- unfiltered_col_count > nrow(t_m) colnames_tmp <- rownames(t_m) my_corcov <- cov(t_m) if (g_filter_cov_var_gt_1) { my_corcov <- keep_cov_w_var_gtr_1(my_corcov) } } else if (g_correlate_substrates) { my_corcov <- cov2cor(my_corcov) my_correlate_substrates <- TRUE } else { my_correlate_substrates <- FALSE if (g_filter_cov_var_gt_1) my_corcov <- keep_cov_w_var_gtr_1(my_corcov) } omitted_samples <- colnames_m[colnames_m %notin% colnames_tmp] suffix <- if (length(omitted_samples) > 1) "s" else "" f_omissions <- function(is_corr) { cat( sprintf( "Below is the %s plot for %s sites", if (is_corr) "correlation" else "covariance", sprintf( if (top_substrates) "%0.0f \"highest-quality\"" else "%0.0f", ncol(t_m) ) ) ) if (did_filter_cols) { cat(sprintf(", omitting sample%s ", suffix)) latex_collapsed_vector(", ", omitted_samples) } cat(".\n\n") } if (is.null(my_corcov) || sum(!is.na(t_m)) < 2) { cat( sprintf( "\\newline\n%s %s plot.\n", "Insufficient covariance to produce", if (my_correlate_substrates) "correlation" else "covariance" ), "\\newpage\n" ) return(NULL) } cat("\\leavevmode\n", "\\newpage\n") f_omissions(my_correlate_substrates) master_cex <- 0.4 max_nchar <- max(nchar(rownames(t_m))) my_limit <- g_intensity_hm_rows diminution <- sqrt(my_limit / (my_limit + ncol(t_m))) my_row_cex <- my_col_cex <- min(1.75, master_cex * 9 * diminution ^ 1.5) my_margin <- 3 + my_row_cex * 64 / (8 + max_nchar) my_key_cex <- 1.4 my_hm2_cex <- 1.0 * master_cex my_hgt_scale <- 3.50 - 0.26 * (max(0.4, my_key_cex) - 0.4) my_hgt_scale <- 2.7 my_legend_height <- 4.0 - my_hgt_scale my_legend_height <- 0.5 * my_key_cex my_warp <- 0.65 * (my_limit + ncol(t_m)) / my_limit my_warp <- 0.8 my_legend_height <- 0.77 my_legend_height <- 0.67 my_plot_height <- my_hgt_scale + (1 - my_warp) * my_legend_height my_legend_height <- my_warp * my_legend_height parjust <- par(adj = 0.5) on.exit(par(parjust)) my_corcov <- my_corcov[order(rownames(my_corcov)), ] my_main <- sprintf("%s among %s sites %s", if (my_correlate_substrates) "Correlation" else "Covariance", kinase_name, if (!my_correlate_substrates && g_filter_cov_var_gt_1 && did_filter_rows ) "having variance > 1" else "" ) my_main_nchar <- nchar(my_main) my_heights <- c( 0.3, my_plot_height, my_legend_height # was 4.0 - my_hgt_scale # was 4.19 ) if (print_trace_messages) cat(sprintf("max_nchar = %0.3f; ", max_nchar)) if (print_trace_messages) cat(sprintf("my_margin = %0.3f; ", my_margin)) if (print_trace_messages) cat( sprintf("my_plot_height = %0.3f\n\n", my_plot_height)) if (print_trace_messages) cat(sprintf("master_cex = %0.3f; ", master_cex)) if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex)) if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex)) if (print_trace_messages) cat( sprintf("my_key_cex = %0.3f\n\n", my_key_cex)) if (print_trace_messages) cat( sprintf("my_hgt_scale = %0.3f\n\n", my_hgt_scale)) if (print_trace_messages) cat( sprintf("legend height = %0.3f\n\n", my_legend_height)) if (print_trace_messages) cat( sprintf( "my_heights = c(%s); sum = %0.3f\n\n", paste( sprintf("%0.3f", my_heights), collapse = ", " ), sum(my_heights) ) ) op <- par(cex.main = (30 + my_main_nchar) / my_main_nchar) on.exit(par(op)) hm2plus( x = my_corcov, cex = my_hm2_cex, cexCol = my_col_cex, cexRow = my_row_cex, density_info = "none", denhgt = 0.15, denwid = 0.5, divergent = TRUE, key_par = list(cex = my_key_cex), key_xlab = if (my_correlate_substrates) "Correlation" else "Covariance", main = my_main, mat = matrix( c( c(0, 3, 3), c(2, 1, 1), c(0, 4, 0) ), nrow = 3, ncol = 3, byrow = TRUE ), heights = my_heights, margins = c(my_margin, my_margin), widths = c(0.5, 3.1, 0.9), scale = "none", symkey = TRUE, symbreaks = TRUE, symm = FALSE #TODO evaluate TRUE # ... ) } # end cov_heatmap ### FILE IMPORT # function reading bzipped file to data.frame bzip2df <- function(d, f, ctor = bzfile) { # read.delim file (by default, compressed by bzip2) if (file.exists(f)) { conn <- NULL pf <- parent.frame() tryCatch( assign( as.character(substitute(d)), read.delim(conn <- bzfile(f, open = "r")), pf ), finally = if (!is.null(conn)) close(conn) ) } } ``` # Purpose The purpose of this analysis is to perform for phosphopeptides: - imputation of missing values, - quantile normalization, - ANOVA (using the R stats::`r params$oneWayManyCategories` function), - assignment of an FDR-adjusted $p$-value and a "quality score" to each phosphopeptide, and - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) - The Human Proteome Database [http://hprd.org](http://hprd.org) - NetworKIN [http://networkin.science/](http://networkin.science/) - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) ```{r include = FALSE} if (params$kseaUseAbsoluteLog2FC) { sfc <- "|s|" pfc <- "|p|" pfc_txt <- "$\\text{absolute value}({\\log_2 (\\text{fold-change})})$" } else { sfc <- "s" pfc <- "p" pfc_txt <- "${\\log_2 (\\text{fold-change}})$" } ksea_heatmap_titles <- list() ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- sprintf( "Summary for all kinases enriched in one or more contrasts at %s < %s", ksea_cutoff_statistic, ksea_cutoff_threshold ) ksea_heatmap_titles[[const_ksea_all_kinases]] <- "Summary figure for all contrasts and all kinases" ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <- sprintf( "Summary for all kinases not enriched at %s < %s in any contrast", ksea_cutoff_statistic, ksea_cutoff_threshold ) # hash to hold names of significantly enriched kinases ksea_asterisk_hash <- new_env() # PROCESS (mostly read) PARAMETERS # False discovery rate adjustment for ANOVA # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 val_fdr <- read.table(file = alpha_file, sep = "\t", header = FALSE, quote = "") if ( ncol(val_fdr) != 1 || sum(!is.numeric(val_fdr[, 1])) || sum(val_fdr[, 1] < 0) || sum(val_fdr[, 1] > 1) ) { cat("alphaFile should be one column of numbers within the range [0.0,1.0]") param_df_exit() knitr::knit_exit() } val_fdr <- val_fdr[, 1] ``` ```{r echo = FALSE, results = 'asis'} result <- file.copy( from = preproc_db, to = ksea_app_prep_db, overwrite = TRUE ) if (!result) { write( sprintf( "fatal error copying initial database '%s' to output '%s'", preproc_db, ksea_app_prep_db, ), stderr() ) if (sys.nframe() > 0) { cat("Cannot continue and quit() failed. Goodbye.") param_df_exit() knitr::knit_exit() # in case knit_exit doesn't exit quit(save = "no", status = 1) } } if (FALSE) { write.table(x = param_df, file = "test-data/params.txt") } ``` ```{r echo = FALSE} ### READ DATA # read.table reads a file in table format and creates a data frame from it. # - note that `quote = ""` means that quotation marks are treated literally. full_data <- read.table( file = input_file, sep = "\t", header = TRUE, quote = "", check.names = FALSE ) ``` # Extraction of Sample Classes and Names from Input Data ```{r echo = FALSE, results = 'asis'} data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) my_column_names <- names(full_data) if (!fdc_is_integer) { if (length(data_column_indices) > 0) { first_data_column <- data_column_indices[1] } else { cat(paste("failed to convert firstDataColumn:", first_data_column)) param_df_exit() knitr::knit_exit() } } cat( sprintf( paste( "\n\nThe input data file has peptide-intensity data", "in columns %d (\"%s\") through %d (\"%s\")." ), tmp <- min(data_column_indices), my_column_names[tmp], tmp <- max(data_column_indices), my_column_names[tmp] ) ) if (TRUE) { cat0( table_offset(i = 1, new = TRUE), "Sample classes and names are shown in ", table_href(), ".\n\n" ) } else { cat0( "\\newcounter{offset}\n", "\\setcounter{offset}{\\value{table}}\n", "\\stepcounter{offset}\n", "Sample classes and names are shown in ", table_href(), ".\n\n" ) } #TODO remove this unused variable and assignment if (FALSE) { # Write column names as a LaTeX enumerated list. column_name_df <- data.frame( column = seq_len(length(colnames(full_data))), name = paste0("\\verb@", colnames(full_data), "@") ) } ``` ```{r echo = FALSE, results = 'asis'} # extract intensity columns from full_data to quant_data quant_data <- full_data[first_data_column:length(full_data)] quant_data[quant_data == 0] <- NA rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide full_data_names <- colnames(quant_data) # Extract factors and trt-replicates using regular expressions. # Typically: # regex_sample_names is "\\.\\d+[A-Z]$" # regex_sample_grouping is "\\d+" # This would distinguish trt-replicates by terminal letter [A-Z] # in sample names and group them into trts by the preceding digits. # e.g.: # group .1A .1B .1C into group 1; # group .2A .2B .2C, into group 2; # etc. m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE) sample_name_matches <- regmatches(names(quant_data), m) colnames(quant_data) <- sample_name_matches write_debug_file(quant_data) rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) if (print_nb_messages) nbe(see_variable(smpl_trt, "\n\n")) if (print_nb_messages) nbe(see_variable(sample_name_matches, "\n\n")) if (print_nb_messages) nbe(see_variable(full_data_names, "\n\n")) sample_treatment_df <- save_sample_treatment_df <- data.frame( class = smpl_trt, sample = sample_name_matches, full_sample_names = full_data_names ) if (print_nb_messages) nbe(see_variable(sample_treatment_df, "\n\n")) # reorder data my_order <- with(sample_treatment_df, order(class, sample)) quant_data <- quant_data[, my_order] sample_name_matches <- sample_name_matches[my_order] smpl_trt <- smpl_trt[my_order] sample_treatment_df <- data.frame( class = smpl_trt, sample = sample_name_matches ) # filter smpl_trt as appropriate if (sample_group_filter %in% c("include", "exclude")) { include_sample <- mgrepl( v = sample_group_filter_patterns, s = as.character(smpl_trt), fixed = sample_group_filter_fixed, perl = sample_group_filter_perl, ignore.case = sample_group_filter_nocase ) if (sum(include_sample) < 2) { errmsg <- paste( "ERROR:", sum(include_sample), "samples are too few for analysis;", "check input parameters for sample-name parsing" ) cat0( errmsg, "\\stepcounter{offset}\n", " in ", table_href(), ".\n\n" ) data_frame_tabbing_latex( x = save_sample_treatment_df, tabstops = c(1.25, 1.25), caption = "Sample classes", use_subsubsection_header = FALSE ) data_frame_tabbing_latex( x = param_df[ c("regexSampleNames", "regexSampleGrouping", "groupFilterPatterns", "groupFilter", "groupFilterMode" ), ], tabstops = c(1.75), underscore_whack = TRUE, caption = "Input parameters for sample-name parsing", verbatim = FALSE ) param_df_exit() knitr::knit_exit() return(invisible(-1)) } sample_treatment_df <- if (sample_group_filter == "include") sample_treatment_df[include_sample, ] else sample_treatment_df[!include_sample, ] } else { include_sample <- rep.int(TRUE, length(smpl_trt)) } sample_name_matches <- sample_treatment_df$sample rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) sample_treatment_df$class <- smpl_trt # filter quant_data as appropriate number_of_samples <- length(sample_name_matches) quant_data <- quant_data[, sample_name_matches] sample_level_integers <- as.integer(smpl_trt) sample_treatment_levels <- levels(smpl_trt) count_of_treatment_levels <- length(sample_treatment_levels) # for each phosphopeptide, across treatment levels, compute minimum # count of observed (i.e., non-missing) values my_env <- new_env() for (l in sample_treatment_levels) my_env[[as.character(l)]] <- as.vector(rowSums(!is.na(quant_data[l == smpl_trt]))) min_group_obs_count <- row_apply( x = Reduce( f = function(l, r) cbind(l, my_env[[r]]), x = sample_treatment_levels, init = c() ), fun = min ) names(min_group_obs_count) <- rownames(quant_data) rm(my_env) # display (possibly-filtered) results cat("\\newpage\n") if (sum(include_sample) > 1) { data_frame_tabbing_latex( x = sample_treatment_df, tabstops = c(1.25), caption = "Sample classes", use_subsubsection_header = FALSE ) } sample_name_grow <- 10 / (10 + max(nchar(sample_name_matches) + 6)) sample_colsep <- transition_positions(as.integer(sample_treatment_df$class)) ``` ```{r echo = FALSE, results = 'asis'} cat("\\newpage\n") ``` ## Are the log-transformed sample distributions similar? ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} quant_data[quant_data == 0] <- NA #replace 0 with NA quant_data_log <- log10(quant_data) rownames(quant_data_log) <- rownames(quant_data) colnames(quant_data_log) <- sample_name_matches write_debug_file(quant_data_log) g_ppep_distrib_ctl <- new_env() g_ppep_distrib_ctl$xlab_line <- 3.5 + 11.86 * (0.67 - sample_name_grow) g_ppep_distrib_ctl$mai_bottom <- (0.5 + 3.95 * (0.67 - sample_name_grow)) g_ppep_distrib_ctl$axis <- (0.6 + 0.925 * (0.67 - sample_name_grow)) my_ppep_distrib_bxp <- function( x , sample_name_grow = sample_name_grow , main , varwidth = FALSE , sub = NULL , xlab , ylab , col = const_boxplot_fill , notch = FALSE , ppep_distrib_ctl = g_ppep_distrib_ctl , ... ) { my_xlab_line <- g_ppep_distrib_ctl$xlab_line my_mai_bottom <- g_ppep_distrib_ctl$mai_bottom my_axis <- g_ppep_distrib_ctl$axis if (print_trace_messages) { cat_variable(my_xlab_line, suffix = "; ") cat_variable(my_mai_bottom, suffix = "; ") cat_variable(my_axis, suffix = "\n\n") } old_par <- par( mai = par("mai") + c(my_mai_bottom, 0, 0, 0), cex.axis = my_axis, cex.lab = 1.2 ) tryCatch( { # Vertical plot boxplot( x , las = 2 , col = col , main = main , sub = NULL , ylab = ylab , xlab = NULL , notch = notch , varwidth = varwidth , ... ) title( sub = sub , cex.sub = 1.0 , line = my_xlab_line + 1 ) title( xlab = xlab , line = my_xlab_line ) }, finally = par(old_par) ) } my_ppep_distrib_bxp( x = quant_data_log , sample_name_grow = sample_name_grow , main = "Peptide intensities for each sample" , varwidth = boxplot_varwidth , sub = "Box widths reflect number of peptides for sample" , xlab = "Sample" , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , col = const_boxplot_fill , notch = FALSE ) cat("\n\n\n\n") ``` ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} if (nrow(quant_data_log) > 1) { quant_data_log_stack <- stack(quant_data_log) ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + ggplot2::ylab("Probability density") + ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) } else { cat("No density plot because there are too few peptides.\n\n") } ``` ## Globally, are peptide intensities are approximately unimodal? <!-- # bquote could be used as an alternative to latex2exp::TeX below particularly # and when plotting math expressions generally, at the expense of mastering # another syntax, which hardly seems worthwhile when I need to use TeX # elsewhere; here's an introduction to bquote: # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ --> ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} # identify the location of missing values fin <- is.finite(as.numeric(as.matrix(quant_data_log))) logvalues <- as.numeric(as.matrix(quant_data_log))[fin] logvalues_density <- density(logvalues) plot( x = logvalues_density, main = latex2exp::TeX( "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)" ), xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), ylab = "Probability density" ) hist( x = as.numeric(as.matrix(quant_data_log)), xlim = c(min(logvalues_density$x), max(logvalues_density$x)), breaks = 100, main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) ``` # Characterization of Input Data ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} # determine quantile q1 <- quantile(logvalues, probs = mean_percentile)[1] # 1 = row of matrix (ie, phosphopeptide) sds <- row_apply(quant_data_log, sd_finite) if (sum(!is.na(sds)) > 2) { plot( density(sds, na.rm = TRUE) , main = "Smoothed estimated probability density vs. std. deviation" , sub = "(probability estimation made with Gaussian smoothing)" , ylab = "Probability density" ) } else { cat( "At least two non-missing values are required to plot", "probability density.\n\n" ) } ``` ```{r echo = FALSE} # Determine number of cells to impute temp <- quant_data[is.na(quant_data)] # Determine number of values to impute number_to_impute <- length(temp) # Determine percent of missing values pct_missing_values <- round(length(temp) / (length(logvalues) + length(temp)) * 100) ``` ```{r echo = FALSE} # prep for trt-median based imputation ``` # Imputation of Missing Values ```{r echo = FALSE} imp_smry_pot_peptides_before <- nrow(quant_data_log) imp_smry_missing_values_before <- number_to_impute imp_smry_pct_missing <- pct_missing_values ``` ```{r echo = FALSE} #Determine number of cells to impute ``` ```{r echo = FALSE} # Identify which values are missing and need to be imputed ind <- which(is.na(quant_data), arr.ind = TRUE) ``` ```{r echo = FALSE, results = 'asis'} # Apply imputation switch( imputation_method , "group-median" = { quant_data_imp <- quant_data imputation_method_description <- paste("Substitute missing value with", "median peptide-intensity for sample group.\n" ) # Take the accurate ln(x+1) because the data are log-normally distributed # and because median can involve an average of two measurements. quant_data_imp <- log1p(quant_data_imp) for (i in seq_len(count_of_treatment_levels)) { # Determine the columns for this factor-level level_cols <- i == sample_level_integers # Extract those columns lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] # assign to ind the row-column pairs corresponding to each NA ind <- which(is.na(lvlsbst), arr.ind = TRUE) # No group-median exists if there is only one sample # a given ppep has no measurement; otherwise, proceed. if (ncol(lvlsbst) > 1) { the_centers <- row_apply(lvlsbst, median, na.rm = TRUE) for (j in seq_len(nrow(lvlsbst))) { for (k in seq_len(ncol(lvlsbst))) { if (is.na(lvlsbst[j, k])) { lvlsbst[j, k] <- the_centers[j] } } } quant_data_imp[, level_cols] <- lvlsbst } } # Take the accurate e^x - 1 to match scaling of original input. quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) good_rows <- !is.na(rowMeans(quant_data_imp)) } , "median" = { quant_data_imp <- quant_data imputation_method_description <- paste("Substitute missing value with", "median peptide-intensity across all sample classes.\n" ) # Take the accurate ln(x+1) because the data are log-normally distributed # and because median can involve an average of two measurements. quant_data_imp <- log1p(quant_data_imp) quant_data_imp[ind] <- row_apply(quant_data_imp, median, na.rm = TRUE)[ind[, 1]] # Take the accurate e^x - 1 to match scaling of original input. quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) good_rows <- !is.nan(rowMeans(quant_data_imp)) } , "mean" = { quant_data_imp <- quant_data imputation_method_description <- paste("Substitute missing value with", "geometric-mean peptide intensity across all sample classes.\n" ) # Take the accurate ln(x+1) because the data are log-normally distributed, # so arguments to mean should be previously transformed. # this will have to be quant_data_imp <- log1p(quant_data_imp) # Assign to NA cells the mean for the row quant_data_imp[ind] <- row_apply(quant_data_imp, mean, na.rm = TRUE)[ind[, 1]] # Take the accurate e^x - 1 to match scaling of original input. quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) good_rows <- !is.nan(rowMeans(quant_data_imp)) } , "random" = { quant_data_imp <- quant_data # sd to be used is the median sd m1 <- median(sds, na.rm = TRUE) * sd_percentile # If you want results to be reproducible, you will want to call # base::set.seed before calling stats::rnorm imputation_method_description <- paste("Substitute each missing value with random intensity", sprintf( "random intensity $N \\sim (%0.2f, %0.2f)$.\n", q1, m1 ) ) cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n", 100 * mean_percentile)) cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n", sd_percentile)) quant_data_imp[ind] <- 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) quant_data_imp_ln <- log1p(quant_data_imp) good_rows <- !is.nan(rowMeans(quant_data_imp)) } ) quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e if (length(good_rows) < 1) { print("ERROR: Cannot impute data; there are no good rows!") return(-1) } ``` ```{r echo = FALSE, results = 'asis'} cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description) ``` ```{r echo = FALSE} imp_smry_pot_peptides_after <- sum(good_rows) imp_smry_rejected_after <- sum(!good_rows) imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) # From ?`%in%`: # %in% is currently defined # as function(x, table) match(x, table, nomatch = 0) > 0 stock_in <- names(good_rows) %in% names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) if (print_nb_messages) nbe(see_variable(stock_in), "\n") explicit_in <- 0 < match( names(good_rows), names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) ) if (print_nb_messages) nbe(see_variable(explicit_in), "\n") great_enough_row_names <- good_rows[ names(good_rows) %in% names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) ] if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") great_enough_row_names <- great_enough_row_names[great_enough_row_names] if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") ``` ```{r echo = FALSE, results = 'asis'} # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf tabular_lines_fmt <- paste( "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) " \\leavevmode", " \\caption{Imputation Results}", " \\centering", # \centering centers the table on the page " \\begin{tabular}{l c c c}", " \\hline\\hline", " \\ & potential peptides & missing values & rejected", " peptides \\\\ [0.5ex]", " \\hline", " before imputation & %d & %d (%d\\%s) & \\\\", " after imputation & %d & %d & %d \\\\", " after keep comparable & %d & & %d \\\\ [1ex]", " \\hline", " \\end{tabular}", #" \\label{table:nonlin}", # may be used to refer this table in the text "\\end{table}", sep = "\n" ) tabular_lines <- sprintf( tabular_lines_fmt, imp_smry_pot_peptides_before, imp_smry_missing_values_before, imp_smry_pct_missing, "%", imp_smry_pot_peptides_after, imp_smry_missing_values_after, imp_smry_rejected_after, length(great_enough_row_names), imp_smry_pot_peptides_before - length(great_enough_row_names) ) cat(tabular_lines) ``` ```{r filter_good_rows, echo = FALSE} if (print_nb_messages) nbe( "before name extraction, ", see_variable(length(good_rows)), " ", see_variable(good_rows), "\n") good_rows <- names(good_rows[names(great_enough_row_names)]) if (print_nb_messages) nbe( "after name extraction, ", see_variable(length(good_rows)), see_variable(good_rows), "\n") min_group_obs_count <- min_group_obs_count[good_rows] # Zap rows where imputation was insufficiently effective full_data <- full_data [good_rows, ] quant_data <- quant_data [good_rows, ] quant_data_log <- quant_data_log [good_rows, ] if (print_nb_messages) nbe( "before row filter, ", see_variable(nrow(quant_data_imp)), "\n") quant_data_imp <- quant_data_imp[good_rows, ] if (print_nb_messages) nbe( "after row filter, ", see_variable(nrow(quant_data_imp)), "\n") write_debug_file(quant_data_imp) quant_data_imp_good_rows <- quant_data_imp write_debug_file(quant_data_imp_good_rows) ``` ```{r echo = FALSE, results = 'asis'} can_plot_before_after_imp <- TRUE d_combined <- as.numeric( as.matrix( log10(quant_data_imp) ) ) d_original <- as.numeric( as.matrix( log10(quant_data_imp[!is.na(quant_data)]) ) ) if (sum(!is.na(d_original)) > 2) { d_original <- density(d_original) } else { can_plot_before_after_imp <- FALSE } if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) { d_combined <- density(d_combined) } else { can_plot_before_after_imp <- FALSE } if (sum(is.na(quant_data)) > 0) { # There ARE missing values d_imputed <- as.numeric( as.matrix( log10(quant_data_imp[is.na(quant_data)]) ) ) if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) { d_imputed <- (density(d_imputed)) } else { can_plot_before_after_imp <- FALSE } } else { # There are NO missing values d_imputed <- d_combined } ``` ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} zero_sd_rownames <- rownames(quant_data_imp)[ is.na((row_apply(quant_data_imp, sd, na.rm = TRUE)) == 0) ] if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { cat("All peptides have zero standard deviation. Cannot continue.") param_df_exit() knitr::knit_exit() } if (length(zero_sd_rownames) > 0) { cat( sprintf( "%d %s %s", length(zero_sd_rownames), "peptides with zero variance", "were removed from statistical consideration" ) ) zap_named_rows <- function(df, nms) { return(df[!(row.names(df) %in% nms), ]) } quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) quant_data <- zap_named_rows(quant_data, zero_sd_rownames) full_data <- zap_named_rows(full_data, zero_sd_rownames) min_group_obs_count <- min_group_obs_count[names(min_group_obs_count) %notin% zero_sd_rownames] } if (sum(is.na(quant_data)) > 0) { cat("\\leavevmode\\newpage\n") # Copy quant data to x x <- quant_data # x gets to have values of: # - NA for observed values # - 1 for missing values x[is.na(x)] <- 0 x[x > 1] <- NA x[x == 0] <- 1 # Log-transform imputed data # update variable because rows may have been eliminated from quant_data_imp quant_data_imp_log10 <- log10(quant_data_imp) write_debug_file(quant_data_imp_log10) # Set blue_dots to log of quant data or NA for NA quant data blue_dots <- log10(quant_data) # Set red_dots to log of imputed data or NA for observed quant data red_dots <- quant_data_imp_log10 * x count_red <- sum(!is.na(red_dots)) count_blue <- sum(!is.na(blue_dots)) ylim_save <- ylim <- c( min(red_dots, blue_dots, na.rm = TRUE), max(red_dots, blue_dots, na.rm = TRUE) ) show_stripchart <- 50 > (count_red + count_blue) / length(sample_name_matches) if (show_stripchart) { boxplot_sub <- "Light blue = data before imputation; Red = imputed data;" } else { boxplot_sub <- "" } # Vertical plot colnames(blue_dots) <- sample_name_matches my_ppep_distrib_bxp( x = blue_dots , sample_name_grow = sample_name_grow , main = "Peptide intensities after eliminating unusable peptides" , varwidth = boxplot_varwidth , sub = paste(boxplot_sub, "Box widths reflect number of peptides for sample") , xlab = "Sample" , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , col = const_boxplot_fill , notch = FALSE , ylim = ylim ) if (show_stripchart) { # Points # ref: https://r-charts.com/distribution/add-points-boxplot/ # NA values are not plotted stripchart( blue_dots, # Data method = "jitter", # Random noise jitter = const_stripchart_jitter, pch = 19, # Pch symbols cex = const_stripchart_cex, # Size of symbols reduced col = "lightblue", # Color of the symbol vertical = TRUE, # Vertical mode add = TRUE # Add it over ) stripchart( red_dots, # Data method = "jitter", # Random noise jitter = const_stripchart_jitter, pch = 19, # Pch symbols cex = const_stripchart_cex, # Size of symbols reduced col = "red", # Color of the symbol vertical = TRUE, # Vertical mode add = TRUE # Add it over ) } } ``` ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} if (sum(is.na(quant_data)) > 0) { # show measured values in blue on left half-violin plot cat("\\leavevmode\n\\quad\n\n\\quad\n\n") old_par <- par( mai = par("mai") + c(g_ppep_distrib_ctl$mai_bottom, 0, 0, 0), cex.axis = g_ppep_distrib_ctl$axis, cex.lab = 1.2 ) tryCatch( { vioplot::vioplot( x = lapply(blue_dots, function(x) x[!is.na(x)]), col = "lightblue1", side = "left", plotCentre = "line", ylim = ylim_save, main = "Distributions of observed and imputed data", sub = NULL, las = 2, xlab = NULL, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) title( sub = "Light blue = observed data; Pink = imputed data", cex.sub = 1.0, line = g_ppep_distrib_ctl$xlab_line + 1 ) title( xlab = "Sample", line = g_ppep_distrib_ctl$xlab_line ) red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) cols_to_delete <- c() for (ix in seq_len(length(red_violins))) { if (length(red_violins[[ix]]) < 1) { cols_to_delete <- c(cols_to_delete, ix) } } # destroy any unimputable columns if (!is.null(cols_to_delete)) { red_violins <- red_violins[-cols_to_delete] } # plot imputed values in red on right half-violin plot vioplot::vioplot( x = red_violins, col = "lightpink1", side = "right", plotCentre = "line", add = TRUE ) }, finally = par(old_par) ) # density plot cat("\\leavevmode\n\n\n\n\n\n\n") if (can_plot_before_after_imp) { ylim <- c( 0, max( if (is.list(d_combined)) d_combined$y else d_combined, if (is.list(d_original)) d_original$y else d_original, if (is.list(d_imputed)) d_imputed$y else d_imputed, na.rm = TRUE ) ) plot( d_combined, ylim = ylim, sub = paste( "Blue = data before imputation; Red = imputed data;", "Black = combined" ), main = "Density of peptide intensity before and after imputation", xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), ylab = "Probability density" ) lines(d_original, col = "blue") lines(d_imputed, col = "red") } else { cat( "There are too few points to plot the density of peptide intensity", "before and after imputation." ) } cat("\\leavevmode\\newpage\n") } ``` # Quantile Normalization The excellent `normalize.quantiles` function from *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* performs "quantile normalization" as described Bolstad *et al.* (2003), DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* and its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html), i.e., it assumes that the goal is to detect subtle differences among grossly similar samples (having similar distributions) by equalizing intra-quantile quantitations^[Unfortunately, one software library upon which `preprocessCore` depends *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* that requires that a specific, non-concurrent version of the library (`openblas` version $0.3.3$) be installed. The installation command equivalent to what was used to install the library to produce the results presented here is: \linebreak ` conda install bioconductor-preprocesscore openblas=0.3.3`]. <!-- # Apply quantile normalization using preprocessCore::normalize.quantiles # --- # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html # except this: https://support.bioconductor.org/p/122925/#9135989 # says to install it like this: # ``` # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) # ``` # conda installation (necessary because of a bug in recent openblas): # conda install bioconductor-preprocesscore openblas=0.3.3 # ... # --- # normalize.quantiles {preprocessCore} -- Quantile Normalization # # Description: # Using a normalization based upon quantiles, this function normalizes a # matrix of probe level intensities. # # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the # assumption that the data is missing at random. # # Usage: # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) # # Arguments: # # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. # # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, # but in certain situations not making a copy of the matrix, but instead normalizing # it in place will be more memory friendly. # # - keep.names: Boolean option to preserve matrix row and column names in output. # # Details: # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. # No special allowances are made for outliers. If you make use of quantile normalization # please cite Bolstad et al, Bioinformatics (2003). # # This functions will handle missing data (ie NA values), based on # the assumption that the data is missing at random. # # Note that the current implementation optimizes for better memory usage # at the cost of some additional run-time. # # Value: A normalized matrix. # # Author: Ben Bolstad, bmbolstad.com # # References # # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf # # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 # http://bmbolstad.com/misc/normalize/normalize.html # ... --> ```{r echo = FALSE, results = 'asis'} if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp)), "\n") if (nrow(quant_data_imp) > 0) { quant_data_imp_qn <- preprocessCore::normalize.quantiles( as.matrix(quant_data_imp), keep.names = TRUE ) } else { quant_data_imp_qn <- as.matrix(quant_data_imp) } if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn)), "\n") quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) write_debug_file(quant_data_imp_qn) quant_data_imp_qn_log <- log10(quant_data_imp_qn) write_debug_file(quant_data_imp_qn_log) if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) sel <- row_apply(quant_data_imp_qn_ls, any_nan) quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls) write_debug_file(quant_data_imp_qn_ls) write_debug_file(quant_data_imp_qn_ls2) # Create data.frame used by ANOVA analysis data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) ``` ## Are normalized, imputed, log-transformed sample distributions similar? ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} # Save unimputed quant_data_log for plotting below unimputed_quant_data_log <- quant_data_log # log10 transform (after preparing for zero values, # which should never happen...) quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 quant_data_log <- log10(quant_data_imp_qn) how_many_peptides <- nrow(quant_data_log) if ((how_many_peptides) > 0) { cat( sprintf( "Intensities for %d peptides:\n\n\n", how_many_peptides ) ) cat("\n\n\n") # data visualization if (TRUE) { my_ppep_distrib_bxp( x = quant_data_log , sample_name_grow = sample_name_grow , main = "Peptide intensities for each sample" , varwidth = boxplot_varwidth , sub = NULL , xlab = "Sample" , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , col = const_boxplot_fill , notch = boxplot_notch ) } else { old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) , oma = par("oma") + c(0.5, 0, 0, 0) ) # ref: https://r-charts.com/distribution/add-points-boxplot/ # Vertical plot colnames(quant_data_log) <- sample_name_matches boxplot( quant_data_log , las = 2 , cex.axis = 0.9 * sample_name_grow , col = const_boxplot_fill , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , xlab = "Sample" , notch = boxplot_notch , varwidth = boxplot_varwidth ) par(old_par) } } else { cat("There are no peptides to plot\n") } cat("\n\n\n") ``` ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} if (nrow(quant_data_log) > 1) { quant_data_log_stack <- stack(quant_data_log) ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + ggplot2::ylab("Probability density") + ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) } else { cat("No density plot because there are fewer than two peptides to plot.\n\n") } ``` ```{r echo = FALSE, results = 'asis'} cat("\\leavevmode\\newpage\n") ``` # ANOVA Analysis ## Assignment of $p$-value and quality score For each phosphopeptide, ANOVA analysis was performed to compute a $p$-value representing the evidence against the "null hypothesis" ($H_0$) that the intensity does not vary significantly among sample groups. However, because as more and more phosphopeptides are tested, there is increasd probability that, by random chance, a given peptide will have a $p$-value that appears to indicate significance. For this reason, the $p$-values were adjusted by applying the False Discovery Rate (FDR) correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https://doi.org/10.1111/j.2517-6161.1995.tb02031.x). Furthermore, to give more weight to phosphopeptides having fewer missing values, an (arbitrarily defined) quality score was assigned to each, defined as: $$ \textit{quality}_j = \frac{1 + o_{j}}{v_{j}(1 + o_{j}) + 0.005} $$ where: - $o_j$ is the minimum number of non-missing observations per sample group for substrate $j$ for all sample groups, and - $v_j$ is the FDR-adjusted ANOVA $p$-value for substrate $j$. ```{r, echo = FALSE, results = 'asis'} cat("\\newpage\n") # Make new data frame containing only Phosphopeptides # to connect preANOVA to ANOVA (connect_df) connect_df <- data.frame( data_table_imp_qn_lt$Phosphopeptide , data_table_imp_qn_lt[, first_data_column] ) colnames(connect_df) <- c("Phosphopeptide", "Intensity") ``` ```{r anova, echo = FALSE, fig.dim = c(10, 12), results = 'asis'} g_can_run_ksea <- FALSE old_oma <- par("oma") # nolint start # squash un-actionable cyclomatic_complexity warning if (count_of_treatment_levels < 2) { # nolint end cat( "ERROR!!!! Cannot perform ANOVA analysis", "(see next page)\\newpage\n" ) cat( "ERROR: ANOVA analysis", "requires two or more factor levels!\n\n\n" ) cat("\n\n\n\n\n") cat("Unparsed sample names are:\n\n\n", "\\begin{quote}\n", paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), "\n\\end{quote}\n\n") regex_sample_names <- latex_printable_control_seqs(regex_sample_names) cat("\\leavevmode\n\n\n") cat("Parsing rule for SampleNames is", "\n\n\n", "\\text{'", regex_sample_names, "'}\n\n\n", sep = "" ) cat("\nParsed sample names are:\n", "\\begin{quote}\n", paste(sample_name_matches, collapse = "\n\n\n"), "\n\\end{quote}\n\n") regex_sample_grouping <- latex_printable_control_seqs(regex_sample_grouping) cat("\\leavevmode\n\n\n") cat("Parsing rule for SampleGrouping is", "\n\n\n", "\\text{'", regex_sample_grouping, "'}\n\n\n", sep = "" ) cat("\n\n\n") cat("Sample group assignments are:\n", "\\begin{quote}\n", paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), "\n\\end{quote}\n\n") } else { if (print_nb_messages) nbe("computing p_value_data_anova_ps\n") if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log), "\n") if (print_nb_messages) nbe(see_variable(anova_func), "\n") if (print_nb_messages) nbe(see_variable(smpl_trt), "\n") if (print_nb_messages) nbe(see_variable(one_way_all_categories), "\n") tryCatch( { p_value_data_anova_ps <- row_apply( quant_data_imp_qn_log, anova_func, grouping_factor = smpl_trt, one_way_f = one_way_all_categories ) }, error = function(e) { mesg <- paste("Could not compute ANOVA because", e$message) cat("\n\n", mesg, "\n\n") param_df_noexit(e) sink(stderr()) cat("\n\n", mesg, "\n\n") values <- paste(param_df$parameter, "=", param_df$value, collapse = "\n") cat(values) sink() knitr::knit_exit() exit(code = 1) } ) if (print_nb_messages) nbe(see_variable(p_value_data_anova_ps), "\n") p_value_data_anova_ps_fdr <- p.adjust(p_value_data_anova_ps, method = "fdr") my_ppep_v <- full_data[, 1] p_value_data <- list( phosphopeptide = my_ppep_v, raw_anova_p = p_value_data_anova_ps, fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, missing_values = rowSums(is.na(quant_data)), min_group_obs_count = min_group_obs_count ) p_value_data <- data.frame( phosphopeptide = my_ppep_v, raw_anova_p = p_value_data_anova_ps, fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, missing_values = rowSums(is.na(quant_data)), min_group_obs_count = min_group_obs_count ) p_value_data$quality <- 1.0 / with( p_value_data, fdr_adjusted_anova_p + 0.005 / (1 + min_group_obs_count) ) p_value_data$ranking <- with( p_value_data, switch( g_intensity_hm_criteria, "quality" = order(-quality), "na_count" = order(missing_values, fdr_adjusted_anova_p), # the default is "p_value" order(fdr_adjusted_anova_p) ) ) p_value_data <- p_value_data[p_value_data$ranking, , drop = FALSE] write.table( p_value_data, file = "p_value_data.txt", sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) # output ANOVA file to constructed filename, # e.g. "Outputfile_pST_ANOVA_STEP5.txt" # becomes "Outputfile_pST_ANOVA_STEP5_FDR0.05.txt" # Re-output datasets to include p-values metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:ncol(p_value_data)]) write.table( cbind(metadata_plus_p, quant_data_imp), file = imputed_data_filename, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) write.table( cbind(metadata_plus_p, quant_data_imp_qn_log), file = imp_qn_lt_data_filenm, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) first_page_suppress <- 1 number_of_peptides_found <- 0 cutoff <- val_fdr[1] for (cutoff in val_fdr) { #loop through FDR cutoffs if (number_of_peptides_found > g_intensity_hm_rows - 1) { cat("\\leavevmode\n\n\n") break } if (print_trace_messages) { cat_variable(cutoff) cat("\n\n") } bool_1 <- (p_value_data$fdr_adjusted_anova_p < cutoff) bool_2 <- (p_value_data$min_group_obs_count >= g_intensity_min_per_class) g_can_run_ksea <- g_can_run_ksea || (sum(bool_2) > 0) bool_4 <- (p_value_data$quality >= params$minQuality) bool_3 <- as.logical( as.integer(bool_1) * as.integer(bool_2) * as.integer(bool_4) ) if (print_trace_messages) { if (length(bool_1) > 30) { cat_variable(bool_1, force_str = TRUE) cat_variable(bool_2, force_str = TRUE) cat_variable(bool_3, force_str = TRUE) } else { cat_variable(bool_1, suffix = "\n\n") cat_variable(bool_2, suffix = "\n\n") cat_variable(bool_3, suffix = "\n\n") } cat_variable(length(bool_3), digits = 0, suffix = "; ") cat_variable(sum(bool_3), digits = 0, suffix = "\n\n") } filtered_p <- p_value_data[bool_3, , drop = FALSE] filtered_p <- filtered_p[!is.na(filtered_p$phosphopeptide), , drop = FALSE] if (print_trace_messages) cat_variable(filtered_p, force_str = TRUE) have_remaining_peptides <- sum(bool_3, na.rm = TRUE) > 0 filter_result_string <- sprintf( "%s, %s of %0.0f peptides remained having both %s and %s.\n\n", "After filtering for ANOVA results", if (have_remaining_peptides) as.character(sum(bool_3, na.rm = TRUE)) else "none", length(bool_3), adj_p_string <- sprintf("adjusted p-value < %s", as.character(signif(cutoff, 2))), sprintf( "more than %0.0f observations in some groups", max(0, g_intensity_min_per_class - 1) ) ) filtered_data_filtered <- quant_data_imp_qn_log[ rownames(filtered_p), , drop = FALSE ] # order by p-value filtered_data_filtered <- filtered_data_filtered[ order(filtered_p$fdr_adjusted_anova_p), , drop = FALSE ] if (have_remaining_peptides && nrow(filtered_p) > 0 && nrow(filtered_data_filtered) > 0 ) { if (first_page_suppress == 1) { first_page_suppress <- 0 } else { cat("\\newpage\n") } latex_samepage({ cat(filter_result_string) if (nrow(filtered_data_filtered) > 1) { cat(subsection_header(sprintf( "Intensity distributions for %d phosphopeptides, %s\n", nrow(filtered_data_filtered), adj_p_string ))) } else { cat( subsection_header( sprintf( "Intensity distribution for one phosphopeptide, %s\n", adj_p_string ) ), rownames(filtered_data_filtered)[1], "\n" ) } }) # end latex_samepage old_par <- par( mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), oma = old_oma * c(1, 1, 0.3, 1), cex.main = 0.9, cex.axis = 0.7, fin = c(9, 7.25) ) # Vertical plot colnames(filtered_data_filtered) <- sample_name_matches tryCatch( boxplot( filtered_data_filtered, main = "Imputed, normalized intensities", # no line plot las = 2, cex.axis = 0.9 * sample_name_grow, col = const_boxplot_fill, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)"), notch = FALSE, varwidth = boxplot_varwidth ), error = function(e) { print(e) cat_margins() } ) par(old_par) } else { cat(sprintf( "%s < %0.2f\n\n\n\n\n", "No peptides were found to have cutoff adjusted p-value", cutoff )) } if (have_remaining_peptides && nrow(filtered_data_filtered) > 0) { # Add Phosphopeptide column to anova_filtered table # The assumption here is that the first intensity is unique; # this is a hokey assumption but almost definitely will # be true in the real world unless there is a computation # error upstream. anova_filtered_merge <- base::merge( x = connect_df, y = filtered_data_filtered, by.x = "Intensity", by.y = 1 ) anova_filtered_merge_order <- rownames(filtered_p) anova_filtered <- data.frame( ppep = anova_filtered_merge$Phosphopeptide, intense = anova_filtered_merge$Intensity, data = anova_filtered_merge[, 2:number_of_samples + 1] ) colnames(anova_filtered) <- c("Phosphopeptide", colnames(filtered_data_filtered)) # Merge qualitative columns into the ANOVA data output_table <- data.frame(anova_filtered$Phosphopeptide) output_table <- base::merge( x = output_table, y = data_table_imp_qn_lt, by.x = "anova_filtered.Phosphopeptide", by.y = "Phosphopeptide" ) # Produce heatmap to visualize significance and the effect of imputation cat_hm_heading <- function(m, cutoff) { if (nrow(m) > g_intensity_hm_rows) { cat("\\clearpage\n") cat(subsection_header( paste( sprintf("Heatmap for the %d most-significant peptides", g_intensity_hm_rows), sprintf("whose adjusted p-value < %0.2f\n", cutoff) ) )) } else { if (nrow(m) < 2) { return(FALSE) } else { cat(subsection_header( paste( sprintf("Heatmap for %d usable peptide genes whose", nrow(m)), sprintf("adjusted p-value < %0.2f\n", cutoff) ) )) } } cat("\n\n\n") cat("\n\n\n") return(TRUE) } # construct matrix with appropriate rownames m <- as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) nrow_m <- nrow(m) ncol_m <- ncol(m) if (nrow_m > 0) { rownames_m <- rownames(m) q <- data.frame(pepname = rownames_m) g <- sqldf::sqldf(" SELECT q.pepname, substr(met.Gene_Name, 1, 30) AS gene_name FROM q, metadata_plus_p AS met WHERE q.pepname = met.Phosphopeptide ORDER BY q.rowid ") tmp <- sapply( X = seq_len(nrow(g)), FUN = function(i) { pre <- strsplit(g$gene_name[i], "; ")[[1]] rslt <- paste(unique(pre), sep = "; ") return(rslt) } ) tmp <- unlist(tmp) tmp <- make.names(tmp, unique = TRUE) tmp <- sub( "No_Gene_Name", "not_found", tmp, fixed = TRUE ) ten_trunc_names <- trunc_ppep(rownames_m) tmp <- sapply( X = seq_len(nrow_m), FUN = function(i) { sprintf( "(%s) %s", tmp[i], ten_trunc_names[i] ) } ) rownames(m) <- tmp } # draw the heading and heatmap if (nrow_m > 0) { number_of_peptides_found <- ppep_heatmap( m = m, cutoff = cutoff, hm_heading_function = cat_hm_heading, hm_main_title = "log(intensities), row-scaled, unimputed, unnormalized", suppress_row_dendrogram = FALSE, master_cex = 0.35, sepcolor = "black", colsep = sample_colsep ) if (number_of_peptides_found > 1) { cat("\\leavevmode\n") } } } } } cat("\\leavevmode\n") if (!g_can_run_ksea) { errmsg <- paste("Cannot proceed with KSEA analysis", "because too many values are missing.") if (FALSE) cat0( errmsg, "\\stepcounter{offset}\n", "\\stepcounter{offset}\n", "\\stepcounter{offset}\n", " in ", table_href(), ".\n\n" ) if (FALSE) { if (print_nb_messages) nbe(see_variable(p_value_data)) } else { if (print_nb_messages) nbe(see_variable(p_value_data)) display_p_value_data <- p_value_data display_p_value_data$raw_anova_p <- sprintf("%0.3g", display_p_value_data$raw_anova_p) display_p_value_data$fdr_adjusted_anova_p <- sprintf("%0.3g", display_p_value_data$fdr_adjusted_anova_p) display_p_value_data$quality <- sprintf("%0.3g", display_p_value_data$quality) headers_1st_line <- c("", "Raw ANOVA", "FDR-adj.", "Missing", "Min. #", "", "") headers_2nd_line <- c("Phosphopeptide", "p-value", "p-value", "values", "group-obs", "Quality", "Ranking") data_frame_tabbing_latex( x = display_p_value_data, tabstops = c(2.75, 0.80, 0.80, 0.5, 0.6, 0.60), use_subsubsection_header = FALSE, headings = c(headers_1st_line, headers_2nd_line), caption = "ANOVA results" ) } data_frame_tabbing_latex( x = save_sample_treatment_df, tabstops = c(1.25, 1.25), caption = "Sample classes", use_subsubsection_header = FALSE ) param_df_exit() knitr::knit_exit() return(invisible(-1)) } ``` ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} if (g_can_run_ksea && count_of_treatment_levels > 1) { # Prepare two-way contrasts with adjusted p-values # Strategy: # - use imputed, log-transformed data: # - remember this when computing log2(fold-change) # - each contrast is between a combination of trt levels # - for each contrast, compute samples that are members # - compute one-way test: # - use `oneway.test` (Welch test) if numbers of samples # are not equivalent between trt levels # - otherwise, aov is fine but offers no advantage # - adjust p-value, assuming that # (# of pppeps)*(# of contrasts) tests were performed # Each contrast is between a combination of trt levels m2 <- combn( x = seq_len(length(levels(smpl_trt))), m = 2, simplify = TRUE ) contrast_count <- ncol(m2) # For each contrast, compute samples that are members # - local function to construct a data.frame for each contrast # - the contrast in the first "column" f_m2 <- function(cntrst, lvl1, lvl2) { return( data.frame( contrast = cntrst, level = smpl_trt[ smpl_trt %in% levels(smpl_trt)[c(lvl1, lvl2)] ], label = sample_name_matches[ smpl_trt %in% levels(smpl_trt)[c(lvl1, lvl2)] ] ) ) } # - compute a df for each contrast sample_level_dfs <- lapply( X = 1:contrast_count, FUN = function(i) f_m2(i, m2[1, i], m2[2, i]) ) # - compute a single df for all contrasts combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs) # - dispose objects to free resources rm(sample_level_dfs) # - write the df to a DB for later join-per-contrast db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) RSQLite::dbWriteTable( conn = db, name = "contrast", value = combined_contrast_df, overwrite = TRUE ) # Create UK for insert ddl_exec(db, " CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx ON contrast(contrast, label); " ) # Create indexes for join ddl_exec(db, " -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label CREATE INDEX IF NOT EXISTS contrast__label__idx ON contrast(label); " ) ddl_exec(db, " -- index for joining two contrast_lvl_ppep_avg_quant on contrast CREATE INDEX IF NOT EXISTS contrast__contrast__idx ON contrast(contrast); " ) ddl_exec(db, " -- index for joining two contrast_lvl_ppep_avg_quant on phophospep CREATE INDEX IF NOT EXISTS contrast__level__idx ON contrast(level); " ) # - dispose objects to free resources rm(combined_contrast_df) # Use imputed, log-transformed data # - remember that this was donoe when computing log2(fold-change) # - melt data matrix for use in later join-per-contrast casted <- cbind( data.frame(vrbl = rownames(quant_data_imp_qn_log)), quant_data_imp_qn_log ) quant_data_imp_qn_log_melted <- reshape2::melt( casted, id.vars = "vrbl" ) colnames(quant_data_imp_qn_log_melted) <- c("phosphopep", "sample", "quant") # - dispose objects to free resources rm(casted) # - write the df to a DB for use in later join-per-contrast RSQLite::dbWriteTable( conn = db, name = "ppep_smpl_qnlt", value = quant_data_imp_qn_log_melted, overwrite = TRUE ) # Create UK for insert ddl_exec(db, " CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx ON ppep_smpl_qnlt(phosphopep, sample); " ) # Create index for join ddl_exec(db, " -- index for join in contrast_ppep_smpl_qnlt CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx ON ppep_smpl_qnlt(sample); " ) ddl_exec(db, " -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx ON ppep_smpl_qnlt(phosphopep); " ) # - dispose objects to free resources rm(quant_data_imp_qn_log_melted) # - drop views if exist ddl_exec(db, " -- drop view dependent on contrast_lvl_ppep_avg_quant DROP VIEW IF EXISTS v_contrast_log2_fc; " ) ddl_exec(db, " -- drop table dependent on contrast_ppep_smpl_qnlt DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant; " ) ddl_exec(db, " DROP TABLE IF EXISTS contrast_lvl_lvl_metadata; " ) ddl_exec(db, " DROP VIEW IF EXISTS v_contrast_lvl_metadata; " ) ddl_exec(db, " -- drop view dependent on contrast_ppep_smpl_qnlt DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant; " ) ddl_exec(db, " DROP VIEW IF EXISTS v_contrast_lvl_lvl; " ) ddl_exec(db, " -- drop view upon which other views depend DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt; " ) # - create view dml_no_rows_exec(db, " -- view contrast_ppep_smpl_qnlt is used for each phopshopep to -- compute p-value for test of trt effect for two trt levels CREATE VIEW contrast_ppep_smpl_qnlt AS SELECT contrast, level, phosphopep, sample, quant FROM contrast AS c, ppep_smpl_qnlt AS q WHERE q.sample = c.label ORDER BY contrast, level, phosphopep ; " ) # - create simplification views dml_no_rows_exec(db, " CREATE VIEW v_contrast_lvl_metadata AS SELECT contrast, level, group_concat(label, ';') AS samples FROM contrast GROUP BY contrast, level /* view v_contrast_lvl_metadata is used to simplify creation of table contrast_lvl_lvl_metadata */ ; " ) dml_no_rows_exec(db, " CREATE VIEW v_contrast_lvl_ppep_avg_quant AS SELECT contrast, level, phosphopep, avg(quant) AS avg_quant FROM contrast_ppep_smpl_qnlt GROUP BY contrast, level, phosphopep /* view v_contrast_lvl_ppep_avg_quant is used to simplify view v_contrast_log2_fc */ ; " ) # - create contrast-metadata table if (print_nb_messages) nbe("CREATE TABLE contrast_lvl_lvl_metadata") dml_no_rows_exec(db, " CREATE TABLE contrast_lvl_lvl_metadata AS SELECT DISTINCT a.contrast AS ab_contrast, a.level AS a_level, b.level AS b_level, a.samples AS a_samples, b.samples AS b_samples, 'log2(level_'||a.level||'/level_'||b.level||')' AS fc_description FROM v_contrast_lvl_metadata AS a, v_contrast_lvl_metadata AS b WHERE a.contrast = b.contrast AND a.level > b.level /* view v_contrast_lvl_lvl is used to simplify view v_contrast_log2_fc */ ; " ) # - create pseudo-materialized view table dml_no_rows_exec(db, " CREATE VIEW v_contrast_lvl_lvl AS SELECT DISTINCT a.contrast AS ab_contrast, a.level AS a_level, b.level AS b_level FROM contrast AS a, contrast AS b WHERE a.contrast = b.contrast AND a.level > b.level /* view v_contrast_lvl_lvl is used to simplify view v_contrast_log2_fc */ ; " ) # - create view to compute log2(fold-change) dml_no_rows_exec(db, " CREATE VIEW v_contrast_log2_fc AS SELECT ab.ab_contrast AS contrast, m.a_level AS a_level, c.avg_quant AS a_quant, m.a_samples AS a_samples, ab.b_level AS b_level, d.avg_quant AS b_quant, m.b_samples AS b_samples, m.fc_description AS fc_description, 3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc, d.phosphopep AS phosphopep FROM contrast_lvl_lvl_metadata AS m, v_contrast_lvl_ppep_avg_quant AS d, v_contrast_lvl_lvl AS ab INNER JOIN v_contrast_lvl_ppep_avg_quant AS c ON c.contrast = ab.ab_contrast AND c.level = ab.a_level WHERE d.contrast = ab.ab_contrast AND m.ab_contrast = ab.ab_contrast AND d.level = ab.b_level AND d.phosphopep = c.phosphopep /* view to compute log2(fold-change) */ ; " ) # For each contrast, compute samples that are members # compute one-way test: # - use `oneway.test` (Welch test) if numbers of samples # are not equivalent between trt levels # - otherwise, aov is fine but offers no advantage for (contrast in contrast_count:2) { invisible(contrast) } for (contrast in 1:contrast_count) { contrast_df <- sqldf::sqldf( x = paste0(" SELECT level, phosphopep, sample, quant FROM contrast_ppep_smpl_qnlt WHERE contrast = ", contrast, " ORDER BY phosphopep, level, sample "), connection = db ) contrast_cast <- reshape2::dcast( data = contrast_df, formula = phosphopep ~ sample, value.var = "quant" ) contrast_cast_ncol <- ncol(contrast_cast) contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol] contrast_cast_samples <- colnames(contrast_cast_data) # - order grouping_factor by order of sample columns of contrast_cast_data grouping_factor <- sqldf::sqldf( x = paste0(" SELECT sample, level FROM contrast_ppep_smpl_qnlt WHERE contrast = ", contrast, " ORDER BY phosphopep, level, sample LIMIT ", contrast_cast_ncol - 1 ), connection = db ) rownames(grouping_factor) <- grouping_factor$sample grouping_factor <- grouping_factor[, "level", drop = FALSE] # - run the two-level (one-way) test p_value_data_contrast_ps <- row_apply( x = contrast_cast_data, fun = anova_func, grouping_factor = as.factor(grouping_factor$level), # anova_func arg2 one_way_f = one_way_two_categories, # anova_func arg3 simplify = TRUE # TRUE is the default for simplify ) if (!is.null(p_value_data_contrast_ps)) { contrast_data_adj_p_values <- p.adjust( p = p_value_data_contrast_ps, method = "fdr", n = length(p_value_data_contrast_ps) # this is the default, length(p) ) # - compute the fold-change contrast_p_df <- data.frame( contrast = contrast, phosphopep = contrast_cast$phosphopep, p_value_raw = p_value_data_contrast_ps, p_value_adj = contrast_data_adj_p_values ) db_write_table_overwrite <- (contrast < 2) db_write_table_append <- !db_write_table_overwrite RSQLite::dbWriteTable( conn = db, name = "contrast_ppep_p_val", value = contrast_p_df, append = db_write_table_append ) # Create UK for insert ddl_exec(db, " CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx ON contrast_ppep_p_val(phosphopep, contrast); " ) } } # Perhaps this could be done more elegantly using unique keys # or creating the tables before saving data to them, but this # is fast and, if the database exists on disk rather than in # memory, it doesn't stress memory. dml_no_rows_exec(db, " CREATE TEMP table contrast_log2_fc AS SELECT * FROM v_contrast_log2_fc ORDER BY contrast, phosphopep ; " ) dml_no_rows_exec(db, " CREATE TEMP table ppep_p_val AS SELECT p_value_raw, p_value_adj, contrast AS p_val_contrast, phosphopep AS p_val_ppep FROM contrast_ppep_p_val ORDER BY contrast, phosphopep ; " ) dml_no_rows_exec(db, " DROP TABLE IF EXISTS contrast_log2_fc_p_val ; " ) dml_no_rows_exec(db, " CREATE TABLE contrast_log2_fc_p_val AS SELECT a.*, b.p_value_raw, b.p_value_adj, b.p_val_contrast, b.p_val_ppep FROM contrast_log2_fc a, ppep_p_val b WHERE a.rowid = b.rowid AND a.phosphopep = b.p_val_ppep ; " ) # Create UK ddl_exec(db, " CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx ON contrast_log2_fc_p_val(phosphopep, contrast); " ) # Create indices for future queries ddl_exec(db, " CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx ON contrast_log2_fc_p_val(contrast); " ) ddl_exec(db, " CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx ON contrast_log2_fc_p_val(phosphopep); " ) ddl_exec(db, " CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx ON contrast_log2_fc_p_val(p_value_raw); " ) ddl_exec(db, " CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx ON contrast_log2_fc_p_val(p_value_adj); " ) dml_no_rows_exec(db, " DROP VIEW IF EXISTS v_contrast_log2_fc_p_val ; " ) dml_no_rows_exec(db, " CREATE VIEW v_contrast_log2_fc_p_val AS SELECT contrast, a_level, a_samples, b_level, b_samples, a_quant, b_quant, fc_description, log2_fc, p_value_raw, p_value_adj, phosphopep FROM contrast_log2_fc_p_val ORDER BY contrast, phosphopep ; " ) ddl_exec(db, " DROP TABLE IF EXISTS kseaapp_metadata ; " ) dml_no_rows_exec(db, " CREATE TABLE kseaapp_metadata AS WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS ( SELECT DISTINCT deppep.seq, ppep.seq, GeneName||';', UniProtID||';', PhosphoResidue||';' FROM ppep, deppep, mrgfltr_metadata WHERE mrgfltr_metadata.ppep_id = ppep.id AND ppep.deppep_id = deppep.id ) SELECT ppep AS `ppep`, SUBSTR(uniprot_id, 1, INSTR(uniprot_id,';') - 1 ) AS `Protein`, SUBSTR(gene_name, 1, INSTR(gene_name,';') - 1 ) AS `Gene`, deppep AS `Peptide`, REPLACE( REPLACE( SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ), 'p', '' ), ', ', ';' ) AS `Residue.Both` FROM extended ; " ) # Create indexes for join ddl_exec(db, " CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx ON kseaapp_metadata(ppep); " ) ddl_exec(db, " DROP VIEW IF EXISTS v_kseaapp_contrast ; " ) dml_no_rows_exec(db, " CREATE VIEW v_kseaapp_contrast AS SELECT a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both` FROM v_contrast_log2_fc_p_val a, kseaapp_metadata b WHERE b.ppep = a.phosphopep ; " ) ddl_exec(db, " DROP VIEW IF EXISTS v_kseaapp_input ; " ) dml_no_rows_exec(db, " CREATE VIEW v_kseaapp_input AS SELECT v.contrast, v.phosphopep, m.`Protein`, m.`Gene`, m.`Peptide`, m.`Residue.Both`, v.p_value_raw AS `p`, v.log2_fc AS `FC` FROM kseaapp_metadata AS m, v_contrast_log2_fc_p_val AS v WHERE m.ppep = v.phosphopep AND NOT m.`Gene` = 'No_Gene_Name' AND NOT v.log2_fc = 0 ; " ) # We are done with DDL and insertion RSQLite::dbDisconnect(db) } ``` ```{r echo = FALSE, results = 'asis'} cat("\\newpage\n") ``` # KSEA Analysis Summaries Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: - The package is available on CRAN, at https://cran.r-project.org/package=KSEAapp - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https://doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https://doi.org/10.1093/bioinformatics/btx415). - An online alternative (supporting only analysis of human data) is available at [https://casecpb.shinyapps.io/ksea/](https://casecpb.shinyapps.io/ksea/). For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: $$ \text{kinase enrichment }z\text{-score}_{j,i} = \frac{(\overline{`r sfc`}_{j,i} - \overline{`r pfc`}_j)\sqrt{m_{j,i}}}{\delta_j} $$ and fold-enrichment is computed as: $$ \text{Enrichment}_{j,i} = \frac{\overline{`r sfc`}_{j,i}}{\overline{`r pfc`}_j} $$ where: - $\overline{`r sfc`}_{j,i}$ is the mean `r pfc_txt` in intensities of known substrates of the kinase $i$ in contrast $j$, - $\overline{`r pfc`}_j$ is the mean `r pfc_txt` of all phosphosites identified in contrast $j$, and - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, - $\delta_j$ is the standard deviation of the $\log_2 (\text{fold-change})$ for contrast $j$ across all phosphosites in the dataset. - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. $\text{FDR}_{j,i}$ is the False Discovery Rate corrected kinase enrichment score. Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). - Kinase-substrate data were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). For each enriched kinase, a heatmap showing the intensities is presented for up to `r g_intensity_hm_rows` substrates, i.e., those substrates having the highest"quality". Where possible, a heatmap of the correlations among these the selected substrates is also presented; if correlations cannot be computed (because of too many missing values), then the covariances are heatmapped for substrates having a variance greater than 1. ```{r ksea, echo = FALSE, fig.dim = c(12, 14.5), results = 'asis'} cat("\\clearpage\n") db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) # -- eliminate the table that's about to be defined ddl_exec(db, " DROP TABLE IF EXISTS site_metadata; ") # -- define the site_metadata table ddl_exec(db, " CREATE TABLE site_metadata( id INTEGER PRIMARY KEY , site_type_id INTEGER REFERENCES site_type(id) , full TEXT UNIQUE ON CONFLICT IGNORE , abbrev TEXT , pattern TEXT , motif TEXT ); ") # -- populate the table with initial values ddl_exec(db, " INSERT INTO site_metadata(full, abbrev, site_type_id) SELECT DISTINCT kinase_map, kinase_map, site_type_id FROM ppep_gene_site ORDER BY kinase_map; ") # -- drop bogus KSData view if exists ddl_exec(db, " DROP VIEW IF EXISTS ks_data_v; ") # -- create view to serve as an impostor for KSEAapp::KSData ddl_exec(db, " CREATE VIEW IF NOT EXISTS ks_data_v AS SELECT 'NA' AS KINASE, 'NA' AS KIN_ACC_ID, kinase_map AS GENE, 'NA' AS KIN_ORGANISM, 'NA' AS SUBSTRATE, 0 AS SUB_GENE_ID, 'NA' AS SUB_ACC_ID, gene_names AS SUB_GENE, 'NA' AS SUB_ORGANISM, phospho_peptide AS SUB_MOD_RSD, 0 AS SITE_GROUP_ID, 'NA' AS 'SITE_7AA', 2 AS networkin_score, type_name AS Source FROM ppep_gene_site_view; ") contrast_metadata_df <- sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db) rslt <- new_env() rslt$score_list <- list() rslt$name_list <- list() rslt$longname_list <- list() ddl_exec(db, " DROP TABLE IF EXISTS contrast_ksea_scores; " ) next_index <- 0 err_na_subscr_df_const <- "missing values are not allowed in subscripted assignments of data frames" for (i_cntrst in seq_len(nrow(contrast_metadata_df))) { cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) contrast_longlabel <- ( sprintf( "Class %s -> Class %s", contrast_metadata_df[i_cntrst, "b_level"], contrast_metadata_df[i_cntrst, "a_level"] ) ) kseaapp_input <- sqldf::sqldf( x = sprintf(" SELECT `Protein`,`Gene`,`Peptide`,phosphopep AS `Residue.Both`,`p`,`FC` FROM v_kseaapp_input WHERE contrast = %d ", i_cntrst ), connection = db ) pseudo_ksdata <- dbReadTable(db, "ks_data_v") # This hack is because SQL table has the log2-transformed values kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE] main_title <- ( sprintf( "Change from treatment %s to treatment %s", contrast_metadata_df[i_cntrst, "b_level"], contrast_metadata_df[i_cntrst, "a_level"] ) ) sub_title <- contrast_longlabel tryCatch( expr = { ksea_scores_rslt <- ksea_scores( ksdata = pseudo_ksdata, px = kseaapp_input, networkin = TRUE, networkin_cutoff = 2, minimum_substrate_count = ksea_min_substrate_count ) if (FALSE) { ksea_scores_rslt <- ksea_scores_rslt[ ksea_scores_rslt$m >= ksea_min_substrate_count, , drop = FALSE ] } if (FALSE) { if (print_nb_messages) nbe( "Output contents of `ksea_scores_rslt` table\n") cat_variable(ksea_scores_rslt) cat("\n\\clearpage\n") data_frame_tabbing_latex( x = ksea_scores_rslt, tabstops = c(1.8, 0.8, 0.8, 0.3, 0.8, 0.8), caption = paste("KSEA scores for contrast ", cntrst_b_level, "to", cntrst_a_level), use_subsubsection_header = FALSE ) } if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { next_index <- 1 + next_index rslt$score_list[[next_index]] <- ksea_scores_rslt rslt$name_list[[next_index]] <- contrast_label rslt$longname_list[[next_index]] <- contrast_longlabel ksea_low_fdr_print( rslt = rslt, i_cntrst = i_cntrst, i = next_index, a_level = cntrst_a_level, b_level = cntrst_b_level, fold_change = cntrst_fold_change, caption = contrast_longlabel ) } }, error = function(e) { str(e) cat_margins() } ) } plotted_kinases <- NULL if (g_can_run_ksea && length(rslt$score_list) > 1) { for (i in seq_len(length(ksea_heatmap_titles))) { hdr <- ksea_heatmap_titles[[i]] which_kinases <- i cat("\\clearpage\n\\begin{center}\n") if (i == const_ksea_astrsk_kinases) { cat(subsection_header(hdr)) } else { cat(subsection_header(hdr)) } cat("\\end{center}\n") plotted_kinases <- ksea_heatmap( # the data frame outputs from the KSEA.Scores() function, in list format score_list = rslt$score_list, # a character vector of all the sample names for heatmap annotation: # - the names must be in the same order as the data in score_list # - please avoid long names, as they may get cropped in the final image sample_labels = rslt$name_list, # character string of either "p.value" or "FDR" indicating the data column # to use for marking statistically significant scores stats = c("p.value", "FDR")[2], # a numeric value between 0 and infinity indicating the min. number of # substrates a kinase must have to be included in the heatmap m_cutoff = 1, # a numeric value between 0 and 1 indicating the p-value/FDR cutoff # for indicating significant kinases in the heatmap p_cutoff = params$kseaCutoffThreshold, # a binary input of TRUE or FALSE, indicating whether or not to perform # hierarchical clustering of the sample columns sample_cluster = TRUE, # a binary input of TRUE or FALSE, indicating whether or not to export # the heatmap as a .png image into the working directory export = FALSE, # additional arguments to gplots::heatmap.2, such as: # - main: main title of plot # - xlab: x-axis label # - ylab: y-axis label xlab = "Contrast", ylab = "Kinase", # print which kinases: # - 1 : all kinases # - 2 : significant kinases # - 3 : non-significant kinases which_kinases = which_kinases, margins = c(7, 15) ) if (!is.null(plotted_kinases)) { cat("\\begin{center}\n") if (which_kinases != const_ksea_nonastrsk_kinases) cat("Asterisks reflect significance.\n") cat("\\end{center}\n") } } # end for (i in ... } # end if (length ... ``` ```{r kseabar_calc, echo = FALSE, fig.dim = c(9.5, 6), results = 'asis'} ksea_prints <- list() ksea_barplots <- list() for (i_cntrst in seq_len(length(rslt$score_list))) { next_index <- i_cntrst cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) contrast_longlabel <- ( sprintf( "Class %s -> Class %s", contrast_metadata_df[i_cntrst, "b_level"], contrast_metadata_df[i_cntrst, "a_level"] ) ) main_title <- ( sprintf( "Change from treatment %s to treatment %s", contrast_metadata_df[i_cntrst, "b_level"], contrast_metadata_df[i_cntrst, "a_level"] ) ) sub_title <- contrast_longlabel tryCatch( expr = { ksea_scores_rslt <- rslt$score_list[[next_index]] if (print_nb_messages) nbe(see_variable(ksea_scores_rslt)) if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { sink(deferred <- file()) ksea_low_fdr_print( rslt = rslt, i_cntrst = i_cntrst, i = next_index, a_level = cntrst_a_level, b_level = cntrst_b_level, fold_change = cntrst_fold_change, caption = contrast_longlabel, write_db = FALSE, anchor = const_table_anchor_t ) cat("\n") sink() lines <- paste( readLines(deferred, warn = FALSE), collapse = "\n" ) close(deferred) sq_put(ksea_prints, lines) sink(stderr()) cat("\n---\n") cat_variable(ksea_prints) barplot_closure <- ksea_low_fdr_barplot_factory( rslt = rslt, i_cntrst = i_cntrst, i = next_index, a_level = cntrst_a_level, b_level = cntrst_b_level, fold_change = cntrst_fold_change, caption = contrast_longlabel ) if (rlang::is_closure(barplot_closure)) sq_put(ksea_barplots, barplot_closure) else sq_put(ksea_barplots, no_op) str(ksea_barplots) cat("\n...\n") sink() } }, error = function(e) { str(e) cat_margins() } ) } ``` ```{r phosphoelm_kinase_upid_desc, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} have_kinase_descriptions <- if (!is.null(bzip2df(kinase_uprt_desc_lut, kinase_uprt_desc_lut_bz2)) && !is.null(bzip2df(kinase_name_uprt_lut, kinase_name_uprt_lut_bz2)) ) { rownames(kinase_uprt_desc_lut) <- kinase_uprt_desc_lut$UniProtID kinase_name_to_desc_uprt <- function(s) { rslt <- NULL tryCatch( { which_rows <- eval(s == kinase_name_uprt_lut$kinase) kinase_uprtid <- kinase_name_uprt_lut[which_rows, 2] # filter for first _HUMAN match if any grepl_human <- grepl("_HUMAN$", kinase_uprtid) if (0 < sum(grepl_human)) kinase_uprtid <- kinase_uprtid[grepl_human] # filter for first match if any if (0 < length(kinase_uprtid)) { kinase_uprtid <- kinase_uprtid[1] kinase_desc <- kinase_uprt_desc_lut[kinase_uprtid, 2] if (!is.na(kinase_desc)) rslt <- c(kinase_desc, kinase_uprtid) else rslt <- c(kinase_desc, "") } }, warning = str ) rslt } TRUE } else { kinase_name_to_desc_uprt <- function(s) NULL FALSE } ``` ```{r write_params, echo = FALSE, results = 'asis'} # perhaps this should be moved into the functions section, eventually ... write_params <- function(db) { # write parameters to report # write parameters to SQLite output mqppep_anova_script_param_df <- data.frame( script = "mqppep_anova_script.Rmd", parameter = names(param_unlist), value = param_unlist ) ddl_exec(db, " DROP TABLE IF EXISTS script_parameter; " ) ddl_exec(db, " CREATE TABLE IF NOT EXISTS script_parameter( script TEXT, parameter TEXT, value ANY, UNIQUE (script, parameter) ON CONFLICT REPLACE ) ; " ) RSQLite::dbWriteTable( conn = db, name = "script_parameter", value = mqppep_anova_script_param_df, append = TRUE ) loaded_packages_df <- sessioninfo::package_info("loaded") loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) loaded_packages_df <- data.frame( package = loaded_packages_df$package, version = loaded_packages_df$loadedversion, date = loaded_packages_df$date ) cat("\\clearpage\n\\section{Input parameter settings}\n") data_frame_tabbing_latex( x = param_df, tabstops = c(1.75), underscore_whack = TRUE, caption = "Input parameters", verbatim = FALSE ) } if (!have_kinase_descriptions) { write_params(db) # We are done with output RSQLite::dbDisconnect(db) param_df_exit() knitr::knit_exit() } ``` ```{r kseabar, echo = FALSE, fig.dim = c(9.5, 12.3), results = 'asis'} # nolint start # squash un-actionable cyclomatic_complexity warning if (have_kinase_descriptions) { # nolint end my_section_header <- sprintf( "inases whose KSEA %s < %s\n", ksea_cutoff_statistic, signif(ksea_cutoff_threshold, 2) ) # Use enriched kinases to find enriched kinase-substrate pairs enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) enriched_kinase_descs <- Reduce( f = function(l, r) { lkup <- kinase_name_to_desc_uprt(r) if (is.null(lkup)) l else r2 <- rbind( l, data.frame( kinase = r, uniprot_id = lkup[2], description = lkup[1] ) ) }, x = enriched_kinases$kinase, init = NULL ) if (length(enriched_kinase_descs) > 0 && nrow(enriched_kinase_descs) > 0) { cat("\n\\clearpage\n") data_frame_tabbing_latex( x = enriched_kinase_descs, tabstops = c(0.9, 1.3), headings = c("Kinase", "UniProt ID", "Description"), caption = paste0("Descriptions of k", my_section_header) ) } if (FALSE) { cat_variable(sqldf::sqldf("SELECT kinase FROM enriched_kinases")) cat_variable(sqldf::sqldf(" SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep FROM pseudo_ksdata WHERE gene IN (SELECT kinase FROM enriched_kinases) ")) data_frame_table_latex( x = sqldf::sqldf(" SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep FROM pseudo_ksdata WHERE gene IN (SELECT kinase FROM enriched_kinases) "), justification = "l l l", centered = TRUE, caption = "substrates of enriched kinases", anchor = c(const_table_anchor_p, const_table_anchor_t), underscore_whack = TRUE ) data_frame_table_latex( x = sqldf::sqldf(" SELECT gene AS kinase, ppep, sub_gene, '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, fdr_adjusted_anova_p, quality, min_group_obs_count FROM ( SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep FROM pseudo_ksdata WHERE gene IN (SELECT kinase FROM enriched_kinases) ), p_value_data WHERE ppep = phosphopeptide GROUP BY kinase, ppep ORDER BY kinase, p_value_data.quality DESC "), justification = "l l l l l l l", centered = TRUE, caption = "labeled substrates of enriched kinases", anchor = c(const_table_anchor_p, const_table_anchor_t), underscore_whack = TRUE ) } all_enriched_substrates <- sqldf::sqldf(" SELECT gene AS kinase, ppep, sub_gene, '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, fdr_adjusted_anova_p, quality, min_group_obs_count FROM ( SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep FROM pseudo_ksdata WHERE gene IN (SELECT kinase FROM enriched_kinases) ), p_value_data WHERE ppep = phosphopeptide GROUP BY kinase, ppep ORDER BY kinase, ppep, p_value_data.quality DESC ") all_enriched_substrates <- all_enriched_substrates[ all_enriched_substrates$quality >= params$minQuality, , drop = FALSE ] all_enriched_substrates$sub_gene <- sub(" ///.*", "...", all_enriched_substrates$sub_gene ) all_enriched_substrates$label <- with( all_enriched_substrates, sprintf( "(%s-%s) %s", kinase, trunc_subgene(sub_gene), ppep ) ) # this global is set to TRUE by cat_enriched_heading immediately below g_neednewpage <- FALSE # helper used to label per-kinase substrate enrichment figure cat_enriched_heading <- function(m, cut_args) { cutoff <- cut_args$cutoff kinase <- cut_args$kinase if (g_neednewpage) cat("\\newpage\n") g_neednewpage <- TRUE if (nrow(m) > g_intensity_hm_rows) { cat(subsection_header( sprintf( "Highest-quality %d (of %d) enriched %s-sites", g_intensity_hm_rows, nrow(m), kinase ) )) } else { if (nrow(m) == 0) { return(FALSE) } else { nrow_m <- nrow(m) cat(subsection_header( sprintf( "%d enriched %s-site%s", nrow_m, kinase, if (nrow_m > 1) "s" else "" ) )) } } cat("\n\n\n") cat("\n\n\n") return(TRUE) } # -------------------------------- # hack begin - show all substrates enriched_substrates <- all_enriched_substrates # add "FALSE &&" to prevent listing of substrates if (show_enriched_substrates && nrow(enriched_substrates) > 0) { short_row_names <- sub( "$FAILED_MATCH_GENE_NAME", "not_found", enriched_substrates$sub_gene, fixed = TRUE ) if (print_nb_messages) nbe(see_variable(enriched_substrates)) substrates_df <- with( enriched_substrates, data.frame( kinase = kinase, substrate = sub(" ///*", "...", short_row_names), anova_p_value = signif(fdr_adjusted_anova_p, 2), min_group_obs_count = signif(min_group_obs_count, 0), quality = signif(quality, 3), sequence = trunc_n(30)(ppep) ) ) substrates_df <- substrates_df[ with(substrates_df, order(kinase, -quality)), , drop = FALSE ] if (print_nb_messages) nbe(see_variable(substrates_df)) if (nrow(substrates_df) < 1) substrates_df$sequence <- c() if (print_nb_messages) nbe(see_variable(substrates_df)) names(substrates_df) <- headers_2nd_line <- c("Kinase", "Substrate", "p-value", "per group)", "quality", "Sequence") headers_1st_line <- c("", "", "ANOVA", "min(values", "", "") data_frame_tabbing_latex( x = substrates_df, tabstops = c(1.2, 0.8, 0.5, 0.65, 0.5), headings = c(headers_1st_line, headers_2nd_line), caption = "Details for all enriched substrates of enriched kinases" ) rm( enriched_substrates, substrates_df, short_row_names, headers_1st_line, headers_2nd_line ) } cat("\\clearpage\n") # hack end - show all substrates # -------------------------------- # print deferred tables and graphs for kinases from contrasts for (i_cntrst in seq_len(length(ksea_prints))) { #latex_samepage({ cat(ksea_prints[[i_cntrst]]) cat("\n") ksea_barplots[[i_cntrst]]() cat("\n") cat("\\clearpage\n") #}) } } ``` ```{r enriched, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} # nolint start # squash un-actionable cyclomatic_complexity warning if (g_can_run_ksea) { # nolint end g_did_enriched_header <- FALSE for (kinase_name in sort(enriched_kinases$kinase)) { enriched_substrates <- all_enriched_substrates[ all_enriched_substrates$kinase == kinase_name, , drop = FALSE ] ten_trunc_ppep <- trunc_enriched_substrate(enriched_substrates$ppep) enriched_substrates$label <- with( enriched_substrates, sprintf( "(%s) %s", make.names( sub("$FAILED_MATCH_GENE_NAME", "not_found", sub_gene, fixed = TRUE), unique = TRUE ), ten_trunc_ppep ) ) # Get the intensity values for the heatmap enriched_intensities <- as.matrix(unimputed_quant_data_log[ enriched_substrates$ppep, , drop = FALSE ]) # Remove rows having too many NA values to be relevant good_rows <- (rowSums(enriched_intensities, na.rm = TRUE) != 0) enriched_substrates <- enriched_substrates[good_rows, , drop = FALSE] enriched_intensities <- enriched_intensities[good_rows, , drop = FALSE] # Rename the rows with the display-name for the heatmap short_row_names <- sub( "$FAILED_MATCH_GENE_NAME", "not_found", enriched_substrates$sub_gene, fixed = TRUE ) short_row_names <- make.names(short_row_names, unique = TRUE) long_row_names <- sapply( X = rownames(enriched_intensities), FUN = function(rn) { enriched_substrates[enriched_substrates$ppep == rn, "label"] } ) rownames(enriched_intensities) <- long_row_names # Format as matrix for heatmap m <- as.matrix(enriched_intensities) rownames(m) <- trunc_enriched_substrate(rownames(m)) # Draw the heading and heatmap nrow_m <- nrow(m) if (nrow_m > 0) { if (!g_did_enriched_header) { cat("\n\\clearpage\n") cat(section_header(paste0("K", my_section_header))) g_did_enriched_header <- TRUE } is_na_m <- is.na(m) cellnote_m <- is_na_m cellnote_m[!is_na_m] <- "" cellnote_m[is_na_m] <- params$heatMapNAcellNote cut_args <- new_env() cut_args$cutoff <- cutoff cut_args$kinase <- kinase_name cut_args$statistic <- ksea_cutoff_statistic cut_args$threshold <- ksea_cutoff_threshold number_of_peptides_found <- ppep_heatmap( m = m, cellnote = cellnote_m, cutoff = cut_args, hm_heading_function = cat_enriched_heading, hm_main_title = paste( "Unnormalized (zero-imputed)", "intensities of enriched kinase-substrates"), suppress_row_dendrogram = FALSE, master_cex = 0.35, sepcolor = "black", colsep = sample_colsep, row_scaling = "none" ) if (number_of_peptides_found > 1) { tryCatch( { rownames(m) <- short_row_names cov_heatmap( m = m, kinase_name = kinase_name, top_substrates = nrow_m > g_intensity_hm_rows ) }, error = function(e) { cat( sprintf( "ERROR: %s\n\\newline\n", mget("e") ) ) cat( sprintf( "message: %s\n\\newline\n", e$message ) ) cat_margins() } ) } substrates_df <- with( enriched_substrates, data.frame( substrate = sub(" ///*", "...", short_row_names), sequence = trunc_long_ppep(ppep), anova_p_value = signif(fdr_adjusted_anova_p, 2), min_group_obs_count = signif(min_group_obs_count, 0), quality = signif(quality, 3) ) ) excess_substrates <- nrow(substrates_df) > g_intensity_hm_rows if (excess_substrates) substrates_df <- substrates_df[1:g_intensity_hm_rows, ] names(substrates_df) <- headers_2nd_line <- c("Substrate", "Sequence", "p-value", "per group)", "quality") headers_1st_line <- c("", "", "ANOVA", "min(values", "") if (1 < nrow(enriched_substrates)) cat("\n\\newpage\n") cat(subsubsection_header( sprintf( "Details for %s%s-sites", if (excess_substrates) sprintf( "%s \"highest quality\" ", g_intensity_hm_rows ) else "", kinase_name ) )) substrates_df <- substrates_df[order(-substrates_df$quality), ] data_frame_tabbing_latex( x = substrates_df, tabstops = c(0.8, 3.8, 0.6, 0.8), headings = c(headers_1st_line, headers_2nd_line) ) } else { if (print_nb_messages) nbe(see_variable(nrow_m > 0), "\n") } if (print_nb_messages) nb("end kinase ", kinase_name, "\n") } # Write output tabular files # get kinase, ppep, concat(kinase) tuples for enriched kinases if (print_nb_messages) nb("kinase_ppep_label <- ...\n") if (print_nb_messages) nbe("kinase_ppep_label <- ...\n") kinase_ppep_label <- sqldf::sqldf(" WITH t(ppep, label) AS ( SELECT DISTINCT SUB_MOD_RSD AS ppep, group_concat(gene, '; ') AS label FROM pseudo_ksdata WHERE GENE IN (SELECT kinase FROM enriched_kinases) GROUP BY ppep ), k(kinase, ppep_join) AS ( SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join FROM pseudo_ksdata WHERE GENE IN (SELECT kinase FROM enriched_kinases) ) SELECT k.kinase, t.ppep, t.label FROM t, k WHERE t.ppep = k.ppep_join ORDER BY k.kinase, t.ppep ") # extract what we need from full_data impish <- cbind(rownames(quant_data_imp), quant_data_imp) colnames(impish)[1] <- "Phosphopeptide" data_table_imputed_sql <- " SELECT f.*, k.label AS KSEA_enrichments, q.* FROM metadata_plus_p f LEFT JOIN kinase_ppep_label k ON f.Phosphopeptide = k.ppep, impish q WHERE f.Phosphopeptide = q.Phosphopeptide " data_table_imputed <- sqldf::sqldf(data_table_imputed_sql) # Zap the duplicated 'Phosphopeptide' column named 'ppep' data_table_imputed <- data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] # Output imputed, un-normalized data if (print_nb_messages) nb("Output imputed, un-normalized data tabular file\n") if (print_nb_messages) nbe( "Output imputed, un-normalized data tabular file\n") write.table( data_table_imputed , file = imputed_data_filename , sep = "\t" , col.names = TRUE , row.names = FALSE , quote = FALSE ) #output quantile normalized data impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) colnames(impish)[1] <- "Phosphopeptide" data_table_imputed <- sqldf::sqldf(data_table_imputed_sql) # Zap the duplicated 'Phosphopeptide' column named 'ppep' data_table_imputed <- data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] if (print_nb_messages) nb("Output quantile normalized data tabular file\n") if (print_nb_messages) nbe("Output quantile normalized data tabular file\n") write.table( data_table_imputed, file = imp_qn_lt_data_filenm, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) ppep_kinase <- sqldf::sqldf(" SELECT DISTINCT k.ppep, k.kinase FROM ( SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep FROM pseudo_ksdata WHERE GENE IN (SELECT kinase FROM enriched_kinases) ) k ORDER BY k.ppep, k.kinase ") RSQLite::dbWriteTable( conn = db, name = "ksea_enriched_ks", value = ppep_kinase, append = FALSE ) } if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\n") RSQLite::dbWriteTable( conn = db, name = "anova_signif", value = p_value_data, append = FALSE ) ddl_exec(db, " DROP VIEW IF EXISTS stats_metadata_v; " ) dml_no_rows_exec(db, " CREATE VIEW stats_metadata_v AS SELECT DISTINCT m.*, p.raw_anova_p, p.fdr_adjusted_anova_p, kek.kinase AS ksea_enrichments FROM mrgfltr_metadata_view m LEFT JOIN anova_signif p ON m.phospho_peptide = p.phosphopeptide LEFT JOIN ksea_enriched_ks kek ON m.phospho_peptide = kek.ppep ; " ) if (print_nb_messages) nb( "Output contents of `stats_metadata_v` table to tabular file\n") if (print_nb_messages) nbe( "Output contents of `stats_metadata_v` table to tabular file\n") write.table( dbReadTable(db, "stats_metadata_v"), file = anova_ksea_mtdt_file, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) cat("\n\\clearpage\n") ``` # Data-processing summary flowchart ![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf) ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} cat("\\leavevmode\n\n\n") write_params(db) # We are done with output RSQLite::dbDisconnect(db) cat("\\clearpage\n\\section{R package versions}\n") utils::toLatex(utils::sessionInfo()) ```