Mercurial > repos > galaxyp > mqppep_preproc
view mqppep_anova_script.Rmd @ 0:8dfd5d2b5903 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author | galaxyp |
---|---|
date | Mon, 11 Jul 2022 19:22:54 +0000 |
parents | |
children | b76c75521d91 |
line wrap: on
line source
--- title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA" author: - "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]" - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" date: - "May 28, 2018" - "; revised June 23, 2022" output: pdf_document: toc: true toc_depth: 3 keep_tex: true header-includes: - \usepackage{longtable} - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut params: alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/test_input_for_anova.tabular" preprocDb: "test-data/test_input_for_anova.sqlite" kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] show_toc: true firstDataColumn: "^Intensity[^_]" imputationMethod: !r c("group-median", "median", "mean", "random")[1] meanPercentile: 1 sdPercentile: 1.0 regexSampleNames: "\\.\\d+[A-Z]$" regexSampleGrouping: "\\d+" imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] kseaCutoffStatistic: !r c("p.value", "FDR")[2] kseaCutoffThreshold: !r c( 0.1, 0.05)[2] kseaMinKinaseCount: 1 intensityHeatmapRows: 75 --- <!-- kseaCutoffStatistic: !r c("p.value", "FDR")[2] kseaCutoffThreshold: !r c(0.05, 0.1)[1] alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/test_input_for_anova.tabular" preprocDb: "test-data/test_input_for_anova.sqlite" kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] inputFile: "test-data/density_failure.preproc_tab.tabular" kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] latex_document: default --> ```{r setup, include = FALSE} #ref for debugging: https://yihui.org/tinytex/r/#debugging options(tinytex.verbose = TRUE) # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) # freeze the random number generator so the same results will be produced # from run to run set.seed(28571) ### LIBRARIES library(gplots) library(DBI) library(RSQLite) # Suppress "Warning: no DISPLAY variable so Tk is not available" suppressWarnings(suppressMessages(library(sqldf))) # required but not added to search list: # - DBI # - RSQLite # - ggplot2 # - knitr # - latex2exp # - preprocessCore # - reshape2 # - vioplot ### CONSTANTS const_parfin <- par("fin") const_boxplot_fill <- "grey94" const_stripchart_cex <- 0.5 const_stripsmall_cex <- sqrt(const_stripchart_cex * const_stripchart_cex / 2) const_stripchart_jitter <- 0.3 const_write_debug_files <- FALSE const_table_anchor_bp <- "bp" const_table_anchor_ht <- "ht" const_table_anchor_p <- "p" const_table_anchor_tbp <- "tbp" const_ksea_astrsk_kinases <- 1 const_ksea_nonastrsk_kinases <- 2 const_ksea_all_kinases <- 3 const_log10_e <- log10(exp(1)) ### FUNCTIONS # from `demo(error.catching)` ##' Catch *and* save both errors and warnings, and in the case of ##' a warning, also keep the computed result. ##' ##' @title tryCatch both warnings (with value) and errors ##' @param expr an \R expression to evaluate ##' @return a list with 'value' and 'warning', where ##' 'value' may be an error caught. ##' @author Martin Maechler; ##' Copyright (C) 2010-2012 The R Core Team try_catch_w_e <- function(expr) { wrn <- NULL # warning handler w_handler <- function(w) { wrn <<- w invokeRestart("muffleWarning") } list( value = withCallingHandlers( tryCatch( expr, error = function(e) e ), warning = w_handler ), warning = wrn ) } write_debug_file <- function(s) { if (const_write_debug_files) { s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) print(sprintf("DEBUG writing file %s", spath)) write.table( s, file = s_path, sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE ) } } # ref: http://adv-r.had.co.nz/Environments.html # "When creating your own environment, note that you should set its parent # environment to be the empty environment. This ensures you don't # accidentally inherit objects from somewhere else." # Caution: this prevents `with(my_env, expr)` from working when `expr` # contains anything from the global environment, even operators! # Hence, `x <- 1; get("x", new_env())` fails by design. new_env <- function() { new.env(parent = emptyenv()) } ### numerical/statistical helper functions any_nan <- function(x) { !any(x == "NaN") } # determine standard deviation of quantile to impute sd_finite <- function(x) { ok <- is.finite(x) sd(x[ok]) } anova_func <- function(x, grouping_factor, one_way_f) { subject <- data.frame( intensity = x ) x_aov <- one_way_f( formula = intensity ~ grouping_factor, data = subject ) pvalue <- if (identical(one_way_f, aov)) summary(x_aov)[[1]][["Pr(>F)"]][1] else pvalue <- x_aov$p.value pvalue } ### LaTeX functions latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) { v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v cat( paste0( v_sub, collapse = collapse_string ) ) } latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { cat("\\begin{itemize}\n\\item ") latex_collapsed_vector(collapse_string, v, underscore_whack) cat("\n\\end{itemize}\n") } latex_itemized_list <- function(v, underscore_whack = TRUE) { latex_itemized_collapsed("\n\\item ", v, underscore_whack) } latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { cat("\\begin{enumerate}\n\\item ") latex_collapsed_vector(collapse_string, v, underscore_whack) cat("\n\\end{enumerate}\n") } latex_enumerated_list <- function(v) { latex_enumerated_collapsed("\n\\item ", v) } latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { latex_collapsed_vector(" & ", v, underscore_whack) cat(extra) cat(" \\\\\n") } # Use this like print.data.frame, from which it is adapted: data_frame_latex <- function( x, ..., # digits to pass to format.data.frame digits = NULL, # TRUE -> right-justify columns; FALSE -> left-justify right = TRUE, # maximumn number of rows to print max = NULL, # string with justification of each column justification = NULL, # TRUE to center on page centered = TRUE, # optional caption caption = NULL, # h(inline); b(bottom); t (top) or p (separate page) anchor = "h", # set underscore_whack to TRUE to escape underscores underscore_whack = TRUE ) { if (is.null(justification)) justification <- Reduce( f = paste, x = rep_len(if (right) "r" else "l", length(colnames(x))) ) n <- length(rownames(x)) if (length(x) == 0L) { cat( sprintf( # if n is one, use singular 'row', else use plural 'rows' ngettext( n, "data frame with 0 columns and %d row", "data frame with 0 columns and %d rows" ), n ), "\n", sep = "" ) } else if (n == 0L) { cat("0 rows for:\n") latex_itemized_list( v = names(x), underscore_whack = underscore_whack ) } else { if (is.null(max)) max <- getOption("max.print", 99999L) if (!is.finite(max)) stop("invalid 'max' / getOption(\"max.print\"): ", max) omit <- (n0 <- max %/% length(x)) < n m <- as.matrix( format.data.frame( if (omit) x[seq_len(n0), , drop = FALSE] else x, digits = digits, na.encode = FALSE ) ) cat( # h(inline); b(bottom); t (top) or p (separate page) paste0("\\begin{table}[", anchor, "]\n") ) if (!is.null(caption)) cat(paste0(" \\caption{", caption, "}")) if (centered) cat("\\centering\n") cat( paste( " \\begin{tabular}{", justification, "}\n", sep = "" ) ) # ref: https://tex.stackexchange.com/a/50353 # Describes use of \rule{0pt}{3ex} if (!is.null(caption)) cat("\\B \\\\ \\hline\\hline\n") # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 latex_table_row( v = colnames(m), extra = "\\T\\B", underscore_whack = underscore_whack ) cat("\\hline\n") for (i in seq_len(length(m[, 1]))) { latex_table_row( v = m[i, ], underscore_whack = underscore_whack ) } cat( paste( " \\end{tabular}", "\\end{table}", sep = "\n" ) ) if (omit) cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", n - n0, "rows ]\n") } invisible(x) } hypersub <- function(s) { hyper <- tolower(s) hyper <- gsub("[^a-z0-9]+", "-", hyper) hyper <- gsub("[-]+", "-", hyper) hyper <- sub("^[-]", "", hyper) hyper <- sub("[-]$", "", hyper) return(hyper) } subsection_header <- function(s) { hyper <- hypersub(s) cat( sprintf( "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", hyper, s, hyper ) ) } subsubsection_header <- function(s) { hyper <- hypersub(s) cat( sprintf( "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n", hyper, s, hyper ) ) } ### SQLite functions ddl_exec <- function(db, sql) { discard <- DBI::dbExecute(conn = db, statement = sql) if (FALSE && discard != 0) { need_newpage <- TRUE if (need_newpage) { need_newpage <<- FALSE cat("\\newpage\n") } o_file <- stdout() cat("\n\\begin{verbatim}\n") cat(sql, file = o_file) cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file) cat("\n\\end{verbatim}\n") } } dml_no_rows_exec <- function(db, sql) { discard <- DBI::dbExecute(conn = db, statement = sql) if (discard != 0) { need_newpage <- TRUE if (need_newpage) { need_newpage <<- FALSE cat("\\newpage\n") } cat("\n\\begin{verbatim}\n") o_file <- stdout() cat(sql, file = o_file) cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file) cat("\n\\end{verbatim}\n") } } ### KSEA functions and helpers # Adapted from KSEAapp::KSEA.Scores to allow retrieval of: # - maximum log2(FC) ksea_scores <- function( # For human data, typically, ksdata = KSEAapp::ksdata ksdata, # Input data file having columns: # - Protein : abbreviated protein name # - Gene : HUGO gene name # - Peptide : peptide sequence without indications of phosphorylation # - Reside.Both : position(s) of phosphorylation within Gene sequence # - First letter designates AA that is modified # - Numbers indicate position within Gene # - Multiple values are separated by semicolons # - p : p-value # - FC : fold-change px, # A binary input of TRUE or FALSE, indicating whether or not to include # NetworKIN predictions networkin, # A numeric value between 1 and infinity setting the minimum NetworKIN # score (can be left out if networkin = FALSE) networkin_cutoff ) { if (length(grep(";", px$Residue.Both)) == 0) { # There are no Residue.Both entries having semicolons, so new is # simply px except two columns are renamed and a column is added # for log2(abs(fold-change)) new <- px colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") new$log2_fc <- log2(abs(as.numeric(as.character(new$FC)))) new <- new[complete.cases(new$log2_fc), ] } else { # Split each row having semicolons in Residue.Both into rows that are # duplicated in all respects except that each row has a single # member of the set "split-on-semicolon-Residue.Both" px_double <- px[grep(";", px$Residue.Both), ] residues <- as.character(px_double$Residue.Both) residues <- as.matrix(residues, ncol = 1) split <- strsplit(residues, split = ";") # x gets count of residues in each row, # i.e., 1 + count of semicolons x <- sapply(split, length) # Here is the set of split rows px_single <- data.frame( Protein = rep(px_double$Protein, x), Gene = rep(px_double$Gene, x), Peptide = rep(px_double$Peptide, x), Residue.Both = unlist(split), p = rep(px_double$p, x), FC = rep(px_double$FC, x) ) # new first gets the split rows new <- px[-grep(";", px$Residue.Both), ] # to new, append the rows that didn't need splitting in the first place new <- rbind(new, px_single) # map Gene to SUB_GENE # map Residue.Both to SUB_MOD_RSD colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") # Eliminate any non-positive values to prevent introduction of # infinite or NaN values new[(0 <= new$log2_fc), "log2_fc"] <- NA # Because of preceding step, there is no need for abs in the next line new$log2_fc <- log2(as.numeric(as.character(new$FC))) # Convert any illegal values from NaN to NA new[is.nan(new$log2_fc), "log2_fc"] <- NA # Eliminate rows having missing values (e.g., non-imputed data) new <- new[complete.cases(new$log2_fc), ] } if (networkin == TRUE) { # When NetworKIN is true, filter on NetworKIN.cutoff which includes # PhosphoSitePlus data *because its networkin_score is set to Inf* ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] ksdata_filtered <- ksdata_filtered[ (ksdata_filtered$networkin_score >= networkin_cutoff), ] } else { # Otherwise, simply use PhosphSitePlus rows ksdata_filtered <- ksdata[ grep("PhosphoSitePlus", ksdata$Source), ] } # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD # colnames of ksdata_filtered: # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" # "SITE_...7_AA" "networkin_score" "Source" # colnames of new: # "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc" # Equivalent to: # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc # FROM ksdata_filtered a # INNER JOIN new b # ON a.SUB_GENE = b.SUB_GENE # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD ksdata_dataset <- base::merge(ksdata_filtered, new) # colnames of ksdata_dataset: # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) # Re-order dataset; prior to accounting for isoforms ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] # Extract non-isoform accession in UniProtKB ksdata_dataset$uniprot_no_isoform <- sapply( ksdata_dataset$KIN_ACC_ID, function(x) unlist(strsplit(as.character(x), split = "-"))[1] ) # Discard previous results while selecting interesting columns ... ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] # Column names are now: # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" # "FC" "log2_fc" "Source" # Make column names human-readable colnames(ksdata_dataset_abbrev) <- c( "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", "FC", "log2FC", "Source" ) # SELECT * FROM ksdata_dataset_abbrev # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p ksdata_dataset_abbrev <- ksdata_dataset_abbrev[ order( ksdata_dataset_abbrev$Kinase.Gene, ksdata_dataset_abbrev$Substrate.Gene, ksdata_dataset_abbrev$Substrate.Mod, ksdata_dataset_abbrev$p), ] # First aggregation step to account for multiply phosphorylated peptides # and differing peptide sequences; the goal here is to combine results # for all measurements of the same substrate. # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, # `Source`, avg(log2FC) AS log2FC # FROM ksdata_dataset_abbrev # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, # `Source` # ORDER BY `Kinase.Gene`; # in two steps: # (1) compute average log_2(fold-change) ksdata_dataset_abbrev <- aggregate( log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, data = ksdata_dataset_abbrev, FUN = mean ) # (2) order by Kinase.Gene ksdata_dataset_abbrev <- ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] # SELECT `Kinase.Gene`, count(*) # FROM ksdata_dataset_abbrev # GROUP BY `Kinase.Gene`; # in two steps: # (1) Extract the list of Kinase.Gene names kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, # named by Kinase.Gene kinase_list <- as.matrix(table(kinase_list)) # Second aggregation step to account for all substrates per kinase # CREATE TABLE mean_fc # AS # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC # FROM ksdata_dataset_abbrev # GROUP BY `Kinase.Gene` mean_fc <- aggregate( log2FC ~ Kinase.Gene, data = ksdata_dataset_abbrev, FUN = mean ) # mean_fc columns: "Kinase.Gene", "log2FC" if (FALSE) { # I need to re-think this; I was trying to find the most-represented # peptide, but that horse has already left the barn # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC # FROM ksdata_dataset_abbrev # GROUP BY `Kinase.Gene` max_fc <- aggregate( log2FC ~ Kinase.Gene, data = ksdata_dataset_abbrev, FUN = function(r) max(abs(r)) ) } # Create column 3: mS mean_fc$m_s <- mean_fc[, 2] # Create column 4: Enrichment mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) # Create column 5: m, count of substrates mean_fc$m <- kinase_list # Create column 6: z-score mean_fc$z_score <- ( (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) # Create column 7: p-value, deduced from z-score mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") # Remove log2FC column, which is duplicated as mS mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] # Correct the column names which we had to hack because of the linter... colnames(mean_fc) <- c( "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" ) return(mean_fc) } low_fdr_barplot <- function( rslt, i_cntrst, i, a_level, b_level, fold_change, caption ) { rslt_score_list_i <- rslt$score_list[[i]] if (!is.null(rslt_score_list_i)) { rslt_score_list_i_nrow <- nrow(rslt_score_list_i) k <- data.frame( contrast = as.integer(i_cntrst), a_level = rep.int(a_level, rslt_score_list_i_nrow), b_level = rep.int(b_level, rslt_score_list_i_nrow), kinase_gene = rslt_score_list_i$Kinase.Gene, mean_log2_fc = rslt_score_list_i$mS, enrichment = rslt_score_list_i$Enrichment, substrate_count = rslt_score_list_i$m, z_score = rslt_score_list_i$z.score, p_value = rslt_score_list_i$p.value, fdr = rslt_score_list_i$FDR ) selector <- switch( ksea_cutoff_statistic, "FDR" = { k$fdr }, "p.value" = { k$p_value }, stop( sprintf( "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", ksea_cutoff_statistic ) ) ) k <- k[selector < ksea_cutoff_threshold, ] if (nrow(k) > 1) { op <- par(mai = c(1, 1.5, 0.4, 0.4)) numeric_z_score <- as.numeric(k$z_score) z_score_order <- order(numeric_z_score) kinase_name <- k$kinase_gene long_caption <- sprintf( "Kinase z-score, %s < %s, %s", ksea_cutoff_statistic, ksea_cutoff_threshold, caption ) my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) cat("\n\\clearpage\n") barplot( height = numeric_z_score[z_score_order], border = NA, xpd = FALSE, cex.names = 1.0, cex.axis = 1.0, main = long_caption, cex.main = my_cex_caption, names.arg = kinase_name[z_score_order], horiz = TRUE, srt = 45, las = 1) par(op) } } } # note that this adds elements to the global variable `ksea_asterisk_hash` low_fdr_print <- function( rslt, i_cntrst, i, a_level, b_level, fold_change, caption ) { rslt_score_list_i <- rslt$score_list[[i]] if (!is.null(rslt_score_list_i)) { rslt_score_list_i_nrow <- nrow(rslt_score_list_i) k <- contrast_ksea_scores <- data.frame( contrast = as.integer(i_cntrst), a_level = rep.int(a_level, rslt_score_list_i_nrow), b_level = rep.int(b_level, rslt_score_list_i_nrow), kinase_gene = rslt_score_list_i$Kinase.Gene, mean_log2_fc = rslt_score_list_i$mS, enrichment = rslt_score_list_i$Enrichment, substrate_count = rslt_score_list_i$m, z_score = rslt_score_list_i$z.score, p_value = rslt_score_list_i$p.value, fdr = rslt_score_list_i$FDR ) selector <- switch( ksea_cutoff_statistic, "FDR" = { k$fdr }, "p.value" = { k$p_value }, stop( sprintf( "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", ksea_cutoff_statistic ) ) ) k <- k[selector < ksea_cutoff_threshold, ] # save kinase names to ksea_asterisk_hash for (kinase_name in k$kinase_gene) { ksea_asterisk_hash[[kinase_name]] <- 1 } db_write_table_overwrite <- (i_cntrst < 2) db_write_table_append <- !db_write_table_overwrite RSQLite::dbWriteTable( conn = db, name = "contrast_ksea_scores", value = contrast_ksea_scores, append = db_write_table_append ) selector <- switch( ksea_cutoff_statistic, "FDR" = { contrast_ksea_scores$fdr }, "p.value" = { contrast_ksea_scores$p_value }, stop( sprintf( "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", ksea_cutoff_statistic ) ) ) output_df <- contrast_ksea_scores[ selector < ksea_cutoff_threshold, c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", "z_score", "p_value", "fdr") ] output_order <- with(output_df, order(mean_log2_fc, kinase_gene)) output_df <- output_df[output_order, ] colnames(output_df) <- c( colnames(output_df)[1], colnames(output_df)[2], "enrichment", "m_s", "z_score", "p_value", "fdr" ) output_df$fdr <- sprintf("%0.4f", output_df$fdr) output_df$p_value <- sprintf("%0.2e", output_df$p_value) output_df$z_score <- sprintf("%0.2f", output_df$z_score) output_df$m_s <- sprintf("%d", output_df$m_s) output_df$enrichment <- sprintf("%0.2f", output_df$enrichment) output_ncol <- ncol(output_df) colnames(output_df) <- c( "Kinase", "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)", "Enrichment", "Substrates", "z-score", "p-value", "FDR" ) selector <- switch( ksea_cutoff_statistic, "FDR" = { rslt$score_list[[i]]$FDR }, "p.value" = { rslt$score_list[[i]]$p.value }, stop( sprintf( "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", ksea_cutoff_statistic ) ) ) if (sum(selector < ksea_cutoff_threshold) > 0) { math_caption <- gsub("{", "\\{", caption, fixed = TRUE) math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) data_frame_latex( x = output_df, justification = "l c c c c c c", centered = TRUE, caption = sprintf( "\\text{%s}, %s < %s", math_caption, ksea_cutoff_statistic, ksea_cutoff_threshold ), anchor = const_table_anchor_p ) } else { cat( sprintf( "\\break No kinases had \\(\\text{%s}_\\text{enrichment} < %s\\) for contrast %s\\hfill\\break\n", ksea_cutoff_statistic, ksea_cutoff_threshold, caption ) ) } } } # create_breaks is a helper for ksea_heatmap create_breaks <- function(merged_scores) { if (min(merged_scores, na.rm = TRUE) < -1.6) { breaks_neg <- seq(-1.6, 0, length.out = 30) breaks_neg <- append( seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10), breaks_neg ) breaks_neg <- sort(unique(breaks_neg)) } else { breaks_neg <- seq(-1.6, 0, length.out = 30) } if (max(merged_scores, na.rm = TRUE) > 1.6) { breaks_pos <- seq(0, 1.6, length.out = 30) breaks_pos <- append( breaks_pos, seq(1.6, max(merged_scores, na.rm = TRUE), length.out = 10) ) breaks_pos <- sort(unique(breaks_pos)) } else { breaks_pos <- seq(0, 1.6, length.out = 30) } breaks_all <- unique(append(breaks_neg, breaks_pos)) mycol_neg <- gplots::colorpanel(n = length(breaks_neg), low = "blue", high = "white") mycol_pos <- gplots::colorpanel(n = length(breaks_pos) - 1, low = "white", high = "red") mycol <- unique(append(mycol_neg, mycol_pos)) color_breaks <- list(breaks_all, mycol) return(color_breaks) } # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap draw_kseaapp_summary_heatmap <- function( x, sample_cluster, merged_asterisk, my_cex_row, color_breaks, margins, ... ) { merged_scores <- x if (!is.matrix(x)) { cat( paste0( "No plot because \\texttt{typeof(x)} is '", typeof(x), "' rather than 'matrix'.\n\n" ) ) } else if (nrow(x) < 2) { cat("No plot because matrix x has ", nrow(x), " rows.\n\n") cat("\\begin{verbatim}\n") str(x) cat("\\end{verbatim}\n") } else if (ncol(x) < 2) { cat("No plot because matrix x has ", ncol(x), " columns.\n\n") cat("\\begin{verbatim}\n") str(x) cat("\\end{verbatim}\n") } else { gplots::heatmap.2( x = merged_scores, Colv = sample_cluster, scale = "none", cellnote = merged_asterisk, notecol = "white", cexCol = 0.9, # Heuristically assign size of row labels cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25), srtCol = 45, srtRow = 45, notecex = 3 * my_cex_row, col = color_breaks[[2]], density.info = "none", trace = "none", breaks = color_breaks[[1]], lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), lhei = c(0.4, 8.0, 1.1), lwid = c(0.5, 3), key = FALSE, margins = margins, ... ) } } # Adapted from KSEAapp::KSEA.Heatmap ksea_heatmap <- function( # the data frame outputs from the KSEA.Scores() function, in list format score_list, # a character vector of all the sample names for heatmap annotation: # - the names must be in the same order as the data in score_list # - please avoid long names, as they may get cropped in the final image sample_labels, # character string of either "p.value" or "FDR" indicating the data column # to use for marking statistically significant scores stats, # a numeric value between 0 and infinity indicating the min. number of # substrates a kinase must have to be included in the heatmap m_cutoff, # a numeric value between 0 and 1 indicating the p-value/FDR cutoff # for indicating significant kinases in the heatmap p_cutoff = stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), # a binary input of TRUE or FALSE, indicating whether or not to perform # hierarchical clustering of the sample columns sample_cluster, # a binary input of TRUE or FALSE, indicating whether or not to export # the heatmap as a .png image into the working directory export = FALSE, # bottom and right margins; adjust as needed if contrast names are too long margins = c(6, 20), # print which kinases? # - Mandatory argument, must be one of const_ksea_.*_kinases which_kinases, # additional arguments to gplots::heatmap.2, such as: # - main: main title of plot # - xlab: x-axis label # - ylab: y-axis label ... ) { filter_m <- function(dataset, m_cutoff) { filtered <- dataset[(dataset$m >= m_cutoff), ] return(filtered) } score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff)) for (i in seq_len(length(score_list_m))) { names <- colnames(score_list_m[[i]])[c(2:7)] colnames(score_list_m[[i]])[c(2:7)] <- paste(names, i, sep = ".") } master <- Reduce( f = function(...) { base::merge(..., by = "Kinase.Gene", all = FALSE) }, x = score_list_m ) row.names(master) <- master$Kinase.Gene columns <- as.character(colnames(master)) merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE]) colnames(merged_scores) <- sample_labels merged_stats <- as.matrix(master[, grep(stats, columns)]) asterisk <- function(mtrx, p_cutoff) { new <- data.frame() for (i in seq_len(nrow(mtrx))) { for (j in seq_len(ncol(mtrx))) { my_value <- mtrx[i, j] if (!is.na(my_value) && my_value < p_cutoff) { new[i, j] <- "*" } else { new[i, j] <- "" } } } return(new) } merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) # begin hack to print only significant rows asterisk_rows <- rowSums(merged_asterisk == "*") > 0 all_rows <- rownames(merged_stats) names(asterisk_rows) <- all_rows non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) merged_scores_asterisk <- merged_scores[names(asterisk_rows), ] merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ] # end hack to print only significant rows row_list <- list() row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows row_list[[const_ksea_all_kinases]] <- all_rows row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows i <- which_kinases my_row_names <- row_list[[i]] scrs <- merged_scores[my_row_names, ] stts <- merged_stats[my_row_names, ] merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) color_breaks <- create_breaks(scrs) plot_height <- nrow(scrs) ^ 0.55 plot_width <- ncol(scrs) ^ 0.7 my_cex_row <- 0.25 * 16 / plot_height if (export == "TRUE") { png( "KSEA.Merged.Heatmap.png", width = plot_width * 300, height = 2 * plot_height * 300, res = 300, pointsize = 14 ) } draw_kseaapp_summary_heatmap( x = scrs, sample_cluster = sample_cluster, merged_asterisk = merged_asterisk, my_cex_row = my_cex_row, color_breaks = color_breaks, margins = margins ) if (export == "TRUE") { dev.off() } return(my_row_names) } # helper for heatmaps of phosphopeptide intensities draw_intensity_heatmap <- function( m, # matrix with rownames already formatted cutoff, # cutoff used by hm_heading_function hm_heading_function, # construct and cat heading from m and cutoff hm_main_title, # main title for plot (drawn below heading) suppress_row_dendrogram = TRUE, # set to false to show dendrogram max_peptide_count # experimental: = intensity_hm_rows, # values of 50 and 75 worked well ... # passthru parameters for heatmap ) { peptide_count <- 0 # emit the heading for the heatmap if (hm_heading_function(m, cutoff)) { peptide_count <- min(max_peptide_count, nrow(m)) if (nrow(m) > 1) { m_margin <- m[peptide_count:1, ] # Margin setting was heuristically derived margins <- c(0.5, # col max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row ) } if (nrow(m) > 1) { tryCatch( { old_oma <- par("oma") par(cex.main = 0.6) # Heuristically determined character size adjustment formula char_contractor <- 250000 / ( max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows ) heatmap( m[peptide_count:1, ], Rowv = if (suppress_row_dendrogram) NA else NULL, Colv = NA, cexRow = char_contractor, cexCol = char_contractor * 50 / max_peptide_count, scale = "row", margins = margins, main = "Unimputed, unnormalized log(intensities)", xlab = "", las = 1, ... ) }, error = function(e) { cat( sprintf( "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n", e$message ) ) }, finally = par(old_oma) ) } } return(peptide_count) } ``` ```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} cat("\\listoftables\n") ``` # Purpose To perform for phosphopeptides: - imputation of missing values, - quantile normalization, - ANOVA (using the R stats::`r params$oneWayManyCategories` function), and - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) - The Human Proteome Database [http://hprd.org](http://hprd.org) - NetworKIN [http://networkin.science/](http://networkin.science/) - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) ```{r include = FALSE} ### GLOBAL VARIABLES # parameters for KSEA ksea_cutoff_statistic <- params$kseaCutoffStatistic ksea_cutoff_threshold <- params$kseaCutoffThreshold ksea_min_kinase_count <- params$kseaMinKinaseCount ksea_heatmap_titles <- list() ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- sprintf( "Summary for all kinases enriched in one or more contrasts at %s < %s", ksea_cutoff_statistic, ksea_cutoff_threshold ) ksea_heatmap_titles[[const_ksea_all_kinases]] <- "Summary figure for all contrasts and all kinases" ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <- sprintf( "Summary for all kinases not enriched at %s < %s in any contrast", ksea_cutoff_statistic, ksea_cutoff_threshold ) # hash to hold names of significantly enriched kinases ksea_asterisk_hash <- new_env() # READ PARAMETERS (mostly) intensity_hm_rows <- params$intensityHeatmapRows # Input Filename input_file <- params$inputFile # First data column - ideally, this could be detected via regexSampleNames, # but for now leave it as is. first_data_column <- params$firstDataColumn fdc_is_integer <- is.integer(first_data_column) if (fdc_is_integer) { first_data_column <- as.integer(params$firstDataColumn) } # False discovery rate adjustment for ANOVA # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 val_fdr <- read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "") if ( ncol(val_fdr) != 1 || sum(!is.numeric(val_fdr[, 1])) || sum(val_fdr[, 1] < 0) || sum(val_fdr[, 1] > 1) ) { stop("alphaFile should be one column of numbers within the range [0.0,1.0]") } val_fdr <- val_fdr[, 1] #Imputed Data filename imputed_data_filename <- params$imputedDataFilename imp_qn_lt_data_filenm <- params$imputedQNLTDataFile anova_ksea_mtdt_file <- params$anovaKseaMetadata ``` ```{r echo = FALSE} # Imputation method, should be one of # "random", "group-median", "median", or "mean" imputation_method <- params$imputationMethod # Selection of percentile of logvalue data to set the mean for random number # generation when using random imputation mean_percentile <- params$meanPercentile / 100.0 # deviation adjustment-factor for random values; real number. sd_percentile <- params$sdPercentile # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" regex_sample_names <- params$regexSampleNames # Regular expression to extract Sample Grouping from Sample Name; # if error occurs, compare sample_treatment_levels vs. sample_name_matches # to see if groupings/pairs line up # e.g., "(\\d+)" regex_sample_grouping <- params$regexSampleGrouping one_way_all_categories_fname <- params$oneWayManyCategories one_way_all_categories <- try_catch_w_e( match.fun(one_way_all_categories_fname)) if (!is.function(one_way_all_categories$value)) { write("fatal error for parameter oneWayManyCategories:", stderr()) write(one_way_all_categories$value$message, stderr()) if (sys.nframe() > 0) quit(save = "no", status = 1) stop("Cannot continue. Goodbye.") } one_way_all_categories <- one_way_all_categories$value one_way_two_categories_fname <- params$oneWayManyCategories one_way_two_categories <- try_catch_w_e( match.fun(one_way_two_categories_fname)) if (!is.function(one_way_two_categories$value)) { cat("fatal error for parameter oneWayTwoCategories: \n") cat(one_way_two_categories$value$message, fill = TRUE) if (sys.nframe() > 0) quit(save = "no", status = 1) stop("Cannot continue. Goodbye.") } one_way_two_categories <- one_way_two_categories$value preproc_db <- params$preprocDb ksea_app_prep_db <- params$kseaAppPrepDb result <- file.copy( from = preproc_db, to = ksea_app_prep_db, overwrite = TRUE ) if (!result) { write( sprintf( "fatal error copying initial database '%s' to output '%s'", preproc_db, ksea_app_prep_db, ), stderr() ) if (sys.nframe() > 0) quit(save = "no", status = 1) stop("Cannot continue. Goodbye.") } ``` ```{r echo = FALSE} ### READ DATA # read.table reads a file in table format and creates a data frame from it. # - note that `quote = ""` means that quotation marks are treated literally. full_data <- read.table( file = input_file, sep = "\t", header = TRUE, quote = "", check.names = FALSE ) ``` # Extract Sample Names and Treatment Levels Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2. ```{r echo = FALSE, results = 'asis'} data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) if (!fdc_is_integer) { if (length(data_column_indices) > 0) { first_data_column <- data_column_indices[1] } else { stop(paste("failed to convert firstDataColumn:", first_data_column)) } } cat( sprintf( paste( "\n\nThe input data file has peptide-intensity data for each sample", "in one of columns %d through %d.\n\n" ), min(data_column_indices), max(data_column_indices) ) ) # Write column names as a LaTeX enumerated list. column_name_df <- data.frame( column = seq_len(length(colnames(full_data))), name = paste0("\\verb@", colnames(full_data), "@") ) data_frame_latex( x = column_name_df, justification = "l l", centered = TRUE, caption = "Input data column names", anchor = const_table_anchor_bp, underscore_whack = FALSE ) ``` ```{r echo = FALSE, results = 'asis'} quant_data <- full_data[first_data_column:length(full_data)] quant_data[quant_data == 0] <- NA rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide # Extract factors and trt-replicates using regular expressions. # Typically: # regex_sample_names is "\\.\\d+[A-Z]$" # regex_sample_grouping is "\\d+" # This would distinguish trt-replicates by terminal letter [A-Z] # in sample names and group them into trts by the preceding digits. # e.g.: # group .1A .1B .1C into group 1; # group .2A .2B .2C, into group 2; # etc. m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE) sample_name_matches <- regmatches(names(quant_data), m) colnames(quant_data) <- sample_name_matches write_debug_file(quant_data) rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) number_of_samples <- length(sample_name_matches) sample_treatment_df <- data.frame( level = sample_treatment_levels, sample = sample_name_matches ) data_frame_latex( x = sample_treatment_df, justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", centered = TRUE, caption = "Treatment levels", anchor = const_table_anchor_tbp, underscore_whack = FALSE ) ``` ```{r echo = FALSE, results = 'asis'} cat("\\newpage\n") ``` ## Are the log-transformed sample distributions similar? ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} quant_data[quant_data == 0] <- NA #replace 0 with NA quant_data_log <- log10(quant_data) rownames(quant_data_log) <- rownames(quant_data) colnames(quant_data_log) <- sample_name_matches write_debug_file(quant_data_log) # data visualization old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) ) # ref: https://r-charts.com/distribution/add-points-boxplot/ # Vertical plot boxplot( quant_data_log , las = 1 , col = const_boxplot_fill , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , xlab = "Sample" ) par(old_par) cat("\n\n\n") cat("\n\n\n") ``` ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} if (nrow(quant_data_log) > 1) { quant_data_log_stack <- stack(quant_data_log) ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + ggplot2::ylab("Probability density") + ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) } else { cat("No density plot because there are too few peptides.\n\n") } ``` ## Globally, are peptide intensities are approximately unimodal? <!-- # bquote could be used as an alternative to latex2exp::TeX below particularly # and when plotting math expressions generally, at the expense of mastering # another syntax, which hardly seems worthwhile when I need to use TeX # elsewhere; here's an introduction to bquote: # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ --> ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} # identify the location of missing values fin <- is.finite(as.numeric(as.matrix(quant_data_log))) logvalues <- as.numeric(as.matrix(quant_data_log))[fin] logvalues_density <- density(logvalues) plot( x = logvalues_density, main = latex2exp::TeX( "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)" ), xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), ylab = "Probability density" ) hist( x = as.numeric(as.matrix(quant_data_log)), xlim = c(min(logvalues_density$x), max(logvalues_density$x)), breaks = 100, main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) ``` ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} # determine quantile q1 <- quantile(logvalues, probs = mean_percentile)[1] # 1 = row of matrix (ie, phosphopeptide) sds <- apply(quant_data_log, 1, sd_finite) if (sum(!is.na(sds)) > 2) { plot( density(sds, na.rm = TRUE) , main = "Smoothed estimated probability density vs. std. deviation" , sub = "(probability estimation made with Gaussian smoothing)" , ylab = "Probability density" ) } else { cat( "At least two non-missing values are required to plot", "probability density.\n\n" ) } ``` ```{r echo = FALSE} # Determine number of cells to impute temp <- quant_data[is.na(quant_data)] # Determine number of values to impute number_to_impute <- length(temp) # Determine percent of missing values pct_missing_values <- round(length(temp) / (length(logvalues) + length(temp)) * 100) ``` ```{r echo = FALSE} # prep for trt-median based imputation ``` # Impute Missing Values ```{r echo = FALSE} imp_smry_pot_peptides_before <- nrow(quant_data_log) imp_smry_missing_values_before <- number_to_impute imp_smry_pct_missing <- pct_missing_values ``` ```{r echo = FALSE} #Determine number of cells to impute ``` ```{r echo = FALSE} # Identify which values are missing and need to be imputed ind <- which(is.na(quant_data), arr.ind = TRUE) ``` ```{r echo = FALSE, results = 'asis'} # Apply imputation switch( imputation_method , "group-median" = { quant_data_imp <- quant_data imputation_method_description <- paste("Substitute missing value with", "median peptide-intensity for sample group.\n" ) sample_level_integers <- as.integer(sample_treatment_levels) # Take the accurate ln(x+1) because the data are log-normally distributed # and because median can involve an average of two measurements. quant_data_imp <- log1p(quant_data_imp) for (i in seq_len(length(levels(sample_treatment_levels)))) { # Determine the columns for this factor-level level_cols <- i == sample_level_integers # Extract those columns lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] # assign to ind the row-column pairs corresponding to each NA ind <- which(is.na(lvlsbst), arr.ind = TRUE) # No group-median exists if there is only one sample # a given ppep has no measurement; otherwise, proceed. if (ncol(lvlsbst) > 1) { the_centers <- apply(lvlsbst, 1, median, na.rm = TRUE) for (j in seq_len(nrow(lvlsbst))) { for (k in seq_len(ncol(lvlsbst))) { if (is.na(lvlsbst[j, k])) { lvlsbst[j, k] <- the_centers[j] } } } quant_data_imp[, level_cols] <- lvlsbst } } # Take the accurate e^x - 1 to match scaling of original input. quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) good_rows <- !is.na(rowMeans(quant_data_imp)) } , "median" = { quant_data_imp <- quant_data imputation_method_description <- paste("Substitute missing value with", "median peptide-intensity across all sample classes.\n" ) # Take the accurate ln(x+1) because the data are log-normally distributed # and because median can involve an average of two measurements. quant_data_imp <- log1p(quant_data_imp) quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] # Take the accurate e^x - 1 to match scaling of original input. quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) good_rows <- !is.nan(rowMeans(quant_data_imp)) } , "mean" = { quant_data_imp <- quant_data imputation_method_description <- paste("Substitute missing value with", "geometric-mean peptide intensity across all sample classes.\n" ) # Take the accurate ln(x+1) because the data are log-normally distributed, # so arguments to mean should be previously transformed. # this will have to be quant_data_imp <- log1p(quant_data_imp) # Assign to NA cells the mean for the row quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] # Take the accurate e^x - 1 to match scaling of original input. quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) good_rows <- !is.nan(rowMeans(quant_data_imp)) } , "random" = { quant_data_imp <- quant_data m1 <- median(sds, na.rm = TRUE) * sd_percentile #sd to be used is the median sd # If you want results to be reproducible, you will want to call # base::set.seed before calling stats::rnorm imputation_method_description <- paste("Substitute each missing value with random intensity", sprintf( "random intensity $N \\sim (%0.2f, %0.2f)$.\n", q1, m1 ) ) cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n", 100 * mean_percentile)) cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n", sd_percentile)) quant_data_imp[ind] <- 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) quant_data_imp_ln <- log1p(quant_data_imp) good_rows <- !is.nan(rowMeans(quant_data_imp)) } ) quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e if (length(good_rows) < 1) { print("ERROR: Cannot impute data; there are no good rows!") return(-1) } ``` ```{r echo = FALSE, results = 'asis'} cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description) ``` ```{r echo = FALSE} imp_smry_pot_peptides_after <- sum(good_rows) imp_smry_rejected_after <- sum(!good_rows) imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) ``` ```{r echo = FALSE, results = 'asis'} # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf tabular_lines_fmt <- paste( "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) " \\caption{Imputation Results}", " \\centering", # \centering centers the table on the page " \\begin{tabular}{l c c c}", " \\hline\\hline", " \\ & potential peptides & missing values & rejected", " peptides \\\\ [0.5ex]", " \\hline", " before imputation & %d & %d (%d\\%s) & \\\\", " after imputation & %d & %d & %d \\\\ [1ex]", " \\hline", " \\end{tabular}", #" \\label{table:nonlin}", # may be used to refer this table in the text "\\end{table}", sep = "\n" ) tabular_lines <- sprintf( tabular_lines_fmt, imp_smry_pot_peptides_before, imp_smry_missing_values_before, imp_smry_pct_missing, "%", imp_smry_pot_peptides_after, imp_smry_missing_values_after, imp_smry_rejected_after ) cat(tabular_lines) ``` ```{r echo = FALSE} # Zap rows where imputation was ineffective full_data <- full_data [good_rows, ] quant_data <- quant_data [good_rows, ] quant_data_imp <- quant_data_imp[good_rows, ] write_debug_file(quant_data_imp) quant_data_imp_good_rows <- quant_data_imp write_debug_file(quant_data_imp_good_rows) ``` ```{r echo = FALSE, results = 'asis'} can_plot_before_after_imp <- TRUE d_combined <- as.numeric( as.matrix( log10(quant_data_imp) ) ) d_original <- as.numeric( as.matrix( log10(quant_data_imp[!is.na(quant_data)]) ) ) if (sum(!is.na(d_original)) > 2) { d_original <- density(d_original) } else { can_plot_before_after_imp <- FALSE } if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) { d_combined <- density(d_combined) } else { can_plot_before_after_imp <- FALSE } if (sum(is.na(quant_data)) > 0) { # There ARE missing values d_imputed <- as.numeric( as.matrix( log10(quant_data_imp[is.na(quant_data)]) ) ) if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) { d_imputed <- (density(d_imputed)) } else { can_plot_before_after_imp <- FALSE } } else { # There are NO missing values d_imputed <- d_combined } ``` ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} zero_sd_rownames <- rownames(quant_data_imp)[ is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) ] if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { stop("All peptides have zero standard deviation. Cannot continue.") } if (length(zero_sd_rownames) > 0) { cat( sprintf("%d peptides with zero variance were removed from statistical consideration", length(zero_sd_rownames) ) ) zap_named_rows <- function(df, nms) { return(df[!(row.names(df) %in% nms), ]) } quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) quant_data <- zap_named_rows(quant_data, zero_sd_rownames) full_data <- zap_named_rows(full_data, zero_sd_rownames) } if (sum(is.na(quant_data)) > 0) { cat("\\leavevmode\\newpage\n") # data visualization old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) ) # Copy quant data to x x <- quant_data # x gets to have values of: # - NA for observed values # - 1 for missing values x[is.na(x)] <- 0 x[x > 1] <- NA x[x == 0] <- 1 # Log-transform imputed data # update variable because rows may have been eliminated from quant_data_imp quant_data_imp_log10 <- log10(quant_data_imp) write_debug_file(quant_data_imp_log10) # Set blue_dots to log of quant data or NA for NA quant data blue_dots <- log10(quant_data) # Set red_dots to log of imputed data or NA for observed quant data red_dots <- quant_data_imp_log10 * x count_red <- sum(!is.na(red_dots)) count_blue <- sum(!is.na(blue_dots)) ylim_save <- ylim <- c( min(red_dots, blue_dots, na.rm = TRUE), max(red_dots, blue_dots, na.rm = TRUE) ) show_stripchart <- 50 > (count_red + count_blue) / length(sample_name_matches) if (show_stripchart) { boxplot_sub <- "Light blue = data before imputation; Red = imputed data" } else { boxplot_sub <- "" } # Vertical plot colnames(blue_dots) <- sample_name_matches boxplot( blue_dots , las = 1 # "always horizontal" , col = const_boxplot_fill , ylim = ylim , main = "Peptide intensities after eliminating unusable peptides" , sub = boxplot_sub , xlab = "Sample" , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) if (show_stripchart) { # Points # ref: https://r-charts.com/distribution/add-points-boxplot/ # NA values are not plotted stripchart( blue_dots, # Data method = "jitter", # Random noise jitter = const_stripchart_jitter, pch = 19, # Pch symbols cex = const_stripsmall_cex, # Size of symbols reduced col = "lightblue", # Color of the symbol vertical = TRUE, # Vertical mode add = TRUE # Add it over ) stripchart( red_dots, # Data method = "jitter", # Random noise jitter = const_stripchart_jitter, pch = 19, # Pch symbols cex = const_stripsmall_cex, # Size of symbols reduced col = "red", # Color of the symbol vertical = TRUE, # Vertical mode add = TRUE # Add it over ) } if (TRUE) { # show measured values in blue on left half-violin plot cat("\\leavevmode\n\\quad\n\n\\quad\n\n") vioplot::vioplot( x = lapply(blue_dots, function(x) x[!is.na(x)]), col = "lightblue1", side = "left", plotCentre = "line", ylim = ylim_save, main = "Distributions of observed and imputed data", sub = "Light blue = observed data; Pink = imputed data", xlab = "Sample", ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) cols_to_delete <- c() for (ix in seq_len(length(red_violins))) { if (length(red_violins[[ix]]) < 1) { cols_to_delete <- c(cols_to_delete, ix) } } # destroy any unimputable columns if (!is.null(cols_to_delete)) { red_violins <- red_violins[-cols_to_delete] } # plot imputed values in red on right half-violin plot vioplot::vioplot( x = red_violins, col = "lightpink1", side = "right", plotCentre = "line", add = TRUE ) } par(old_par) # density plot cat("\\leavevmode\n\n\n\n\n\n\n") if (can_plot_before_after_imp) { ylim <- c( 0, max( if (is.list(d_combined)) d_combined$y else d_combined, if (is.list(d_original)) d_original$y else d_original, if (is.list(d_imputed)) d_imputed$y else d_imputed, na.rm = TRUE ) ) plot( d_combined, ylim = ylim, sub = paste( "Blue = data before imputation; Red = imputed data;", "Black = combined" ), main = "Density of peptide intensity before and after imputation", xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), ylab = "Probability density" ) lines(d_original, col = "blue") lines(d_imputed, col = "red") } else { cat( "There are too few points to plot the density of peptide intensity", "before and after imputation." ) } cat("\\leavevmode\\newpage\n") } ``` # Perform Quantile Normalization The excellent `normalize.quantiles` function from *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* performs "quantile normalization" as described Bolstad *et al.* (2003), DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, i.e., it assumes that the goal is to detect subtle differences among grossly similar samples (having similar distributions) by equailzing intra-quantile quantitations. Unfortunately, one software library upon which it depends *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* that requires that a specific, non-concurrent version of the library be installed. The installation command equivalent to what was used to install the library to produce the results presented here is: ``` conda install bioconductor-preprocesscore openblas=0.3.3 ``` <!-- # Apply quantile normalization using preprocessCore::normalize.quantiles # --- # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html # except this: https://support.bioconductor.org/p/122925/#9135989 # says to install it like this: # ``` # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) # ``` # conda installation (necessary because of a bug in recent openblas): # conda install bioconductor-preprocesscore openblas=0.3.3 # ... # --- # normalize.quantiles {preprocessCore} -- Quantile Normalization # # Description: # Using a normalization based upon quantiles, this function normalizes a # matrix of probe level intensities. # # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the # assumption that the data is missing at random. # # Usage: # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) # # Arguments: # # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. # # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, # but in certain situations not making a copy of the matrix, but instead normalizing # it in place will be more memory friendly. # # - keep.names: Boolean option to preserve matrix row and column names in output. # # Details: # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. # No special allowances are made for outliers. If you make use of quantile normalization # please cite Bolstad et al, Bioinformatics (2003). # # This functions will handle missing data (ie NA values), based on # the assumption that the data is missing at random. # # Note that the current implementation optimizes for better memory usage # at the cost of some additional run-time. # # Value: A normalized matrix. # # Author: Ben Bolstad, bmbolstad.com # # References # # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf # # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 # http://bmbolstad.com/misc/normalize/normalize.html # ... --> ```{r echo = FALSE, results = 'asis'} if (nrow(quant_data_imp) > 0) { quant_data_imp_qn <- preprocessCore::normalize.quantiles( as.matrix(quant_data_imp), keep.names = TRUE ) } else { quant_data_imp_qn <- as.matrix(quant_data_imp) } quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) write_debug_file(quant_data_imp_qn) quant_data_imp_qn_log <- log10(quant_data_imp_qn) write_debug_file(quant_data_imp_qn_log) quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) sel <- apply(quant_data_imp_qn_ls, 1, any_nan) quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls) write_debug_file(quant_data_imp_qn_ls) write_debug_file(quant_data_imp_qn_ls2) # Create data.frame used by ANOVA analysis data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) ``` <!-- ACE insertion begin --> ## Are normalized, imputed, log-transformed sample distributions similar? ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} # Save unimputed quant_data_log for plotting below unimputed_quant_data_log <- quant_data_log # log10 transform (after preparing for zero values, # which should never happen...) quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 quant_data_log <- log10(quant_data_imp_qn) how_many_peptides <- nrow(quant_data_log) if ((how_many_peptides) > 0) { cat( sprintf( "Intensities for %d peptides:\n\n\n", how_many_peptides ) ) cat("\n\n\n") # data visualization old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) , oma = par("oma") + c(0.5, 0, 0, 0) ) # ref: https://r-charts.com/distribution/add-points-boxplot/ # Vertical plot colnames(quant_data_log) <- sample_name_matches boxplot( quant_data_log , las = 1 , col = const_boxplot_fill , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , xlab = "Sample" ) par(old_par) } else { cat("There are no peptides to plot\n") } cat("\n\n\n") ``` ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} if (nrow(quant_data_log) > 1) { quant_data_log_stack <- stack(quant_data_log) ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + ggplot2::ylab("Probability density") + ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) } else { cat("No density plot because there are fewer than two peptides to plot.\n\n") } ``` ```{r echo = FALSE, results = 'asis'} cat("\\leavevmode\\newpage\n") ``` # ANOVA Analysis ```{r, echo = FALSE} # Make new data frame containing only Phosphopeptides # to connect preANOVA to ANOVA (connect_df) connect_df <- data.frame( data_table_imp_qn_lt$Phosphopeptide , data_table_imp_qn_lt[, first_data_column] ) colnames(connect_df) <- c("Phosphopeptide", "Intensity") ``` ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} count_of_treatment_levels <- length(levels(sample_treatment_levels)) if (count_of_treatment_levels < 2) { nuke_control_sequences <- function(s) { s <- gsub("[\\]", "xyzzy_plugh", s) s <- gsub("[$]", "\\\\$", s) s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) return(s) } cat( "ERROR!!!! Cannot perform ANOVA analysis", "(see next page)\\newpage\n" ) cat( "ERROR: ANOVA analysis", "requires two or more factor levels!\n\n\n" ) cat("\n\n\n\n\n") cat("Unparsed sample names are:\n\n\n", "\\begin{quote}\n", paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), "\n\\end{quote}\n\n") regex_sample_names <- nuke_control_sequences(regex_sample_names) cat("\\leavevmode\n\n\n") cat("Parsing rule for SampleNames is", "\n\n\n", "\\text{'", regex_sample_names, "'}\n\n\n", sep = "" ) cat("\nParsed sample names are:\n", "\\begin{quote}\n", paste(sample_name_matches, collapse = "\n\n\n"), "\n\\end{quote}\n\n") regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) cat("\\leavevmode\n\n\n") cat("Parsing rule for SampleGrouping is", "\n\n\n", "\\text{'", regex_sample_grouping, "'}\n\n\n", sep = "" ) cat("\n\n\n") cat("Sample group assignments are:\n", "\\begin{quote}\n", paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), "\n\\end{quote}\n\n") } else { p_value_data_anova_ps <- apply( quant_data_imp_qn_log, 1, anova_func, grouping_factor = sample_treatment_levels, one_way_f = one_way_all_categories ) p_value_data_anova_ps_fdr <- p.adjust(p_value_data_anova_ps, method = "fdr") p_value_data <- data.frame( phosphopeptide = full_data[, 1] , raw_anova_p = p_value_data_anova_ps , fdr_adjusted_anova_p = p_value_data_anova_ps_fdr ) # output ANOVA file to constructed filename, # e.g. "Outputfile_pST_ANOVA_STEP5.txt" # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" # Re-output datasets to include p-values metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) write.table( cbind(metadata_plus_p, quant_data_imp), file = imputed_data_filename, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) write.table( cbind(metadata_plus_p, quant_data_imp_qn_log), file = imp_qn_lt_data_filenm, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) p_value_data <- p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] first_page_suppress <- 1 number_of_peptides_found <- 0 cutoff <- val_fdr[1] for (cutoff in val_fdr) { if (number_of_peptides_found > 49) { cat("\\leavevmode\n\n\n") break } #loop through FDR cutoffs filtered_p <- p_value_data[ which(p_value_data$fdr_adjusted_anova_p < cutoff), , drop = FALSE ] filtered_data_filtered <- quant_data_imp_qn_log[ rownames(filtered_p), , drop = FALSE ] filtered_data_filtered <- filtered_data_filtered[ order(filtered_p$fdr_adjusted_anova_p), , drop = FALSE ] # <!-- ACE insertion start --> if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { if (first_page_suppress == 1) { first_page_suppress <- 0 } else { cat("\\newpage\n") } if (nrow(filtered_data_filtered) > 1) { subsection_header(sprintf( "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", nrow(filtered_data_filtered), cutoff )) } else { subsection_header(sprintf( "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", rownames(filtered_data_filtered)[1], cutoff )) } cat("\n\n\n") cat("\n\n\n") old_oma <- par("oma") old_par <- par( mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), oma = old_oma * c(1, 1, 0.3, 1), cex.main = 0.9, cex.axis = 0.7, fin = c(9, 7.25) ) # ref: https://r-charts.com/distribution/add-points-boxplot/ # Vertical plot colnames(filtered_data_filtered) <- sample_name_matches tryCatch( boxplot( filtered_data_filtered, main = "Imputed, normalized intensities", # no line plot las = 1, col = const_boxplot_fill, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ), error = function(e) print(e) ) par(old_par) } else { cat(sprintf( "%s < %0.2f\n\n\n\n\n", "No peptides were found to have cutoff adjusted p-value", cutoff )) } if (nrow(filtered_data_filtered) > 0) { # Add Phosphopeptide column to anova_filtered table # The assumption here is that the first intensity is unique; # this is a hokey assumption but almost definitely will # be true in the real world unless there is a computation # error upstream. anova_filtered_merge <- base::merge( x = connect_df, y = filtered_data_filtered, by.x = "Intensity", by.y = 1 ) anova_filtered_merge_order <- rownames(filtered_p) anova_filtered <- data.frame( ppep = anova_filtered_merge$Phosphopeptide, intense = anova_filtered_merge$Intensity, data = anova_filtered_merge[, 2:number_of_samples + 1] ) colnames(anova_filtered) <- c("Phosphopeptide", colnames(filtered_data_filtered)) # Merge qualitative columns into the ANOVA data output_table <- data.frame(anova_filtered$Phosphopeptide) output_table <- base::merge( x = output_table, y = data_table_imp_qn_lt, by.x = "anova_filtered.Phosphopeptide", by.y = "Phosphopeptide" ) # Produce heatmap to visualize significance and the effect of imputation anova_filtered_merge_format <- sapply( X = filtered_p$fdr_adjusted_anova_p , FUN = function(x) { if (x > 0.0001) paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") else paste0("(%0.4e) %s") } ) cat_hm_heading <- function(m, cutoff) { cat("\\newpage\n") if (nrow(m) > intensity_hm_rows) { subsection_header( paste( sprintf("Heatmap for the %d most-significant peptides", intensity_hm_rows), sprintf("whose adjusted p-value < %0.2f\n", cutoff) ) ) } else { if (nrow(m) == 1) { return(FALSE) } else { subsection_header( paste( sprintf("Heatmap for %d usable peptides whose", nrow(m)), sprintf("adjusted p-value < %0.2f\n", cutoff) ) ) } } cat("\n\n\n") cat("\n\n\n") return(TRUE) } # construct matrix with appropriate rownames m <- as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) if (nrow(m) > 0) { rownames_m <- rownames(m) rownames(m) <- sapply( X = seq_len(nrow(m)) , FUN = function(i) { sprintf( anova_filtered_merge_format[i], filtered_p$fdr_adjusted_anova_p[i], rownames_m[i] ) } ) } # draw the heading and heatmap if (nrow(m) > 0) { number_of_peptides_found <- draw_intensity_heatmap( m = m, cutoff = cutoff, hm_heading_function = cat_hm_heading, hm_main_title = "Unimputed, unnormalized log(intensities)", suppress_row_dendrogram = FALSE ) } } } } cat("\\leavevmode\n\n\n") ``` ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} if (count_of_treatment_levels > 1) { # Prepare two-way contrasts with adjusted p-values # Strategy: # - use imputed, log-transformed data: # - remember this when computing log2(fold-change) # - each contrast is between a combination of trt levels # - for each contrast, compute samples that are members # - compute one-way test: # - use `oneway.test` (Welch test) if numbers of samples # are not equivalent between trt levels # - otherwise, aov is fine but offers no advantage # - adjust p-value, assuming that # (# of pppeps)*(# of contrasts) tests were performed # Each contrast is between a combination of trt levels m2 <- combn( x = seq_len(length(levels(sample_treatment_levels))), m = 2, simplify = TRUE ) contrast_count <- ncol(m2) # For each contrast, compute samples that are members # - local function to construct a data.frame for each contrast # - the contrast in the first "column" f_m2 <- function(cntrst, lvl1, lvl2) { return( data.frame( contrast = cntrst, level = sample_treatment_levels[ sample_treatment_levels %in% levels(sample_treatment_levels)[c(lvl1, lvl2)] ], label = sample_name_matches[ sample_treatment_levels %in% levels(sample_treatment_levels)[c(lvl1, lvl2)] ] ) ) } # - compute a df for each contrast sample_level_dfs <- lapply( X = 1:contrast_count, FUN = function(i) f_m2(i, m2[1, i], m2[2, i]) ) # - compute a single df for all contrasts combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs) # - dispose objects to free resources rm(sample_level_dfs) # - write the df to a DB for later join-per-contrast db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) RSQLite::dbWriteTable( conn = db, name = "contrast", value = combined_contrast_df, overwrite = TRUE ) # Create UK for insert ddl_exec(db, " CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx ON contrast(contrast, label); " ) # Create indexes for join ddl_exec(db, " -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label CREATE INDEX IF NOT EXISTS contrast__label__idx ON contrast(label); " ) ddl_exec(db, " -- index for joining two contrast_lvl_ppep_avg_quant on contrast CREATE INDEX IF NOT EXISTS contrast__contrast__idx ON contrast(contrast); " ) ddl_exec(db, " -- index for joining two contrast_lvl_ppep_avg_quant on phophospep CREATE INDEX IF NOT EXISTS contrast__level__idx ON contrast(level); " ) # - dispose objects to free resources rm(combined_contrast_df) # Use imputed, log-transformed data # - remember that this was donoe when computing log2(fold-change) # - melt data matrix for use in later join-per-contrast casted <- cbind( data.frame(vrbl = rownames(quant_data_imp_qn_log)), quant_data_imp_qn_log ) quant_data_imp_qn_log_melted <- reshape2::melt( casted, id.vars = "vrbl" ) colnames(quant_data_imp_qn_log_melted) <- c("phosphopep", "sample", "quant") # - dispose objects to free resources rm(casted) # - write the df to a DB for use in later join-per-contrast RSQLite::dbWriteTable( conn = db, name = "ppep_smpl_qnlt", value = quant_data_imp_qn_log_melted, overwrite = TRUE ) # Create UK for insert ddl_exec(db, " CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx ON ppep_smpl_qnlt(phosphopep, sample); " ) # Create index for join ddl_exec(db, " -- index for join in contrast_ppep_smpl_qnlt CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx ON ppep_smpl_qnlt(sample); " ) ddl_exec(db, " -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx ON ppep_smpl_qnlt(phosphopep); " ) # - dispose objects to free resources rm(quant_data_imp_qn_log_melted) # - drop views if exist ddl_exec(db, " -- drop view dependent on contrast_lvl_ppep_avg_quant DROP VIEW IF EXISTS v_contrast_log2_fc; " ) ddl_exec(db, " -- drop table dependent on contrast_ppep_smpl_qnlt DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant; " ) ddl_exec(db, " DROP TABLE IF EXISTS contrast_lvl_lvl_metadata; " ) ddl_exec(db, " DROP VIEW IF EXISTS v_contrast_lvl_metadata; " ) ddl_exec(db, " -- drop view dependent on contrast_ppep_smpl_qnlt DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant; " ) ddl_exec(db, " DROP VIEW IF EXISTS v_contrast_lvl_lvl; " ) ddl_exec(db, " -- drop view upon which other views depend DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt; " ) # - create view dml_no_rows_exec(db, " -- view contrast_ppep_smpl_qnlt is used for each phopshopep to -- compute p-value for test of trt effect for two trt levels CREATE VIEW contrast_ppep_smpl_qnlt AS SELECT contrast, level, phosphopep, sample, quant FROM contrast AS c, ppep_smpl_qnlt AS q WHERE q.sample = c.label ORDER BY contrast, level, phosphopep ; " ) # - create simplification views dml_no_rows_exec(db, " CREATE VIEW v_contrast_lvl_metadata AS SELECT contrast, level, group_concat(label, ';') AS samples FROM contrast GROUP BY contrast, level /* view v_contrast_lvl_metadata is used to simplify creation of table contrast_lvl_lvl_metadata */ ; " ) dml_no_rows_exec(db, " CREATE VIEW v_contrast_lvl_ppep_avg_quant AS SELECT contrast, level, phosphopep, avg(quant) AS avg_quant FROM contrast_ppep_smpl_qnlt GROUP BY contrast, level, phosphopep /* view v_contrast_lvl_ppep_avg_quant is used to simplify view v_contrast_log2_fc */ ; " ) # - create contrast-metadata table dml_no_rows_exec(db, " CREATE TABLE contrast_lvl_lvl_metadata AS SELECT DISTINCT a.contrast AS ab_contrast, a.level AS a_level, b.level AS b_level, a.samples AS a_samples, b.samples AS b_samples, 'log2(level_'||a.level||'/level_'||b.level||')' AS fc_description FROM v_contrast_lvl_metadata AS a, v_contrast_lvl_metadata AS b WHERE a.contrast = b.contrast AND a.level > b.level /* view v_contrast_lvl_lvl is used to simplify view v_contrast_log2_fc */ ; " ) # - create pseudo-materialized view table dml_no_rows_exec(db, " CREATE VIEW v_contrast_lvl_lvl AS SELECT DISTINCT a.contrast AS ab_contrast, a.level AS a_level, b.level AS b_level FROM contrast AS a, contrast AS b WHERE a.contrast = b.contrast AND a.level > b.level /* view v_contrast_lvl_lvl is used to simplify view v_contrast_log2_fc */ ; " ) # - create view to compute log2(fold-change) dml_no_rows_exec(db, " CREATE VIEW v_contrast_log2_fc AS SELECT ab.ab_contrast AS contrast, m.a_level AS a_level, c.avg_quant AS a_quant, m.a_samples AS a_samples, ab.b_level AS b_level, d.avg_quant AS b_quant, m.b_samples AS b_samples, m.fc_description AS fc_description, 3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc, d.phosphopep AS phosphopep FROM contrast_lvl_lvl_metadata AS m, v_contrast_lvl_ppep_avg_quant AS d, v_contrast_lvl_lvl AS ab INNER JOIN v_contrast_lvl_ppep_avg_quant AS c ON c.contrast = ab.ab_contrast AND c.level = ab.a_level WHERE d.contrast = ab.ab_contrast AND m.ab_contrast = ab.ab_contrast AND d.level = ab.b_level AND d.phosphopep = c.phosphopep /* view to compute log2(fold-change) */ ; " ) # For each contrast, compute samples that are members # compute one-way test: # - use `oneway.test` (Welch test) if numbers of samples # are not equivalent between trt levels # - otherwise, aov is fine but offers no advantage for (contrast in contrast_count:2) { invisible(contrast) } for (contrast in 1:contrast_count) { contrast_df <- sqldf::sqldf( x = paste0(" SELECT level, phosphopep, sample, quant FROM contrast_ppep_smpl_qnlt WHERE contrast = ", contrast, " ORDER BY phosphopep, level, sample "), connection = db ) contrast_cast <- reshape2::dcast( data = contrast_df, formula = phosphopep ~ sample, value.var = "quant" ) contrast_cast_ncol <- ncol(contrast_cast) contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol] contrast_cast_samples <- colnames(contrast_cast_data) # - order grouping_factor by order of sample columns of contrast_cast_data grouping_factor <- sqldf::sqldf( x = paste0(" SELECT sample, level FROM contrast_ppep_smpl_qnlt WHERE contrast = ", contrast, " ORDER BY phosphopep, level, sample LIMIT ", contrast_cast_ncol - 1 ), connection = db ) rownames(grouping_factor) <- grouping_factor$sample grouping_factor <- grouping_factor[, "level", drop = FALSE] # - run the two-level (one-way) test p_value_data_contrast_ps <- apply( X = contrast_cast_data, MARGIN = 1, # apply to rows FUN = anova_func, grouping_factor = as.factor(as.numeric(grouping_factor$level)), # anova_func arg2 one_way_f = one_way_two_categories, # anova_func arg3 simplify = TRUE # TRUE is the default for simplify ) contrast_data_adj_p_values <- p.adjust( p = p_value_data_contrast_ps, method = "fdr", n = length(p_value_data_contrast_ps) # this is the default, length(p) ) # - compute the fold-change contrast_p_df <- data.frame( contrast = contrast, phosphopep = contrast_cast$phosphopep, p_value_raw = p_value_data_contrast_ps, p_value_adj = contrast_data_adj_p_values ) db_write_table_overwrite <- (contrast < 2) db_write_table_append <- !db_write_table_overwrite RSQLite::dbWriteTable( conn = db, name = "contrast_ppep_p_val", value = contrast_p_df, append = db_write_table_append ) # Create UK for insert ddl_exec(db, " CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx ON contrast_ppep_p_val(phosphopep, contrast); " ) } # Perhaps this could be done more elegantly using unique keys # or creating the tables before saving data to them, but this # is fast and, if the database exists on disk rather than in # memory, it doesn't stress memory. dml_no_rows_exec(db, " CREATE TEMP table contrast_log2_fc AS SELECT * FROM v_contrast_log2_fc ORDER BY contrast, phosphopep ; " ) dml_no_rows_exec(db, " CREATE TEMP table ppep_p_val AS SELECT p_value_raw, p_value_adj, contrast AS p_val_contrast, phosphopep AS p_val_ppep FROM contrast_ppep_p_val ORDER BY contrast, phosphopep ; " ) dml_no_rows_exec(db, " DROP TABLE IF EXISTS contrast_log2_fc_p_val ; " ) dml_no_rows_exec(db, " CREATE TABLE contrast_log2_fc_p_val AS SELECT a.*, b.p_value_raw, b.p_value_adj, b.p_val_contrast, b.p_val_ppep FROM contrast_log2_fc a, ppep_p_val b WHERE a.rowid = b.rowid AND a.phosphopep = b.p_val_ppep ; " ) # Create UK ddl_exec(db, " CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx ON contrast_log2_fc_p_val(phosphopep, contrast); " ) # Create indices for future queries ddl_exec(db, " CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx ON contrast_log2_fc_p_val(contrast); " ) ddl_exec(db, " CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx ON contrast_log2_fc_p_val(phosphopep); " ) ddl_exec(db, " CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx ON contrast_log2_fc_p_val(p_value_raw); " ) ddl_exec(db, " CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx ON contrast_log2_fc_p_val(p_value_adj); " ) dml_no_rows_exec(db, " DROP VIEW IF EXISTS v_contrast_log2_fc_p_val ; " ) dml_no_rows_exec(db, " CREATE VIEW v_contrast_log2_fc_p_val AS SELECT contrast, a_level, a_samples, b_level, b_samples, a_quant, b_quant, fc_description, log2_fc, p_value_raw, p_value_adj, phosphopep FROM contrast_log2_fc_p_val ORDER BY contrast, phosphopep ; " ) ddl_exec(db, " DROP TABLE IF EXISTS kseaapp_metadata ; " ) dml_no_rows_exec(db, " CREATE TABLE kseaapp_metadata AS WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS ( SELECT DISTINCT deppep.seq, ppep.seq, GeneName||';', UniProtID||';', PhosphoResidue||';' FROM ppep, deppep, mrgfltr_metadata WHERE mrgfltr_metadata.ppep_id = ppep.id AND ppep.deppep_id = deppep.id ) SELECT ppep AS `ppep`, SUBSTR(uniprot_id, 1, INSTR(uniprot_id,';') - 1 ) AS `Protein`, SUBSTR(gene_name, 1, INSTR(gene_name,';') - 1 ) AS `Gene`, deppep AS `Peptide`, REPLACE( REPLACE( SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ), 'p', '' ), ', ', ';' ) AS `Residue.Both` FROM extended ; " ) # Create indexes for join ddl_exec(db, " CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx ON kseaapp_metadata(ppep); " ) ddl_exec(db, " DROP VIEW IF EXISTS v_kseaapp_contrast ; " ) dml_no_rows_exec(db, " CREATE VIEW v_kseaapp_contrast AS SELECT a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both` FROM v_contrast_log2_fc_p_val a, kseaapp_metadata b WHERE b.ppep = a.phosphopep ; " ) ddl_exec(db, " DROP VIEW IF EXISTS v_kseaapp_input ; " ) dml_no_rows_exec(db, " CREATE VIEW v_kseaapp_input AS SELECT v.contrast, v.phosphopep, m.`Protein`, m.`Gene`, m.`Peptide`, m.`Residue.Both`, v.p_value_raw AS `p`, v.log2_fc AS `FC` FROM kseaapp_metadata AS m, v_contrast_log2_fc_p_val AS v WHERE m.ppep = v.phosphopep AND NOT m.`Gene` = 'No_Gene_Name' AND NOT v.log2_fc = 0 ; " ) } ``` ```{r echo = FALSE, results = 'asis'} cat("\\newpage\n") ``` # KSEA Analysis Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: $$ \text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} $$ and fold-enrichment is computed as: $$ \text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} $$ where: - $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, - $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, - $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. $\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). - Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). ```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) # -- eliminate the table that's about to be defined ddl_exec(db, " DROP TABLE IF EXISTS site_metadata; ") # -- define the site_metadata table ddl_exec(db, " CREATE TABLE site_metadata( id INTEGER PRIMARY KEY , site_type_id INTEGER REFERENCES site_type(id) , full TEXT UNIQUE ON CONFLICT IGNORE , abbrev TEXT , pattern TEXT , motif TEXT ); ") # -- populate the table with initial values ddl_exec(db, " INSERT INTO site_metadata(full, abbrev, site_type_id) SELECT DISTINCT kinase_map, kinase_map, site_type_id FROM ppep_gene_site ORDER BY kinase_map; ") # -- drop bogus KSData view if exists ddl_exec(db, " DROP VIEW IF EXISTS ks_data_v; ") # -- create view to serve as an impostor for KSEAapp::KSData ddl_exec(db, " CREATE VIEW IF NOT EXISTS ks_data_v AS SELECT 'NA' AS KINASE, 'NA' AS KIN_ACC_ID, kinase_map AS GENE, 'NA' AS KIN_ORGANISM, 'NA' AS SUBSTRATE, 0 AS SUB_GENE_ID, 'NA' AS SUB_ACC_ID, gene_names AS SUB_GENE, 'NA' AS SUB_ORGANISM, phospho_peptide AS SUB_MOD_RSD, 0 AS SITE_GROUP_ID, 'NA' AS 'SITE_7AA', 2 AS networkin_score, type_name AS Source FROM ppep_gene_site_view; ") contrast_metadata_df <- sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db) rslt <- new_env() rslt$score_list <- list() rslt$name_list <- list() rslt$longname_list <- list() ddl_exec(db, " DROP TABLE IF EXISTS contrast_ksea_scores; " ) next_index <- 0 err_na_subscr_df_const <- "missing values are not allowed in subscripted assignments of data frames" for (i_cntrst in seq_len(nrow(contrast_metadata_df))) { cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) contrast_longlabel <- ( sprintf( "Trt %s {%s} -> Trt %s {%s}", contrast_metadata_df[i_cntrst, "b_level"], gsub( pattern = ";", replacement = ", ", x = contrast_metadata_df[i_cntrst, "b_samples"], fixed = TRUE ), contrast_metadata_df[i_cntrst, "a_level"], gsub( pattern = ";", replacement = ", ", x = contrast_metadata_df[i_cntrst, "a_samples"], fixed = TRUE ) ) ) kseaapp_input <- sqldf::sqldf( x = sprintf(" SELECT `Protein`, `Gene`, `Peptide`, phosphopep AS `Residue.Both`, `p`, `FC` FROM v_kseaapp_input WHERE contrast = %d ", i_cntrst ), connection = db ) pseudo_ksdata <- dbReadTable(db, "ks_data_v") # This hack is because SQL table has the log2-transformed values kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE] main_title <- ( sprintf( "Change from treatment %s to treatment %s", contrast_metadata_df[i_cntrst, "b_level"], contrast_metadata_df[i_cntrst, "a_level"] ) ) sub_title <- contrast_longlabel tryCatch( expr = { ksea_scores_rslt <- ksea_scores( ksdata = pseudo_ksdata, # KSEAapp::KSData, px = kseaapp_input, networkin = TRUE, networkin_cutoff = 2 ) if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { next_index <- 1 + next_index rslt$score_list[[next_index]] <- ksea_scores_rslt rslt$name_list[[next_index]] <- contrast_label rslt$longname_list[[next_index]] <- contrast_longlabel low_fdr_print( rslt = rslt, i_cntrst = i_cntrst, i = next_index, a_level = cntrst_a_level, b_level = cntrst_b_level, fold_change = cntrst_fold_change, caption = contrast_longlabel ) } }, error = function(e) str(e) ) } plotted_kinases <- NULL if (length(rslt$score_list) > 1) { for (i in seq_len(length(ksea_heatmap_titles))) { hdr <- ksea_heatmap_titles[[i]] which_kinases <- i cat("\\clearpage\n\\begin{center}\n") if (i == const_ksea_astrsk_kinases) { subsection_header(hdr) } else { subsection_header(hdr) } cat("\\end{center}\n") plotted_kinases <- ksea_heatmap( # the data frame outputs from the KSEA.Scores() function, in list format score_list = rslt$score_list, # a character vector of all the sample names for heatmap annotation: # - the names must be in the same order as the data in score_list # - please avoid long names, as they may get cropped in the final image sample_labels = rslt$name_list, # character string of either "p.value" or "FDR" indicating the data column # to use for marking statistically significant scores stats = c("p.value", "FDR")[2], # a numeric value between 0 and infinity indicating the min. number of # substrates a kinase must have to be included in the heatmap m_cutoff = 1, # a numeric value between 0 and 1 indicating the p-value/FDR cutoff # for indicating significant kinases in the heatmap p_cutoff = 0.05, # a binary input of TRUE or FALSE, indicating whether or not to perform # hierarchical clustering of the sample columns sample_cluster = TRUE, # a binary input of TRUE or FALSE, indicating whether or not to export # the heatmap as a .png image into the working directory export = FALSE, # additional arguments to gplots::heatmap.2, such as: # - main: main title of plot # - xlab: x-axis label # - ylab: y-axis label xlab = "Contrast", ylab = "Kinase", # print which kinases: # - 1 : all kinases # - 2 : significant kinases # - 3 : non-significant kinases which_kinases = which_kinases ) cat("\\begin{center}\n") cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") cat("\\end{center}\n") } # end for (i in ... } # end if (length ... for (i_cntrst in seq_len(length(rslt$score_list))) { next_index <- i_cntrst cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) contrast_longlabel <- ( sprintf( "Trt %s {%s} -> Trt %s {%s}", contrast_metadata_df[i_cntrst, "b_level"], gsub( pattern = ";", replacement = ", ", x = contrast_metadata_df[i_cntrst, "b_samples"], fixed = TRUE ), contrast_metadata_df[i_cntrst, "a_level"], gsub( pattern = ";", replacement = ", ", x = contrast_metadata_df[i_cntrst, "a_samples"], fixed = TRUE ) ) ) main_title <- ( sprintf( "Change from treatment %s to treatment %s", contrast_metadata_df[i_cntrst, "b_level"], contrast_metadata_df[i_cntrst, "a_level"] ) ) sub_title <- contrast_longlabel tryCatch( expr = { ksea_scores_rslt <- rslt$score_list[[next_index]] if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { low_fdr_barplot( rslt = rslt, i_cntrst = i_cntrst, i = next_index, a_level = cntrst_a_level, b_level = cntrst_b_level, fold_change = cntrst_fold_change, caption = contrast_longlabel ) } }, error = function(e) str(e) ) } ``` ```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} # Use enriched kinases to find enriched kinase-substrate pairs enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) all_enriched_substrates <- sqldf(" SELECT gene AS kinase, ppep, '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label FROM ( SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep FROM pseudo_ksdata WHERE GENE IN (SELECT kinase FROM enriched_kinases) ) GROUP BY ppep ") # helper used to label per-kinase substrate enrichment figure cat_enriched_heading <- function(m, cut_args) { cutoff <- cut_args$cutoff kinase <- cut_args$kinase statistic <- cut_args$statistic threshold <- cut_args$threshold cat("\\newpage\n") if (nrow(m) > intensity_hm_rows) { subsection_header( paste( sprintf( "Lowest p-valued %d (of %d) enriched %s-substrates,", intensity_hm_rows, nrow(m), kinase ), sprintf(" KSEA %s < %0.2f\n", statistic, threshold) ) ) } else { if (nrow(m) == 1) { return(FALSE) } else { subsection_header( paste( sprintf( "%d enriched %s-substrates,", nrow(m), kinase ), sprintf( " KSEA %s < %0.2f\n", statistic, threshold ) ) ) } } cat("\n\n\n") cat("\n\n\n") return(TRUE) } # Disabling heatmaps for substrates pending decision whether to eliminate them altogether if (FALSE) for (kinase_name in sort(enriched_kinases$kinase)) { enriched_substrates <- all_enriched_substrates[ all_enriched_substrates$kinase == kinase_name, , drop = FALSE ] # Get the intensity values for the heatmap enriched_intensities <- as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) # Remove rows having too many NA values to be relevant na_counter <- is.na(enriched_intensities) na_counts <- apply(na_counter, 1, sum) enriched_intensities <- enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE] # Rename the rows with the display-name for the heatmap rownames(enriched_intensities) <- sapply( X = rownames(enriched_intensities), FUN = function(rn) { enriched_substrates[enriched_substrates$ppep == rn, "label"] } ) # Format as matrix for heatmap m <- as.matrix(enriched_intensities) # Draw the heading and heatmap if (nrow(m) > 0) { cut_args <- new_env() cut_args$cutoff <- cutoff cut_args$kinase <- kinase_name cut_args$statistic <- ksea_cutoff_statistic cut_args$threshold <- ksea_cutoff_threshold number_of_peptides_found <- draw_intensity_heatmap( m = m, cutoff = cut_args, hm_heading_function = cat_enriched_heading, hm_main_title = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", suppress_row_dendrogram = FALSE ) } } # Write output tabular files # get kinase, ppep, concat(kinase) tuples for enriched kinases kinase_ppep_label <- sqldf(" WITH t(ppep, label) AS ( SELECT DISTINCT SUB_MOD_RSD AS ppep, group_concat(gene, '; ') AS label FROM pseudo_ksdata WHERE GENE IN (SELECT kinase FROM enriched_kinases) GROUP BY ppep ), k(kinase, ppep_join) AS ( SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join FROM pseudo_ksdata WHERE GENE IN (SELECT kinase FROM enriched_kinases) ) SELECT k.kinase, t.ppep, t.label FROM t, k WHERE t.ppep = k.ppep_join ORDER BY k.kinase, t.ppep ") # extract what we need from full_data impish <- cbind(rownames(quant_data_imp), quant_data_imp) colnames(impish)[1] <- "Phosphopeptide" data_table_imputed_sql <- " SELECT f.*, k.label AS KSEA_enrichments, q.* FROM metadata_plus_p f LEFT JOIN kinase_ppep_label k ON f.Phosphopeptide = k.ppep, impish q WHERE f.Phosphopeptide = q.Phosphopeptide " data_table_imputed <- sqldf(data_table_imputed_sql) # Zap the duplicated 'Phosphopeptide' column named 'ppep' data_table_imputed <- data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] # Output with imputed, un-normalized data write.table( data_table_imputed , file = imputed_data_filename , sep = "\t" , col.names = TRUE , row.names = FALSE , quote = FALSE ) #output quantile normalized data impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) colnames(impish)[1] <- "Phosphopeptide" data_table_imputed <- sqldf(data_table_imputed_sql) # Zap the duplicated 'Phosphopeptide' column named 'ppep' data_table_imputed <- data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] write.table( data_table_imputed, file = imp_qn_lt_data_filenm, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) ppep_kinase <- sqldf(" SELECT DISTINCT k.ppep, k.kinase FROM ( SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep FROM pseudo_ksdata WHERE GENE IN (SELECT kinase FROM enriched_kinases) ) k ORDER BY k.ppep, k.kinase ") RSQLite::dbWriteTable( conn = db, name = "ksea_enriched_ks", value = ppep_kinase, append = FALSE ) RSQLite::dbWriteTable( conn = db, name = "anova_signif", value = p_value_data, append = FALSE ) ddl_exec(db, " DROP VIEW IF EXISTS stats_metadata_v; " ) dml_no_rows_exec(db, " CREATE VIEW stats_metadata_v AS SELECT DISTINCT m.*, p.raw_anova_p, p.fdr_adjusted_anova_p, kek.kinase AS ksea_enrichments FROM mrgfltr_metadata_view m LEFT JOIN anova_signif p ON m.phospho_peptide = p.phosphopeptide LEFT JOIN ksea_enriched_ks kek ON m.phospho_peptide = kek.ppep ; " ) write.table( dbReadTable(db, "stats_metadata_v"), file = anova_ksea_mtdt_file, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) ``` ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} cat("\\leavevmode\n\n\n") # write parameters to report param_unlist <- unlist(as.list(params)) param_df <- data.frame( parameter = paste0("\\verb@", names(param_unlist), "@"), value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@") ) data_frame_latex( x = param_df, justification = "p{0.35\\linewidth} p{0.6\\linewidth}", centered = TRUE, caption = "Input parameters", anchor = const_table_anchor_bp, underscore_whack = FALSE ) # write parameters to SQLite output mqppep_anova_script_param_df <- data.frame( script = "mqppep_anova_script.Rmd", parameter = names(param_unlist), value = param_unlist ) ddl_exec(db, " DROP TABLE IF EXISTS script_parameter; " ) ddl_exec(db, " CREATE TABLE IF NOT EXISTS script_parameter( script TEXT, parameter TEXT, value ANY, UNIQUE (script, parameter) ON CONFLICT REPLACE ) ; " ) RSQLite::dbWriteTable( conn = db, name = "script_parameter", value = mqppep_anova_script_param_df, append = TRUE ) # We are done with output RSQLite::dbDisconnect(db) ``` <!-- There's gotta be a better way... loaded_packages_df <- sessioninfo::package_info("loaded") loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) loaded_packages_df <- data.frame( package = loaded_packages_df$package, version = loaded_packages_df$loadedversion, date = loaded_packages_df$date ) data_frame_latex( x = loaded_packages_df, justification = "l | l l", centered = FALSE, caption = "Loaded R packages", anchor = const_table_anchor_bp ) -->