Mercurial > repos > galaxyp > mqppep_preproc

diff mqppep_anova_script.Rmd @ 0:8dfd5d2b5903 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author: galaxyp
date: Mon, 11 Jul 2022 19:22:54 +0000
children: b76c75521d91
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova_script.Rmd	Mon Jul 11 19:22:54 2022 +0000
@@ -0,0 +1,3536 @@
+---
+title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA"
+author:
+- "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]"
+- "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]"
+- "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]"
+date:
+- "May 28, 2018"
+- "; revised June 23, 2022"
+output:
+  pdf_document:
+    toc: true
+    toc_depth: 3
+    keep_tex: true
+header-includes:
+  - \usepackage{longtable}
+  - \newcommand\T{\rule{0pt}{2.6ex}}       % Top strut
+  - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut
+params:
+  alphaFile:            "test-data/alpha_levels.tabular"
+  inputFile:            "test-data/test_input_for_anova.tabular"
+  preprocDb:            "test-data/test_input_for_anova.sqlite"
+  kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
+  show_toc:             true
+  firstDataColumn:      "^Intensity[^_]"
+  imputationMethod:     !r c("group-median", "median", "mean", "random")[1]
+  meanPercentile:       1
+  sdPercentile:         1.0
+  regexSampleNames:     "\\.\\d+[A-Z]$"
+  regexSampleGrouping:  "\\d+"
+  imputedDataFilename:  "test-data/limbo/imputedDataFilename.txt"
+  imputedQNLTDataFile:  "test-data/limbo/imputedQNLTDataFile.txt"
+  anovaKseaMetadata:    "test-data/limbo/anovaKseaMetadata.txt"
+  oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]
+  oneWayTwoCategories:  !r c("aov", "kruskal.test", "oneway.test")[3]
+  kseaCutoffStatistic:  !r c("p.value", "FDR")[2]
+  kseaCutoffThreshold:  !r c( 0.1, 0.05)[2]
+  kseaMinKinaseCount:   1
+  intensityHeatmapRows: 75
+---
+<!--
+  kseaCutoffStatistic:  !r c("p.value", "FDR")[2]
+  kseaCutoffThreshold:  !r c(0.05, 0.1)[1]
+
+  alphaFile:            "test-data/alpha_levels.tabular"
+  inputFile:            "test-data/test_input_for_anova.tabular"
+  preprocDb:            "test-data/test_input_for_anova.sqlite"
+  kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
+
+  alphaFile:            "test-data/alpha_levels.tabular"
+  inputFile:            "test-data/UT_phospho_ST_sites.preproc.tabular"
+  preprocDb:            "test-data/UT_phospho_ST_sites.preproc.sqlite"
+  kseaAppPrepDb:        !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2]
+
+  alphaFile:            "test-data/alpha_levels.tabular"
+  inputFile:            "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular"
+  preprocDb:            "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite"
+  kseaAppPrepDb:        !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
+
+  alphaFile:            "test-data/alpha_levels.tabular"
+  inputFile:            "test-data/pST_Sites_NancyDu.txt.preproc.tabular"
+  preprocDb:            "test-data/pST_Sites_NancyDu.txt.preproc.sqlite"
+  kseaAppPrepDb:        !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
+
+  inputFile:            "test-data/density_failure.preproc_tab.tabular"
+  kseaAppPrepDb:        !r c(":memory:", "mqppep.sqlite")[2]
+  latex_document: default
+-->
+```{r setup, include = FALSE}
+#ref for debugging: https://yihui.org/tinytex/r/#debugging
+options(tinytex.verbose = TRUE)
+
+# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
+# ref for top and bottom struts: https://tex.stackexchange.com/a/50355
+knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))
+
+# freeze the random number generator so the same results will be produced
+#  from run to run
+set.seed(28571)
+
+### LIBRARIES
+library(gplots)
+library(DBI)
+library(RSQLite)
+# Suppress "Warning: no DISPLAY variable so Tk is not available"
+suppressWarnings(suppressMessages(library(sqldf)))
+
+# required but not added to search list:
+# - DBI
+# - RSQLite
+# - ggplot2
+# - knitr
+# - latex2exp
+# - preprocessCore
+# - reshape2
+# - vioplot
+
+### CONSTANTS
+
+const_parfin <- par("fin")
+const_boxplot_fill <- "grey94"
+const_stripchart_cex <- 0.5
+const_stripsmall_cex <-
+  sqrt(const_stripchart_cex * const_stripchart_cex / 2)
+const_stripchart_jitter <- 0.3
+const_write_debug_files <- FALSE
+const_table_anchor_bp <- "bp"
+const_table_anchor_ht <- "ht"
+const_table_anchor_p <- "p"
+const_table_anchor_tbp <- "tbp"
+
+
+const_ksea_astrsk_kinases    <- 1
+const_ksea_nonastrsk_kinases <- 2
+const_ksea_all_kinases       <- 3
+
+const_log10_e <- log10(exp(1))
+
+### FUNCTIONS
+
+# from `demo(error.catching)`
+##' Catch *and* save both errors and warnings, and in the case of
+##' a warning, also keep the computed result.
+##'
+##' @title tryCatch both warnings (with value) and errors
+##' @param expr an \R expression to evaluate
+##' @return a list with 'value' and 'warning', where
+##'   'value' may be an error caught.
+##' @author Martin Maechler;
+##' Copyright (C) 2010-2012  The R Core Team
+try_catch_w_e <- function(expr) {
+  wrn <- NULL
+  # warning handler
+  w_handler <- function(w) {
+    wrn <<- w
+    invokeRestart("muffleWarning")
+  }
+  list(
+    value = withCallingHandlers(
+      tryCatch(
+        expr,
+        error = function(e) e
+      ),
+      warning = w_handler
+    ),
+    warning = wrn
+  )
+}
+
+
+write_debug_file <- function(s) {
+  if (const_write_debug_files) {
+    s_path <- sprintf("test-data/%s.txt", deparse(substitute(s)))
+    print(sprintf("DEBUG writing file %s", spath))
+    write.table(
+      s,
+      file = s_path,
+      sep = "\t",
+      col.names = TRUE,
+      row.names = TRUE,
+      quote = FALSE
+    )
+  }
+}
+
+# ref: http://adv-r.had.co.nz/Environments.html
+# "When creating your own environment, note that you should set its parent
+#   environment to be the empty environment. This ensures you don't
+#   accidentally inherit objects from somewhere else."
+# Caution: this prevents `with(my_env, expr)` from working when `expr`
+#   contains anything from the global environment, even operators!
+#   Hence, `x <- 1; get("x", new_env())` fails by design.
+new_env <- function() {
+  new.env(parent = emptyenv())
+}
+
+### numerical/statistical helper functions
+
+any_nan <- function(x) {
+  !any(x == "NaN")
+}
+
+# determine standard deviation of quantile to impute
+sd_finite <- function(x) {
+  ok <- is.finite(x)
+  sd(x[ok])
+}
+
+anova_func <- function(x, grouping_factor, one_way_f) {
+  subject <- data.frame(
+    intensity = x
+  )
+  x_aov <-
+    one_way_f(
+      formula = intensity ~ grouping_factor,
+      data = subject
+      )
+  pvalue <-
+    if (identical(one_way_f, aov))
+      summary(x_aov)[[1]][["Pr(>F)"]][1]
+    else
+      pvalue <- x_aov$p.value
+  pvalue
+}
+
+
+### LaTeX functions
+
+latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) {
+  v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v
+  cat(
+    paste0(
+      v_sub,
+      collapse = collapse_string
+      )
+    )
+}
+
+latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
+  cat("\\begin{itemize}\n\\item ")
+  latex_collapsed_vector(collapse_string, v, underscore_whack)
+  cat("\n\\end{itemize}\n")
+}
+
+latex_itemized_list <- function(v, underscore_whack = TRUE) {
+  latex_itemized_collapsed("\n\\item ", v, underscore_whack)
+}
+
+latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
+  cat("\\begin{enumerate}\n\\item ")
+  latex_collapsed_vector(collapse_string, v, underscore_whack)
+  cat("\n\\end{enumerate}\n")
+}
+
+latex_enumerated_list <- function(v) {
+  latex_enumerated_collapsed("\n\\item ", v)
+}
+
+latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
+  latex_collapsed_vector(" & ", v, underscore_whack)
+  cat(extra)
+  cat(" \\\\\n")
+}
+
+# Use this like print.data.frame, from which it is adapted:
+data_frame_latex <-
+  function(
+    x,
+    ...,
+    # digits to pass to format.data.frame
+    digits = NULL,
+    # TRUE -> right-justify columns; FALSE -> left-justify
+    right = TRUE,
+    # maximumn number of rows to print
+    max = NULL,
+    # string with justification of each column
+    justification = NULL,
+    # TRUE to center on page
+    centered = TRUE,
+    # optional caption
+    caption = NULL,
+    # h(inline); b(bottom); t (top) or p (separate page)
+    anchor = "h",
+    # set underscore_whack to TRUE to escape underscores
+    underscore_whack = TRUE
+  ) {
+    if (is.null(justification))
+      justification <-
+        Reduce(
+          f = paste,
+          x = rep_len(if (right) "r" else "l", length(colnames(x)))
+          )
+    n <- length(rownames(x))
+    if (length(x) == 0L) {
+      cat(
+        sprintf(
+          # if n is one, use singular 'row', else use plural 'rows'
+          ngettext(
+            n,
+            "data frame with 0 columns and %d row",
+            "data frame with 0 columns and %d rows"
+            ),
+          n
+          ),
+        "\n",
+        sep = ""
+        )
+    } else if (n == 0L) {
+      cat("0 rows for:\n")
+      latex_itemized_list(
+        v = names(x),
+        underscore_whack = underscore_whack
+        )
+    } else {
+      if (is.null(max))
+        max <- getOption("max.print", 99999L)
+      if (!is.finite(max))
+        stop("invalid 'max' / getOption(\"max.print\"): ",
+          max)
+      omit <- (n0 <- max %/% length(x)) < n
+      m <- as.matrix(
+        format.data.frame(
+          if (omit) x[seq_len(n0), , drop = FALSE] else x,
+          digits = digits,
+          na.encode = FALSE
+          )
+        )
+      cat(
+        # h(inline); b(bottom); t (top) or p (separate page)
+        paste0("\\begin{table}[", anchor, "]\n")
+        )
+      if (!is.null(caption))
+        cat(paste0(" \\caption{", caption, "}"))
+      if (centered) cat("\\centering\n")
+      cat(
+        paste(
+          " \\begin{tabular}{",
+          justification,
+          "}\n",
+          sep = ""
+          )
+        )
+      # ref: https://tex.stackexchange.com/a/50353
+      #   Describes use of \rule{0pt}{3ex}
+      if (!is.null(caption))
+        cat("\\B \\\\ \\hline\\hline\n")
+      # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
+      latex_table_row(
+        v = colnames(m),
+        extra = "\\T\\B",
+        underscore_whack = underscore_whack
+        )
+      cat("\\hline\n")
+      for (i in seq_len(length(m[, 1]))) {
+        latex_table_row(
+        v = m[i, ],
+        underscore_whack = underscore_whack
+        )
+      }
+      cat(
+        paste(
+          " \\end{tabular}",
+          "\\end{table}",
+          sep = "\n"
+          )
+        )
+      if (omit)
+        cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
+          n - n0, "rows ]\n")
+    }
+    invisible(x)
+  }
+
+hypersub <-
+  function(s) {
+    hyper <- tolower(s)
+    hyper <- gsub("[^a-z0-9]+", "-", hyper)
+    hyper <- gsub("[-]+",       "-", hyper)
+    hyper <- sub("^[-]",        "",  hyper)
+    hyper <- sub("[-]$",        "",  hyper)
+    return(hyper)
+  }
+
+subsection_header <-
+  function(s) {
+    hyper <- hypersub(s)
+    cat(
+      sprintf(
+        "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n",
+        hyper, s, hyper
+        )
+      )
+  }
+
+subsubsection_header <-
+  function(s) {
+    hyper <- hypersub(s)
+    cat(
+      sprintf(
+        "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n",
+        hyper, s, hyper
+        )
+      )
+  }
+
+### SQLite functions
+
+ddl_exec <- function(db, sql) {
+  discard <- DBI::dbExecute(conn = db, statement = sql)
+  if (FALSE && discard != 0) {
+    need_newpage <- TRUE
+    if (need_newpage) {
+      need_newpage <<- FALSE
+      cat("\\newpage\n")
+    }
+    o_file <- stdout()
+    cat("\n\\begin{verbatim}\n")
+    cat(sql, file = o_file)
+    cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file)
+    cat("\n\\end{verbatim}\n")
+  }
+}
+
+dml_no_rows_exec <- function(db, sql) {
+  discard <- DBI::dbExecute(conn = db, statement = sql)
+  if (discard != 0) {
+    need_newpage <- TRUE
+    if (need_newpage) {
+      need_newpage <<- FALSE
+      cat("\\newpage\n")
+    }
+    cat("\n\\begin{verbatim}\n")
+    o_file <- stdout()
+    cat(sql, file = o_file)
+    cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file)
+    cat("\n\\end{verbatim}\n")
+  }
+}
+
+### KSEA functions and helpers
+
+# Adapted from KSEAapp::KSEA.Scores to allow retrieval of:
+# - maximum log2(FC)
+ksea_scores <- function(
+
+  # For human data, typically, ksdata = KSEAapp::ksdata
+  ksdata,
+
+  # Input data file having columns:
+  # - Protein     : abbreviated protein name
+  # - Gene        : HUGO gene name
+  # - Peptide     : peptide sequence without indications of phosphorylation
+  # - Reside.Both : position(s) of phosphorylation within Gene sequence
+  #                 - First letter designates AA that is modified
+  #                 - Numbers indicate position within Gene
+  #                 - Multiple values are separated by semicolons
+  #   - p         : p-value
+  #   - FC        : fold-change
+  px,
+
+  # A binary input of TRUE or FALSE, indicating whether or not to include
+  #   NetworKIN predictions
+  networkin,
+
+  # A numeric value between 1 and infinity setting the minimum NetworKIN
+  #   score (can be left out if networkin = FALSE)
+  networkin_cutoff
+
+) {
+  if (length(grep(";", px$Residue.Both)) == 0) {
+    # There are no Residue.Both entries having semicolons, so new is
+    #   simply px except two columns are renamed and a column is added
+    #   for log2(abs(fold-change))
+    new <- px
+    colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
+    new$log2_fc <- log2(abs(as.numeric(as.character(new$FC))))
+    new <- new[complete.cases(new$log2_fc), ]
+  } else {
+    # Split each row having semicolons in Residue.Both into rows that are
+    #   duplicated in all respects except that each row has a single
+    #   member of the set "split-on-semicolon-Residue.Both"
+    px_double <- px[grep(";", px$Residue.Both), ]
+    residues <- as.character(px_double$Residue.Both)
+    residues <- as.matrix(residues, ncol = 1)
+    split <- strsplit(residues, split = ";")
+    # x gets count of residues in each row,
+    #   i.e., 1 + count of semicolons
+    x <- sapply(split, length)
+    # Here is the set of split rows
+    px_single <- data.frame(
+      Protein      = rep(px_double$Protein, x),
+      Gene         = rep(px_double$Gene,    x),
+      Peptide      = rep(px_double$Peptide, x),
+      Residue.Both = unlist(split),
+      p            = rep(px_double$p,       x),
+      FC           = rep(px_double$FC,      x)
+      )
+    # new first gets the split rows
+    new <- px[-grep(";", px$Residue.Both), ]
+    # to new, append the rows that didn't need splitting in the first place
+    new <- rbind(new, px_single)
+    # map Gene         to SUB_GENE
+    # map Residue.Both to SUB_MOD_RSD
+    colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
+    # Eliminate any non-positive values to prevent introduction of
+    #   infinite or NaN values
+    new[(0 <= new$log2_fc), "log2_fc"] <- NA
+    # Because of preceding step, there is no need for abs in the next line
+    new$log2_fc <- log2(as.numeric(as.character(new$FC)))
+    # Convert any illegal values from NaN to NA
+    new[is.nan(new$log2_fc), "log2_fc"] <- NA
+    # Eliminate rows having missing values (e.g., non-imputed data)
+    new <- new[complete.cases(new$log2_fc), ]
+  }
+  if (networkin == TRUE) {
+    # When NetworKIN is true, filter on NetworKIN.cutoff which includes
+    #   PhosphoSitePlus data *because its networkin_score is set to Inf*
+    ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ]
+    ksdata_filtered <- ksdata_filtered[
+      (ksdata_filtered$networkin_score >= networkin_cutoff), ]
+  } else {
+    # Otherwise, simply use PhosphSitePlus rows
+    ksdata_filtered <- ksdata[
+      grep("PhosphoSitePlus", ksdata$Source), ]
+  }
+  # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
+  #   colnames of ksdata_filtered:
+  #     "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID"
+  #     "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID"
+  #     "SITE_...7_AA" "networkin_score" "Source"
+  #   colnames of new:
+  #     "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc"
+  # Equivalent to:
+  #   SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc
+  #     FROM ksdata_filtered a
+  #       INNER JOIN new b
+  #         ON a.SUB_GENE = b.SUB_GENE
+  #           AND a.SUB_MOD_RSD = b.SUB_MOD_RSD
+  ksdata_dataset <- base::merge(ksdata_filtered, new)
+  #   colnames of ksdata_dataset:
+  #     "KINASE"      "KIN_ACC_ID"   "GENE"       "KIN_ORGANISM" "SUBSTRATE"
+  #     "SUB_GENE_ID" "SUB_ACC_ID"   "SUB_GENE"   "SUB_ORGANISM" "SUB_MOD_RSD"
+  #     "SITE_GRP_ID" "SITE_...7_AA" "networkin_score"  "Source" "Protein"
+  #     "Peptide"     "p"            "FC"         "log2_fc" (uniprot_no_isoform)
+  # Re-order dataset; prior to accounting for isoforms
+  ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
+  # Extract non-isoform accession in UniProtKB
+  ksdata_dataset$uniprot_no_isoform <- sapply(
+    ksdata_dataset$KIN_ACC_ID,
+    function(x) unlist(strsplit(as.character(x), split = "-"))[1]
+    )
+  # Discard previous results while selecting interesting columns ...
+  ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
+  # Column names are now:
+  #   "GENE"       "SUB_GENE"        "SUB_MOD_RSD"    "Peptide" "p"
+  #   "FC" "log2_fc" "Source"
+  # Make column names human-readable
+  colnames(ksdata_dataset_abbrev) <- c(
+    "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p",
+    "FC", "log2FC", "Source"
+    )
+  # SELECT * FROM ksdata_dataset_abbrev
+  #   ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p
+  ksdata_dataset_abbrev <-
+    ksdata_dataset_abbrev[
+      order(
+        ksdata_dataset_abbrev$Kinase.Gene,
+        ksdata_dataset_abbrev$Substrate.Gene,
+        ksdata_dataset_abbrev$Substrate.Mod,
+        ksdata_dataset_abbrev$p),
+      ]
+  # First aggregation step to account for multiply phosphorylated peptides
+  #   and differing peptide sequences; the goal here is to combine results
+  #   for all measurements of the same substrate.
+  # SELECT  `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
+  #         `Source`, avg(log2FC) AS log2FC
+  #   FROM  ksdata_dataset_abbrev
+  #   GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
+  #         `Source`
+  #   ORDER BY `Kinase.Gene`;
+  # in two steps:
+  # (1) compute average log_2(fold-change)
+  ksdata_dataset_abbrev <- aggregate(
+    log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source,
+    data = ksdata_dataset_abbrev,
+    FUN = mean
+    )
+  # (2) order by Kinase.Gene
+  ksdata_dataset_abbrev <-
+    ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
+  # SELECT  `Kinase.Gene`, count(*)
+  #   FROM  ksdata_dataset_abbrev
+  #  GROUP BY `Kinase.Gene`;
+  # in two steps:
+  # (1) Extract the list of Kinase.Gene names
+  kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
+  # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev,
+  #   named by Kinase.Gene
+  kinase_list <- as.matrix(table(kinase_list))
+  # Second aggregation step to account for all substrates per kinase
+  # CREATE TABLE mean_fc
+  #   AS
+  # SELECT  `Kinase.Gene`, avg(log2FC) AS log2FC
+  #   FROM  ksdata_dataset_abbrev
+  #   GROUP BY `Kinase.Gene`
+  mean_fc <- aggregate(
+    log2FC ~ Kinase.Gene,
+    data = ksdata_dataset_abbrev,
+    FUN = mean
+    )
+  # mean_fc columns: "Kinase.Gene", "log2FC"
+  if (FALSE) {
+    # I need to re-think this; I was trying to find the most-represented
+    #   peptide, but that horse has already left the barn
+    # SELECT  `Kinase.Gene`, max(abs(log2FC)) AS log2FC
+    #   FROM  ksdata_dataset_abbrev
+    #   GROUP BY `Kinase.Gene`
+    max_fc <- aggregate(
+      log2FC ~ Kinase.Gene,
+      data = ksdata_dataset_abbrev,
+      FUN = function(r) max(abs(r))
+      )
+  }
+
+  # Create column 3: mS
+  mean_fc$m_s <- mean_fc[, 2]
+  # Create column 4: Enrichment
+  mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE))
+  # Create column 5: m, count of substrates
+  mean_fc$m <- kinase_list
+  # Create column 6: z-score
+  mean_fc$z_score <- (
+    (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) *
+      sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE)
+  # Create column 7: p-value, deduced from z-score
+  mean_fc$p_value <- pnorm(-abs(mean_fc$z_score))
+  # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value
+  mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr")
+
+  # Remove log2FC column, which is duplicated as mS
+  mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2]
+  # Correct the column names which we had to hack because of the linter...
+  colnames(mean_fc) <- c(
+    "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
+    )
+  return(mean_fc)
+}
+
+low_fdr_barplot <- function(
+  rslt,
+  i_cntrst,
+  i,
+  a_level,
+  b_level,
+  fold_change,
+  caption
+) {
+  rslt_score_list_i <- rslt$score_list[[i]]
+  if (!is.null(rslt_score_list_i)) {
+    rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
+    k <- data.frame(
+      contrast = as.integer(i_cntrst),
+      a_level = rep.int(a_level, rslt_score_list_i_nrow),
+      b_level = rep.int(b_level, rslt_score_list_i_nrow),
+      kinase_gene = rslt_score_list_i$Kinase.Gene,
+      mean_log2_fc = rslt_score_list_i$mS,
+      enrichment = rslt_score_list_i$Enrichment,
+      substrate_count = rslt_score_list_i$m,
+      z_score = rslt_score_list_i$z.score,
+      p_value = rslt_score_list_i$p.value,
+      fdr = rslt_score_list_i$FDR
+    )
+    selector <- switch(
+      ksea_cutoff_statistic,
+      "FDR" = {
+        k$fdr
+        },
+      "p.value" = {
+        k$p_value
+        },
+      stop(
+        sprintf(
+          "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+          ksea_cutoff_statistic
+          )
+        )
+      )
+
+    k <- k[selector < ksea_cutoff_threshold, ]
+
+    if (nrow(k) > 1) {
+      op <- par(mai = c(1, 1.5, 0.4, 0.4))
+      numeric_z_score <- as.numeric(k$z_score)
+      z_score_order <- order(numeric_z_score)
+      kinase_name <- k$kinase_gene
+      long_caption <-
+        sprintf(
+          "Kinase z-score, %s < %s, %s",
+          ksea_cutoff_statistic,
+          ksea_cutoff_threshold,
+          caption
+          )
+      my_cex_caption <- 65.0 / max(65.0, nchar(long_caption))
+      cat("\n\\clearpage\n")
+      barplot(
+        height = numeric_z_score[z_score_order],
+        border = NA,
+        xpd = FALSE,
+        cex.names = 1.0,
+        cex.axis = 1.0,
+        main = long_caption,
+        cex.main = my_cex_caption,
+        names.arg = kinase_name[z_score_order],
+        horiz = TRUE,
+        srt = 45,
+        las = 1)
+      par(op)
+    }
+  }
+}
+
+# note that this adds elements to the global variable `ksea_asterisk_hash`
+
+low_fdr_print <- function(
+  rslt,
+  i_cntrst,
+  i,
+  a_level,
+  b_level,
+  fold_change,
+  caption
+) {
+  rslt_score_list_i <- rslt$score_list[[i]]
+  if (!is.null(rslt_score_list_i)) {
+    rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
+    k <- contrast_ksea_scores <- data.frame(
+      contrast = as.integer(i_cntrst),
+      a_level = rep.int(a_level, rslt_score_list_i_nrow),
+      b_level = rep.int(b_level, rslt_score_list_i_nrow),
+      kinase_gene = rslt_score_list_i$Kinase.Gene,
+      mean_log2_fc = rslt_score_list_i$mS,
+      enrichment = rslt_score_list_i$Enrichment,
+      substrate_count = rslt_score_list_i$m,
+      z_score = rslt_score_list_i$z.score,
+      p_value = rslt_score_list_i$p.value,
+      fdr = rslt_score_list_i$FDR
+    )
+
+    selector <- switch(
+      ksea_cutoff_statistic,
+      "FDR" = {
+        k$fdr
+        },
+      "p.value" = {
+        k$p_value
+        },
+      stop(
+        sprintf(
+          "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+          ksea_cutoff_statistic
+          )
+        )
+      )
+
+    k <- k[selector < ksea_cutoff_threshold, ]
+    # save kinase names to ksea_asterisk_hash
+    for (kinase_name in k$kinase_gene) {
+      ksea_asterisk_hash[[kinase_name]] <- 1
+    }
+
+    db_write_table_overwrite <- (i_cntrst < 2)
+    db_write_table_append    <- !db_write_table_overwrite
+    RSQLite::dbWriteTable(
+      conn = db,
+      name = "contrast_ksea_scores",
+      value = contrast_ksea_scores,
+      append = db_write_table_append
+      )
+    selector <- switch(
+      ksea_cutoff_statistic,
+      "FDR" = {
+        contrast_ksea_scores$fdr
+        },
+      "p.value" = {
+        contrast_ksea_scores$p_value
+        },
+      stop(
+        sprintf(
+          "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+          ksea_cutoff_statistic
+          )
+        )
+      )
+    output_df <- contrast_ksea_scores[
+      selector < ksea_cutoff_threshold,
+      c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
+        "z_score", "p_value", "fdr")
+      ]
+    output_order <- with(output_df, order(mean_log2_fc, kinase_gene))
+    output_df <- output_df[output_order, ]
+    colnames(output_df) <-
+      c(
+        colnames(output_df)[1],
+        colnames(output_df)[2],
+        "enrichment",
+        "m_s",
+        "z_score",
+        "p_value",
+        "fdr"
+      )
+    output_df$fdr <- sprintf("%0.4f", output_df$fdr)
+    output_df$p_value <- sprintf("%0.2e", output_df$p_value)
+    output_df$z_score <- sprintf("%0.2f", output_df$z_score)
+    output_df$m_s <- sprintf("%d", output_df$m_s)
+    output_df$enrichment <- sprintf("%0.2f", output_df$enrichment)
+    output_ncol <- ncol(output_df)
+    colnames(output_df) <-
+      c(
+        "Kinase",
+        "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)",
+        "Enrichment",
+        "Substrates",
+        "z-score",
+        "p-value",
+        "FDR"
+      )
+    selector <- switch(
+      ksea_cutoff_statistic,
+      "FDR" = {
+        rslt$score_list[[i]]$FDR
+        },
+      "p.value" = {
+        rslt$score_list[[i]]$p.value
+        },
+      stop(
+        sprintf(
+          "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+          ksea_cutoff_statistic
+          )
+        )
+      )
+    if (sum(selector < ksea_cutoff_threshold) > 0) {
+      math_caption <- gsub("{", "\\{", caption,      fixed = TRUE)
+      math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
+      data_frame_latex(
+        x = output_df,
+        justification = "l c c c c c c",
+        centered = TRUE,
+        caption = sprintf(
+          "\\text{%s}, %s < %s",
+          math_caption,
+          ksea_cutoff_statistic,
+          ksea_cutoff_threshold
+          ),
+        anchor = const_table_anchor_p
+        )
+    } else {
+      cat(
+        sprintf(
+          "\\break
+          No kinases had
+          \\(\\text{%s}_\\text{enrichment} < %s\\)
+          for contrast %s\\hfill\\break\n",
+          ksea_cutoff_statistic,
+          ksea_cutoff_threshold,
+          caption
+        )
+      )
+    }
+  }
+}
+
+# create_breaks is a helper for ksea_heatmap
+create_breaks <- function(merged_scores) {
+  if (min(merged_scores, na.rm = TRUE) < -1.6) {
+    breaks_neg <- seq(-1.6, 0, length.out = 30)
+    breaks_neg <-
+      append(
+        seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10),
+        breaks_neg
+        )
+    breaks_neg <- sort(unique(breaks_neg))
+  } else {
+    breaks_neg <- seq(-1.6, 0, length.out = 30)
+  }
+  if (max(merged_scores, na.rm = TRUE) > 1.6) {
+    breaks_pos <- seq(0, 1.6, length.out = 30)
+    breaks_pos <-
+      append(
+        breaks_pos,
+        seq(1.6, max(merged_scores, na.rm = TRUE),
+        length.out = 10)
+        )
+    breaks_pos <- sort(unique(breaks_pos))
+  } else {
+    breaks_pos <- seq(0, 1.6, length.out = 30)
+  }
+  breaks_all <- unique(append(breaks_neg, breaks_pos))
+  mycol_neg <-
+    gplots::colorpanel(n = length(breaks_neg),
+               low = "blue",
+               high = "white")
+  mycol_pos <-
+    gplots::colorpanel(n = length(breaks_pos) - 1,
+               low = "white",
+               high = "red")
+  mycol <- unique(append(mycol_neg, mycol_pos))
+  color_breaks <- list(breaks_all, mycol)
+  return(color_breaks)
+}
+
+# draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap
+draw_kseaapp_summary_heatmap <- function(
+    x,
+    sample_cluster,
+    merged_asterisk,
+    my_cex_row,
+    color_breaks,
+    margins,
+    ...
+) {
+  merged_scores <- x
+  if (!is.matrix(x)) {
+    cat(
+      paste0(
+        "No plot because \\texttt{typeof(x)} is '",
+        typeof(x),
+        "' rather than 'matrix'.\n\n"
+        )
+      )
+  } else if (nrow(x) < 2) {
+    cat("No plot because matrix x has ", nrow(x), " rows.\n\n")
+    cat("\\begin{verbatim}\n")
+    str(x)
+    cat("\\end{verbatim}\n")
+  } else if (ncol(x) < 2) {
+    cat("No plot because matrix x has ", ncol(x), " columns.\n\n")
+    cat("\\begin{verbatim}\n")
+    str(x)
+    cat("\\end{verbatim}\n")
+  } else {
+    gplots::heatmap.2(
+      x            = merged_scores,
+      Colv         = sample_cluster,
+      scale        = "none",
+      cellnote     = merged_asterisk,
+      notecol      = "white",
+      cexCol       = 0.9,
+      # Heuristically assign size of row labels
+      cexRow       = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25),
+      srtCol       = 45,
+      srtRow       = 45,
+      notecex      = 3 * my_cex_row,
+      col          = color_breaks[[2]],
+      density.info = "none",
+      trace        = "none",
+      breaks       = color_breaks[[1]],
+      lmat         = rbind(c(0, 3), c(2, 1), c(0, 4)),
+      lhei         = c(0.4, 8.0, 1.1),
+      lwid         = c(0.5, 3),
+      key          = FALSE,
+      margins      = margins,
+      ...
+    )
+  }
+}
+
+# Adapted from KSEAapp::KSEA.Heatmap
+ksea_heatmap <- function(
+  # the data frame outputs from the KSEA.Scores() function, in list format
+  score_list,
+  # a character vector of all the sample names for heatmap annotation:
+  # - the names must be in the same order as the data in score_list
+  # - please avoid long names, as they may get cropped in the final image
+  sample_labels,
+  # character string of either "p.value" or "FDR" indicating the data column
+  #   to use for marking statistically significant scores
+  stats,
+  # a numeric value between 0 and infinity indicating the min. number of
+  #   substrates a kinase must have to be included in the heatmap
+  m_cutoff,
+  # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
+  #   for indicating significant kinases in the heatmap
+  p_cutoff =
+    stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"),
+  # a binary input of TRUE or FALSE, indicating whether or not to perform
+  #   hierarchical clustering of the sample columns
+  sample_cluster,
+  # a binary input of TRUE or FALSE, indicating whether or not to export
+  #   the heatmap as a .png image into the working directory
+  export = FALSE,
+  # bottom and right margins; adjust as needed if contrast names are too long
+  margins = c(6, 20),
+  # print which kinases?
+  # - Mandatory argument, must be one of const_ksea_.*_kinases
+  which_kinases,
+  # additional arguments to gplots::heatmap.2, such as:
+  # - main: main title of plot
+  # - xlab: x-axis label
+  # - ylab: y-axis label
+  ...
+) {
+  filter_m <- function(dataset, m_cutoff) {
+    filtered <- dataset[(dataset$m >= m_cutoff), ]
+    return(filtered)
+  }
+  score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff))
+  for (i in seq_len(length(score_list_m))) {
+    names <- colnames(score_list_m[[i]])[c(2:7)]
+    colnames(score_list_m[[i]])[c(2:7)] <-
+      paste(names, i, sep = ".")
+  }
+  master <-
+    Reduce(
+      f = function(...) {
+        base::merge(..., by = "Kinase.Gene", all = FALSE)
+      },
+      x = score_list_m
+      )
+
+  row.names(master) <- master$Kinase.Gene
+  columns <- as.character(colnames(master))
+  merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE])
+  colnames(merged_scores) <- sample_labels
+  merged_stats <- as.matrix(master[, grep(stats, columns)])
+  asterisk <- function(mtrx, p_cutoff) {
+    new <- data.frame()
+    for (i in seq_len(nrow(mtrx))) {
+      for (j in seq_len(ncol(mtrx))) {
+        my_value <- mtrx[i, j]
+        if (!is.na(my_value) && my_value < p_cutoff) {
+          new[i, j] <- "*"
+        } else {
+          new[i, j] <- ""
+        }
+      }
+    }
+    return(new)
+  }
+  merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff))
+
+  # begin hack to print only significant rows
+  asterisk_rows <- rowSums(merged_asterisk == "*") > 0
+  all_rows <- rownames(merged_stats)
+  names(asterisk_rows) <- all_rows
+  non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE])
+  asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE])
+  merged_scores_asterisk <- merged_scores[names(asterisk_rows), ]
+  merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ]
+  # end hack to print only significant rows
+
+  row_list <- list()
+  row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows
+  row_list[[const_ksea_all_kinases]] <- all_rows
+  row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows
+
+  i <- which_kinases
+  my_row_names <- row_list[[i]]
+  scrs <- merged_scores[my_row_names, ]
+  stts <- merged_stats[my_row_names, ]
+  merged_asterisk <- as.matrix(asterisk(stts, p_cutoff))
+
+  color_breaks <- create_breaks(scrs)
+  plot_height <- nrow(scrs) ^ 0.55
+  plot_width <- ncol(scrs) ^ 0.7
+  my_cex_row <- 0.25 * 16 / plot_height
+  if (export == "TRUE") {
+    png(
+      "KSEA.Merged.Heatmap.png",
+      width = plot_width * 300,
+      height = 2 * plot_height * 300,
+      res = 300,
+      pointsize = 14
+    )
+  }
+  draw_kseaapp_summary_heatmap(
+    x               = scrs,
+    sample_cluster  = sample_cluster,
+    merged_asterisk = merged_asterisk,
+    my_cex_row      = my_cex_row,
+    color_breaks    = color_breaks,
+    margins         = margins
+  )
+  if (export == "TRUE") {
+    dev.off()
+  }
+  return(my_row_names)
+}
+
+# helper for heatmaps of phosphopeptide intensities
+
+draw_intensity_heatmap <-
+  function(
+    m,                              # matrix with rownames already formatted
+    cutoff,                         # cutoff used by hm_heading_function
+    hm_heading_function,            # construct and cat heading from m and cutoff
+    hm_main_title,                  # main title for plot (drawn below heading)
+    suppress_row_dendrogram = TRUE, # set to false to show dendrogram
+    max_peptide_count               # experimental:
+          = intensity_hm_rows,      #   values of 50 and 75 worked well
+    ...                             # passthru parameters for heatmap
+  ) {
+    peptide_count <- 0
+    # emit the heading for the heatmap
+    if (hm_heading_function(m, cutoff)) {
+      peptide_count <- min(max_peptide_count, nrow(m))
+      if (nrow(m) > 1) {
+        m_margin <- m[peptide_count:1, ]
+        # Margin setting was heuristically derived
+        margins <-
+          c(0.5, # col
+            max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16  # row
+          )
+        }
+      if (nrow(m) > 1) {
+        tryCatch(
+          {
+            old_oma <- par("oma")
+            par(cex.main = 0.6)
+            # Heuristically determined character size adjustment formula
+            char_contractor <-
+              250000 / (
+                max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows
+                )
+            heatmap(
+              m[peptide_count:1, ],
+              Rowv = if (suppress_row_dendrogram) NA else NULL,
+              Colv = NA,
+              cexRow = char_contractor,
+              cexCol = char_contractor * 50 / max_peptide_count,
+              scale = "row",
+              margins = margins,
+              main =
+                "Unimputed, unnormalized log(intensities)",
+              xlab = "",
+              las = 1,
+              ...
+              )
+          },
+          error = function(e) {
+            cat(
+              sprintf(
+                "\nCould not draw heatmap, possibly because of too many missing values.  Internal message: %s\n",
+                e$message
+                )
+              )
+            },
+          finally = par(old_oma)
+        )
+      }
+    }
+    return(peptide_count)
+  }
+```
+
+```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+cat("\\listoftables\n")
+```
+# Purpose
+
+To perform for phosphopeptides:
+
+- imputation of missing values,
+- quantile normalization,
+- ANOVA (using the R stats::`r params$oneWayManyCategories` function), and
+- KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases:
+  - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org)
+  - The Human Proteome Database [http://hprd.org](http://hprd.org)
+  - NetworKIN [http://networkin.science/](http://networkin.science/)
+  - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx)
+
+```{r include = FALSE}
+
+### GLOBAL VARIABLES
+
+# parameters for KSEA
+
+ksea_cutoff_statistic <- params$kseaCutoffStatistic
+ksea_cutoff_threshold <- params$kseaCutoffThreshold
+ksea_min_kinase_count <- params$kseaMinKinaseCount
+
+ksea_heatmap_titles <- list()
+ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <-
+  sprintf(
+    "Summary for all kinases enriched in one or more contrasts at %s < %s",
+    ksea_cutoff_statistic,
+    ksea_cutoff_threshold
+    )
+ksea_heatmap_titles[[const_ksea_all_kinases]] <-
+  "Summary figure for all contrasts and all kinases"
+ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <-
+  sprintf(
+    "Summary for all kinases not enriched at %s < %s in any contrast",
+    ksea_cutoff_statistic,
+    ksea_cutoff_threshold
+    )
+# hash to hold names of significantly enriched kinases
+ksea_asterisk_hash <- new_env()
+
+# READ PARAMETERS (mostly)
+
+intensity_hm_rows <- params$intensityHeatmapRows
+# Input Filename
+input_file <- params$inputFile
+
+# First data column - ideally, this could be detected via regexSampleNames,
+#   but for now leave it as is.
+first_data_column <- params$firstDataColumn
+fdc_is_integer <- is.integer(first_data_column)
+if (fdc_is_integer) {
+  first_data_column <- as.integer(params$firstDataColumn)
+}
+
+# False discovery rate adjustment for ANOVA
+#  Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
+val_fdr <-
+  read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "")
+
+if (
+  ncol(val_fdr) != 1 ||
+  sum(!is.numeric(val_fdr[, 1])) ||
+  sum(val_fdr[, 1] < 0) ||
+  sum(val_fdr[, 1] > 1)
+) {
+  stop("alphaFile should be one column of numbers within the range [0.0,1.0]")
+}
+val_fdr <- val_fdr[, 1]
+
+#Imputed Data filename
+imputed_data_filename <- params$imputedDataFilename
+imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
+anova_ksea_mtdt_file  <- params$anovaKseaMetadata
+
+```
+
+```{r echo = FALSE}
+# Imputation method, should be one of
+#   "random", "group-median", "median", or "mean"
+imputation_method <- params$imputationMethod
+
+# Selection of percentile of logvalue data to set the mean for random number
+#   generation when using random imputation
+mean_percentile <- params$meanPercentile / 100.0
+
+# deviation adjustment-factor for random values; real number.
+sd_percentile <- params$sdPercentile
+
+# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
+regex_sample_names <- params$regexSampleNames
+
+# Regular expression to extract Sample Grouping from Sample Name;
+#   if error occurs, compare sample_treatment_levels vs. sample_name_matches
+#   to see if groupings/pairs line up
+#   e.g., "(\\d+)"
+regex_sample_grouping <- params$regexSampleGrouping
+
+one_way_all_categories_fname <- params$oneWayManyCategories
+one_way_all_categories <- try_catch_w_e(
+  match.fun(one_way_all_categories_fname))
+if (!is.function(one_way_all_categories$value)) {
+  write("fatal error for parameter oneWayManyCategories:", stderr())
+  write(one_way_all_categories$value$message,             stderr())
+  if (sys.nframe() > 0) quit(save = "no", status = 1)
+  stop("Cannot continue. Goodbye.")
+}
+one_way_all_categories <- one_way_all_categories$value
+
+one_way_two_categories_fname <- params$oneWayManyCategories
+one_way_two_categories <- try_catch_w_e(
+  match.fun(one_way_two_categories_fname))
+if (!is.function(one_way_two_categories$value)) {
+  cat("fatal error for parameter oneWayTwoCategories: \n")
+  cat(one_way_two_categories$value$message, fill = TRUE)
+  if (sys.nframe() > 0) quit(save = "no", status = 1)
+  stop("Cannot continue. Goodbye.")
+}
+one_way_two_categories <- one_way_two_categories$value
+
+preproc_db       <- params$preprocDb
+ksea_app_prep_db <- params$kseaAppPrepDb
+result <- file.copy(
+  from      = preproc_db,
+  to        = ksea_app_prep_db,
+  overwrite = TRUE
+  )
+if (!result) {
+  write(
+    sprintf(
+      "fatal error copying initial database '%s' to output '%s'",
+      preproc_db,
+      ksea_app_prep_db,
+    ),
+    stderr()
+  )
+  if (sys.nframe() > 0) quit(save = "no", status = 1)
+  stop("Cannot continue. Goodbye.")
+}
+```
+
+```{r echo = FALSE}
+### READ DATA
+
+# read.table reads a file in table format and creates a data frame from it.
+#   - note that `quote = ""` means that quotation marks are treated literally.
+full_data <- read.table(
+  file = input_file,
+  sep = "\t",
+  header = TRUE,
+  quote = "",
+  check.names = FALSE
+  )
+```
+
+# Extract Sample Names and Treatment Levels
+
+Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2.
+
+```{r echo = FALSE, results = 'asis'}
+
+data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
+
+if (!fdc_is_integer) {
+  if (length(data_column_indices) > 0) {
+    first_data_column <- data_column_indices[1]
+  } else {
+    stop(paste("failed to convert firstDataColumn:", first_data_column))
+  }
+}
+
+cat(
+  sprintf(
+    paste(
+      "\n\nThe input data file has peptide-intensity data for each sample",
+      "in one of columns %d through %d.\n\n"
+      ),
+    min(data_column_indices),
+    max(data_column_indices)
+    )
+  )
+
+# Write column names as a LaTeX enumerated list.
+column_name_df <- data.frame(
+  column = seq_len(length(colnames(full_data))),
+  name = paste0("\\verb@", colnames(full_data), "@")
+  )
+data_frame_latex(
+  x = column_name_df,
+  justification = "l l",
+  centered = TRUE,
+  caption = "Input data column names",
+  anchor = const_table_anchor_bp,
+  underscore_whack = FALSE
+  )
+
+```
+
+```{r echo = FALSE, results = 'asis'}
+quant_data <- full_data[first_data_column:length(full_data)]
+quant_data[quant_data == 0] <- NA
+rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide
+# Extract factors and trt-replicates using regular expressions.
+# Typically:
+#   regex_sample_names    is "\\.\\d+[A-Z]$"
+#   regex_sample_grouping is "\\d+"
+# This would distinguish trt-replicates by terminal letter [A-Z]
+#   in sample names and group them into trts by the preceding digits.
+#   e.g.:
+#     group .1A .1B .1C into group 1;
+#     group .2A .2B .2C, into group 2;
+#     etc.
+m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE)
+sample_name_matches <- regmatches(names(quant_data), m)
+colnames(quant_data) <- sample_name_matches
+
+write_debug_file(quant_data)
+
+rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
+sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match))
+number_of_samples <- length(sample_name_matches)
+sample_treatment_df <- data.frame(
+  level = sample_treatment_levels,
+  sample = sample_name_matches
+  )
+data_frame_latex(
+  x = sample_treatment_df,
+  justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}",
+  centered = TRUE,
+  caption = "Treatment levels",
+  anchor = const_table_anchor_tbp,
+  underscore_whack = FALSE
+  )
+```
+
+```{r echo = FALSE, results = 'asis'}
+cat("\\newpage\n")
+```
+
+## Are the log-transformed sample distributions similar?
+
+```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+
+quant_data[quant_data == 0] <- NA  #replace 0 with NA
+quant_data_log <- log10(quant_data)
+
+rownames(quant_data_log) <- rownames(quant_data)
+colnames(quant_data_log) <- sample_name_matches
+
+write_debug_file(quant_data_log)
+
+# data visualization
+old_par <- par(
+  mai = par("mai") + c(0.5, 0, 0, 0)
+)
+# ref: https://r-charts.com/distribution/add-points-boxplot/
+# Vertical plot
+boxplot(
+  quant_data_log
+, las = 1
+, col = const_boxplot_fill
+, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+, xlab = "Sample"
+)
+par(old_par)
+
+
+
+cat("\n\n\n")
+cat("\n\n\n")
+
+```
+
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
+if (nrow(quant_data_log) > 1) {
+  quant_data_log_stack <- stack(quant_data_log)
+  ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
+    ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
+    ggplot2::ylab("Probability density") +
+    ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
+} else {
+  cat("No density plot because there are too few peptides.\n\n")
+}
+```
+
+## Globally, are peptide intensities are approximately unimodal?
+
+<!--
+# bquote could be used as an alternative to latex2exp::TeX below particularly
+#   and when plotting math expressions generally, at the expense of mastering
+#   another syntax, which hardly seems worthwhile when I need to use TeX
+#   elsewhere; here's an introduction to bquote:
+#   https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/
+-->
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
+
+# identify the location of missing values
+fin <- is.finite(as.numeric(as.matrix(quant_data_log)))
+
+logvalues <- as.numeric(as.matrix(quant_data_log))[fin]
+logvalues_density <- density(logvalues)
+plot(
+  x = logvalues_density,
+  main = latex2exp::TeX(
+    "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)"
+    ),
+  xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
+  ylab = "Probability density"
+  )
+hist(
+  x = as.numeric(as.matrix(quant_data_log)),
+  xlim = c(min(logvalues_density$x), max(logvalues_density$x)),
+  breaks = 100,
+  main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"),
+  xlab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+)
+```
+
+## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values
+
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
+# determine quantile
+q1 <- quantile(logvalues, probs = mean_percentile)[1]
+
+# 1 = row of matrix (ie, phosphopeptide)
+sds <- apply(quant_data_log, 1, sd_finite)
+if (sum(!is.na(sds)) > 2) {
+  plot(
+    density(sds, na.rm = TRUE)
+  , main = "Smoothed estimated probability density vs. std. deviation"
+  , sub = "(probability estimation made with Gaussian smoothing)"
+  , ylab = "Probability density"
+  )
+} else {
+  cat(
+    "At least two non-missing values are required to plot",
+    "probability density.\n\n"
+    )
+}
+
+```
+
+```{r echo = FALSE}
+# Determine number of cells to impute
+temp <- quant_data[is.na(quant_data)]
+
+# Determine number of values to impute
+number_to_impute <- length(temp)
+
+# Determine percent of missing values
+pct_missing_values <-
+  round(length(temp) / (length(logvalues) + length(temp)) * 100)
+```
+
+```{r echo = FALSE}
+
+# prep for trt-median based imputation
+
+```
+# Impute Missing Values
+
+```{r echo = FALSE}
+
+imp_smry_pot_peptides_before <- nrow(quant_data_log)
+imp_smry_missing_values_before   <- number_to_impute
+imp_smry_pct_missing      <- pct_missing_values
+
+```
+
+```{r echo = FALSE}
+#Determine number of cells to impute
+
+```
+```{r echo = FALSE}
+
+# Identify which values are missing and need to be imputed
+ind <- which(is.na(quant_data), arr.ind = TRUE)
+
+```
+```{r echo = FALSE, results = 'asis'}
+
+# Apply imputation
+switch(
+  imputation_method
+, "group-median" = {
+    quant_data_imp <- quant_data
+    imputation_method_description <-
+      paste("Substitute missing value with",
+        "median peptide-intensity for sample group.\n"
+        )
+    sample_level_integers <- as.integer(sample_treatment_levels)
+    # Take the accurate ln(x+1) because the data are log-normally distributed
+    #   and because median can involve an average of two measurements.
+    quant_data_imp <- log1p(quant_data_imp)
+    for (i in seq_len(length(levels(sample_treatment_levels)))) {
+      # Determine the columns for this factor-level
+      level_cols <- i == sample_level_integers
+      # Extract those columns
+      lvlsbst <- quant_data_imp[, level_cols, drop = FALSE]
+      # assign to ind the row-column pairs corresponding to each NA
+      ind <- which(is.na(lvlsbst), arr.ind = TRUE)
+      # No group-median exists if there is only one sample
+      #   a given ppep has no measurement; otherwise, proceed.
+      if (ncol(lvlsbst) > 1) {
+        the_centers <-
+          apply(lvlsbst, 1, median, na.rm = TRUE)
+        for (j in seq_len(nrow(lvlsbst))) {
+          for (k in seq_len(ncol(lvlsbst))) {
+            if (is.na(lvlsbst[j, k])) {
+              lvlsbst[j, k] <- the_centers[j]
+            }
+          }
+        }
+        quant_data_imp[, level_cols] <- lvlsbst
+      }
+    }
+    # Take the accurate e^x - 1 to match scaling of original input.
+    quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
+    good_rows <- !is.na(rowMeans(quant_data_imp))
+  }
+, "median" = {
+    quant_data_imp <- quant_data
+    imputation_method_description <-
+      paste("Substitute missing value with",
+        "median peptide-intensity across all sample classes.\n"
+        )
+    # Take the accurate ln(x+1) because the data are log-normally distributed
+    #   and because median can involve an average of two measurements.
+    quant_data_imp <- log1p(quant_data_imp)
+    quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]]
+    # Take the accurate e^x - 1 to match scaling of original input.
+    quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
+    good_rows <- !is.nan(rowMeans(quant_data_imp))
+  }
+, "mean" = {
+    quant_data_imp <- quant_data
+    imputation_method_description <-
+      paste("Substitute missing value with",
+        "geometric-mean peptide intensity across all sample classes.\n"
+        )
+    # Take the accurate ln(x+1) because the data are log-normally distributed,
+    #   so arguments to mean should be previously transformed.
+    #   this will have to be
+    quant_data_imp <- log1p(quant_data_imp)
+    # Assign to NA cells the mean for the row
+    quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]]
+    # Take the accurate e^x - 1 to match scaling of original input.
+    quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
+    good_rows <- !is.nan(rowMeans(quant_data_imp))
+  }
+, "random" = {
+    quant_data_imp <- quant_data
+    m1 <- median(sds, na.rm = TRUE) * sd_percentile #sd to be used is the median sd
+    # If you want results to be reproducible, you will want to call
+    #   base::set.seed before calling stats::rnorm
+    imputation_method_description <-
+      paste("Substitute each missing value with random intensity",
+        sprintf(
+          "random intensity $N \\sim (%0.2f, %0.2f)$.\n",
+          q1, m1
+          )
+        )
+    cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n",
+      100 * mean_percentile))
+    cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n",
+      sd_percentile))
+    quant_data_imp[ind] <-
+      10 ^ rnorm(number_to_impute, mean = q1, sd = m1)
+    quant_data_imp_ln <- log1p(quant_data_imp)
+    good_rows <- !is.nan(rowMeans(quant_data_imp))
+  }
+)
+quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e
+
+if (length(good_rows) < 1) {
+  print("ERROR: Cannot impute data; there are no good rows!")
+  return(-1)
+  }
+```
+
+```{r echo = FALSE, results = 'asis'}
+cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description)
+```
+
+```{r echo = FALSE}
+
+imp_smry_pot_peptides_after <- sum(good_rows)
+imp_smry_rejected_after  <- sum(!good_rows)
+imp_smry_missing_values_after   <- sum(is.na(quant_data_imp[good_rows, ]))
+```
+```{r echo = FALSE, results = 'asis'}
+# ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
+tabular_lines_fmt <- paste(
+  "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
+  " \\caption{Imputation Results}",
+  " \\centering", # \centering centers the table on the page
+  " \\begin{tabular}{l c c c}",
+  "  \\hline\\hline",
+  "  \\  & potential peptides   & missing values & rejected",
+  "    peptides \\\\ [0.5ex]",
+  "  \\hline",
+  "  before imputation & %d     & %d (%d\\%s)    &    \\\\",
+  "  after imputation  & %d     & %d             & %d \\\\ [1ex]",
+  "  \\hline",
+  " \\end{tabular}",
+  #" \\label{table:nonlin}", # may be used to refer this table in the text
+  "\\end{table}",
+  sep = "\n"
+  )
+tabular_lines <-
+  sprintf(
+    tabular_lines_fmt,
+    imp_smry_pot_peptides_before,
+    imp_smry_missing_values_before,
+    imp_smry_pct_missing,
+    "%",
+    imp_smry_pot_peptides_after,
+    imp_smry_missing_values_after,
+    imp_smry_rejected_after
+    )
+cat(tabular_lines)
+```
+```{r echo = FALSE}
+
+
+# Zap rows where imputation was ineffective
+full_data         <- full_data        [good_rows, ]
+quant_data        <- quant_data       [good_rows, ]
+
+quant_data_imp <- quant_data_imp[good_rows, ]
+write_debug_file(quant_data_imp)
+quant_data_imp_good_rows <- quant_data_imp
+
+write_debug_file(quant_data_imp_good_rows)
+```
+
+```{r echo = FALSE, results = 'asis'}
+
+can_plot_before_after_imp <- TRUE
+d_combined <-
+  as.numeric(
+    as.matrix(
+      log10(quant_data_imp)
+      )
+    )
+d_original <-
+  as.numeric(
+    as.matrix(
+      log10(quant_data_imp[!is.na(quant_data)])
+      )
+    )
+
+if (sum(!is.na(d_original)) > 2) {
+  d_original <- density(d_original)
+} else {
+  can_plot_before_after_imp <- FALSE
+}
+if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) {
+  d_combined <- density(d_combined)
+} else {
+  can_plot_before_after_imp <- FALSE
+}
+
+if (sum(is.na(quant_data)) > 0) {
+  # There ARE missing values
+  d_imputed <-
+    as.numeric(
+      as.matrix(
+        log10(quant_data_imp[is.na(quant_data)])
+        )
+      )
+  if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) {
+    d_imputed <- (density(d_imputed))
+  } else {
+    can_plot_before_after_imp <- FALSE
+  }
+} else {
+  # There are NO missing values
+  d_imputed <- d_combined
+}
+
+```
+
+```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+zero_sd_rownames <-
+  rownames(quant_data_imp)[
+    is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0)
+  ]
+
+if (length(zero_sd_rownames) >= nrow(quant_data_imp)) {
+  stop("All peptides have zero standard deviation.  Cannot continue.")
+}
+if (length(zero_sd_rownames) > 0) {
+  cat(
+    sprintf("%d peptides with zero variance were removed from statistical consideration",
+      length(zero_sd_rownames)
+    )
+  )
+  zap_named_rows <- function(df, nms) {
+    return(df[!(row.names(df) %in% nms), ])
+  }
+  quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames)
+  quant_data     <- zap_named_rows(quant_data,     zero_sd_rownames)
+  full_data      <- zap_named_rows(full_data,      zero_sd_rownames)
+}
+
+if (sum(is.na(quant_data)) > 0) {
+  cat("\\leavevmode\\newpage\n")
+  # data visualization
+  old_par <- par(
+    mai = par("mai") + c(0.5, 0, 0, 0)
+  )
+  # Copy quant data to x
+  x <- quant_data
+  # x gets to have values of:
+  #  - NA for observed values
+  #  - 1 for missing values
+  x[is.na(x)] <- 0
+  x[x > 1] <- NA
+  x[x == 0] <- 1
+  # Log-transform imputed data
+  # update variable because rows may have been eliminated from quant_data_imp
+  quant_data_imp_log10 <- log10(quant_data_imp)
+
+  write_debug_file(quant_data_imp_log10)
+
+  # Set blue_dots to log of quant data or NA for NA quant data
+  blue_dots <- log10(quant_data)
+  # Set red_dots to log of imputed data or NA for observed quant data
+  red_dots <- quant_data_imp_log10 * x
+
+  count_red <- sum(!is.na(red_dots))
+  count_blue <- sum(!is.na(blue_dots))
+  ylim_save <- ylim <- c(
+    min(red_dots, blue_dots, na.rm = TRUE),
+    max(red_dots, blue_dots, na.rm = TRUE)
+    )
+  show_stripchart <-
+    50 > (count_red + count_blue) / length(sample_name_matches)
+  if (show_stripchart) {
+    boxplot_sub <- "Light blue = data before imputation; Red = imputed data"
+  } else {
+    boxplot_sub <- ""
+  }
+
+  # Vertical plot
+  colnames(blue_dots) <- sample_name_matches
+  boxplot(
+      blue_dots
+    , las = 1 # "always horizontal"
+    , col = const_boxplot_fill
+    , ylim = ylim
+    , main = "Peptide intensities after eliminating unusable peptides"
+    , sub = boxplot_sub
+    , xlab = "Sample"
+    , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+    )
+
+  if (show_stripchart) {
+    # Points
+    # ref: https://r-charts.com/distribution/add-points-boxplot/
+    # NA values are not plotted
+    stripchart(
+      blue_dots,                 # Data
+      method = "jitter",          # Random noise
+      jitter = const_stripchart_jitter,
+      pch = 19,                   # Pch symbols
+      cex = const_stripsmall_cex, # Size of symbols reduced
+      col = "lightblue",          # Color of the symbol
+      vertical = TRUE,            # Vertical mode
+      add = TRUE                  # Add it over
+      )
+    stripchart(
+      red_dots,                   # Data
+      method = "jitter",          # Random noise
+      jitter = const_stripchart_jitter,
+      pch = 19,                   # Pch symbols
+      cex = const_stripsmall_cex, # Size of symbols reduced
+      col = "red",                # Color of the symbol
+      vertical = TRUE,            # Vertical mode
+      add = TRUE                  # Add it over
+      )
+
+  }
+  if (TRUE) {
+    # show measured values in blue on left half-violin plot
+    cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
+    vioplot::vioplot(
+      x = lapply(blue_dots, function(x) x[!is.na(x)]),
+      col = "lightblue1",
+      side = "left",
+      plotCentre = "line",
+      ylim = ylim_save,
+      main = "Distributions of observed and imputed data",
+      sub = "Light blue = observed data; Pink = imputed data",
+      xlab = "Sample",
+      ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+      )
+    red_violins <- lapply(red_dots, function(x) x[!is.na(x)])
+    cols_to_delete <- c()
+    for (ix in seq_len(length(red_violins))) {
+      if (length(red_violins[[ix]]) < 1) {
+        cols_to_delete <- c(cols_to_delete, ix)
+      }
+    }
+    # destroy any unimputable columns
+    if (!is.null(cols_to_delete)) {
+      red_violins <- red_violins[-cols_to_delete]
+    }
+    # plot imputed values in red on right half-violin plot
+    vioplot::vioplot(
+      x = red_violins,
+      col = "lightpink1",
+      side = "right",
+      plotCentre = "line",
+      add = TRUE
+      )
+  }
+
+  par(old_par)
+
+  # density plot
+  cat("\\leavevmode\n\n\n\n\n\n\n")
+  if (can_plot_before_after_imp) {
+    ylim <- c(
+      0,
+      max(
+        if (is.list(d_combined)) d_combined$y else d_combined,
+        if (is.list(d_original)) d_original$y else d_original,
+        if (is.list(d_imputed)) d_imputed$y else d_imputed,
+        na.rm = TRUE
+      )
+    )
+    plot(
+      d_combined,
+      ylim = ylim,
+      sub =
+        paste(
+          "Blue = data before imputation; Red = imputed data;",
+          "Black = combined"
+          ),
+      main = "Density of peptide intensity before and after imputation",
+      xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
+      ylab = "Probability density"
+    )
+    lines(d_original, col = "blue")
+    lines(d_imputed, col = "red")
+  } else {
+    cat(
+      "There are too few points to plot the density of peptide intensity",
+      "before and after imputation."
+      )
+  }
+  cat("\\leavevmode\\newpage\n")
+}
+```
+
+# Perform Quantile Normalization
+
+The excellent `normalize.quantiles` function from
+*[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)*
+performs "quantile normalization" as described Bolstad *et al.* (2003),
+DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)*
+and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*,
+i.e., it assumes that the goal is to detect
+subtle differences among grossly similar samples (having similar distributions)
+by equailzing intra-quantile quantitations.
+Unfortunately, one software library upon which it depends
+*[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)*
+that requires that a specific, non-concurrent version of the library be
+installed.  The installation command equivalent to what was used to install the library to produce the results presented here is:
+```
+    conda install bioconductor-preprocesscore openblas=0.3.3
+```
+
+
+<!--
+# Apply quantile normalization using preprocessCore::normalize.quantiles
+# ---
+# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
+#   except this: https://support.bioconductor.org/p/122925/#9135989
+#   says to install it like this:
+#     ```
+#     BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1])
+#     ```
+# conda installation (necessary because of a bug in recent openblas):
+#   conda install bioconductor-preprocesscore openblas=0.3.3
+# ...
+# ---
+# normalize.quantiles {preprocessCore} --  Quantile Normalization
+#
+# Description:
+#   Using a normalization based upon quantiles, this function normalizes a
+#     matrix of probe level intensities.
+#
+#   THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the
+#     assumption that the data is missing at random.
+#
+# Usage:
+#   normalize.quantiles(x, copy = TRUE, keep.names = FALSE)
+#
+# Arguments:
+#
+#   - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
+#
+#   - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
+#       but in certain situations not making a copy of the matrix, but instead normalizing
+#       it in place will be more memory friendly.
+#
+#   - keep.names: Boolean option to preserve matrix row and column names in output.
+#
+# Details:
+#   This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
+#     No special allowances are made for outliers. If you make use of quantile normalization
+#     please cite Bolstad et al, Bioinformatics (2003).
+#
+#   This functions will handle missing data (ie NA values), based on
+#     the assumption that the data is missing at random.
+#
+#   Note that the current implementation optimizes for better memory usage
+#     at the cost of some additional run-time.
+#
+# Value: A normalized matrix.
+#
+# Author: Ben Bolstad, bmbolstad.com
+#
+# References
+#
+#   - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
+#       Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
+#
+#   - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
+#       Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
+#       and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
+#       http://bmbolstad.com/misc/normalize/normalize.html
+# ...
+-->
+```{r echo = FALSE, results = 'asis'}
+
+if (nrow(quant_data_imp) > 0) {
+  quant_data_imp_qn <- preprocessCore::normalize.quantiles(
+     as.matrix(quant_data_imp), keep.names = TRUE
+   )
+} else {
+  quant_data_imp_qn <- as.matrix(quant_data_imp)
+}
+
+quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
+
+write_debug_file(quant_data_imp_qn)
+
+quant_data_imp_qn_log <- log10(quant_data_imp_qn)
+
+write_debug_file(quant_data_imp_qn_log)
+
+quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
+
+sel <- apply(quant_data_imp_qn_ls, 1, any_nan)
+quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls
+
+quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ]
+quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)
+
+quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls)
+
+write_debug_file(quant_data_imp_qn_ls)
+write_debug_file(quant_data_imp_qn_ls2)
+
+# Create data.frame used by ANOVA analysis
+data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
+```
+
+<!-- ACE insertion begin -->
+## Are normalized, imputed, log-transformed sample distributions similar?
+
+```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+
+# Save unimputed quant_data_log for plotting below
+unimputed_quant_data_log <- quant_data_log
+
+# log10 transform (after preparing for zero values,
+#   which should never happen...)
+quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001
+quant_data_log <- log10(quant_data_imp_qn)
+
+how_many_peptides <- nrow(quant_data_log)
+
+if ((how_many_peptides) > 0) {
+  cat(
+    sprintf(
+      "Intensities for %d peptides:\n\n\n",
+      how_many_peptides
+      )
+    )
+  cat("\n\n\n")
+
+
+  # data visualization
+  old_par <- par(
+    mai = par("mai") + c(0.5, 0, 0, 0)
+  , oma = par("oma") + c(0.5, 0, 0, 0)
+  )
+  # ref: https://r-charts.com/distribution/add-points-boxplot/
+  # Vertical plot
+  colnames(quant_data_log) <- sample_name_matches
+  boxplot(
+    quant_data_log
+  , las = 1
+  , col = const_boxplot_fill
+  , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+  , xlab = "Sample"
+  )
+  par(old_par)
+} else {
+  cat("There are no peptides to plot\n")
+}
+
+cat("\n\n\n")
+
+```
+
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
+if (nrow(quant_data_log) > 1) {
+  quant_data_log_stack <- stack(quant_data_log)
+  ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
+    ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
+    ggplot2::ylab("Probability density") +
+    ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
+} else {
+  cat("No density plot because there are fewer than two peptides to plot.\n\n")
+}
+```
+```{r echo = FALSE, results = 'asis'}
+cat("\\leavevmode\\newpage\n")
+```
+
+# ANOVA Analysis
+
+```{r, echo = FALSE}
+# Make new data frame containing only Phosphopeptides
+#   to connect preANOVA to ANOVA (connect_df)
+connect_df <- data.frame(
+    data_table_imp_qn_lt$Phosphopeptide
+  , data_table_imp_qn_lt[, first_data_column]
+  )
+colnames(connect_df) <- c("Phosphopeptide", "Intensity")
+```
+
+```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+count_of_treatment_levels <- length(levels(sample_treatment_levels))
+if (count_of_treatment_levels < 2) {
+  nuke_control_sequences <-
+    function(s) {
+      s <- gsub("[\\]", "xyzzy_plugh", s)
+      s <- gsub("[$]", "\\\\$", s)
+      s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
+      return(s)
+    }
+  cat(
+    "ERROR!!!! Cannot perform ANOVA analysis",
+    "(see next page)\\newpage\n"
+  )
+  cat(
+    "ERROR: ANOVA analysis",
+    "requires two or more factor levels!\n\n\n"
+  )
+
+  cat("\n\n\n\n\n")
+  cat("Unparsed sample names are:\n\n\n",
+    "\\begin{quote}\n",
+    paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"),
+    "\n\\end{quote}\n\n")
+
+  regex_sample_names <- nuke_control_sequences(regex_sample_names)
+
+  cat("\\leavevmode\n\n\n")
+  cat("Parsing rule for SampleNames is",
+    "\n\n\n",
+    "\\text{'",
+    regex_sample_names,
+    "'}\n\n\n",
+    sep = ""
+    )
+
+  cat("\nParsed sample names are:\n",
+    "\\begin{quote}\n",
+    paste(sample_name_matches, collapse = "\n\n\n"),
+    "\n\\end{quote}\n\n")
+
+  regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping)
+
+  cat("\\leavevmode\n\n\n")
+  cat("Parsing rule for SampleGrouping is",
+    "\n\n\n",
+    "\\text{'",
+    regex_sample_grouping,
+    "'}\n\n\n",
+    sep = ""
+    )
+
+  cat("\n\n\n")
+  cat("Sample group assignments are:\n",
+    "\\begin{quote}\n",
+    paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"),
+    "\n\\end{quote}\n\n")
+
+} else {
+
+  p_value_data_anova_ps <-
+    apply(
+      quant_data_imp_qn_log,
+      1,
+      anova_func,
+      grouping_factor = sample_treatment_levels,
+      one_way_f = one_way_all_categories
+      )
+
+  p_value_data_anova_ps_fdr <-
+    p.adjust(p_value_data_anova_ps, method = "fdr")
+  p_value_data <- data.frame(
+    phosphopeptide = full_data[, 1]
+    ,
+    raw_anova_p = p_value_data_anova_ps
+    ,
+    fdr_adjusted_anova_p = p_value_data_anova_ps_fdr
+  )
+
+  # output ANOVA file to constructed filename,
+  #   e.g.    "Outputfile_pST_ANOVA_STEP5.txt"
+  #   becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"
+
+  # Re-output datasets to include p-values
+  metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3])
+  write.table(
+    cbind(metadata_plus_p, quant_data_imp),
+    file = imputed_data_filename,
+    sep = "\t",
+    col.names = TRUE,
+    row.names = FALSE,
+    quote = FALSE
+    )
+
+  write.table(
+    cbind(metadata_plus_p, quant_data_imp_qn_log),
+    file = imp_qn_lt_data_filenm,
+    sep = "\t",
+    col.names = TRUE,
+    row.names = FALSE,
+    quote = FALSE
+    )
+
+
+  p_value_data <-
+    p_value_data[order(p_value_data$fdr_adjusted_anova_p), ]
+
+  first_page_suppress <- 1
+  number_of_peptides_found <- 0
+  cutoff <- val_fdr[1]
+  for (cutoff in val_fdr) {
+    if (number_of_peptides_found > 49) {
+      cat("\\leavevmode\n\n\n")
+      break
+      }
+
+    #loop through FDR cutoffs
+
+    filtered_p <-
+      p_value_data[
+        which(p_value_data$fdr_adjusted_anova_p < cutoff),
+        , drop = FALSE
+        ]
+    filtered_data_filtered <-
+      quant_data_imp_qn_log[
+        rownames(filtered_p),
+        , drop = FALSE
+        ]
+    filtered_data_filtered <-
+      filtered_data_filtered[
+        order(filtered_p$fdr_adjusted_anova_p),
+        , drop = FALSE
+        ]
+
+    # <!-- ACE insertion start -->
+
+    if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
+      if (first_page_suppress == 1) {
+        first_page_suppress <- 0
+        } else {
+          cat("\\newpage\n")
+        }
+      if (nrow(filtered_data_filtered) > 1) {
+        subsection_header(sprintf(
+          "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n",
+          nrow(filtered_data_filtered),
+          cutoff
+        ))
+      } else {
+        subsection_header(sprintf(
+          "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n",
+          rownames(filtered_data_filtered)[1],
+          cutoff
+        ))
+      }
+      cat("\n\n\n")
+      cat("\n\n\n")
+
+      old_oma <- par("oma")
+      old_par <- par(
+        mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
+        oma = old_oma * c(1, 1, 0.3, 1),
+        cex.main = 0.9,
+        cex.axis = 0.7,
+        fin = c(9, 7.25)
+        )
+      # ref: https://r-charts.com/distribution/add-points-boxplot/
+      # Vertical plot
+      colnames(filtered_data_filtered) <- sample_name_matches
+      tryCatch(
+        boxplot(
+          filtered_data_filtered,
+          main = "Imputed, normalized intensities", # no line plot
+          las = 1,
+          col = const_boxplot_fill,
+          ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+        ),
+        error = function(e) print(e)
+      )
+      par(old_par)
+    } else {
+      cat(sprintf(
+        "%s < %0.2f\n\n\n\n\n",
+        "No peptides were found to have cutoff adjusted p-value",
+        cutoff
+      ))
+    }
+
+    if (nrow(filtered_data_filtered) > 0) {
+      # Add Phosphopeptide column to anova_filtered table
+      # The assumption here is that the first intensity is unique;
+      #   this is a hokey assumption but almost definitely will
+      #   be true in the real world unless there is a computation
+      #   error upstream.
+      anova_filtered_merge <- base::merge(
+        x = connect_df,
+        y = filtered_data_filtered,
+        by.x = "Intensity",
+        by.y = 1
+      )
+      anova_filtered_merge_order <- rownames(filtered_p)
+
+      anova_filtered <- data.frame(
+        ppep    = anova_filtered_merge$Phosphopeptide,
+        intense = anova_filtered_merge$Intensity,
+        data    = anova_filtered_merge[, 2:number_of_samples + 1]
+      )
+      colnames(anova_filtered) <-
+        c("Phosphopeptide", colnames(filtered_data_filtered))
+
+      # Merge qualitative columns into the ANOVA data
+      output_table <- data.frame(anova_filtered$Phosphopeptide)
+      output_table <- base::merge(
+        x = output_table,
+        y = data_table_imp_qn_lt,
+        by.x = "anova_filtered.Phosphopeptide",
+        by.y = "Phosphopeptide"
+      )
+
+      # Produce heatmap to visualize significance and the effect of imputation
+
+      anova_filtered_merge_format <- sapply(
+        X = filtered_p$fdr_adjusted_anova_p
+        ,
+        FUN = function(x) {
+          if (x > 0.0001)
+            paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s")
+          else
+            paste0("(%0.4e) %s")
+        }
+      )
+
+      cat_hm_heading <- function(m, cutoff) {
+        cat("\\newpage\n")
+        if (nrow(m) > intensity_hm_rows) {
+          subsection_header(
+            paste(
+              sprintf("Heatmap for the %d most-significant peptides",
+                intensity_hm_rows),
+              sprintf("whose adjusted p-value < %0.2f\n", cutoff)
+            )
+          )
+        } else {
+          if (nrow(m) == 1) {
+            return(FALSE)
+          } else {
+            subsection_header(
+              paste(
+                sprintf("Heatmap for %d usable peptides whose", nrow(m)),
+                sprintf("adjusted p-value < %0.2f\n", cutoff)
+              )
+            )
+          }
+        }
+        cat("\n\n\n")
+        cat("\n\n\n")
+        return(TRUE)
+      }
+
+      # construct matrix with appropriate rownames
+      m <-
+        as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
+      if (nrow(m) > 0) {
+        rownames_m <- rownames(m)
+        rownames(m) <- sapply(
+          X = seq_len(nrow(m))
+          ,
+          FUN = function(i) {
+            sprintf(
+              anova_filtered_merge_format[i],
+              filtered_p$fdr_adjusted_anova_p[i],
+              rownames_m[i]
+            )
+          }
+        )
+      }
+      # draw the heading and heatmap
+      if (nrow(m) > 0) {
+          number_of_peptides_found <-
+            draw_intensity_heatmap(
+              m                       = m,
+              cutoff                  = cutoff,
+              hm_heading_function     = cat_hm_heading,
+              hm_main_title           = "Unimputed, unnormalized log(intensities)",
+              suppress_row_dendrogram = FALSE
+            )
+      }
+    }
+  }
+}
+cat("\\leavevmode\n\n\n")
+```
+
+```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+
+if (count_of_treatment_levels > 1) {
+  # Prepare two-way contrasts with adjusted p-values
+  # Strategy:
+  # - use imputed, log-transformed data:
+  #   - remember this when computing log2(fold-change)
+  # - each contrast is between a combination of trt levels
+  # - for each contrast, compute samples that are members
+  # - compute one-way test:
+  #   - use `oneway.test` (Welch test) if numbers of samples
+  #     are not equivalent between trt levels
+  #   - otherwise, aov is fine but offers no advantage
+  # - adjust p-value, assuming that
+  #   (# of pppeps)*(# of contrasts) tests were performed
+
+  # Each contrast is between a combination of trt levels
+  m2 <- combn(
+    x = seq_len(length(levels(sample_treatment_levels))),
+    m = 2,
+    simplify = TRUE
+  )
+  contrast_count <- ncol(m2)
+
+  # For each contrast, compute samples that are members
+  # - local function to construct a data.frame for each contrast
+  #   - the contrast in the first "column"
+  f_m2 <-
+    function(cntrst, lvl1, lvl2) {
+      return(
+        data.frame(
+          contrast = cntrst,
+          level = sample_treatment_levels[
+              sample_treatment_levels %in%
+                levels(sample_treatment_levels)[c(lvl1, lvl2)]
+            ],
+          label = sample_name_matches[
+              sample_treatment_levels %in%
+                levels(sample_treatment_levels)[c(lvl1, lvl2)]
+            ]
+        )
+      )
+    }
+  # - compute a df for each contrast
+  sample_level_dfs <- lapply(
+    X = 1:contrast_count,
+    FUN = function(i) f_m2(i, m2[1, i], m2[2, i])
+  )
+  # - compute a single df for all contrasts
+  combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs)
+
+  # - dispose objects to free resources
+  rm(sample_level_dfs)
+
+  # - write the df to a DB for later join-per-contrast
+  db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
+
+  RSQLite::dbWriteTable(
+    conn = db,
+    name = "contrast",
+    value = combined_contrast_df,
+    overwrite = TRUE
+  )
+
+  # Create UK for insert
+  ddl_exec(db, "
+    CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx
+      ON contrast(contrast, label);
+    "
+  )
+  # Create indexes for join
+  ddl_exec(db, "
+    -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label
+    CREATE INDEX IF NOT EXISTS contrast__label__idx
+      ON contrast(label);
+    "
+  )
+  ddl_exec(db, "
+    -- index for joining two contrast_lvl_ppep_avg_quant on contrast
+    CREATE INDEX IF NOT EXISTS contrast__contrast__idx
+      ON contrast(contrast);
+    "
+  )
+  ddl_exec(db, "
+    -- index for joining two contrast_lvl_ppep_avg_quant on phophospep
+    CREATE INDEX IF NOT EXISTS contrast__level__idx
+      ON contrast(level);
+    "
+  )
+  # - dispose objects to free resources
+  rm(combined_contrast_df)
+
+  # Use imputed, log-transformed data
+  # - remember that this was donoe when computing log2(fold-change)
+  # - melt data matrix for use in later join-per-contrast
+  casted <- cbind(
+    data.frame(vrbl = rownames(quant_data_imp_qn_log)),
+    quant_data_imp_qn_log
+  )
+  quant_data_imp_qn_log_melted <- reshape2::melt(
+    casted,
+    id.vars = "vrbl"
+  )
+  colnames(quant_data_imp_qn_log_melted) <-
+    c("phosphopep", "sample", "quant")
+  # - dispose objects to free resources
+  rm(casted)
+
+  # - write the df to a DB for use in later join-per-contrast
+  RSQLite::dbWriteTable(
+    conn = db,
+    name = "ppep_smpl_qnlt",
+    value = quant_data_imp_qn_log_melted,
+    overwrite = TRUE
+  )
+  # Create UK for insert
+  ddl_exec(db, "
+    CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx
+      ON ppep_smpl_qnlt(phosphopep, sample);
+    "
+  )
+  # Create index for join
+  ddl_exec(db, "
+    -- index for join in contrast_ppep_smpl_qnlt
+    CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx
+      ON ppep_smpl_qnlt(sample);
+    "
+  )
+  ddl_exec(db, "
+    -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep
+    CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx
+      ON ppep_smpl_qnlt(phosphopep);
+    "
+  )
+  # - dispose objects to free resources
+  rm(quant_data_imp_qn_log_melted)
+
+  # - drop views if exist
+  ddl_exec(db, "
+    -- drop view dependent on contrast_lvl_ppep_avg_quant
+    DROP VIEW IF EXISTS v_contrast_log2_fc;
+    "
+  )
+  ddl_exec(db, "
+    -- drop table dependent on contrast_ppep_smpl_qnlt
+    DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant;
+    "
+  )
+  ddl_exec(db, "
+    DROP TABLE IF EXISTS contrast_lvl_lvl_metadata;
+    "
+  )
+  ddl_exec(db, "
+    DROP VIEW IF EXISTS v_contrast_lvl_metadata;
+    "
+  )
+  ddl_exec(db, "
+    -- drop view dependent on contrast_ppep_smpl_qnlt
+    DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant;
+    "
+  )
+  ddl_exec(db, "
+    DROP VIEW IF EXISTS v_contrast_lvl_lvl;
+    "
+  )
+  ddl_exec(db, "
+    -- drop view upon which other views depend
+    DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt;
+    "
+  )
+  # - create view
+  dml_no_rows_exec(db, "
+      -- view contrast_ppep_smpl_qnlt is used for each phopshopep to
+      --   compute p-value for test of trt effect for two trt levels
+      CREATE VIEW contrast_ppep_smpl_qnlt
+        AS
+      SELECT  contrast,
+              level,
+              phosphopep,
+              sample,
+              quant
+        FROM  contrast AS c,
+              ppep_smpl_qnlt AS q
+        WHERE q.sample = c.label
+        ORDER BY contrast, level, phosphopep
+      ;
+    "
+  )
+  # - create simplification views
+  dml_no_rows_exec(db, "
+      CREATE VIEW v_contrast_lvl_metadata
+        AS
+      SELECT  contrast,
+              level,
+              group_concat(label, ';') AS samples
+        FROM  contrast
+        GROUP BY contrast, level
+        /* view v_contrast_lvl_metadata is used
+           to simplify creation of table contrast_lvl_lvl_metadata */
+      ;
+    "
+  )
+  dml_no_rows_exec(db, "
+      CREATE VIEW v_contrast_lvl_ppep_avg_quant
+        AS
+      SELECT  contrast,
+              level,
+              phosphopep,
+              avg(quant)                AS avg_quant
+        FROM  contrast_ppep_smpl_qnlt
+        GROUP BY contrast, level, phosphopep
+        /* view v_contrast_lvl_ppep_avg_quant is used
+           to simplify view v_contrast_log2_fc */
+      ;
+    "
+  )
+
+  # - create contrast-metadata table
+  dml_no_rows_exec(db, "
+      CREATE TABLE contrast_lvl_lvl_metadata
+        AS
+      SELECT DISTINCT
+              a.contrast              AS ab_contrast,
+              a.level                 AS a_level,
+              b.level                 AS b_level,
+              a.samples               AS a_samples,
+              b.samples               AS b_samples,
+              'log2(level_'||a.level||'/level_'||b.level||')'
+                                      AS fc_description
+        FROM  v_contrast_lvl_metadata AS a,
+              v_contrast_lvl_metadata AS b
+        WHERE a.contrast = b.contrast
+          AND a.level > b.level
+        /* view v_contrast_lvl_lvl is used
+           to simplify view v_contrast_log2_fc */
+      ;
+    "
+  )
+  # - create pseudo-materialized view table
+  dml_no_rows_exec(db, "
+      CREATE VIEW v_contrast_lvl_lvl
+        AS
+      SELECT DISTINCT
+              a.contrast AS ab_contrast,
+              a.level    AS a_level,
+              b.level    AS b_level
+        FROM  contrast   AS a,
+              contrast   AS b
+        WHERE a.contrast = b.contrast
+          AND a.level > b.level
+        /* view v_contrast_lvl_lvl is used
+           to simplify view v_contrast_log2_fc */
+      ;
+    "
+  )
+
+  # - create view to compute log2(fold-change)
+  dml_no_rows_exec(db, "
+      CREATE VIEW v_contrast_log2_fc
+        AS
+      SELECT  ab.ab_contrast                         AS contrast,
+              m.a_level                              AS a_level,
+              c.avg_quant                            AS a_quant,
+              m.a_samples                            AS a_samples,
+              ab.b_level                             AS b_level,
+              d.avg_quant                            AS b_quant,
+              m.b_samples                            AS b_samples,
+              m.fc_description                       AS fc_description,
+              3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc,
+              d.phosphopep                           AS phosphopep
+        FROM  contrast_lvl_lvl_metadata                  AS m,
+              v_contrast_lvl_ppep_avg_quant              AS d,
+              v_contrast_lvl_lvl AS ab
+                INNER JOIN v_contrast_lvl_ppep_avg_quant AS c
+                  ON c.contrast = ab.ab_contrast
+                  AND c.level   = ab.a_level
+        WHERE d.contrast    = ab.ab_contrast
+          AND m.ab_contrast = ab.ab_contrast
+          AND d.level       = ab.b_level
+          AND d.phosphopep  = c.phosphopep
+        /* view to compute log2(fold-change) */
+        ;
+    "
+  )
+
+  # For each contrast, compute samples that are members
+  # compute one-way test:
+  # - use `oneway.test` (Welch test) if numbers of samples
+  #   are not equivalent between trt levels
+  # - otherwise, aov is fine but offers no advantage
+  for (contrast in contrast_count:2) {
+    invisible(contrast)
+  }
+  for (contrast in 1:contrast_count) {
+    contrast_df <- sqldf::sqldf(
+      x = paste0("
+        SELECT level, phosphopep, sample, quant
+          FROM contrast_ppep_smpl_qnlt
+          WHERE contrast = ", contrast, "
+          ORDER BY phosphopep, level, sample
+      "),
+      connection = db
+    )
+    contrast_cast <- reshape2::dcast(
+      data = contrast_df,
+      formula = phosphopep ~ sample,
+      value.var = "quant"
+    )
+    contrast_cast_ncol <- ncol(contrast_cast)
+    contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol]
+    contrast_cast_samples <- colnames(contrast_cast_data)
+
+    # - order grouping_factor by order of sample columns of contrast_cast_data
+    grouping_factor <- sqldf::sqldf(
+      x = paste0("
+        SELECT sample, level
+          FROM contrast_ppep_smpl_qnlt
+          WHERE contrast = ", contrast, "
+          ORDER BY phosphopep, level, sample
+          LIMIT ", contrast_cast_ncol - 1
+      ),
+      connection = db
+    )
+    rownames(grouping_factor) <- grouping_factor$sample
+    grouping_factor <- grouping_factor[, "level", drop = FALSE]
+
+    # - run the two-level (one-way) test
+    p_value_data_contrast_ps <-
+      apply(
+        X = contrast_cast_data,
+        MARGIN = 1, # apply to rows
+        FUN = anova_func,
+        grouping_factor =
+          as.factor(as.numeric(grouping_factor$level)), # anova_func arg2
+        one_way_f = one_way_two_categories, # anova_func arg3
+        simplify = TRUE # TRUE is the default for simplify
+        )
+    contrast_data_adj_p_values <- p.adjust(
+        p = p_value_data_contrast_ps,
+        method = "fdr",
+        n = length(p_value_data_contrast_ps) # this is the default, length(p)
+      )
+    # - compute the fold-change
+    contrast_p_df <-
+      data.frame(
+        contrast = contrast,
+        phosphopep = contrast_cast$phosphopep,
+        p_value_raw = p_value_data_contrast_ps,
+        p_value_adj = contrast_data_adj_p_values
+      )
+    db_write_table_overwrite <- (contrast < 2)
+    db_write_table_append <- !db_write_table_overwrite
+    RSQLite::dbWriteTable(
+      conn = db,
+      name = "contrast_ppep_p_val",
+      value = contrast_p_df,
+      append = db_write_table_append
+      )
+    # Create UK for insert
+    ddl_exec(db, "
+      CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx
+        ON contrast_ppep_p_val(phosphopep, contrast);
+      "
+    )
+  }
+  # Perhaps this could be done more elegantly using unique keys
+  #   or creating the tables before saving data to them, but this
+  #   is fast and, if the database exists on disk rather than in
+  #   memory, it doesn't stress memory.
+  dml_no_rows_exec(db, "
+    CREATE TEMP table contrast_log2_fc
+      AS
+    SELECT *
+      FROM  v_contrast_log2_fc
+      ORDER BY contrast, phosphopep
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    CREATE TEMP table ppep_p_val
+      AS
+    SELECT  p_value_raw,
+            p_value_adj,
+            contrast   AS p_val_contrast,
+            phosphopep AS p_val_ppep
+      FROM  contrast_ppep_p_val
+      ORDER BY contrast, phosphopep
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    DROP TABLE IF EXISTS contrast_log2_fc_p_val
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    CREATE TABLE contrast_log2_fc_p_val
+      AS
+    SELECT  a.*,
+            b.p_value_raw,
+            b.p_value_adj,
+            b.p_val_contrast,
+            b.p_val_ppep
+      FROM  contrast_log2_fc a, ppep_p_val b
+      WHERE a.rowid = b.rowid
+        AND a.phosphopep = b.p_val_ppep
+    ;
+    "
+  )
+  # Create UK
+  ddl_exec(db, "
+    CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx
+      ON contrast_log2_fc_p_val(phosphopep, contrast);
+    "
+  )
+  # Create indices for future queries
+  ddl_exec(db, "
+    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx
+      ON contrast_log2_fc_p_val(contrast);
+    "
+  )
+  ddl_exec(db, "
+    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx
+      ON contrast_log2_fc_p_val(phosphopep);
+    "
+  )
+  ddl_exec(db, "
+    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx
+      ON contrast_log2_fc_p_val(p_value_raw);
+    "
+  )
+  ddl_exec(db, "
+    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx
+      ON contrast_log2_fc_p_val(p_value_adj);
+    "
+  )
+  dml_no_rows_exec(db, "
+    DROP VIEW IF EXISTS v_contrast_log2_fc_p_val
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+      CREATE VIEW v_contrast_log2_fc_p_val
+        AS
+      SELECT  contrast,
+              a_level,
+              a_samples,
+              b_level,
+              b_samples,
+              a_quant,
+              b_quant,
+              fc_description,
+              log2_fc,
+              p_value_raw,
+              p_value_adj,
+              phosphopep
+        FROM  contrast_log2_fc_p_val
+        ORDER BY contrast, phosphopep
+        ;
+    "
+  )
+  ddl_exec(db, "
+    DROP TABLE IF EXISTS kseaapp_metadata
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    CREATE TABLE kseaapp_metadata
+      AS
+    WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS (
+              SELECT DISTINCT
+                  deppep.seq,
+                  ppep.seq,
+                  GeneName||';',
+                  UniProtID||';',
+                  PhosphoResidue||';'
+              FROM
+                ppep, deppep, mrgfltr_metadata
+              WHERE
+                  mrgfltr_metadata.ppep_id = ppep.id
+                AND
+                  ppep.deppep_id = deppep.id
+        )
+    SELECT
+        ppep                                                  AS `ppep`,
+        SUBSTR(uniprot_id,     1, INSTR(uniprot_id,';') - 1 ) AS `Protein`,
+        SUBSTR(gene_name,      1, INSTR(gene_name,';')  - 1 ) AS `Gene`,
+        deppep                                                AS `Peptide`,
+        REPLACE(
+          REPLACE(
+            SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ),
+            'p',
+            ''
+            ),
+          ', ',
+          ';'
+          )                                                   AS `Residue.Both`
+      FROM extended
+      ;
+    "
+  )
+  # Create indexes for join
+  ddl_exec(db, "
+    CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx
+      ON kseaapp_metadata(ppep);
+    "
+  )
+  ddl_exec(db, "
+    DROP VIEW IF EXISTS v_kseaapp_contrast
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    CREATE VIEW v_kseaapp_contrast
+      AS
+    SELECT  a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both`
+      FROM  v_contrast_log2_fc_p_val a, kseaapp_metadata b
+      WHERE b.ppep = a.phosphopep
+      ;
+      "
+  )
+  ddl_exec(db, "
+    DROP VIEW IF EXISTS v_kseaapp_input
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    CREATE VIEW v_kseaapp_input
+      AS
+    SELECT  v.contrast,
+            v.phosphopep,
+            m.`Protein`,
+            m.`Gene`,
+            m.`Peptide`,
+            m.`Residue.Both`,
+            v.p_value_raw AS `p`,
+            v.log2_fc AS `FC`
+      FROM  kseaapp_metadata AS m,
+            v_contrast_log2_fc_p_val AS v
+      WHERE m.ppep = v.phosphopep
+        AND NOT m.`Gene` = 'No_Gene_Name'
+        AND NOT v.log2_fc = 0
+      ;
+      "
+  )
+}
+```
+
+```{r echo = FALSE, results = 'asis'}
+cat("\\newpage\n")
+```
+
+# KSEA Analysis
+
+Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched.   Enrichments are found by the CRAN `KSEAapp` package:
+
+- The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp
+- The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415).
+- An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/).
+
+For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as:
+
+$$
+\text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j}
+$$
+
+and fold-enrichment is computed as:
+
+$$
+\text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j}
+$$
+
+where:
+
+- $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$,
+- $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and
+- $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$,
+- $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset.
+- Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score.
+
+$\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x)
+
+Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive).
+
+Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`.
+
+- Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis.
+- Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)).
+- Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).
+
+```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+
+db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
+
+# -- eliminate the table that's about to be defined
+ddl_exec(db, "
+DROP TABLE IF EXISTS site_metadata;
+")
+
+# -- define the site_metadata table
+ddl_exec(db, "
+CREATE TABLE site_metadata(
+  id            INTEGER PRIMARY KEY
+, site_type_id  INTEGER REFERENCES site_type(id)
+, full          TEXT    UNIQUE ON CONFLICT IGNORE
+, abbrev        TEXT
+, pattern       TEXT
+, motif         TEXT
+);
+")
+
+# -- populate the table with initial values
+ddl_exec(db, "
+INSERT INTO site_metadata(full, abbrev, site_type_id)
+  SELECT  DISTINCT kinase_map, kinase_map, site_type_id
+    FROM  ppep_gene_site
+    ORDER BY kinase_map;
+")
+
+# -- drop bogus KSData view if exists
+ddl_exec(db, "
+DROP VIEW IF EXISTS ks_data_v;
+")
+
+# -- create view to serve as an impostor for  KSEAapp::KSData
+ddl_exec(db, "
+CREATE VIEW IF NOT EXISTS ks_data_v
+AS
+SELECT
+  'NA' AS KINASE,
+  'NA' AS KIN_ACC_ID,
+  kinase_map AS GENE,
+  'NA' AS KIN_ORGANISM,
+  'NA' AS SUBSTRATE,
+  0 AS SUB_GENE_ID,
+  'NA' AS SUB_ACC_ID,
+  gene_names AS SUB_GENE,
+  'NA' AS SUB_ORGANISM,
+  phospho_peptide AS SUB_MOD_RSD,
+  0 AS SITE_GROUP_ID,
+  'NA' AS 'SITE_7AA',
+  2 AS networkin_score,
+  type_name AS Source
+FROM ppep_gene_site_view;
+")
+
+contrast_metadata_df <-
+  sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db)
+rslt <- new_env()
+rslt$score_list <- list()
+rslt$name_list  <- list()
+rslt$longname_list  <- list()
+
+ddl_exec(db, "
+  DROP TABLE IF EXISTS contrast_ksea_scores;
+  "
+)
+
+next_index <- 0
+err_na_subscr_df_const <-
+ "missing values are not allowed in subscripted assignments of data frames"
+
+for (i_cntrst in seq_len(nrow(contrast_metadata_df))) {
+  cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
+  cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
+  cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
+  contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
+  contrast_longlabel <- (
+    sprintf(
+      "Trt %s {%s} -> Trt %s {%s}",
+      contrast_metadata_df[i_cntrst, "b_level"],
+      gsub(
+        pattern = ";",
+        replacement = ", ",
+        x = contrast_metadata_df[i_cntrst, "b_samples"],
+        fixed = TRUE
+      ),
+      contrast_metadata_df[i_cntrst, "a_level"],
+      gsub(
+        pattern = ";",
+        replacement = ", ",
+        x = contrast_metadata_df[i_cntrst, "a_samples"],
+        fixed = TRUE
+      )
+    )
+  )
+  kseaapp_input <-
+    sqldf::sqldf(
+      x = sprintf("
+        SELECT `Protein`, `Gene`, `Peptide`, phosphopep AS `Residue.Both`, `p`, `FC`
+          FROM v_kseaapp_input
+          WHERE contrast = %d
+        ",
+        i_cntrst
+      ),
+    connection = db
+    )
+
+  pseudo_ksdata <- dbReadTable(db, "ks_data_v")
+
+  # This hack is because SQL table has the log2-transformed values
+  kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE]
+  main_title <- (
+      sprintf(
+      "Change from treatment %s to treatment %s",
+      contrast_metadata_df[i_cntrst, "b_level"],
+      contrast_metadata_df[i_cntrst, "a_level"]
+    )
+  )
+  sub_title <- contrast_longlabel
+  tryCatch(
+    expr = {
+        ksea_scores_rslt <-
+          ksea_scores(
+            ksdata           = pseudo_ksdata, # KSEAapp::KSData,
+            px               = kseaapp_input,
+            networkin        = TRUE,
+            networkin_cutoff = 2
+            )
+
+        if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
+          next_index <- 1 + next_index
+          rslt$score_list[[next_index]] <- ksea_scores_rslt
+          rslt$name_list[[next_index]] <- contrast_label
+          rslt$longname_list[[next_index]] <- contrast_longlabel
+          low_fdr_print(
+            rslt = rslt,
+            i_cntrst = i_cntrst,
+            i = next_index,
+            a_level = cntrst_a_level,
+            b_level = cntrst_b_level,
+            fold_change = cntrst_fold_change,
+            caption = contrast_longlabel
+            )
+        }
+      },
+    error = function(e) str(e)
+  )
+}
+
+plotted_kinases <- NULL
+if (length(rslt$score_list) > 1) {
+  for (i in seq_len(length(ksea_heatmap_titles))) {
+    hdr <- ksea_heatmap_titles[[i]]
+    which_kinases <- i
+
+    cat("\\clearpage\n\\begin{center}\n")
+    if (i == const_ksea_astrsk_kinases) {
+      subsection_header(hdr)
+    } else {
+      subsection_header(hdr)
+    }
+    cat("\\end{center}\n")
+
+    plotted_kinases <- ksea_heatmap(
+      # the data frame outputs from the KSEA.Scores() function, in list format
+      score_list = rslt$score_list,
+      # a character vector of all the sample names for heatmap annotation:
+      # - the names must be in the same order as the data in score_list
+      # - please avoid long names, as they may get cropped in the final image
+      sample_labels = rslt$name_list,
+      # character string of either "p.value" or "FDR" indicating the data column
+      #   to use for marking statistically significant scores
+      stats = c("p.value", "FDR")[2],
+      # a numeric value between 0 and infinity indicating the min. number of
+      #   substrates a kinase must have to be included in the heatmap
+      m_cutoff = 1,
+      # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
+      #   for indicating significant kinases in the heatmap
+      p_cutoff = 0.05,
+      # a binary input of TRUE or FALSE, indicating whether or not to perform
+      #   hierarchical clustering of the sample columns
+      sample_cluster = TRUE,
+      # a binary input of TRUE or FALSE, indicating whether or not to export
+      #   the heatmap as a .png image into the working directory
+      export = FALSE,
+      # additional arguments to gplots::heatmap.2, such as:
+      # - main: main title of plot
+      # - xlab: x-axis label
+      # - ylab: y-axis label
+      xlab = "Contrast",
+      ylab = "Kinase",
+      # print which kinases:
+      # - 1 : all kinases
+      # - 2 : significant kinases
+      # - 3 : non-significant kinases
+      which_kinases = which_kinases
+    )
+    cat("\\begin{center}\n")
+    cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign.  Asterisks reflect significance.\n")
+    cat("\\end{center}\n")
+  } # end for (i in ...
+} # end if (length ...
+
+for (i_cntrst in seq_len(length(rslt$score_list))) {
+  next_index <- i_cntrst
+  cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
+  cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
+  cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
+  contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
+  contrast_longlabel <- (
+    sprintf(
+      "Trt %s {%s} -> Trt %s {%s}",
+      contrast_metadata_df[i_cntrst, "b_level"],
+      gsub(
+        pattern = ";",
+        replacement = ", ",
+        x = contrast_metadata_df[i_cntrst, "b_samples"],
+        fixed = TRUE
+      ),
+      contrast_metadata_df[i_cntrst, "a_level"],
+      gsub(
+        pattern = ";",
+        replacement = ", ",
+        x = contrast_metadata_df[i_cntrst, "a_samples"],
+        fixed = TRUE
+      )
+    )
+  )
+  main_title <- (
+      sprintf(
+      "Change from treatment %s to treatment %s",
+      contrast_metadata_df[i_cntrst, "b_level"],
+      contrast_metadata_df[i_cntrst, "a_level"]
+    )
+  )
+  sub_title <- contrast_longlabel
+  tryCatch(
+    expr = {
+        ksea_scores_rslt <- rslt$score_list[[next_index]]
+
+        if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
+          low_fdr_barplot(
+            rslt = rslt,
+            i_cntrst = i_cntrst,
+            i = next_index,
+            a_level = cntrst_a_level,
+            b_level = cntrst_b_level,
+            fold_change = cntrst_fold_change,
+            caption = contrast_longlabel
+            )
+        }
+      },
+    error = function(e) str(e)
+  )
+}
+```
+
+```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+
+# Use enriched kinases to find enriched kinase-substrate pairs
+enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
+all_enriched_substrates <- sqldf("
+  SELECT
+    gene AS kinase,
+    ppep,
+    '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label
+  FROM (
+    SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
+      FROM pseudo_ksdata
+      WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+    )
+  GROUP BY ppep
+  ")
+
+# helper used to label per-kinase substrate enrichment figure
+cat_enriched_heading <- function(m, cut_args) {
+  cutoff <- cut_args$cutoff
+  kinase <- cut_args$kinase
+  statistic <- cut_args$statistic
+  threshold <- cut_args$threshold
+  cat("\\newpage\n")
+  if (nrow(m) > intensity_hm_rows) {
+    subsection_header(
+      paste(
+        sprintf(
+          "Lowest p-valued %d (of %d) enriched %s-substrates,",
+          intensity_hm_rows,
+          nrow(m),
+          kinase
+        ),
+        sprintf(" KSEA %s < %0.2f\n", statistic, threshold)
+      )
+    )
+  } else {
+    if (nrow(m) == 1) {
+      return(FALSE)
+    } else {
+      subsection_header(
+        paste(
+          sprintf(
+            "%d enriched %s-substrates,",
+            nrow(m),
+            kinase
+            ),
+          sprintf(
+            " KSEA %s < %0.2f\n",
+            statistic,
+            threshold
+            )
+        )
+      )
+    }
+  }
+  cat("\n\n\n")
+  cat("\n\n\n")
+  return(TRUE)
+}
+
+# Disabling heatmaps for substrates pending decision whether to eliminate them altogether
+if (FALSE)
+  for (kinase_name in sort(enriched_kinases$kinase)) {
+    enriched_substrates <-
+      all_enriched_substrates[
+        all_enriched_substrates$kinase == kinase_name,
+        ,
+        drop = FALSE
+        ]
+    # Get the intensity values for the heatmap
+    enriched_intensities <-
+      as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE])
+    # Remove rows having too many NA values to be relevant
+    na_counter <- is.na(enriched_intensities)
+    na_counts <- apply(na_counter, 1, sum)
+    enriched_intensities <-
+      enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE]
+    # Rename the rows with the display-name for the heatmap
+    rownames(enriched_intensities) <-
+      sapply(
+        X = rownames(enriched_intensities),
+        FUN = function(rn) {
+          enriched_substrates[enriched_substrates$ppep == rn, "label"]
+        }
+      )
+    # Format as matrix for heatmap
+    m <- as.matrix(enriched_intensities)
+    # Draw the heading and heatmap
+    if (nrow(m) > 0) {
+      cut_args <- new_env()
+      cut_args$cutoff <- cutoff
+      cut_args$kinase <- kinase_name
+      cut_args$statistic <- ksea_cutoff_statistic
+      cut_args$threshold <- ksea_cutoff_threshold
+      number_of_peptides_found <-
+        draw_intensity_heatmap(
+          m                       = m,
+          cutoff                  = cut_args,
+          hm_heading_function     = cat_enriched_heading,
+          hm_main_title
+            = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates",
+          suppress_row_dendrogram = FALSE
+        )
+    }
+  }
+
+# Write output tabular files
+
+# get kinase, ppep, concat(kinase) tuples for enriched kinases
+
+kinase_ppep_label <- sqldf("
+  WITH
+    t(ppep, label) AS
+      (
+        SELECT DISTINCT
+            SUB_MOD_RSD AS ppep,
+            group_concat(gene, '; ') AS label
+          FROM pseudo_ksdata
+          WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+          GROUP BY ppep
+      ),
+    k(kinase, ppep_join) AS
+      (
+      SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
+        FROM pseudo_ksdata
+        WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+      )
+  SELECT k.kinase, t.ppep, t.label
+    FROM  t,  k
+    WHERE t.ppep = k.ppep_join
+    ORDER BY k.kinase, t.ppep
+  ")
+
+# extract what we need from full_data
+impish <- cbind(rownames(quant_data_imp), quant_data_imp)
+colnames(impish)[1] <- "Phosphopeptide"
+data_table_imputed_sql <- "
+  SELECT
+    f.*,
+    k.label AS KSEA_enrichments,
+    q.*
+  FROM
+    metadata_plus_p f
+      LEFT JOIN kinase_ppep_label k
+        ON f.Phosphopeptide = k.ppep,
+    impish q
+  WHERE
+    f.Phosphopeptide = q.Phosphopeptide
+  "
+data_table_imputed <- sqldf(data_table_imputed_sql)
+# Zap the duplicated 'Phosphopeptide' column named 'ppep'
+data_table_imputed <-
+    data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
+
+# Output with imputed, un-normalized data
+
+write.table(
+    data_table_imputed
+  , file = imputed_data_filename
+  , sep = "\t"
+  , col.names = TRUE
+  , row.names = FALSE
+  , quote = FALSE
+  )
+
+
+#output quantile normalized data
+impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
+colnames(impish)[1] <- "Phosphopeptide"
+data_table_imputed <- sqldf(data_table_imputed_sql)
+# Zap the duplicated 'Phosphopeptide' column named 'ppep'
+data_table_imputed <-
+    data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
+write.table(
+  data_table_imputed,
+  file = imp_qn_lt_data_filenm,
+  sep = "\t",
+  col.names = TRUE,
+  row.names = FALSE,
+  quote = FALSE
+)
+
+ppep_kinase <- sqldf("
+  SELECT DISTINCT k.ppep, k.kinase
+    FROM (
+      SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
+        FROM pseudo_ksdata
+        WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+      ) k
+    ORDER BY k.ppep, k.kinase
+  ")
+
+RSQLite::dbWriteTable(
+  conn = db,
+  name = "ksea_enriched_ks",
+  value = ppep_kinase,
+  append = FALSE
+  )
+
+RSQLite::dbWriteTable(
+  conn = db,
+  name = "anova_signif",
+  value = p_value_data,
+  append = FALSE
+  )
+
+  ddl_exec(db, "
+    DROP VIEW IF EXISTS stats_metadata_v;
+    "
+  )
+  dml_no_rows_exec(db, "
+      CREATE VIEW stats_metadata_v
+        AS
+      SELECT DISTINCT  m.*,
+          p.raw_anova_p,
+          p.fdr_adjusted_anova_p,
+          kek.kinase AS ksea_enrichments
+        FROM
+          mrgfltr_metadata_view m
+            LEFT JOIN anova_signif p
+              ON m.phospho_peptide = p.phosphopeptide
+            LEFT JOIN ksea_enriched_ks kek
+              ON m.phospho_peptide = kek.ppep
+      ;
+    "
+  )
+
+write.table(
+  dbReadTable(db, "stats_metadata_v"),
+  file = anova_ksea_mtdt_file,
+  sep = "\t",
+  col.names = TRUE,
+  row.names = FALSE,
+  quote = FALSE
+  )
+
+
+```
+
+```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+cat("\\leavevmode\n\n\n")
+
+# write parameters to report
+
+param_unlist <- unlist(as.list(params))
+param_df <- data.frame(
+  parameter = paste0("\\verb@", names(param_unlist), "@"),
+  value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@")
+  )
+
+data_frame_latex(
+  x = param_df,
+  justification = "p{0.35\\linewidth} p{0.6\\linewidth}",
+  centered = TRUE,
+  caption = "Input parameters",
+  anchor = const_table_anchor_bp,
+  underscore_whack = FALSE
+  )
+
+# write parameters to SQLite output
+
+mqppep_anova_script_param_df <- data.frame(
+  script    = "mqppep_anova_script.Rmd",
+  parameter = names(param_unlist),
+  value     = param_unlist
+  )
+ddl_exec(db, "
+  DROP TABLE IF EXISTS script_parameter;
+  "
+)
+ddl_exec(db, "
+  CREATE TABLE IF NOT EXISTS script_parameter(
+    script    TEXT,
+    parameter TEXT,
+    value     ANY,
+    UNIQUE (script, parameter) ON CONFLICT REPLACE
+    )
+    ;
+  "
+)
+RSQLite::dbWriteTable(
+  conn = db,
+  name = "script_parameter",
+  value = mqppep_anova_script_param_df,
+  append = TRUE
+)
+
+# We are done with output
+RSQLite::dbDisconnect(db)
+```
+<!--
+There's gotta be a better way...
+
+loaded_packages_df <-  sessioninfo::package_info("loaded")
+loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
+loaded_packages_df <- data.frame(
+  package = loaded_packages_df$package,
+  version = loaded_packages_df$loadedversion,
+  date    = loaded_packages_df$date
+  )
+data_frame_latex(
+  x = loaded_packages_df,
+  justification = "l | l l",
+  centered = FALSE,
+  caption = "Loaded R packages",
+  anchor = const_table_anchor_bp
+  )
+-->
author	galaxyp
date	Mon, 11 Jul 2022 19:22:54 +0000
parents
children	b76c75521d91