diff mqppep_anova.R @ 0:8dfd5d2b5903 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author galaxyp
date Mon, 11 Jul 2022 19:22:54 +0000
parents
children b76c75521d91
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova.R	Mon Jul 11 19:22:54 2022 +0000
@@ -0,0 +1,297 @@
+#!/usr/bin/env Rscript
+# libraries
+library(optparse)
+library(data.table)
+library(stringr)
+
+# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
+
+# parse options
+option_list <- list(
+  make_option(
+    c("-i", "--inputFile"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = "Phosphopeptide Intensities sparse input file path"
+  ),
+  make_option(
+    c("-a", "--alphaFile"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = paste0("List of alpha cutoff values for significance testing;",
+             " path to text file having one column and no header")
+  ),
+  make_option(
+    c("-S", "--preproc_sqlite"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`"
+  ),
+  make_option(
+    c("-K", "--ksea_sqlite"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = "Path to 'ksea_sqlite' output produced by this tool"
+  ),
+  make_option(
+    c("-f", "--firstDataColumn"),
+    action = "store",
+    default = "^Intensity[^_]",
+    type = "character",
+    help = "First column of intensity values"
+  ),
+  make_option(
+    c("-m", "--imputationMethod"),
+    action = "store",
+    default = "random",
+    type = "character",
+    help = paste0("Method for missing-value imputation,",
+             " one of c('group-median','median','mean','random')")
+  ),
+  make_option(
+    c("-p", "--meanPercentile"),
+    action = "store",
+    default = 3,
+    type = "integer",
+    help = paste0("Mean percentile for randomly generated imputed values;",
+              ", range [1,99]")
+  ),
+  make_option(
+    c("-d", "--sdPercentile"),
+    action = "store",
+    default = 3,
+    type = "double",
+    help = paste0("Adjustment value for standard deviation of",
+              " randomly generated imputed values; real")
+  ),
+  make_option(
+    c("-s", "--regexSampleNames"),
+    action = "store",
+    default = "\\.(\\d+)[A-Z]$",
+    type = "character",
+    help = "Regular expression extracting sample-names"
+  ),
+  make_option(
+    c("-g", "--regexSampleGrouping"),
+    action = "store",
+    default = "(\\d+)",
+    type = "character",
+    help = paste0("Regular expression extracting sample-group",
+             " from an extracted sample-name")
+  ),
+  make_option(
+    c("-o", "--imputedDataFile"),
+    action = "store",
+    default = "output_imputed.tsv",
+    type = "character",
+    help = "Imputed Phosphopeptide Intensities output file path"
+  ),
+  make_option(
+    c("-n", "--imputedQNLTDataFile"),
+    action = "store",
+    default = "output_imp_qn_lt.tsv",
+    type = "character",
+    help =
+      paste(
+        "Imputed, Quantile-Normalized Log-Transformed Phosphopeptide",
+        "Intensities output file path"
+        )
+  ),
+  make_option(
+    c("-r", "--reportFile"),
+    action = "store",
+    default = "QuantDataProcessingScript.html",
+    type = "character",
+    help = "HTML report file path"
+  ),
+  make_option(
+    c("-k", "--ksea_cutoff_statistic"),
+    action = "store",
+    default = "FDR",
+    type = "character",
+    help = paste0("Method for missing-value imputation,",
+             " one of c('FDR','p.value'), but don't expect 'p.value' to work well.")
+  ),
+  make_option(
+    c("-t", "--ksea_cutoff_threshold"),
+    action = "store",
+    default = 0.05,
+    type = "double",
+    help = paste0("Maximum score to be used to score a kinase enrichment as significant")
+  ),
+  make_option(
+    c("-M", "--anova_ksea_metadata"),
+    action = "store",
+    default = "anova_ksea_metadata.tsv",
+    type = "character",
+    help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"
+  )
+)
+args <- parse_args(OptionParser(option_list = option_list))
+print("args is:")
+cat(str(args))
+
+# Check parameter values
+
+if (! file.exists(args$inputFile)) {
+  stop((paste("Input file", args$inputFile, "does not exist")))
+}
+input_file             <- args$inputFile
+alpha_file             <- args$alphaFile
+preproc_sqlite         <- args$preproc_sqlite
+imputed_data_file_name <- args$imputedDataFile
+imp_qn_lt_data_filenm  <- args$imputedQNLTDataFile
+anova_ksea_metadata    <- args$anova_ksea_metadata
+report_file_name       <- args$reportFile
+ksea_sqlite            <- args$ksea_sqlite
+ksea_cutoff_statistic  <- args$ksea_cutoff_statistic
+ksea_cutoff_threshold  <- args$ksea_cutoff_threshold
+if (
+  sum(
+    grepl(
+      pattern = ksea_cutoff_statistic,
+      x = c("FDR", "p.value")
+      )
+    ) < 1
+  ) {
+    print(sprintf("bad ksea_cutoff_statistic argument: %s", ksea_cutoff_statistic))
+    return(-1)
+    }
+
+imputation_method <- args$imputationMethod
+if (
+  sum(
+    grepl(
+      pattern = imputation_method,
+      x = c("group-median", "median", "mean", "random")
+      )
+    ) < 1
+  ) {
+    print(sprintf("bad imputationMethod argument: %s", imputation_method))
+    return(-1)
+    }
+
+# read with default values, when applicable
+mean_percentile <- args$meanPercentile
+sd_percentile   <- args$sdPercentile
+# in the case of 'random" these values are ignored by the client script
+if (imputation_method == "random") {
+  print("mean_percentile is:")
+  cat(str(mean_percentile))
+
+  print("sd_percentile is:")
+  cat(str(mean_percentile))
+}
+
+# convert string parameters that are passed in via config files:
+#  - firstDataColumn
+#  - regexSampleNames
+#  - regexSampleGrouping
+read_config_file_string <- function(fname, limit) {
+  # eliminate any leading whitespace
+  result    <- gsub("^[ \t\n]*", "", readChar(fname, limit))
+  # eliminate any trailing whitespace
+  result    <- gsub("[ \t\n]*$", "", result)
+  # substitute characters escaped by Galaxy sanitizer
+  result <- gsub("__lt__", "<",  result)
+  result <- gsub("__le__", "<=", result)
+  result <- gsub("__eq__", "==", result)
+  result <- gsub("__ne__", "!=", result)
+  result <- gsub("__gt__", ">",  result)
+  result <- gsub("__ge__", ">=", result)
+  result <- gsub("__sq__", "'",  result)
+  result <- gsub("__dq__", '"',  result)
+  result <- gsub("__ob__", "[",  result)
+  result <- gsub("__cb__", "]",  result)
+}
+cat(paste0("first_data_column file: ", args$firstDataColumn, "\n"))
+cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n"))
+cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n"))
+nc <- 1000
+regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)
+regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc)
+first_data_column <- read_config_file_string(args$firstDataColumn,  nc)
+cat(paste0("first_data_column: ",     first_data_column,     "\n"))
+cat(paste0("regex_sample_names: ",    regex_sample_names,    "\n"))
+cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n"))
+
+# from: https://github.com/molgenis/molgenis-pipelines/wiki/
+#   How-to-source-another_file.R-from-within-your-R-script
+# Function location_of_this_script returns the location of this .R script
+#   (may be needed to source other files in same dir)
+location_of_this_script <- function() {
+    this_file <- NULL
+    # This file may be 'sourced'
+    for (i in - (1:sys.nframe())) {
+        if (identical(sys.function(i), base::source)) {
+            this_file <- (normalizePath(sys.frame(i)$ofile))
+        }
+    }
+
+    if (!is.null(this_file)) return(dirname(this_file))
+
+    # But it may also be called from the command line
+    cmd_args <- commandArgs(trailingOnly = FALSE)
+    cmd_args_trailing <- commandArgs(trailingOnly = TRUE)
+    cmd_args <- cmd_args[
+      seq.int(
+        from = 1,
+        length.out = length(cmd_args) - length(cmd_args_trailing)
+        )
+      ]
+    res <- gsub("^(?:--file=(.*)|.*)$", "\\1", cmd_args)
+
+    # If multiple --file arguments are given, R uses the last one
+    res <- tail(res[res != ""], 1)
+    if (0 < length(res)) return(dirname(res))
+
+    # Both are not the case. Maybe we are in an R GUI?
+    return(NULL)
+}
+
+script_dir <-  location_of_this_script()
+
+rmarkdown_params <- list(
+    inputFile = input_file
+  , alphaFile = alpha_file
+  , preprocDb = preproc_sqlite
+  , firstDataColumn = first_data_column
+  , imputationMethod = imputation_method
+  , meanPercentile = mean_percentile
+  , sdPercentile = sd_percentile
+  , regexSampleNames = regex_sample_names
+  , regexSampleGrouping = regex_sample_grouping
+  , imputedDataFilename = imputed_data_file_name
+  , imputedQNLTDataFile = imp_qn_lt_data_filenm
+  , anovaKseaMetadata = anova_ksea_metadata
+  , kseaAppPrepDb = ksea_sqlite
+  , kseaCutoffThreshold = ksea_cutoff_threshold
+  , kseaCutoffStatistic = ksea_cutoff_statistic
+  )
+
+print("rmarkdown_params")
+str(rmarkdown_params)
+
+# freeze the random number generator so the same results will be produced
+#  from run to run
+set.seed(28571)
+
+# BUG (or "opportunity")
+# To render as PDF for the time being requires installing the conda
+# package `r-texlive` until this issue in `texlive-core` is resolved:
+#   https://github.com/conda-forge/texlive-core-feedstock/issues/19
+# This workaround is detailed in the fourth comment of:
+#   https://github.com/conda-forge/texlive-core-feedstock/issues/61
+
+library(tinytex)
+tinytex::install_tinytex()
+rmarkdown::render(
+  input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/")
+, output_format = rmarkdown::pdf_document(toc = TRUE)
+, output_file = report_file_name
+, params = rmarkdown_params
+)