Mercurial > repos > galaxyp > mqppep_preproc
diff mqppep_anova.R @ 0:8dfd5d2b5903 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author | galaxyp |
---|---|
date | Mon, 11 Jul 2022 19:22:54 +0000 |
parents | |
children | b76c75521d91 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_anova.R Mon Jul 11 19:22:54 2022 +0000 @@ -0,0 +1,297 @@ +#!/usr/bin/env Rscript +# libraries +library(optparse) +library(data.table) +library(stringr) + +# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 + +# parse options +option_list <- list( + make_option( + c("-i", "--inputFile"), + action = "store", + default = NA, + type = "character", + help = "Phosphopeptide Intensities sparse input file path" + ), + make_option( + c("-a", "--alphaFile"), + action = "store", + default = NA, + type = "character", + help = paste0("List of alpha cutoff values for significance testing;", + " path to text file having one column and no header") + ), + make_option( + c("-S", "--preproc_sqlite"), + action = "store", + default = NA, + type = "character", + help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`" + ), + make_option( + c("-K", "--ksea_sqlite"), + action = "store", + default = NA, + type = "character", + help = "Path to 'ksea_sqlite' output produced by this tool" + ), + make_option( + c("-f", "--firstDataColumn"), + action = "store", + default = "^Intensity[^_]", + type = "character", + help = "First column of intensity values" + ), + make_option( + c("-m", "--imputationMethod"), + action = "store", + default = "random", + type = "character", + help = paste0("Method for missing-value imputation,", + " one of c('group-median','median','mean','random')") + ), + make_option( + c("-p", "--meanPercentile"), + action = "store", + default = 3, + type = "integer", + help = paste0("Mean percentile for randomly generated imputed values;", + ", range [1,99]") + ), + make_option( + c("-d", "--sdPercentile"), + action = "store", + default = 3, + type = "double", + help = paste0("Adjustment value for standard deviation of", + " randomly generated imputed values; real") + ), + make_option( + c("-s", "--regexSampleNames"), + action = "store", + default = "\\.(\\d+)[A-Z]$", + type = "character", + help = "Regular expression extracting sample-names" + ), + make_option( + c("-g", "--regexSampleGrouping"), + action = "store", + default = "(\\d+)", + type = "character", + help = paste0("Regular expression extracting sample-group", + " from an extracted sample-name") + ), + make_option( + c("-o", "--imputedDataFile"), + action = "store", + default = "output_imputed.tsv", + type = "character", + help = "Imputed Phosphopeptide Intensities output file path" + ), + make_option( + c("-n", "--imputedQNLTDataFile"), + action = "store", + default = "output_imp_qn_lt.tsv", + type = "character", + help = + paste( + "Imputed, Quantile-Normalized Log-Transformed Phosphopeptide", + "Intensities output file path" + ) + ), + make_option( + c("-r", "--reportFile"), + action = "store", + default = "QuantDataProcessingScript.html", + type = "character", + help = "HTML report file path" + ), + make_option( + c("-k", "--ksea_cutoff_statistic"), + action = "store", + default = "FDR", + type = "character", + help = paste0("Method for missing-value imputation,", + " one of c('FDR','p.value'), but don't expect 'p.value' to work well.") + ), + make_option( + c("-t", "--ksea_cutoff_threshold"), + action = "store", + default = 0.05, + type = "double", + help = paste0("Maximum score to be used to score a kinase enrichment as significant") + ), + make_option( + c("-M", "--anova_ksea_metadata"), + action = "store", + default = "anova_ksea_metadata.tsv", + type = "character", + help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments" + ) +) +args <- parse_args(OptionParser(option_list = option_list)) +print("args is:") +cat(str(args)) + +# Check parameter values + +if (! file.exists(args$inputFile)) { + stop((paste("Input file", args$inputFile, "does not exist"))) +} +input_file <- args$inputFile +alpha_file <- args$alphaFile +preproc_sqlite <- args$preproc_sqlite +imputed_data_file_name <- args$imputedDataFile +imp_qn_lt_data_filenm <- args$imputedQNLTDataFile +anova_ksea_metadata <- args$anova_ksea_metadata +report_file_name <- args$reportFile +ksea_sqlite <- args$ksea_sqlite +ksea_cutoff_statistic <- args$ksea_cutoff_statistic +ksea_cutoff_threshold <- args$ksea_cutoff_threshold +if ( + sum( + grepl( + pattern = ksea_cutoff_statistic, + x = c("FDR", "p.value") + ) + ) < 1 + ) { + print(sprintf("bad ksea_cutoff_statistic argument: %s", ksea_cutoff_statistic)) + return(-1) + } + +imputation_method <- args$imputationMethod +if ( + sum( + grepl( + pattern = imputation_method, + x = c("group-median", "median", "mean", "random") + ) + ) < 1 + ) { + print(sprintf("bad imputationMethod argument: %s", imputation_method)) + return(-1) + } + +# read with default values, when applicable +mean_percentile <- args$meanPercentile +sd_percentile <- args$sdPercentile +# in the case of 'random" these values are ignored by the client script +if (imputation_method == "random") { + print("mean_percentile is:") + cat(str(mean_percentile)) + + print("sd_percentile is:") + cat(str(mean_percentile)) +} + +# convert string parameters that are passed in via config files: +# - firstDataColumn +# - regexSampleNames +# - regexSampleGrouping +read_config_file_string <- function(fname, limit) { + # eliminate any leading whitespace + result <- gsub("^[ \t\n]*", "", readChar(fname, limit)) + # eliminate any trailing whitespace + result <- gsub("[ \t\n]*$", "", result) + # substitute characters escaped by Galaxy sanitizer + result <- gsub("__lt__", "<", result) + result <- gsub("__le__", "<=", result) + result <- gsub("__eq__", "==", result) + result <- gsub("__ne__", "!=", result) + result <- gsub("__gt__", ">", result) + result <- gsub("__ge__", ">=", result) + result <- gsub("__sq__", "'", result) + result <- gsub("__dq__", '"', result) + result <- gsub("__ob__", "[", result) + result <- gsub("__cb__", "]", result) +} +cat(paste0("first_data_column file: ", args$firstDataColumn, "\n")) +cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n")) +cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n")) +nc <- 1000 +regex_sample_names <- read_config_file_string(args$regexSampleNames, nc) +regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc) +first_data_column <- read_config_file_string(args$firstDataColumn, nc) +cat(paste0("first_data_column: ", first_data_column, "\n")) +cat(paste0("regex_sample_names: ", regex_sample_names, "\n")) +cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n")) + +# from: https://github.com/molgenis/molgenis-pipelines/wiki/ +# How-to-source-another_file.R-from-within-your-R-script +# Function location_of_this_script returns the location of this .R script +# (may be needed to source other files in same dir) +location_of_this_script <- function() { + this_file <- NULL + # This file may be 'sourced' + for (i in - (1:sys.nframe())) { + if (identical(sys.function(i), base::source)) { + this_file <- (normalizePath(sys.frame(i)$ofile)) + } + } + + if (!is.null(this_file)) return(dirname(this_file)) + + # But it may also be called from the command line + cmd_args <- commandArgs(trailingOnly = FALSE) + cmd_args_trailing <- commandArgs(trailingOnly = TRUE) + cmd_args <- cmd_args[ + seq.int( + from = 1, + length.out = length(cmd_args) - length(cmd_args_trailing) + ) + ] + res <- gsub("^(?:--file=(.*)|.*)$", "\\1", cmd_args) + + # If multiple --file arguments are given, R uses the last one + res <- tail(res[res != ""], 1) + if (0 < length(res)) return(dirname(res)) + + # Both are not the case. Maybe we are in an R GUI? + return(NULL) +} + +script_dir <- location_of_this_script() + +rmarkdown_params <- list( + inputFile = input_file + , alphaFile = alpha_file + , preprocDb = preproc_sqlite + , firstDataColumn = first_data_column + , imputationMethod = imputation_method + , meanPercentile = mean_percentile + , sdPercentile = sd_percentile + , regexSampleNames = regex_sample_names + , regexSampleGrouping = regex_sample_grouping + , imputedDataFilename = imputed_data_file_name + , imputedQNLTDataFile = imp_qn_lt_data_filenm + , anovaKseaMetadata = anova_ksea_metadata + , kseaAppPrepDb = ksea_sqlite + , kseaCutoffThreshold = ksea_cutoff_threshold + , kseaCutoffStatistic = ksea_cutoff_statistic + ) + +print("rmarkdown_params") +str(rmarkdown_params) + +# freeze the random number generator so the same results will be produced +# from run to run +set.seed(28571) + +# BUG (or "opportunity") +# To render as PDF for the time being requires installing the conda +# package `r-texlive` until this issue in `texlive-core` is resolved: +# https://github.com/conda-forge/texlive-core-feedstock/issues/19 +# This workaround is detailed in the fourth comment of: +# https://github.com/conda-forge/texlive-core-feedstock/issues/61 + +library(tinytex) +tinytex::install_tinytex() +rmarkdown::render( + input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/") +, output_format = rmarkdown::pdf_document(toc = TRUE) +, output_file = report_file_name +, params = rmarkdown_params +)