comparison mqppep_anova.R @ 0:8dfd5d2b5903 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author galaxyp
date Mon, 11 Jul 2022 19:22:54 +0000
parents
children b76c75521d91
comparison
equal deleted inserted replaced
-1:000000000000 0:8dfd5d2b5903
1 #!/usr/bin/env Rscript
2 # libraries
3 library(optparse)
4 library(data.table)
5 library(stringr)
6
7 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
8
9 # parse options
10 option_list <- list(
11 make_option(
12 c("-i", "--inputFile"),
13 action = "store",
14 default = NA,
15 type = "character",
16 help = "Phosphopeptide Intensities sparse input file path"
17 ),
18 make_option(
19 c("-a", "--alphaFile"),
20 action = "store",
21 default = NA,
22 type = "character",
23 help = paste0("List of alpha cutoff values for significance testing;",
24 " path to text file having one column and no header")
25 ),
26 make_option(
27 c("-S", "--preproc_sqlite"),
28 action = "store",
29 default = NA,
30 type = "character",
31 help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`"
32 ),
33 make_option(
34 c("-K", "--ksea_sqlite"),
35 action = "store",
36 default = NA,
37 type = "character",
38 help = "Path to 'ksea_sqlite' output produced by this tool"
39 ),
40 make_option(
41 c("-f", "--firstDataColumn"),
42 action = "store",
43 default = "^Intensity[^_]",
44 type = "character",
45 help = "First column of intensity values"
46 ),
47 make_option(
48 c("-m", "--imputationMethod"),
49 action = "store",
50 default = "random",
51 type = "character",
52 help = paste0("Method for missing-value imputation,",
53 " one of c('group-median','median','mean','random')")
54 ),
55 make_option(
56 c("-p", "--meanPercentile"),
57 action = "store",
58 default = 3,
59 type = "integer",
60 help = paste0("Mean percentile for randomly generated imputed values;",
61 ", range [1,99]")
62 ),
63 make_option(
64 c("-d", "--sdPercentile"),
65 action = "store",
66 default = 3,
67 type = "double",
68 help = paste0("Adjustment value for standard deviation of",
69 " randomly generated imputed values; real")
70 ),
71 make_option(
72 c("-s", "--regexSampleNames"),
73 action = "store",
74 default = "\\.(\\d+)[A-Z]$",
75 type = "character",
76 help = "Regular expression extracting sample-names"
77 ),
78 make_option(
79 c("-g", "--regexSampleGrouping"),
80 action = "store",
81 default = "(\\d+)",
82 type = "character",
83 help = paste0("Regular expression extracting sample-group",
84 " from an extracted sample-name")
85 ),
86 make_option(
87 c("-o", "--imputedDataFile"),
88 action = "store",
89 default = "output_imputed.tsv",
90 type = "character",
91 help = "Imputed Phosphopeptide Intensities output file path"
92 ),
93 make_option(
94 c("-n", "--imputedQNLTDataFile"),
95 action = "store",
96 default = "output_imp_qn_lt.tsv",
97 type = "character",
98 help =
99 paste(
100 "Imputed, Quantile-Normalized Log-Transformed Phosphopeptide",
101 "Intensities output file path"
102 )
103 ),
104 make_option(
105 c("-r", "--reportFile"),
106 action = "store",
107 default = "QuantDataProcessingScript.html",
108 type = "character",
109 help = "HTML report file path"
110 ),
111 make_option(
112 c("-k", "--ksea_cutoff_statistic"),
113 action = "store",
114 default = "FDR",
115 type = "character",
116 help = paste0("Method for missing-value imputation,",
117 " one of c('FDR','p.value'), but don't expect 'p.value' to work well.")
118 ),
119 make_option(
120 c("-t", "--ksea_cutoff_threshold"),
121 action = "store",
122 default = 0.05,
123 type = "double",
124 help = paste0("Maximum score to be used to score a kinase enrichment as significant")
125 ),
126 make_option(
127 c("-M", "--anova_ksea_metadata"),
128 action = "store",
129 default = "anova_ksea_metadata.tsv",
130 type = "character",
131 help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"
132 )
133 )
134 args <- parse_args(OptionParser(option_list = option_list))
135 print("args is:")
136 cat(str(args))
137
138 # Check parameter values
139
140 if (! file.exists(args$inputFile)) {
141 stop((paste("Input file", args$inputFile, "does not exist")))
142 }
143 input_file <- args$inputFile
144 alpha_file <- args$alphaFile
145 preproc_sqlite <- args$preproc_sqlite
146 imputed_data_file_name <- args$imputedDataFile
147 imp_qn_lt_data_filenm <- args$imputedQNLTDataFile
148 anova_ksea_metadata <- args$anova_ksea_metadata
149 report_file_name <- args$reportFile
150 ksea_sqlite <- args$ksea_sqlite
151 ksea_cutoff_statistic <- args$ksea_cutoff_statistic
152 ksea_cutoff_threshold <- args$ksea_cutoff_threshold
153 if (
154 sum(
155 grepl(
156 pattern = ksea_cutoff_statistic,
157 x = c("FDR", "p.value")
158 )
159 ) < 1
160 ) {
161 print(sprintf("bad ksea_cutoff_statistic argument: %s", ksea_cutoff_statistic))
162 return(-1)
163 }
164
165 imputation_method <- args$imputationMethod
166 if (
167 sum(
168 grepl(
169 pattern = imputation_method,
170 x = c("group-median", "median", "mean", "random")
171 )
172 ) < 1
173 ) {
174 print(sprintf("bad imputationMethod argument: %s", imputation_method))
175 return(-1)
176 }
177
178 # read with default values, when applicable
179 mean_percentile <- args$meanPercentile
180 sd_percentile <- args$sdPercentile
181 # in the case of 'random" these values are ignored by the client script
182 if (imputation_method == "random") {
183 print("mean_percentile is:")
184 cat(str(mean_percentile))
185
186 print("sd_percentile is:")
187 cat(str(mean_percentile))
188 }
189
190 # convert string parameters that are passed in via config files:
191 # - firstDataColumn
192 # - regexSampleNames
193 # - regexSampleGrouping
194 read_config_file_string <- function(fname, limit) {
195 # eliminate any leading whitespace
196 result <- gsub("^[ \t\n]*", "", readChar(fname, limit))
197 # eliminate any trailing whitespace
198 result <- gsub("[ \t\n]*$", "", result)
199 # substitute characters escaped by Galaxy sanitizer
200 result <- gsub("__lt__", "<", result)
201 result <- gsub("__le__", "<=", result)
202 result <- gsub("__eq__", "==", result)
203 result <- gsub("__ne__", "!=", result)
204 result <- gsub("__gt__", ">", result)
205 result <- gsub("__ge__", ">=", result)
206 result <- gsub("__sq__", "'", result)
207 result <- gsub("__dq__", '"', result)
208 result <- gsub("__ob__", "[", result)
209 result <- gsub("__cb__", "]", result)
210 }
211 cat(paste0("first_data_column file: ", args$firstDataColumn, "\n"))
212 cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n"))
213 cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n"))
214 nc <- 1000
215 regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)
216 regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc)
217 first_data_column <- read_config_file_string(args$firstDataColumn, nc)
218 cat(paste0("first_data_column: ", first_data_column, "\n"))
219 cat(paste0("regex_sample_names: ", regex_sample_names, "\n"))
220 cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n"))
221
222 # from: https://github.com/molgenis/molgenis-pipelines/wiki/
223 # How-to-source-another_file.R-from-within-your-R-script
224 # Function location_of_this_script returns the location of this .R script
225 # (may be needed to source other files in same dir)
226 location_of_this_script <- function() {
227 this_file <- NULL
228 # This file may be 'sourced'
229 for (i in - (1:sys.nframe())) {
230 if (identical(sys.function(i), base::source)) {
231 this_file <- (normalizePath(sys.frame(i)$ofile))
232 }
233 }
234
235 if (!is.null(this_file)) return(dirname(this_file))
236
237 # But it may also be called from the command line
238 cmd_args <- commandArgs(trailingOnly = FALSE)
239 cmd_args_trailing <- commandArgs(trailingOnly = TRUE)
240 cmd_args <- cmd_args[
241 seq.int(
242 from = 1,
243 length.out = length(cmd_args) - length(cmd_args_trailing)
244 )
245 ]
246 res <- gsub("^(?:--file=(.*)|.*)$", "\\1", cmd_args)
247
248 # If multiple --file arguments are given, R uses the last one
249 res <- tail(res[res != ""], 1)
250 if (0 < length(res)) return(dirname(res))
251
252 # Both are not the case. Maybe we are in an R GUI?
253 return(NULL)
254 }
255
256 script_dir <- location_of_this_script()
257
258 rmarkdown_params <- list(
259 inputFile = input_file
260 , alphaFile = alpha_file
261 , preprocDb = preproc_sqlite
262 , firstDataColumn = first_data_column
263 , imputationMethod = imputation_method
264 , meanPercentile = mean_percentile
265 , sdPercentile = sd_percentile
266 , regexSampleNames = regex_sample_names
267 , regexSampleGrouping = regex_sample_grouping
268 , imputedDataFilename = imputed_data_file_name
269 , imputedQNLTDataFile = imp_qn_lt_data_filenm
270 , anovaKseaMetadata = anova_ksea_metadata
271 , kseaAppPrepDb = ksea_sqlite
272 , kseaCutoffThreshold = ksea_cutoff_threshold
273 , kseaCutoffStatistic = ksea_cutoff_statistic
274 )
275
276 print("rmarkdown_params")
277 str(rmarkdown_params)
278
279 # freeze the random number generator so the same results will be produced
280 # from run to run
281 set.seed(28571)
282
283 # BUG (or "opportunity")
284 # To render as PDF for the time being requires installing the conda
285 # package `r-texlive` until this issue in `texlive-core` is resolved:
286 # https://github.com/conda-forge/texlive-core-feedstock/issues/19
287 # This workaround is detailed in the fourth comment of:
288 # https://github.com/conda-forge/texlive-core-feedstock/issues/61
289
290 library(tinytex)
291 tinytex::install_tinytex()
292 rmarkdown::render(
293 input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/")
294 , output_format = rmarkdown::pdf_document(toc = TRUE)
295 , output_file = report_file_name
296 , params = rmarkdown_params
297 )