Mercurial > repos > melpetera > intensity_checks
diff wrapper_intensity_check.R @ 7:00181e66c50f draft default tip
planemo upload for repository https://github.com/workflow4metabolomics/tools-metabolomics commit 689ab2a3db06eeed6a190f3cfa1618b9d9b7b07b
author | workflow4metabolomics |
---|---|
date | Wed, 09 Jul 2025 15:43:29 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wrapper_intensity_check.R Wed Jul 09 15:43:29 2025 +0000 @@ -0,0 +1,535 @@ +suppressPackageStartupMessages(library(argparse)) + +check.err <- function(err.stock) { + # err.stock = vector of results returned by check functions + if (length(err.stock) != 0) { + stop("\n- - - - - - - - -\n", err.stock, "\n- - - - - - - - -\n") + } +} + + +# To check if the 3 standard tables match regarding identifiers +table_match <- function(dataMatrix, sampleMetadata, variableMetadata) { + err.stock <- character() + + # Sample identifiers + dm_samples <- colnames(dataMatrix)[-1] + sm_samples <- as.character(sampleMetadata[[1]]) + # Variable identifiers + dm_vars <- as.character(dataMatrix[[1]]) + vm_vars <- as.character(variableMetadata[[1]]) + + # Check sample IDs + missing_in_sm <- setdiff(dm_samples, sm_samples) + missing_in_dm <- setdiff(sm_samples, dm_samples) + if (length(missing_in_sm) > 0 || length(missing_in_dm) > 0) { + err.stock <- c( + err.stock, + "\nData matrix and sample metadata do not match regarding sample identifiers." + ) + if (length(missing_in_sm) > 0) { + prefix <- if (length(missing_in_sm) < 4) { + "\n The " + } else { + "\n For example, the " + } + err.stock <- c( + err.stock, + prefix, + "following identifiers found in the data matrix\n", + " do not appear in the sample metadata file:\n", + " ", + paste(head(missing_in_sm, 3), collapse = "\n "), + "\n" + ) + } + if (length(missing_in_dm) > 0) { + prefix <- if (length(missing_in_dm) < 4) { + "\n The " + } else { + "\n For example, the " + } + err.stock <- c( + err.stock, + prefix, + "following identifiers found in the sample metadata file\n", + " do not appear in the data matrix:\n", + " ", + paste(head(missing_in_dm, 3), collapse = "\n "), + "\n" + ) + } + } + + # Check variable IDs + missing_in_vm <- setdiff(dm_vars, vm_vars) + missing_in_dm_var <- setdiff(vm_vars, dm_vars) + if (length(missing_in_vm) > 0 || length(missing_in_dm_var) > 0) { + err.stock <- c( + err.stock, + "\nData matrix and variable metadata do not match regarding variable identifiers." + ) + if (length(missing_in_vm) > 0) { + prefix <- if (length(missing_in_vm) < 4) { + "\n The " + } else { + "\n For example, the " + } + err.stock <- c( + err.stock, + prefix, + "following identifiers found in the data matrix\n", + " do not appear in the variable metadata file:\n", + " ", + paste(head(missing_in_vm, 3), collapse = "\n "), + "\n" + ) + } + if (length(missing_in_dm_var) > 0) { + prefix <- if (length(missing_in_dm_var) < 4) { + "\n The " + } else { + "\n For example, the " + } + err.stock <- c( + err.stock, + prefix, + "following identifiers found in the variable metadata file\n", + " do not appear in the data matrix:\n", + " ", + paste(head(missing_in_dm_var, 3), collapse = "\n "), + "\n" + ) + } + } + + if (length(err.stock) > 0) { + err.stock <- c(err.stock, "\nPlease check your data.\n") + } + + return(err.stock) +} + + +intens_check <- function( + DM.name, + SM.name, + VM.name, + method, + chosen.stat, + class.col, + test.fold, + class1, + fold.frac, + logarithm, + VM.output, + graphs.output) { + # Read input tables + DM <- read.table(DM.name, header = TRUE, sep = "\t", check.names = FALSE) + SM <- read.table(SM.name, header = TRUE, sep = "\t", check.names = FALSE) + VM <- read.table(VM.name, header = TRUE, sep = "\t", check.names = FALSE) + + # Table match check + table.check <- table_match(DM, SM, VM) + check.err(table.check) + + # Transpose and align DM + rownames(DM) <- DM[, 1] + DM <- data.frame(t(DM[, -1])) + DM <- merge(x = cbind(1:nrow(SM), SM), y = DM, by.x = 2, by.y = 0) + DM <- DM[order(DM[, 2]), ] + rownames(DM) <- DM[, 1] + DM <- DM[, -c(1:(ncol(SM) + 1))] + + stat.list <- strsplit(chosen.stat, ",")[[1]] + + # Class assignment + if (method == "no_class") { + c_class <- rep("global", nrow(DM)) + classnames <- "global" + nb_class <- 1 + test.fold <- "No" + } else { + class.col <- colnames(SM)[as.numeric(class.col)] + if (!(class.col %in% colnames(SM))) { + stop("The column ", class.col, " is not in sample metadata") + } + c_class <- as.factor(SM[, class.col]) + nb_class <- nlevels(c_class) + classnames <- levels(c_class) + if (nb_class < 2 && test.fold == "Yes") { + cat( + "The '", + class.col, + "' column contains only one class, fold calculation could not be executed.\n" + ) + } + if (nb_class > (nrow(SM)) / 3 && method == "each_class") { + cat("There are too many classes, consider reducing the number.\n") + } + if (method == "one_class") { + if (!(class1 %in% classnames)) { + stop("The '", class1, "' class does not appear in the '", class.col, "'column.") + } + c_class <- factor( + ifelse(c_class == class1, class1, "Other"), + levels = c(class1, "Other") + ) + nb_class <- nlevels(c_class) + classnames <- levels(c_class) + } + } + + # Check numeric + if (!is.numeric(as.matrix(DM))) { + findchar <- function(myval) { + ifelse( + is.na(myval), + "ok", + ifelse(is.na(as.numeric(as.character(myval))), "char", "ok") + ) + } + chardiag <- suppressWarnings(apply(DM, 2, vapply, findchar, "character")) + charlist <- which(chardiag == "char") + err.stock <- paste( + "\n- - - - - - - - -\nYour dataMatrix contains", + length(charlist), + "non-numeric value(s).\n" + ) + stop(c( + err.stock, + "The dataMatrix file is supposed to contain only numeric values.\n- - - - - - - - -\n" + )) + } + + DM <- cbind(c_class, DM) + stat.res <- t(DM[0, -1, drop = FALSE]) + names <- NULL + + mean.res <- sd.res <- med.res <- quart.res <- dec.res <- NA.res <- pct_NA.res <- fold.res <- NULL + mean.names <- sd.names <- med.names <- quart.names <- dec.names <- NA.names <- pct_NA.names <- fold.names <- NULL + graphs <- if (("NA" %in% stat.list) || (test.fold == "Yes")) 1 else 0 + data_bp <- data.frame() + + for (j in 1:nb_class) { + idx <- which(DM$c_class == classnames[j]) + # Mean + if ("mean" %in% stat.list) { + mean.res <- cbind(mean.res, colMeans(DM[idx, -1], na.rm = TRUE)) + mean.names <- cbind(mean.names, paste("Mean", classnames[j], sep = "_")) + if (j == nb_class) { + stat.res <- cbind(stat.res, mean.res) + names <- cbind(names, mean.names) + } + } + # SD + if ("sd" %in% stat.list) { + sd.res <- cbind(sd.res, apply(DM[idx, -1], 2, sd, na.rm = TRUE)) + sd.names <- cbind(sd.names, paste("Sd", classnames[j], sep = "_")) + if (j == nb_class) { + stat.res <- cbind(stat.res, sd.res) + names <- cbind(names, sd.names) + } + } + # Median + if (("median" %in% stat.list) && (!("quartile" %in% stat.list))) { + med.res <- cbind(med.res, apply(DM[idx, -1], 2, median, na.rm = TRUE)) + med.names <- cbind(med.names, paste("Median", classnames[j], sep = "_")) + if (j == nb_class) { + stat.res <- cbind(stat.res, med.res) + names <- cbind(names, med.names) + } + } + # Quartiles + if ("quartile" %in% stat.list) { + quart.res <- cbind( + quart.res, + t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE)) + ) + quart.names <- cbind( + quart.names, + paste("Min", classnames[j], sep = "_"), + paste("Q1", classnames[j], sep = "_"), + paste("Median", classnames[j], sep = "_"), + paste("Q3", classnames[j], sep = "_"), + paste("Max", classnames[j], sep = "_") + ) + if (j == nb_class) { + stat.res <- cbind(stat.res, quart.res) + names <- cbind(names, quart.names) + } + } + # Deciles + if ("decile" %in% stat.list) { + dec.res <- cbind( + dec.res, + t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE, seq(0, 1, 0.1))) + ) + dec.names <- cbind( + dec.names, + t(matrix(paste("D", seq(0, 10, 1), sep = "_", classnames[j]))) + ) + if (j == nb_class) { + stat.res <- cbind(stat.res, dec.res) + names <- cbind(names, dec.names) + } + } + # Missing values + if ("NA" %in% stat.list) { + nb_NA <- apply(DM[idx, -1], 2, function(x) sum(is.na(x))) + pct_NA <- round(nb_NA / nrow(DM[idx, -1]) * 100, 4) + NA.res <- cbind(NA.res, nb_NA) + pct_NA.res <- cbind(pct_NA.res, pct_NA) + NA.names <- cbind(NA.names, paste("NA", classnames[j], sep = "_")) + pct_NA.names <- cbind( + pct_NA.names, + paste("Pct_NA", classnames[j], sep = "_") + ) + if (j == nb_class) { + stat.res <- cbind(stat.res, NA.res, pct_NA.res) + names <- cbind(names, NA.names, pct_NA.names) + } + # Barplot data + bins <- cut(pct_NA, breaks = c(-1, 20, 40, 60, 80, 100), labels = FALSE) + for (b in 1:5) data_bp[b, j] <- sum(bins == b) + rownames(data_bp) <- c( + "0%-20%", + "20%-40%", + "40%-60%", + "60%-80%", + "80%-100%" + ) + if (j == nb_class) { + if (sum(NA.res) == 0) cat("Data Matrix contains no NA.\n") + colnames(data_bp) <- classnames + data_bp <- as.matrix(data_bp) + } + } + # Mean fold change + if (test.fold == "Yes" && nb_class >= 2 && j != nb_class) { + if (method == "each_class") fold.frac <- "Top" + for (k in (j + 1):nb_class) { + ratio1 <- if (fold.frac == "Bottom") classnames[k] else classnames[j] + ratio2 <- if (fold.frac == "Bottom") classnames[j] else classnames[k] + fold <- colMeans(DM[which(DM$c_class == ratio1), -1], na.rm = TRUE) / + colMeans(DM[which(DM$c_class == ratio2), -1], na.rm = TRUE) + if (logarithm == "log2") { + fold <- log2(fold) + } else if ( + logarithm == "log10" + ) { + fold <- log10(fold) + } + fold.res <- cbind(fold.res, fold) + fname <- if (logarithm == "none") { + paste("fold", ratio1, "VS", ratio2, sep = "_") + } else { + paste(logarithm, "fold", ratio1, "VS", ratio2, sep = "_") + } + fold.names <- cbind(fold.names, fname) + } + if (j == nb_class) { + stat.res <- cbind(stat.res, fold.res) + names <- cbind(names, fold.names) + } + } + } + + # Ensure unique column names + VM.names <- colnames(VM) + for (i in seq_along(VM.names)) { + for (j in seq_along(names)) { + if (VM.names[i] == names[j]) names[j] <- paste(names[j], "2", sep = "_") + } + } + colnames(stat.res) <- names + + # Output + VM <- cbind(VM, stat.res) + write.table(VM, VM.output, sep = "\t", quote = FALSE, row.names = FALSE) + + # Graphics + pdf(graphs.output) + if (graphs == 1) { + if ("NA" %in% stat.list) { + graph.colors <- c("green3", "palegreen3", "lightblue", "orangered", "red") + par(mar = c(5.1, 4.1, 4.1, 8.1), xpd = TRUE) + bp <- barplot( + data_bp, + col = graph.colors, + main = "Proportion of NA", + xlab = "Classes", + ylab = "Variables" + ) + legend( + "topright", + fill = graph.colors, + rownames(data_bp), + inset = c(-0.3, 0) + ) + stock <- 0 + for (i in 1:nrow(data_bp)) { + text( + bp, + stock + data_bp[i, ] / 2, + data_bp[i, ], + col = "white", + cex = 0.7 + ) + stock <- stock + data_bp[i, ] + } + } + if ((test.fold == "Yes") && (nb_class >= 2)) { + clean_fold <- fold.res + clean_fold[is.infinite(clean_fold)] <- NA + for (j in 1:ncol(clean_fold)) { + boxplot(clean_fold[, j], main = fold.names[j]) + } + } + } else { + plot.new() + legend("center", "You did not select any option with graphical output.") + } + dev.off() +} + + +parser <- ArgumentParser(description = "Intensity Check Tool") + +parser$add_argument( + "--dataMatrix_in", + required = TRUE, + help = "Input data matrix file" +) +parser$add_argument( + "--sampleMetadata_in", + required = TRUE, + help = "Input sample metadata file" +) +parser$add_argument( + "--variableMetadata_in", + required = TRUE, + help = "Input variable metadata file" +) +parser$add_argument("--method", required = TRUE, help = "Computation method") +parser$add_argument( + "--chosen_stat", + required = TRUE, + help = "Statistics to compute" +) +parser$add_argument( + "--class_col", + default = NULL, + help = "Class column (optional)" +) +parser$add_argument( + "--test_fold", + default = NULL, + help = "Test fold (optional)" +) +parser$add_argument("--class1", default = NULL, help = "Class1 (optional)") +parser$add_argument( + "--fold_frac", + default = NULL, + help = "Fold fraction (optional)" +) +parser$add_argument( + "--logarithm", + default = NULL, + help = "Logarithm (optional)" +) +parser$add_argument( + "--variableMetadata_out", + required = TRUE, + help = "Output variable metadata file" +) +parser$add_argument( + "--graphs_out", + required = TRUE, + help = "Output graphs file" +) + +args <- parser$parse_args() + +print(args) + +if (length(args) < 7) { + stop("NOT enough argument !!!") +} + +cat( + "\nJob starting time:\n", + format(Sys.time(), "%a %d %b %Y %X"), + "\n\n--------------------------------------------------------------------", + "\nIntensity Check parameters:\n\n" +) +print(args) +cat("--------------------------------------------------------------------\n\n") + +class_col <- NULL +test_fold <- NULL +class1 <- NULL +fold_frac <- NULL +logarithm <- NULL +if (args$method == "each_class") { + class_col <- args$class_col + test_fold <- args$test_fold + if (args$test_fold == "Yes") { + logarithm <- args$logarithm + } +} +if (args$method == "one_class") { + class_col <- args$class_col + class1 <- args$class1 + test_fold <- args$test_fold + if (args$test_fold == "Yes") { + fold_frac <- args$fold_frac + logarithm <- args$logarithm + } +} + +err_no_option <- NULL + +if ( + ((args$method == "no_class") && (args$chosen_stat == "None")) || + ((args$method != "no_class") && + (args$chosen_stat == "None") && + (test_fold == "No")) +) { + err_no_option <- "You did not select any computational option. Program can not be executed." + stop("\n- - - - - - - - -\n", err_no_option, "\n- - - - - - - - -\n") +} + + +if (is.null(err_no_option)) { + intens_check( + args$dataMatrix_in, + args$sampleMetadata_in, + args$variableMetadata_in, + args$method, + args$chosen_stat, + class_col, + test_fold, + class1, + fold_frac, + logarithm, + args$variableMetadata_out, + args$graphs_out + ) +} + + +cat( + "\n--------------------------------------------------------------------", + "\nInformation about R (version, Operating System, attached or loaded packages):\n\n" +) +sessionInfo() +cat( + "--------------------------------------------------------------------\n", + "\nJob ending time:\n", + format(Sys.time(), "%a %d %b %Y %X") +) + + +# delete the parameters to avoid the passage to the next tool in .RData image +rm(args)