diff wrapper_intensity_check.R @ 7:00181e66c50f draft default tip

planemo upload for repository https://github.com/workflow4metabolomics/tools-metabolomics commit 689ab2a3db06eeed6a190f3cfa1618b9d9b7b07b
author workflow4metabolomics
date Wed, 09 Jul 2025 15:43:29 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/wrapper_intensity_check.R	Wed Jul 09 15:43:29 2025 +0000
@@ -0,0 +1,535 @@
+suppressPackageStartupMessages(library(argparse))
+
+check.err <- function(err.stock) {
+    # err.stock = vector of results returned by check functions
+    if (length(err.stock) != 0) {
+        stop("\n- - - - - - - - -\n", err.stock, "\n- - - - - - - - -\n")
+    }
+}
+
+
+# To check if the 3 standard tables match regarding identifiers
+table_match <- function(dataMatrix, sampleMetadata, variableMetadata) {
+    err.stock <- character()
+
+    # Sample identifiers
+    dm_samples <- colnames(dataMatrix)[-1]
+    sm_samples <- as.character(sampleMetadata[[1]])
+    # Variable identifiers
+    dm_vars <- as.character(dataMatrix[[1]])
+    vm_vars <- as.character(variableMetadata[[1]])
+
+    # Check sample IDs
+    missing_in_sm <- setdiff(dm_samples, sm_samples)
+    missing_in_dm <- setdiff(sm_samples, dm_samples)
+    if (length(missing_in_sm) > 0 || length(missing_in_dm) > 0) {
+        err.stock <- c(
+            err.stock,
+            "\nData matrix and sample metadata do not match regarding sample identifiers."
+        )
+        if (length(missing_in_sm) > 0) {
+            prefix <- if (length(missing_in_sm) < 4) {
+                "\n    The "
+            } else {
+                "\n    For example, the "
+            }
+            err.stock <- c(
+                err.stock,
+                prefix,
+                "following identifiers found in the data matrix\n",
+                "    do not appear in the sample metadata file:\n",
+                "    ",
+                paste(head(missing_in_sm, 3), collapse = "\n    "),
+                "\n"
+            )
+        }
+        if (length(missing_in_dm) > 0) {
+            prefix <- if (length(missing_in_dm) < 4) {
+                "\n    The "
+            } else {
+                "\n    For example, the "
+            }
+            err.stock <- c(
+                err.stock,
+                prefix,
+                "following identifiers found in the sample metadata file\n",
+                "    do not appear in the data matrix:\n",
+                "    ",
+                paste(head(missing_in_dm, 3), collapse = "\n    "),
+                "\n"
+            )
+        }
+    }
+
+    # Check variable IDs
+    missing_in_vm <- setdiff(dm_vars, vm_vars)
+    missing_in_dm_var <- setdiff(vm_vars, dm_vars)
+    if (length(missing_in_vm) > 0 || length(missing_in_dm_var) > 0) {
+        err.stock <- c(
+            err.stock,
+            "\nData matrix and variable metadata do not match regarding variable identifiers."
+        )
+        if (length(missing_in_vm) > 0) {
+            prefix <- if (length(missing_in_vm) < 4) {
+                "\n    The "
+            } else {
+                "\n    For example, the "
+            }
+            err.stock <- c(
+                err.stock,
+                prefix,
+                "following identifiers found in the data matrix\n",
+                "    do not appear in the variable metadata file:\n",
+                "    ",
+                paste(head(missing_in_vm, 3), collapse = "\n    "),
+                "\n"
+            )
+        }
+        if (length(missing_in_dm_var) > 0) {
+            prefix <- if (length(missing_in_dm_var) < 4) {
+                "\n    The "
+            } else {
+                "\n    For example, the "
+            }
+            err.stock <- c(
+                err.stock,
+                prefix,
+                "following identifiers found in the variable metadata file\n",
+                "    do not appear in the data matrix:\n",
+                "    ",
+                paste(head(missing_in_dm_var, 3), collapse = "\n    "),
+                "\n"
+            )
+        }
+    }
+
+    if (length(err.stock) > 0) {
+        err.stock <- c(err.stock, "\nPlease check your data.\n")
+    }
+
+    return(err.stock)
+}
+
+
+intens_check <- function(
+    DM.name,
+    SM.name,
+    VM.name,
+    method,
+    chosen.stat,
+    class.col,
+    test.fold,
+    class1,
+    fold.frac,
+    logarithm,
+    VM.output,
+    graphs.output) {
+    # Read input tables
+    DM <- read.table(DM.name, header = TRUE, sep = "\t", check.names = FALSE)
+    SM <- read.table(SM.name, header = TRUE, sep = "\t", check.names = FALSE)
+    VM <- read.table(VM.name, header = TRUE, sep = "\t", check.names = FALSE)
+
+    # Table match check
+    table.check <- table_match(DM, SM, VM)
+    check.err(table.check)
+
+    # Transpose and align DM
+    rownames(DM) <- DM[, 1]
+    DM <- data.frame(t(DM[, -1]))
+    DM <- merge(x = cbind(1:nrow(SM), SM), y = DM, by.x = 2, by.y = 0)
+    DM <- DM[order(DM[, 2]), ]
+    rownames(DM) <- DM[, 1]
+    DM <- DM[, -c(1:(ncol(SM) + 1))]
+
+    stat.list <- strsplit(chosen.stat, ",")[[1]]
+
+    # Class assignment
+    if (method == "no_class") {
+        c_class <- rep("global", nrow(DM))
+        classnames <- "global"
+        nb_class <- 1
+        test.fold <- "No"
+    } else {
+        class.col <- colnames(SM)[as.numeric(class.col)]
+        if (!(class.col %in% colnames(SM))) {
+            stop("The column ", class.col, " is not in sample metadata")
+        }
+        c_class <- as.factor(SM[, class.col])
+        nb_class <- nlevels(c_class)
+        classnames <- levels(c_class)
+        if (nb_class < 2 && test.fold == "Yes") {
+            cat(
+                "The '",
+                class.col,
+                "' column contains only one class, fold calculation could not be executed.\n"
+            )
+        }
+        if (nb_class > (nrow(SM)) / 3 && method == "each_class") {
+            cat("There are too many classes, consider reducing the number.\n")
+        }
+        if (method == "one_class") {
+            if (!(class1 %in% classnames)) {
+                stop("The '", class1, "' class does not appear in the '", class.col, "'column.")
+            }
+            c_class <- factor(
+                ifelse(c_class == class1, class1, "Other"),
+                levels = c(class1, "Other")
+            )
+            nb_class <- nlevels(c_class)
+            classnames <- levels(c_class)
+        }
+    }
+
+    # Check numeric
+    if (!is.numeric(as.matrix(DM))) {
+        findchar <- function(myval) {
+            ifelse(
+                is.na(myval),
+                "ok",
+                ifelse(is.na(as.numeric(as.character(myval))), "char", "ok")
+            )
+        }
+        chardiag <- suppressWarnings(apply(DM, 2, vapply, findchar, "character"))
+        charlist <- which(chardiag == "char")
+        err.stock <- paste(
+            "\n- - - - - - - - -\nYour dataMatrix contains",
+            length(charlist),
+            "non-numeric value(s).\n"
+        )
+        stop(c(
+            err.stock,
+            "The dataMatrix file is supposed to contain only numeric values.\n- - - - - - - - -\n"
+        ))
+    }
+
+    DM <- cbind(c_class, DM)
+    stat.res <- t(DM[0, -1, drop = FALSE])
+    names <- NULL
+
+    mean.res <- sd.res <- med.res <- quart.res <- dec.res <- NA.res <- pct_NA.res <- fold.res <- NULL
+    mean.names <- sd.names <- med.names <- quart.names <- dec.names <- NA.names <- pct_NA.names <- fold.names <- NULL
+    graphs <- if (("NA" %in% stat.list) || (test.fold == "Yes")) 1 else 0
+    data_bp <- data.frame()
+
+    for (j in 1:nb_class) {
+        idx <- which(DM$c_class == classnames[j])
+        # Mean
+        if ("mean" %in% stat.list) {
+            mean.res <- cbind(mean.res, colMeans(DM[idx, -1], na.rm = TRUE))
+            mean.names <- cbind(mean.names, paste("Mean", classnames[j], sep = "_"))
+            if (j == nb_class) {
+                stat.res <- cbind(stat.res, mean.res)
+                names <- cbind(names, mean.names)
+            }
+        }
+        # SD
+        if ("sd" %in% stat.list) {
+            sd.res <- cbind(sd.res, apply(DM[idx, -1], 2, sd, na.rm = TRUE))
+            sd.names <- cbind(sd.names, paste("Sd", classnames[j], sep = "_"))
+            if (j == nb_class) {
+                stat.res <- cbind(stat.res, sd.res)
+                names <- cbind(names, sd.names)
+            }
+        }
+        # Median
+        if (("median" %in% stat.list) && (!("quartile" %in% stat.list))) {
+            med.res <- cbind(med.res, apply(DM[idx, -1], 2, median, na.rm = TRUE))
+            med.names <- cbind(med.names, paste("Median", classnames[j], sep = "_"))
+            if (j == nb_class) {
+                stat.res <- cbind(stat.res, med.res)
+                names <- cbind(names, med.names)
+            }
+        }
+        # Quartiles
+        if ("quartile" %in% stat.list) {
+            quart.res <- cbind(
+                quart.res,
+                t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE))
+            )
+            quart.names <- cbind(
+                quart.names,
+                paste("Min", classnames[j], sep = "_"),
+                paste("Q1", classnames[j], sep = "_"),
+                paste("Median", classnames[j], sep = "_"),
+                paste("Q3", classnames[j], sep = "_"),
+                paste("Max", classnames[j], sep = "_")
+            )
+            if (j == nb_class) {
+                stat.res <- cbind(stat.res, quart.res)
+                names <- cbind(names, quart.names)
+            }
+        }
+        # Deciles
+        if ("decile" %in% stat.list) {
+            dec.res <- cbind(
+                dec.res,
+                t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE, seq(0, 1, 0.1)))
+            )
+            dec.names <- cbind(
+                dec.names,
+                t(matrix(paste("D", seq(0, 10, 1), sep = "_", classnames[j])))
+            )
+            if (j == nb_class) {
+                stat.res <- cbind(stat.res, dec.res)
+                names <- cbind(names, dec.names)
+            }
+        }
+        # Missing values
+        if ("NA" %in% stat.list) {
+            nb_NA <- apply(DM[idx, -1], 2, function(x) sum(is.na(x)))
+            pct_NA <- round(nb_NA / nrow(DM[idx, -1]) * 100, 4)
+            NA.res <- cbind(NA.res, nb_NA)
+            pct_NA.res <- cbind(pct_NA.res, pct_NA)
+            NA.names <- cbind(NA.names, paste("NA", classnames[j], sep = "_"))
+            pct_NA.names <- cbind(
+                pct_NA.names,
+                paste("Pct_NA", classnames[j], sep = "_")
+            )
+            if (j == nb_class) {
+                stat.res <- cbind(stat.res, NA.res, pct_NA.res)
+                names <- cbind(names, NA.names, pct_NA.names)
+            }
+            # Barplot data
+            bins <- cut(pct_NA, breaks = c(-1, 20, 40, 60, 80, 100), labels = FALSE)
+            for (b in 1:5) data_bp[b, j] <- sum(bins == b)
+            rownames(data_bp) <- c(
+                "0%-20%",
+                "20%-40%",
+                "40%-60%",
+                "60%-80%",
+                "80%-100%"
+            )
+            if (j == nb_class) {
+                if (sum(NA.res) == 0) cat("Data Matrix contains no NA.\n")
+                colnames(data_bp) <- classnames
+                data_bp <- as.matrix(data_bp)
+            }
+        }
+        # Mean fold change
+        if (test.fold == "Yes" && nb_class >= 2 && j != nb_class) {
+            if (method == "each_class") fold.frac <- "Top"
+            for (k in (j + 1):nb_class) {
+                ratio1 <- if (fold.frac == "Bottom") classnames[k] else classnames[j]
+                ratio2 <- if (fold.frac == "Bottom") classnames[j] else classnames[k]
+                fold <- colMeans(DM[which(DM$c_class == ratio1), -1], na.rm = TRUE) /
+                    colMeans(DM[which(DM$c_class == ratio2), -1], na.rm = TRUE)
+                if (logarithm == "log2") {
+                    fold <- log2(fold)
+                } else if (
+                    logarithm == "log10"
+                ) {
+                    fold <- log10(fold)
+                }
+                fold.res <- cbind(fold.res, fold)
+                fname <- if (logarithm == "none") {
+                    paste("fold", ratio1, "VS", ratio2, sep = "_")
+                } else {
+                    paste(logarithm, "fold", ratio1, "VS", ratio2, sep = "_")
+                }
+                fold.names <- cbind(fold.names, fname)
+            }
+            if (j == nb_class) {
+                stat.res <- cbind(stat.res, fold.res)
+                names <- cbind(names, fold.names)
+            }
+        }
+    }
+
+    # Ensure unique column names
+    VM.names <- colnames(VM)
+    for (i in seq_along(VM.names)) {
+        for (j in seq_along(names)) {
+            if (VM.names[i] == names[j]) names[j] <- paste(names[j], "2", sep = "_")
+        }
+    }
+    colnames(stat.res) <- names
+
+    # Output
+    VM <- cbind(VM, stat.res)
+    write.table(VM, VM.output, sep = "\t", quote = FALSE, row.names = FALSE)
+
+    # Graphics
+    pdf(graphs.output)
+    if (graphs == 1) {
+        if ("NA" %in% stat.list) {
+            graph.colors <- c("green3", "palegreen3", "lightblue", "orangered", "red")
+            par(mar = c(5.1, 4.1, 4.1, 8.1), xpd = TRUE)
+            bp <- barplot(
+                data_bp,
+                col = graph.colors,
+                main = "Proportion of NA",
+                xlab = "Classes",
+                ylab = "Variables"
+            )
+            legend(
+                "topright",
+                fill = graph.colors,
+                rownames(data_bp),
+                inset = c(-0.3, 0)
+            )
+            stock <- 0
+            for (i in 1:nrow(data_bp)) {
+                text(
+                    bp,
+                    stock + data_bp[i, ] / 2,
+                    data_bp[i, ],
+                    col = "white",
+                    cex = 0.7
+                )
+                stock <- stock + data_bp[i, ]
+            }
+        }
+        if ((test.fold == "Yes") && (nb_class >= 2)) {
+            clean_fold <- fold.res
+            clean_fold[is.infinite(clean_fold)] <- NA
+            for (j in 1:ncol(clean_fold)) {
+                boxplot(clean_fold[, j], main = fold.names[j])
+            }
+        }
+    } else {
+        plot.new()
+        legend("center", "You did not select any option with graphical output.")
+    }
+    dev.off()
+}
+
+
+parser <- ArgumentParser(description = "Intensity Check Tool")
+
+parser$add_argument(
+    "--dataMatrix_in",
+    required = TRUE,
+    help = "Input data matrix file"
+)
+parser$add_argument(
+    "--sampleMetadata_in",
+    required = TRUE,
+    help = "Input sample metadata file"
+)
+parser$add_argument(
+    "--variableMetadata_in",
+    required = TRUE,
+    help = "Input variable metadata file"
+)
+parser$add_argument("--method", required = TRUE, help = "Computation method")
+parser$add_argument(
+    "--chosen_stat",
+    required = TRUE,
+    help = "Statistics to compute"
+)
+parser$add_argument(
+    "--class_col",
+    default = NULL,
+    help = "Class column (optional)"
+)
+parser$add_argument(
+    "--test_fold",
+    default = NULL,
+    help = "Test fold (optional)"
+)
+parser$add_argument("--class1", default = NULL, help = "Class1 (optional)")
+parser$add_argument(
+    "--fold_frac",
+    default = NULL,
+    help = "Fold fraction (optional)"
+)
+parser$add_argument(
+    "--logarithm",
+    default = NULL,
+    help = "Logarithm (optional)"
+)
+parser$add_argument(
+    "--variableMetadata_out",
+    required = TRUE,
+    help = "Output variable metadata file"
+)
+parser$add_argument(
+    "--graphs_out",
+    required = TRUE,
+    help = "Output graphs file"
+)
+
+args <- parser$parse_args()
+
+print(args)
+
+if (length(args) < 7) {
+    stop("NOT enough argument !!!")
+}
+
+cat(
+    "\nJob starting time:\n",
+    format(Sys.time(), "%a %d %b %Y %X"),
+    "\n\n--------------------------------------------------------------------",
+    "\nIntensity Check parameters:\n\n"
+)
+print(args)
+cat("--------------------------------------------------------------------\n\n")
+
+class_col <- NULL
+test_fold <- NULL
+class1 <- NULL
+fold_frac <- NULL
+logarithm <- NULL
+if (args$method == "each_class") {
+    class_col <- args$class_col
+    test_fold <- args$test_fold
+    if (args$test_fold == "Yes") {
+        logarithm <- args$logarithm
+    }
+}
+if (args$method == "one_class") {
+    class_col <- args$class_col
+    class1 <- args$class1
+    test_fold <- args$test_fold
+    if (args$test_fold == "Yes") {
+        fold_frac <- args$fold_frac
+        logarithm <- args$logarithm
+    }
+}
+
+err_no_option <- NULL
+
+if (
+    ((args$method == "no_class") && (args$chosen_stat == "None")) ||
+        ((args$method != "no_class") &&
+            (args$chosen_stat == "None") &&
+            (test_fold == "No"))
+) {
+    err_no_option <- "You did not select any computational option. Program can not be executed."
+    stop("\n- - - - - - - - -\n", err_no_option, "\n- - - - - - - - -\n")
+}
+
+
+if (is.null(err_no_option)) {
+    intens_check(
+        args$dataMatrix_in,
+        args$sampleMetadata_in,
+        args$variableMetadata_in,
+        args$method,
+        args$chosen_stat,
+        class_col,
+        test_fold,
+        class1,
+        fold_frac,
+        logarithm,
+        args$variableMetadata_out,
+        args$graphs_out
+    )
+}
+
+
+cat(
+    "\n--------------------------------------------------------------------",
+    "\nInformation about R (version, Operating System, attached or loaded packages):\n\n"
+)
+sessionInfo()
+cat(
+    "--------------------------------------------------------------------\n",
+    "\nJob ending time:\n",
+    format(Sys.time(), "%a %d %b %Y %X")
+)
+
+
+# delete the parameters to avoid the passage to the next tool in .RData image
+rm(args)