view wrapper_intensity_check.R @ 7:00181e66c50f draft default tip

planemo upload for repository https://github.com/workflow4metabolomics/tools-metabolomics commit 689ab2a3db06eeed6a190f3cfa1618b9d9b7b07b
author workflow4metabolomics
date Wed, 09 Jul 2025 15:43:29 +0000
parents
children
line wrap: on
line source

suppressPackageStartupMessages(library(argparse))

check.err <- function(err.stock) {
    # err.stock = vector of results returned by check functions
    if (length(err.stock) != 0) {
        stop("\n- - - - - - - - -\n", err.stock, "\n- - - - - - - - -\n")
    }
}


# To check if the 3 standard tables match regarding identifiers
table_match <- function(dataMatrix, sampleMetadata, variableMetadata) {
    err.stock <- character()

    # Sample identifiers
    dm_samples <- colnames(dataMatrix)[-1]
    sm_samples <- as.character(sampleMetadata[[1]])
    # Variable identifiers
    dm_vars <- as.character(dataMatrix[[1]])
    vm_vars <- as.character(variableMetadata[[1]])

    # Check sample IDs
    missing_in_sm <- setdiff(dm_samples, sm_samples)
    missing_in_dm <- setdiff(sm_samples, dm_samples)
    if (length(missing_in_sm) > 0 || length(missing_in_dm) > 0) {
        err.stock <- c(
            err.stock,
            "\nData matrix and sample metadata do not match regarding sample identifiers."
        )
        if (length(missing_in_sm) > 0) {
            prefix <- if (length(missing_in_sm) < 4) {
                "\n    The "
            } else {
                "\n    For example, the "
            }
            err.stock <- c(
                err.stock,
                prefix,
                "following identifiers found in the data matrix\n",
                "    do not appear in the sample metadata file:\n",
                "    ",
                paste(head(missing_in_sm, 3), collapse = "\n    "),
                "\n"
            )
        }
        if (length(missing_in_dm) > 0) {
            prefix <- if (length(missing_in_dm) < 4) {
                "\n    The "
            } else {
                "\n    For example, the "
            }
            err.stock <- c(
                err.stock,
                prefix,
                "following identifiers found in the sample metadata file\n",
                "    do not appear in the data matrix:\n",
                "    ",
                paste(head(missing_in_dm, 3), collapse = "\n    "),
                "\n"
            )
        }
    }

    # Check variable IDs
    missing_in_vm <- setdiff(dm_vars, vm_vars)
    missing_in_dm_var <- setdiff(vm_vars, dm_vars)
    if (length(missing_in_vm) > 0 || length(missing_in_dm_var) > 0) {
        err.stock <- c(
            err.stock,
            "\nData matrix and variable metadata do not match regarding variable identifiers."
        )
        if (length(missing_in_vm) > 0) {
            prefix <- if (length(missing_in_vm) < 4) {
                "\n    The "
            } else {
                "\n    For example, the "
            }
            err.stock <- c(
                err.stock,
                prefix,
                "following identifiers found in the data matrix\n",
                "    do not appear in the variable metadata file:\n",
                "    ",
                paste(head(missing_in_vm, 3), collapse = "\n    "),
                "\n"
            )
        }
        if (length(missing_in_dm_var) > 0) {
            prefix <- if (length(missing_in_dm_var) < 4) {
                "\n    The "
            } else {
                "\n    For example, the "
            }
            err.stock <- c(
                err.stock,
                prefix,
                "following identifiers found in the variable metadata file\n",
                "    do not appear in the data matrix:\n",
                "    ",
                paste(head(missing_in_dm_var, 3), collapse = "\n    "),
                "\n"
            )
        }
    }

    if (length(err.stock) > 0) {
        err.stock <- c(err.stock, "\nPlease check your data.\n")
    }

    return(err.stock)
}


intens_check <- function(
    DM.name,
    SM.name,
    VM.name,
    method,
    chosen.stat,
    class.col,
    test.fold,
    class1,
    fold.frac,
    logarithm,
    VM.output,
    graphs.output) {
    # Read input tables
    DM <- read.table(DM.name, header = TRUE, sep = "\t", check.names = FALSE)
    SM <- read.table(SM.name, header = TRUE, sep = "\t", check.names = FALSE)
    VM <- read.table(VM.name, header = TRUE, sep = "\t", check.names = FALSE)

    # Table match check
    table.check <- table_match(DM, SM, VM)
    check.err(table.check)

    # Transpose and align DM
    rownames(DM) <- DM[, 1]
    DM <- data.frame(t(DM[, -1]))
    DM <- merge(x = cbind(1:nrow(SM), SM), y = DM, by.x = 2, by.y = 0)
    DM <- DM[order(DM[, 2]), ]
    rownames(DM) <- DM[, 1]
    DM <- DM[, -c(1:(ncol(SM) + 1))]

    stat.list <- strsplit(chosen.stat, ",")[[1]]

    # Class assignment
    if (method == "no_class") {
        c_class <- rep("global", nrow(DM))
        classnames <- "global"
        nb_class <- 1
        test.fold <- "No"
    } else {
        class.col <- colnames(SM)[as.numeric(class.col)]
        if (!(class.col %in% colnames(SM))) {
            stop("The column ", class.col, " is not in sample metadata")
        }
        c_class <- as.factor(SM[, class.col])
        nb_class <- nlevels(c_class)
        classnames <- levels(c_class)
        if (nb_class < 2 && test.fold == "Yes") {
            cat(
                "The '",
                class.col,
                "' column contains only one class, fold calculation could not be executed.\n"
            )
        }
        if (nb_class > (nrow(SM)) / 3 && method == "each_class") {
            cat("There are too many classes, consider reducing the number.\n")
        }
        if (method == "one_class") {
            if (!(class1 %in% classnames)) {
                stop("The '", class1, "' class does not appear in the '", class.col, "'column.")
            }
            c_class <- factor(
                ifelse(c_class == class1, class1, "Other"),
                levels = c(class1, "Other")
            )
            nb_class <- nlevels(c_class)
            classnames <- levels(c_class)
        }
    }

    # Check numeric
    if (!is.numeric(as.matrix(DM))) {
        findchar <- function(myval) {
            ifelse(
                is.na(myval),
                "ok",
                ifelse(is.na(as.numeric(as.character(myval))), "char", "ok")
            )
        }
        chardiag <- suppressWarnings(apply(DM, 2, vapply, findchar, "character"))
        charlist <- which(chardiag == "char")
        err.stock <- paste(
            "\n- - - - - - - - -\nYour dataMatrix contains",
            length(charlist),
            "non-numeric value(s).\n"
        )
        stop(c(
            err.stock,
            "The dataMatrix file is supposed to contain only numeric values.\n- - - - - - - - -\n"
        ))
    }

    DM <- cbind(c_class, DM)
    stat.res <- t(DM[0, -1, drop = FALSE])
    names <- NULL

    mean.res <- sd.res <- med.res <- quart.res <- dec.res <- NA.res <- pct_NA.res <- fold.res <- NULL
    mean.names <- sd.names <- med.names <- quart.names <- dec.names <- NA.names <- pct_NA.names <- fold.names <- NULL
    graphs <- if (("NA" %in% stat.list) || (test.fold == "Yes")) 1 else 0
    data_bp <- data.frame()

    for (j in 1:nb_class) {
        idx <- which(DM$c_class == classnames[j])
        # Mean
        if ("mean" %in% stat.list) {
            mean.res <- cbind(mean.res, colMeans(DM[idx, -1], na.rm = TRUE))
            mean.names <- cbind(mean.names, paste("Mean", classnames[j], sep = "_"))
            if (j == nb_class) {
                stat.res <- cbind(stat.res, mean.res)
                names <- cbind(names, mean.names)
            }
        }
        # SD
        if ("sd" %in% stat.list) {
            sd.res <- cbind(sd.res, apply(DM[idx, -1], 2, sd, na.rm = TRUE))
            sd.names <- cbind(sd.names, paste("Sd", classnames[j], sep = "_"))
            if (j == nb_class) {
                stat.res <- cbind(stat.res, sd.res)
                names <- cbind(names, sd.names)
            }
        }
        # Median
        if (("median" %in% stat.list) && (!("quartile" %in% stat.list))) {
            med.res <- cbind(med.res, apply(DM[idx, -1], 2, median, na.rm = TRUE))
            med.names <- cbind(med.names, paste("Median", classnames[j], sep = "_"))
            if (j == nb_class) {
                stat.res <- cbind(stat.res, med.res)
                names <- cbind(names, med.names)
            }
        }
        # Quartiles
        if ("quartile" %in% stat.list) {
            quart.res <- cbind(
                quart.res,
                t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE))
            )
            quart.names <- cbind(
                quart.names,
                paste("Min", classnames[j], sep = "_"),
                paste("Q1", classnames[j], sep = "_"),
                paste("Median", classnames[j], sep = "_"),
                paste("Q3", classnames[j], sep = "_"),
                paste("Max", classnames[j], sep = "_")
            )
            if (j == nb_class) {
                stat.res <- cbind(stat.res, quart.res)
                names <- cbind(names, quart.names)
            }
        }
        # Deciles
        if ("decile" %in% stat.list) {
            dec.res <- cbind(
                dec.res,
                t(apply(DM[idx, -1], 2, quantile, na.rm = TRUE, seq(0, 1, 0.1)))
            )
            dec.names <- cbind(
                dec.names,
                t(matrix(paste("D", seq(0, 10, 1), sep = "_", classnames[j])))
            )
            if (j == nb_class) {
                stat.res <- cbind(stat.res, dec.res)
                names <- cbind(names, dec.names)
            }
        }
        # Missing values
        if ("NA" %in% stat.list) {
            nb_NA <- apply(DM[idx, -1], 2, function(x) sum(is.na(x)))
            pct_NA <- round(nb_NA / nrow(DM[idx, -1]) * 100, 4)
            NA.res <- cbind(NA.res, nb_NA)
            pct_NA.res <- cbind(pct_NA.res, pct_NA)
            NA.names <- cbind(NA.names, paste("NA", classnames[j], sep = "_"))
            pct_NA.names <- cbind(
                pct_NA.names,
                paste("Pct_NA", classnames[j], sep = "_")
            )
            if (j == nb_class) {
                stat.res <- cbind(stat.res, NA.res, pct_NA.res)
                names <- cbind(names, NA.names, pct_NA.names)
            }
            # Barplot data
            bins <- cut(pct_NA, breaks = c(-1, 20, 40, 60, 80, 100), labels = FALSE)
            for (b in 1:5) data_bp[b, j] <- sum(bins == b)
            rownames(data_bp) <- c(
                "0%-20%",
                "20%-40%",
                "40%-60%",
                "60%-80%",
                "80%-100%"
            )
            if (j == nb_class) {
                if (sum(NA.res) == 0) cat("Data Matrix contains no NA.\n")
                colnames(data_bp) <- classnames
                data_bp <- as.matrix(data_bp)
            }
        }
        # Mean fold change
        if (test.fold == "Yes" && nb_class >= 2 && j != nb_class) {
            if (method == "each_class") fold.frac <- "Top"
            for (k in (j + 1):nb_class) {
                ratio1 <- if (fold.frac == "Bottom") classnames[k] else classnames[j]
                ratio2 <- if (fold.frac == "Bottom") classnames[j] else classnames[k]
                fold <- colMeans(DM[which(DM$c_class == ratio1), -1], na.rm = TRUE) /
                    colMeans(DM[which(DM$c_class == ratio2), -1], na.rm = TRUE)
                if (logarithm == "log2") {
                    fold <- log2(fold)
                } else if (
                    logarithm == "log10"
                ) {
                    fold <- log10(fold)
                }
                fold.res <- cbind(fold.res, fold)
                fname <- if (logarithm == "none") {
                    paste("fold", ratio1, "VS", ratio2, sep = "_")
                } else {
                    paste(logarithm, "fold", ratio1, "VS", ratio2, sep = "_")
                }
                fold.names <- cbind(fold.names, fname)
            }
            if (j == nb_class) {
                stat.res <- cbind(stat.res, fold.res)
                names <- cbind(names, fold.names)
            }
        }
    }

    # Ensure unique column names
    VM.names <- colnames(VM)
    for (i in seq_along(VM.names)) {
        for (j in seq_along(names)) {
            if (VM.names[i] == names[j]) names[j] <- paste(names[j], "2", sep = "_")
        }
    }
    colnames(stat.res) <- names

    # Output
    VM <- cbind(VM, stat.res)
    write.table(VM, VM.output, sep = "\t", quote = FALSE, row.names = FALSE)

    # Graphics
    pdf(graphs.output)
    if (graphs == 1) {
        if ("NA" %in% stat.list) {
            graph.colors <- c("green3", "palegreen3", "lightblue", "orangered", "red")
            par(mar = c(5.1, 4.1, 4.1, 8.1), xpd = TRUE)
            bp <- barplot(
                data_bp,
                col = graph.colors,
                main = "Proportion of NA",
                xlab = "Classes",
                ylab = "Variables"
            )
            legend(
                "topright",
                fill = graph.colors,
                rownames(data_bp),
                inset = c(-0.3, 0)
            )
            stock <- 0
            for (i in 1:nrow(data_bp)) {
                text(
                    bp,
                    stock + data_bp[i, ] / 2,
                    data_bp[i, ],
                    col = "white",
                    cex = 0.7
                )
                stock <- stock + data_bp[i, ]
            }
        }
        if ((test.fold == "Yes") && (nb_class >= 2)) {
            clean_fold <- fold.res
            clean_fold[is.infinite(clean_fold)] <- NA
            for (j in 1:ncol(clean_fold)) {
                boxplot(clean_fold[, j], main = fold.names[j])
            }
        }
    } else {
        plot.new()
        legend("center", "You did not select any option with graphical output.")
    }
    dev.off()
}


parser <- ArgumentParser(description = "Intensity Check Tool")

parser$add_argument(
    "--dataMatrix_in",
    required = TRUE,
    help = "Input data matrix file"
)
parser$add_argument(
    "--sampleMetadata_in",
    required = TRUE,
    help = "Input sample metadata file"
)
parser$add_argument(
    "--variableMetadata_in",
    required = TRUE,
    help = "Input variable metadata file"
)
parser$add_argument("--method", required = TRUE, help = "Computation method")
parser$add_argument(
    "--chosen_stat",
    required = TRUE,
    help = "Statistics to compute"
)
parser$add_argument(
    "--class_col",
    default = NULL,
    help = "Class column (optional)"
)
parser$add_argument(
    "--test_fold",
    default = NULL,
    help = "Test fold (optional)"
)
parser$add_argument("--class1", default = NULL, help = "Class1 (optional)")
parser$add_argument(
    "--fold_frac",
    default = NULL,
    help = "Fold fraction (optional)"
)
parser$add_argument(
    "--logarithm",
    default = NULL,
    help = "Logarithm (optional)"
)
parser$add_argument(
    "--variableMetadata_out",
    required = TRUE,
    help = "Output variable metadata file"
)
parser$add_argument(
    "--graphs_out",
    required = TRUE,
    help = "Output graphs file"
)

args <- parser$parse_args()

print(args)

if (length(args) < 7) {
    stop("NOT enough argument !!!")
}

cat(
    "\nJob starting time:\n",
    format(Sys.time(), "%a %d %b %Y %X"),
    "\n\n--------------------------------------------------------------------",
    "\nIntensity Check parameters:\n\n"
)
print(args)
cat("--------------------------------------------------------------------\n\n")

class_col <- NULL
test_fold <- NULL
class1 <- NULL
fold_frac <- NULL
logarithm <- NULL
if (args$method == "each_class") {
    class_col <- args$class_col
    test_fold <- args$test_fold
    if (args$test_fold == "Yes") {
        logarithm <- args$logarithm
    }
}
if (args$method == "one_class") {
    class_col <- args$class_col
    class1 <- args$class1
    test_fold <- args$test_fold
    if (args$test_fold == "Yes") {
        fold_frac <- args$fold_frac
        logarithm <- args$logarithm
    }
}

err_no_option <- NULL

if (
    ((args$method == "no_class") && (args$chosen_stat == "None")) ||
        ((args$method != "no_class") &&
            (args$chosen_stat == "None") &&
            (test_fold == "No"))
) {
    err_no_option <- "You did not select any computational option. Program can not be executed."
    stop("\n- - - - - - - - -\n", err_no_option, "\n- - - - - - - - -\n")
}


if (is.null(err_no_option)) {
    intens_check(
        args$dataMatrix_in,
        args$sampleMetadata_in,
        args$variableMetadata_in,
        args$method,
        args$chosen_stat,
        class_col,
        test_fold,
        class1,
        fold_frac,
        logarithm,
        args$variableMetadata_out,
        args$graphs_out
    )
}


cat(
    "\n--------------------------------------------------------------------",
    "\nInformation about R (version, Operating System, attached or loaded packages):\n\n"
)
sessionInfo()
cat(
    "--------------------------------------------------------------------\n",
    "\nJob ending time:\n",
    format(Sys.time(), "%a %d %b %Y %X")
)


# delete the parameters to avoid the passage to the next tool in .RData image
rm(args)