diff waveica_wrapper.R @ 10:821062fc5782 draft default tip

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit 2b8b1dcb2947c6503fd4f82904df708e4f88ea1d
author recetox
date Fri, 04 Jul 2025 09:43:22 +0000
parents 6fc9f6dbcef5
children
line wrap: on
line diff
--- a/waveica_wrapper.R	Fri May 16 10:14:27 2025 +0000
+++ b/waveica_wrapper.R	Fri Jul 04 09:43:22 2025 +0000
@@ -1,65 +1,112 @@
-read_file <- function(file, metadata, ft_ext, mt_ext, transpose) {
-    data <- read_data(file, ft_ext)
+# Read data from a file in the specified format (csv, tsv/tabular, or parquet)
+read_data <- function(file, ext, transpose = FALSE) {
+    # Reads a data file based on its extension and returns a data frame.
+    if (ext == "csv") {
+        tryCatch(
+            {
+                data <- read.csv(file, header = TRUE)
+            },
+            error = function(e) {
+                stop(
+                    paste0(
+                        "Failed to read as CSV. The file may not ",
+                        "be a valid text file or may be corrupted: ",
+                        file, "\nError: ", e$message
+                    )
+                )
+            }
+        )
+    } else if (ext == "tsv" || ext == "tabular") {
+        tryCatch(
+            {
+                data <- read.csv(file, header = TRUE, sep = "\t")
+            },
+            error = function(e) {
+                stop(
+                    paste0(
+                        "Failed to read as TSV/tabular.",
+                        "The file may not be a valid text file or may be corrupted: ",
+                        file, "\nError: ", e$message
+                    )
+                )
+            }
+        )
+    } else if (ext == "parquet") {
+        data <- arrow::read_parquet(file)
+    } else {
+        stop(paste("Unsupported file extension or format for reading:", ext))
+    }
 
+    original_first_colname <- colnames(data)[1]
     if (transpose) {
         col_names <- c("sampleName", data[[1]])
         data <- tranpose_data(data, col_names)
     }
-
-    if (!is.na(metadata)) {
-        mt_data <- read_data(metadata, mt_ext)
-        data <- merge(mt_data, data, by = "sampleName")
-    }
-
-    return(data)
+    return(list(data = data, original_first_colname = original_first_colname))
 }
 
-read_data <- function(file, ext) {
-    if (ext == "csv") {
-        data <- read.csv(file, header = TRUE)
-    } else if (ext == "tsv" || ext == "tabular") {
-        data <- read.csv(file, header = TRUE, sep = "\t")
-    } else {
-        data <- arrow::read_parquet(file)
-    }
-
-    return(data)
-}
-
-waveica <- function(file,
-                    metadata = NA,
-                    ext,
-                    transpose = FALSE,
+# Main function for batchwise WaveICA normalization
+waveica <- function(data_matrix_file,
+                    sample_metadata_file,
+                    ft_ext,
+                    mt_ext,
                     wavelet_filter,
                     wavelet_length,
                     k,
                     t,
                     t2,
                     alpha,
-                    exclude_blanks) {
-    # get input from the Galaxy, preprocess data
-    ext <- strsplit(x = ext, split = "\\,")[[1]]
+                    exclude_blanks,
+                    transpose = FALSE) {
+    # Reads feature and metadata tables, merges them,
+    # verifies columns, runs WaveICA, and returns normalized data.
+    read_features_response <- read_data(
+        data_matrix_file, ft_ext,
+        transpose
+    )
+    features <- read_features_response$data
+    original_first_colname <- read_features_response$original_first_colname
 
-    ft_ext <- ext[1]
-    mt_ext <- ext[2]
-
-    data <- read_file(file, metadata, ft_ext, mt_ext, transpose)
+    read_metadata_response <- read_data(sample_metadata_file, mt_ext)
+    metadata <- read_metadata_response$data
 
     required_columns <- c(
         "sampleName", "class", "sampleType",
         "injectionOrder", "batch"
     )
+
+    metadata <- dplyr::select(metadata, required_columns)
+
+    # Ensure both tables have a sampleName column
+    if (!"sampleName" %in% colnames(features) || !"sampleName" %in% colnames(metadata)) { # nolint
+        stop("Both feature and metadata tables must contain a 'sampleName' column.")
+    }
+    data <- merge(metadata, features, by = "sampleName")
+
+
     data <- verify_input_dataframe(data, required_columns)
 
     data <- sort_by_injection_order(data)
 
-    # separate data into features, batch and group
+    # Separate features, batch, and group columns
     feature_columns <- colnames(data)[!colnames(data) %in% required_columns]
     features <- data[, feature_columns]
     group <- enumerate_groups(as.character(data$sampleType))
     batch <- data$batch
 
-    # run WaveICA
+    # Check that wavelet level is not too high for the number of samples
+    max_level <- floor(log2(nrow(features)))
+    requested_level <- as.numeric(wavelet_length)
+    if (requested_level > max_level) {
+        stop(sprintf(
+            paste0(
+                "Wavelet length/level (%d) is too high for ",
+                "the number of samples (%d). Maximum allowed is %d."
+            ),
+            requested_level, nrow(features), max_level
+        ))
+    }
+    # Run WaveICA normalization
     features <- recetox.waveica::waveica(
         data = features,
         wf = get_wf(wavelet_filter, wavelet_length),
@@ -70,47 +117,63 @@
         t2 = t2,
         alpha = alpha
     )
+    non_feature_columns <- setdiff(colnames(data), feature_columns)
 
+    # Update the data frame with normalized features
     data[, feature_columns] <- features
 
-    # remove blanks from dataset
+    # Optionally remove blank samples
     if (exclude_blanks) {
         data <- exclude_group(data, group)
     }
-
-    return(data)
+    data <- final_data_processing(
+        data, non_feature_columns,
+        transpose, original_first_colname
+    )
+    data
 }
 
-waveica_singlebatch <- function(file,
-                                metadata = NA,
-                                ext,
-                                transpose = FALSE,
+# Main function for single-batch WaveICA normalization
+waveica_singlebatch <- function(data_matrix_file,
+                                sample_metadata_file,
+                                ft_ext,
+                                mt_ext,
                                 wavelet_filter,
                                 wavelet_length,
                                 k,
                                 alpha,
                                 cutoff,
-                                exclude_blanks) {
-    # get input from the Galaxy, preprocess data
-    ext <- strsplit(x = ext, split = "\\,")[[1]]
+                                exclude_blanks,
+                                transpose = FALSE) {
+    # Reads feature and metadata tables, merges them,
+    # verifies columns, runs WaveICA (single batch), and returns normalized data.
+    read_features_response <- read_data(data_matrix_file, ft_ext, transpose)
+    features <- read_features_response$data
+    original_first_colname <- read_features_response$original_first_colname
 
-    ft_ext <- ext[1]
-    mt_ext <- ext[2]
+    read_data_response <- read_data(sample_metadata_file, mt_ext)
+    metadata <- read_data_response$data
 
-    data <- read_file(file, metadata, ft_ext, mt_ext, transpose)
+    # Ensure both tables have a sampleName column
+    if (!"sampleName" %in% colnames(features) ||
+        !"sampleName" %in% colnames(metadata)) { # nolint
+        stop("Both feature and metadata tables must contain a 'sampleName' column.")
+    }
+    data <- merge(metadata, features, by = "sampleName")
 
     required_columns <- c("sampleName", "class", "sampleType", "injectionOrder")
     optional_columns <- c("batch")
-
     data <- verify_input_dataframe(data, required_columns)
 
     data <- sort_by_injection_order(data)
 
-    feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)]
+    feature_columns <- colnames(data)[
+        !colnames(data) %in% c(required_columns, optional_columns)
+    ]
     features <- data[, feature_columns]
     injection_order <- data$injectionOrder
 
-    # run WaveICA
+    # Run WaveICA normalization (single batch)
     features <- recetox.waveica::waveica_nonbatchwise(
         data = features,
         wf = get_wf(wavelet_filter, wavelet_length),
@@ -119,45 +182,59 @@
         alpha = alpha,
         cutoff = cutoff
     )
+    non_feature_columns <- setdiff(colnames(data), feature_columns)
 
+    # Update the data frame with normalized features
     data[, feature_columns] <- features
     group <- enumerate_groups(as.character(data$sampleType))
-    # remove blanks from dataset
+    # Optionally remove blank samples
     if (exclude_blanks) {
         data <- exclude_group(data, group)
     }
-
-    return(data)
+    data <- final_data_processing(
+        data, non_feature_columns,
+        transpose, original_first_colname
+    )
+    data
 }
 
+# Sorts the data frame by batch and injection order (if batch exists),
+# otherwise by injection order only
 sort_by_injection_order <- function(data) {
     if ("batch" %in% colnames(data)) {
-        data <- data[order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE), ]
+        data <- data[
+            order(data[, "batch"],
+                data[, "injectionOrder"],
+                decreasing = FALSE
+            ),
+        ]
     } else {
         data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ]
     }
-    return(data)
+    data
 }
 
+# Verifies that required columns exist and that there are no missing values
 verify_input_dataframe <- function(data, required_columns) {
     if (anyNA(data)) {
         stop("Error: dataframe cannot contain NULL values!
-Make sure that your dataframe does not contain empty cells")
+    \nMake sure that your dataframe does not contain empty cells")
     } else if (!all(required_columns %in% colnames(data))) {
         stop(
             "Error: missing metadata!
-Make sure that the following columns are present in your dataframe: ",
+      \nMake sure that the following columns are present in your dataframe: ",
             paste(required_columns, collapse = ", ")
         )
     }
-
     data <- verify_column_types(data, required_columns)
-
-    return(data)
+    data
 }
 
+# Checks column types for required and feature columns
+# and removes problematic feature columns
 verify_column_types <- function(data, required_columns) {
-    # Specify the column names and their expected types
+    # Checks that required columns have the correct type
+    # and removes non-numeric feature columns efficiently.
     column_types <- list(
         "sampleName" = c("character", "factor"),
         "class" = c("character", "factor", "integer"),
@@ -165,120 +242,133 @@
         "injectionOrder" = "integer",
         "batch" = "integer"
     )
-
     column_types <- column_types[required_columns]
 
-    for (col_name in names(data)) {
+    # Check required columns' types (fast, vectorized)
+    for (col_name in names(column_types)) {
+        if (!col_name %in% names(data)) next
+        expected_types <- column_types[[col_name]]
         actual_type <- class(data[[col_name]])
-        if (col_name %in% names(column_types)) {
-            expected_types <- column_types[[col_name]]
-
-            if (!actual_type %in% expected_types) {
-                stop(
-                    "Column ", col_name, " is of type ", actual_type,
-                    " but expected type is ",
-                    paste(expected_types, collapse = " or "), "\n"
-                )
-            }
-        } else {
-            if (actual_type != "numeric") {
-                data[[col_name]] <- as.numeric(as.character(data[[col_name]]))
-            }
+        if (!actual_type %in% expected_types) {
+            stop(
+                "Column ", col_name, " is of type ", actual_type,
+                " but expected type is ",
+                paste(expected_types, collapse = " or "), "\n"
+            )
         }
     }
-    return(data)
+
+    # Identify feature columns (not required columns)
+    feature_cols <- setdiff(names(data), required_columns)
+    # Try to convert all feature columns to numeric in one go
+    # as well as suppressing warnings
+    data[feature_cols] <- suppressWarnings(
+        lapply(
+            data[feature_cols],
+            function(x) as.numeric(as.character(x))
+        )
+    )
+    # Find columns that are problematic (contain any NA after conversion)
+    na_counts <- vapply(data[feature_cols], function(x) any(is.na(x)), logical(1))
+    removed_columns <- names(na_counts)[na_counts]
+    if (length(removed_columns) > 0) {
+        message(
+            "Removed problematic columns (non-numeric): ",
+            paste(removed_columns, collapse = ", ")
+        )
+    }
+
+    # Keep only good columns
+    keep_cols <- !(names(data) %in% removed_columns)
+    data <- data[, keep_cols, drop = FALSE]
+    data
 }
 
-# Match group labels with [blank/sample/qc] and enumerate them
+# Enumerates group labels: blank=0, sample=1, qc=2, standard=3
 enumerate_groups <- function(group) {
     group[grepl("blank", tolower(group))] <- 0
     group[grepl("sample", tolower(group))] <- 1
     group[grepl("qc", tolower(group))] <- 2
     group[grepl("standard", tolower(group))] <- 3
-
-    return(group)
+    group
 }
 
-# Create appropriate input for R wavelets function
+# Returns the correct wavelet filter string for the R wavelets function
 get_wf <- function(wavelet_filter, wavelet_length) {
     wf <- paste(wavelet_filter, wavelet_length, sep = "")
-
-    # exception to the wavelet function
+    # Exception for Daubechies-2
     if (wf == "d2") {
         wf <- "haar"
     }
-
-    return(wf)
+    wf
 }
 
-# Exclude blanks from a dataframe
+# Removes blank samples (group==0) from the data frame
 exclude_group <- function(data, group) {
     row_idx_to_exclude <- which(group %in% 0)
     if (length(row_idx_to_exclude) > 0) {
         data_without_blanks <- data[-c(row_idx_to_exclude), ]
         cat("Blank samples have been excluded from the dataframe.\n")
-        return(data_without_blanks)
+        data_without_blanks
     } else {
-        return(data)
+        data
     }
 }
 
-store_data <- function(data, feature_output, metadata_output, ext, split_output = FALSE) {
+# Stores the output data in the requested format (csv, tsv/tabular, parquet),
+# optionally splitting metadata and features
+store_data <- function(data, feature_output, ext) {
     if (ext == "parquet") {
-        if (split_output == TRUE) {
-            split_df <- split_output(data)
-            arrow::write_parquet(split_df$metadata, metadata_output)
-            arrow::write_parquet(split_df$feature_table, feature_output)
-        } else {
-            arrow::write_parquet(data, feature_output)
-        }
+        arrow::write_parquet(data, feature_output)
+    } else if (ext == "csv") {
+        write.csv(data, file = feature_output, row.names = FALSE, quote = FALSE)
+    } else if (ext == "tsv" || ext == "tabular") {
+        write.table(data,
+            file = feature_output, sep = "\t",
+            row.names = FALSE, quote = FALSE
+        )
     } else {
-        if (split_output == TRUE) {
-            split_df <- split_output(data)
-            write.table(split_df$metadata,
-                file = metadata_output, sep = "\t",
-                row.names = FALSE, quote = FALSE
-            )
-            write.table(split_df$feature_table,
-                file = feature_output, sep = "\t",
-                row.names = FALSE, quote = FALSE
-            )
-        } else {
-            write.table(data,
-                file = feature_output, sep = "\t",
-                row.names = FALSE, quote = FALSE
-            )
-        }
+        stop(paste("Unsupported file extension:", ext))
     }
     cat("Normalization has been completed.\n")
 }
 
-split_output <- function(df) {
-    required_columns_set1 <- c("sampleName", "class", "sampleType", "injectionOrder", "batch")
-    required_columns_set2 <- c("sampleName", "class", "sampleType", "injectionOrder")
-
-    if (all(required_columns_set1 %in% colnames(df))) {
-        metadata_df <- df[, required_columns_set1, drop = FALSE]
-        df <- df[, -c(2:5)]
-    } else if (all(required_columns_set2 %in% colnames(df))) {
-        metadata_df <- df[, required_columns_set2, drop = FALSE]
-        df <- df[, -c(2:4)]
-    } else {
-        stop("Neither set of required columns is present in the dataframe.")
-    }
-
-    # Transpose the feature table
-    col_names <- c("id", as.vector(df[[1]]))
-    feature_table <- tranpose_data(df, col_names)
-
-    return(list(metadata = metadata_df, feature_table = feature_table))
-}
-
 tranpose_data <- function(data, column_names) {
     t_data <- data[-1]
     t_data <- t(t_data)
     tranposed_data <- data.frame(rownames(t_data), t_data)
     colnames(tranposed_data) <- column_names
 
-    return(tranposed_data)
+    tranposed_data
 }
+
+
+final_data_processing <- function(
+    data, non_feature_columns,
+    transpose, original_first_colname) {
+    # Remove all columns that are in non_
+    # feature_columns, except the first column
+    cols_to_keep <- !(colnames(data) %in% non_feature_columns)
+    cols_to_keep[1] <- TRUE # Always keep the first column
+    data <- data[, cols_to_keep, drop = FALSE]
+
+
+    if (transpose) {
+        # The first column becomes the new column names
+        new_colnames <- as.character(data[[1]])
+        # Remove the first column
+        data <- data[, -1, drop = FALSE]
+        # Transpose the rest
+        data <- t(data)
+        # Convert to data frame
+        data <- as.data.frame(data, stringsAsFactors = FALSE)
+        # The first row becomes the first column
+        first_col <- rownames(data)
+        data <- cbind(first_col, data)
+        # Set column names
+        colnames(data) <- c(colnames(data)[1], new_colnames)
+        rownames(data) <- NULL
+    }
+    colnames(data)[1] <- original_first_colname
+    data
+}