Mercurial > repos > recetox > waveica
view waveica_wrapper.R @ 10:821062fc5782 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit 2b8b1dcb2947c6503fd4f82904df708e4f88ea1d
author | recetox |
---|---|
date | Fri, 04 Jul 2025 09:43:22 +0000 |
parents | 6fc9f6dbcef5 |
children |
line wrap: on
line source
# Read data from a file in the specified format (csv, tsv/tabular, or parquet) read_data <- function(file, ext, transpose = FALSE) { # Reads a data file based on its extension and returns a data frame. if (ext == "csv") { tryCatch( { data <- read.csv(file, header = TRUE) }, error = function(e) { stop( paste0( "Failed to read as CSV. The file may not ", "be a valid text file or may be corrupted: ", file, "\nError: ", e$message ) ) } ) } else if (ext == "tsv" || ext == "tabular") { tryCatch( { data <- read.csv(file, header = TRUE, sep = "\t") }, error = function(e) { stop( paste0( "Failed to read as TSV/tabular.", "The file may not be a valid text file or may be corrupted: ", file, "\nError: ", e$message ) ) } ) } else if (ext == "parquet") { data <- arrow::read_parquet(file) } else { stop(paste("Unsupported file extension or format for reading:", ext)) } original_first_colname <- colnames(data)[1] if (transpose) { col_names <- c("sampleName", data[[1]]) data <- tranpose_data(data, col_names) } return(list(data = data, original_first_colname = original_first_colname)) } # Main function for batchwise WaveICA normalization waveica <- function(data_matrix_file, sample_metadata_file, ft_ext, mt_ext, wavelet_filter, wavelet_length, k, t, t2, alpha, exclude_blanks, transpose = FALSE) { # Reads feature and metadata tables, merges them, # verifies columns, runs WaveICA, and returns normalized data. read_features_response <- read_data( data_matrix_file, ft_ext, transpose ) features <- read_features_response$data original_first_colname <- read_features_response$original_first_colname read_metadata_response <- read_data(sample_metadata_file, mt_ext) metadata <- read_metadata_response$data required_columns <- c( "sampleName", "class", "sampleType", "injectionOrder", "batch" ) metadata <- dplyr::select(metadata, required_columns) # Ensure both tables have a sampleName column if (!"sampleName" %in% colnames(features) || !"sampleName" %in% colnames(metadata)) { # nolint stop("Both feature and metadata tables must contain a 'sampleName' column.") } data <- merge(metadata, features, by = "sampleName") data <- verify_input_dataframe(data, required_columns) data <- sort_by_injection_order(data) # Separate features, batch, and group columns feature_columns <- colnames(data)[!colnames(data) %in% required_columns] features <- data[, feature_columns] group <- enumerate_groups(as.character(data$sampleType)) batch <- data$batch # Check that wavelet level is not too high for the number of samples max_level <- floor(log2(nrow(features))) requested_level <- as.numeric(wavelet_length) if (requested_level > max_level) { stop(sprintf( paste0( "Wavelet length/level (%d) is too high for ", "the number of samples (%d). Maximum allowed is %d." ), requested_level, nrow(features), max_level )) } # Run WaveICA normalization features <- recetox.waveica::waveica( data = features, wf = get_wf(wavelet_filter, wavelet_length), batch = batch, group = group, K = k, t = t, t2 = t2, alpha = alpha ) non_feature_columns <- setdiff(colnames(data), feature_columns) # Update the data frame with normalized features data[, feature_columns] <- features # Optionally remove blank samples if (exclude_blanks) { data <- exclude_group(data, group) } data <- final_data_processing( data, non_feature_columns, transpose, original_first_colname ) data } # Main function for single-batch WaveICA normalization waveica_singlebatch <- function(data_matrix_file, sample_metadata_file, ft_ext, mt_ext, wavelet_filter, wavelet_length, k, alpha, cutoff, exclude_blanks, transpose = FALSE) { # Reads feature and metadata tables, merges them, # verifies columns, runs WaveICA (single batch), and returns normalized data. read_features_response <- read_data(data_matrix_file, ft_ext, transpose) features <- read_features_response$data original_first_colname <- read_features_response$original_first_colname read_data_response <- read_data(sample_metadata_file, mt_ext) metadata <- read_data_response$data # Ensure both tables have a sampleName column if (!"sampleName" %in% colnames(features) || !"sampleName" %in% colnames(metadata)) { # nolint stop("Both feature and metadata tables must contain a 'sampleName' column.") } data <- merge(metadata, features, by = "sampleName") required_columns <- c("sampleName", "class", "sampleType", "injectionOrder") optional_columns <- c("batch") data <- verify_input_dataframe(data, required_columns) data <- sort_by_injection_order(data) feature_columns <- colnames(data)[ !colnames(data) %in% c(required_columns, optional_columns) ] features <- data[, feature_columns] injection_order <- data$injectionOrder # Run WaveICA normalization (single batch) features <- recetox.waveica::waveica_nonbatchwise( data = features, wf = get_wf(wavelet_filter, wavelet_length), injection_order = injection_order, K = k, alpha = alpha, cutoff = cutoff ) non_feature_columns <- setdiff(colnames(data), feature_columns) # Update the data frame with normalized features data[, feature_columns] <- features group <- enumerate_groups(as.character(data$sampleType)) # Optionally remove blank samples if (exclude_blanks) { data <- exclude_group(data, group) } data <- final_data_processing( data, non_feature_columns, transpose, original_first_colname ) data } # Sorts the data frame by batch and injection order (if batch exists), # otherwise by injection order only sort_by_injection_order <- function(data) { if ("batch" %in% colnames(data)) { data <- data[ order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE ), ] } else { data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ] } data } # Verifies that required columns exist and that there are no missing values verify_input_dataframe <- function(data, required_columns) { if (anyNA(data)) { stop("Error: dataframe cannot contain NULL values! \nMake sure that your dataframe does not contain empty cells") } else if (!all(required_columns %in% colnames(data))) { stop( "Error: missing metadata! \nMake sure that the following columns are present in your dataframe: ", paste(required_columns, collapse = ", ") ) } data <- verify_column_types(data, required_columns) data } # Checks column types for required and feature columns # and removes problematic feature columns verify_column_types <- function(data, required_columns) { # Checks that required columns have the correct type # and removes non-numeric feature columns efficiently. column_types <- list( "sampleName" = c("character", "factor"), "class" = c("character", "factor", "integer"), "sampleType" = c("character", "factor"), "injectionOrder" = "integer", "batch" = "integer" ) column_types <- column_types[required_columns] # Check required columns' types (fast, vectorized) for (col_name in names(column_types)) { if (!col_name %in% names(data)) next expected_types <- column_types[[col_name]] actual_type <- class(data[[col_name]]) if (!actual_type %in% expected_types) { stop( "Column ", col_name, " is of type ", actual_type, " but expected type is ", paste(expected_types, collapse = " or "), "\n" ) } } # Identify feature columns (not required columns) feature_cols <- setdiff(names(data), required_columns) # Try to convert all feature columns to numeric in one go # as well as suppressing warnings data[feature_cols] <- suppressWarnings( lapply( data[feature_cols], function(x) as.numeric(as.character(x)) ) ) # Find columns that are problematic (contain any NA after conversion) na_counts <- vapply(data[feature_cols], function(x) any(is.na(x)), logical(1)) removed_columns <- names(na_counts)[na_counts] if (length(removed_columns) > 0) { message( "Removed problematic columns (non-numeric): ", paste(removed_columns, collapse = ", ") ) } # Keep only good columns keep_cols <- !(names(data) %in% removed_columns) data <- data[, keep_cols, drop = FALSE] data } # Enumerates group labels: blank=0, sample=1, qc=2, standard=3 enumerate_groups <- function(group) { group[grepl("blank", tolower(group))] <- 0 group[grepl("sample", tolower(group))] <- 1 group[grepl("qc", tolower(group))] <- 2 group[grepl("standard", tolower(group))] <- 3 group } # Returns the correct wavelet filter string for the R wavelets function get_wf <- function(wavelet_filter, wavelet_length) { wf <- paste(wavelet_filter, wavelet_length, sep = "") # Exception for Daubechies-2 if (wf == "d2") { wf <- "haar" } wf } # Removes blank samples (group==0) from the data frame exclude_group <- function(data, group) { row_idx_to_exclude <- which(group %in% 0) if (length(row_idx_to_exclude) > 0) { data_without_blanks <- data[-c(row_idx_to_exclude), ] cat("Blank samples have been excluded from the dataframe.\n") data_without_blanks } else { data } } # Stores the output data in the requested format (csv, tsv/tabular, parquet), # optionally splitting metadata and features store_data <- function(data, feature_output, ext) { if (ext == "parquet") { arrow::write_parquet(data, feature_output) } else if (ext == "csv") { write.csv(data, file = feature_output, row.names = FALSE, quote = FALSE) } else if (ext == "tsv" || ext == "tabular") { write.table(data, file = feature_output, sep = "\t", row.names = FALSE, quote = FALSE ) } else { stop(paste("Unsupported file extension:", ext)) } cat("Normalization has been completed.\n") } tranpose_data <- function(data, column_names) { t_data <- data[-1] t_data <- t(t_data) tranposed_data <- data.frame(rownames(t_data), t_data) colnames(tranposed_data) <- column_names tranposed_data } final_data_processing <- function( data, non_feature_columns, transpose, original_first_colname) { # Remove all columns that are in non_ # feature_columns, except the first column cols_to_keep <- !(colnames(data) %in% non_feature_columns) cols_to_keep[1] <- TRUE # Always keep the first column data <- data[, cols_to_keep, drop = FALSE] if (transpose) { # The first column becomes the new column names new_colnames <- as.character(data[[1]]) # Remove the first column data <- data[, -1, drop = FALSE] # Transpose the rest data <- t(data) # Convert to data frame data <- as.data.frame(data, stringsAsFactors = FALSE) # The first row becomes the first column first_col <- rownames(data) data <- cbind(first_col, data) # Set column names colnames(data) <- c(colnames(data)[1], new_colnames) rownames(data) <- NULL } colnames(data)[1] <- original_first_colname data }