# HG changeset patch
# User recetox
# Date 1751622202 0
# Node ID 821062fc57829d48a30ba2b82fde84d487495459
# Parent 6fc9f6dbcef5da6d83331c89c4fe64dd75aae531
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit 2b8b1dcb2947c6503fd4f82904df708e4f88ea1d
diff -r 6fc9f6dbcef5 -r 821062fc5782 macros.xml
--- a/macros.xml Fri May 16 10:14:27 2025 +0000
+++ b/macros.xml Fri Jul 04 09:43:22 2025 +0000
@@ -26,10 +26,6 @@
-
-
-
-
@@ -81,22 +77,15 @@
-
-
-
-
-
-
-
- keep_two_output
-
-
+
+
+
+
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/data_matrix.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/data_matrix.csv Fri Jul 04 09:43:22 2025 +0000
@@ -0,0 +1,6 @@
+sampleName,M85T34,M86T41,M86T518,M86T539
+VT_160120_002,228520.06430737,35646729.21543971,2386896.97966461,1026645.83653468
+VT_160120_004,90217.384387202,35735702.457215995,2456290.69621518,1089246.46040563
+VT_160120_006,235656.75288383896,37021134.452711605,8873450.40260241,837856.449608585
+VT_160120_008,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685
+VT_160120_010,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/data_matrix.parquet
Binary file test-data/data_matrix.parquet has changed
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/data_matrix.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/data_matrix.tsv Fri Jul 04 09:43:22 2025 +0000
@@ -0,0 +1,6 @@
+sampleName M85T34 M86T41 M86T518 M86T539
+VT_160120_002 228520.06430737 35646729.21543971 2386896.97966461 1026645.83653468
+VT_160120_004 90217.384387202 35735702.457215995 2456290.69621518 1089246.46040563
+VT_160120_006 235656.75288383896 37021134.452711605 8873450.40260241 837856.449608585
+VT_160120_008 16622.9351783435 44302499.262606 2466946.89667101 994979.069689685
+VT_160120_010 62385.0742465736 44639738.0735709 2389372.85729467 954938.131337246
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/feature_table.csv
--- a/test-data/feature_table.csv Fri May 16 10:14:27 2025 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-sampleName,M85T34,M86T41,M86T518,M86T539
-VT_160120_002,228520.06430737,35646729.21543971,2386896.97966461,1026645.83653468
-VT_160120_004,90217.384387202,35735702.457215995,2456290.69621518,1089246.46040563
-VT_160120_006,235656.75288383896,37021134.452711605,8873450.40260241,837856.449608585
-VT_160120_008,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685
-VT_160120_010,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/feature_table.parquet
Binary file test-data/feature_table.parquet has changed
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/feature_table.tsv
--- a/test-data/feature_table.tsv Fri May 16 10:14:27 2025 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-sampleName M85T34 M86T41 M86T518 M86T539
-VT_160120_002 228520.06430737 35646729.21543971 2386896.97966461 1026645.83653468
-VT_160120_004 90217.384387202 35735702.457215995 2456290.69621518 1089246.46040563
-VT_160120_006 235656.75288383896 37021134.452711605 8873450.40260241 837856.449608585
-VT_160120_008 16622.9351783435 44302499.262606 2466946.89667101 994979.069689685
-VT_160120_010 62385.0742465736 44639738.0735709 2389372.85729467 954938.131337246
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/metadata.csv
--- a/test-data/metadata.csv Fri May 16 10:14:27 2025 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-sampleName,class,sampleType,injectionOrder,batch
-VT_160120_002,sample,sample,1,1
-VT_160120_004,sample,sample,2,1
-VT_160120_006,sample,sample,3,1
-VT_160120_008,sample,sample,4,1
-VT_160120_010,sample,sample,5,1
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/metadata.parquet
Binary file test-data/metadata.parquet has changed
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/metadata.tsv
--- a/test-data/metadata.tsv Fri May 16 10:14:27 2025 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-sampleName class sampleType injectionOrder batch
-VT_160120_002 sample sample 1 1
-VT_160120_004 sample sample 2 1
-VT_160120_006 sample sample 3 1
-VT_160120_008 sample sample 4 1
-VT_160120_010 sample sample 5 1
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/normalized_data.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/normalized_data.csv Fri Jul 04 09:43:22 2025 +0000
@@ -0,0 +1,5 @@
+id,VT_160120_002,VT_160120_004,VT_160120_006,VT_160120_008,VT_160120_010
+M85T34,355200.506508035,216897.826587868,362337.195084505,143303.377379009,189065.516447239
+M86T41,75115889.9077485,75204863.1495248,76490295.1450204,83771659.9549148,84108898.7658797
+M86T518,6101488.54615418,6170882.26270475,12588041.969092,6181538.46316058,6103964.42378424
+M86T539,2007379.02604984,2069979.64992079,1818589.63912375,1975712.25920485,1935671.32085241
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/normalized_data.parquet
Binary file test-data/normalized_data.parquet has changed
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/normalized_data.tsv
--- a/test-data/normalized_data.tsv Fri May 16 10:14:27 2025 +0000
+++ b/test-data/normalized_data.tsv Fri Jul 04 09:43:22 2025 +0000
@@ -1,6 +1,6 @@
-sampleName class sampleType injectionOrder batch M85T34 M86T41 M86T518 M86T539
-VT_160120_002 sample sample 1 1 355200.506508035 75115889.9077485 6101488.54615418 2007379.02604984
-VT_160120_004 sample sample 2 1 216897.826587868 75204863.1495248 6170882.26270475 2069979.64992079
-VT_160120_006 sample sample 3 1 362337.195084504 76490295.1450204 12588041.969092 1818589.63912375
-VT_160120_008 sample sample 4 1 143303.377379009 83771659.9549148 6181538.46316058 1975712.25920485
-VT_160120_010 sample sample 5 1 189065.516447239 84108898.7658797 6103964.42378424 1935671.32085241
+sampleName M85T34 M86T41 M86T518 M86T539
+VT_160120_002 355200.506508035 75115889.9077485 6101488.54615418 2007379.02604984
+VT_160120_004 216897.826587868 75204863.1495248 6170882.26270475 2069979.64992079
+VT_160120_006 362337.195084505 76490295.1450204 12588041.969092 1818589.63912375
+VT_160120_008 143303.377379009 83771659.9549148 6181538.46316058 1975712.25920485
+VT_160120_010 189065.516447239 84108898.7658797 6103964.42378424 1935671.32085241
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/normalized_data_nobatch.tsv
--- a/test-data/normalized_data_nobatch.tsv Fri May 16 10:14:27 2025 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-sampleName class sampleType injectionOrder M85T34 M86T41 M86T518 M86T539
-VT_160120_002 sample sample 1 -9795801.68327296 29546678.5668352 -6207890.55898405 -8941748.93595845
-VT_160120_004 sample sample 2 -9798910.74239713 29543569.5077111 -6210999.61810821 -8944857.99508262
-VT_160120_006 sample sample 3 -9797307.93141959 29545172.3186886 -6209396.80713068 -8943255.18410509
-VT_160120_008 sample sample 4 -9793706.69204694 29548773.5580612 -6205795.56775803 -8939653.94473244
-VT_160120_010 sample sample 5 -9800711.45464277 29541768.7954654 -6212800.33035386 -8946658.70732827
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/sample_metadata.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_metadata.csv Fri Jul 04 09:43:22 2025 +0000
@@ -0,0 +1,6 @@
+sampleName,class,sampleType,injectionOrder,batch
+VT_160120_002,sample,sample,1,1
+VT_160120_004,sample,sample,2,1
+VT_160120_006,sample,sample,3,1
+VT_160120_008,sample,sample,4,1
+VT_160120_010,sample,sample,5,1
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/sample_metadata.parquet
Binary file test-data/sample_metadata.parquet has changed
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/sample_metadata.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_metadata.tsv Fri Jul 04 09:43:22 2025 +0000
@@ -0,0 +1,6 @@
+sampleName class sampleType injectionOrder batch
+VT_160120_002 sample sample 1 1
+VT_160120_004 sample sample 2 1
+VT_160120_006 sample sample 3 1
+VT_160120_008 sample sample 4 1
+VT_160120_010 sample sample 5 1
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/test10_output1.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test10_output1.csv Fri Jul 04 09:43:22 2025 +0000
@@ -0,0 +1,5 @@
+id,VT_160120_002,VT_160120_004,VT_160120_006,VT_160120_008,VT_160120_010
+M85T34,355200.506508035,216897.826587868,362337.195084505,143303.377379009,189065.516447239
+M86T41,75115889.9077485,75204863.1495248,76490295.1450204,83771659.9549148,84108898.7658797
+M86T518,6101488.54615418,6170882.26270475,12588041.969092,6181538.46316058,6103964.42378424
+M86T539,2007379.02604984,2069979.64992079,1818589.63912375,1975712.25920485,1935671.32085241
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/test10_output1.tsv
--- a/test-data/test10_output1.tsv Fri May 16 10:14:27 2025 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-id VT_160120_002 VT_160120_004 VT_160120_006 VT_160120_008 VT_160120_010
-M85T34 355200.506508035 216897.826587868 362337.195084504 143303.377379009 189065.516447239
-M86T41 75115889.9077485 75204863.1495248 76490295.1450204 83771659.9549148 84108898.7658797
-M86T518 6101488.54615418 6170882.26270475 12588041.969092 6181538.46316058 6103964.42378424
-M86T539 2007379.02604984 2069979.64992079 1818589.63912375 1975712.25920485 1935671.32085241
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/test10_output2.tsv
--- a/test-data/test10_output2.tsv Fri May 16 10:14:27 2025 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-sampleName class sampleType injectionOrder batch
-VT_160120_002 sample sample 1 1
-VT_160120_004 sample sample 2 1
-VT_160120_006 sample sample 3 1
-VT_160120_008 sample sample 4 1
-VT_160120_010 sample sample 5 1
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/test1_output.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test1_output.csv Fri Jul 04 09:43:22 2025 +0000
@@ -0,0 +1,6 @@
+sampleName,M85T34,M86T41,M86T518,M86T539
+VT_160120_002,355200.506508035,75115889.9077485,6101488.54615418,2007379.02604984
+VT_160120_004,216897.826587868,75204863.1495248,6170882.26270475,2069979.64992079
+VT_160120_006,362337.195084505,76490295.1450204,12588041.969092,1818589.63912375
+VT_160120_008,143303.377379009,83771659.9549148,6181538.46316058,1975712.25920485
+VT_160120_010,189065.516447239,84108898.7658797,6103964.42378424,1935671.32085241
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/test3_output.parquet
Binary file test-data/test3_output.parquet has changed
diff -r 6fc9f6dbcef5 -r 821062fc5782 test-data/test9_output2.parquet
Binary file test-data/test9_output2.parquet has changed
diff -r 6fc9f6dbcef5 -r 821062fc5782 waveica.xml
--- a/waveica.xml Fri May 16 10:14:27 2025 +0000
+++ b/waveica.xml Fri Jul 04 09:43:22 2025 +0000
@@ -1,4 +1,4 @@
-
+
removal of batch effects for untargeted metabolomics data
macros.xml
@@ -9,67 +9,49 @@
r-recetox-waveica
r-arrow
+ r-dplyr
-
-
-
-
-
-
-
-
-
-
-
-
+
+
-
-
@@ -89,16 +71,14 @@
-
-
-
-
-
+
+
+
@@ -110,12 +90,11 @@
-
+
-
-
-
-
+
+
+
@@ -129,29 +108,9 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
@@ -163,73 +122,40 @@
-
+
-
-
-
-
-
-
-
+
+
+
-
-
-
-
-
-
-
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
-
-
-
-
-
-
-
-
-
+
+
+
+
+
diff -r 6fc9f6dbcef5 -r 821062fc5782 waveica_wrapper.R
--- a/waveica_wrapper.R Fri May 16 10:14:27 2025 +0000
+++ b/waveica_wrapper.R Fri Jul 04 09:43:22 2025 +0000
@@ -1,65 +1,112 @@
-read_file <- function(file, metadata, ft_ext, mt_ext, transpose) {
- data <- read_data(file, ft_ext)
+# Read data from a file in the specified format (csv, tsv/tabular, or parquet)
+read_data <- function(file, ext, transpose = FALSE) {
+ # Reads a data file based on its extension and returns a data frame.
+ if (ext == "csv") {
+ tryCatch(
+ {
+ data <- read.csv(file, header = TRUE)
+ },
+ error = function(e) {
+ stop(
+ paste0(
+ "Failed to read as CSV. The file may not ",
+ "be a valid text file or may be corrupted: ",
+ file, "\nError: ", e$message
+ )
+ )
+ }
+ )
+ } else if (ext == "tsv" || ext == "tabular") {
+ tryCatch(
+ {
+ data <- read.csv(file, header = TRUE, sep = "\t")
+ },
+ error = function(e) {
+ stop(
+ paste0(
+ "Failed to read as TSV/tabular.",
+ "The file may not be a valid text file or may be corrupted: ",
+ file, "\nError: ", e$message
+ )
+ )
+ }
+ )
+ } else if (ext == "parquet") {
+ data <- arrow::read_parquet(file)
+ } else {
+ stop(paste("Unsupported file extension or format for reading:", ext))
+ }
+ original_first_colname <- colnames(data)[1]
if (transpose) {
col_names <- c("sampleName", data[[1]])
data <- tranpose_data(data, col_names)
}
-
- if (!is.na(metadata)) {
- mt_data <- read_data(metadata, mt_ext)
- data <- merge(mt_data, data, by = "sampleName")
- }
-
- return(data)
+ return(list(data = data, original_first_colname = original_first_colname))
}
-read_data <- function(file, ext) {
- if (ext == "csv") {
- data <- read.csv(file, header = TRUE)
- } else if (ext == "tsv" || ext == "tabular") {
- data <- read.csv(file, header = TRUE, sep = "\t")
- } else {
- data <- arrow::read_parquet(file)
- }
-
- return(data)
-}
-
-waveica <- function(file,
- metadata = NA,
- ext,
- transpose = FALSE,
+# Main function for batchwise WaveICA normalization
+waveica <- function(data_matrix_file,
+ sample_metadata_file,
+ ft_ext,
+ mt_ext,
wavelet_filter,
wavelet_length,
k,
t,
t2,
alpha,
- exclude_blanks) {
- # get input from the Galaxy, preprocess data
- ext <- strsplit(x = ext, split = "\\,")[[1]]
+ exclude_blanks,
+ transpose = FALSE) {
+ # Reads feature and metadata tables, merges them,
+ # verifies columns, runs WaveICA, and returns normalized data.
+ read_features_response <- read_data(
+ data_matrix_file, ft_ext,
+ transpose
+ )
+ features <- read_features_response$data
+ original_first_colname <- read_features_response$original_first_colname
- ft_ext <- ext[1]
- mt_ext <- ext[2]
-
- data <- read_file(file, metadata, ft_ext, mt_ext, transpose)
+ read_metadata_response <- read_data(sample_metadata_file, mt_ext)
+ metadata <- read_metadata_response$data
required_columns <- c(
"sampleName", "class", "sampleType",
"injectionOrder", "batch"
)
+
+ metadata <- dplyr::select(metadata, required_columns)
+
+ # Ensure both tables have a sampleName column
+ if (!"sampleName" %in% colnames(features) || !"sampleName" %in% colnames(metadata)) { # nolint
+ stop("Both feature and metadata tables must contain a 'sampleName' column.")
+ }
+ data <- merge(metadata, features, by = "sampleName")
+
+
data <- verify_input_dataframe(data, required_columns)
data <- sort_by_injection_order(data)
- # separate data into features, batch and group
+ # Separate features, batch, and group columns
feature_columns <- colnames(data)[!colnames(data) %in% required_columns]
features <- data[, feature_columns]
group <- enumerate_groups(as.character(data$sampleType))
batch <- data$batch
- # run WaveICA
+ # Check that wavelet level is not too high for the number of samples
+ max_level <- floor(log2(nrow(features)))
+ requested_level <- as.numeric(wavelet_length)
+ if (requested_level > max_level) {
+ stop(sprintf(
+ paste0(
+ "Wavelet length/level (%d) is too high for ",
+ "the number of samples (%d). Maximum allowed is %d."
+ ),
+ requested_level, nrow(features), max_level
+ ))
+ }
+ # Run WaveICA normalization
features <- recetox.waveica::waveica(
data = features,
wf = get_wf(wavelet_filter, wavelet_length),
@@ -70,47 +117,63 @@
t2 = t2,
alpha = alpha
)
+ non_feature_columns <- setdiff(colnames(data), feature_columns)
+ # Update the data frame with normalized features
data[, feature_columns] <- features
- # remove blanks from dataset
+ # Optionally remove blank samples
if (exclude_blanks) {
data <- exclude_group(data, group)
}
-
- return(data)
+ data <- final_data_processing(
+ data, non_feature_columns,
+ transpose, original_first_colname
+ )
+ data
}
-waveica_singlebatch <- function(file,
- metadata = NA,
- ext,
- transpose = FALSE,
+# Main function for single-batch WaveICA normalization
+waveica_singlebatch <- function(data_matrix_file,
+ sample_metadata_file,
+ ft_ext,
+ mt_ext,
wavelet_filter,
wavelet_length,
k,
alpha,
cutoff,
- exclude_blanks) {
- # get input from the Galaxy, preprocess data
- ext <- strsplit(x = ext, split = "\\,")[[1]]
+ exclude_blanks,
+ transpose = FALSE) {
+ # Reads feature and metadata tables, merges them,
+ # verifies columns, runs WaveICA (single batch), and returns normalized data.
+ read_features_response <- read_data(data_matrix_file, ft_ext, transpose)
+ features <- read_features_response$data
+ original_first_colname <- read_features_response$original_first_colname
- ft_ext <- ext[1]
- mt_ext <- ext[2]
+ read_data_response <- read_data(sample_metadata_file, mt_ext)
+ metadata <- read_data_response$data
- data <- read_file(file, metadata, ft_ext, mt_ext, transpose)
+ # Ensure both tables have a sampleName column
+ if (!"sampleName" %in% colnames(features) ||
+ !"sampleName" %in% colnames(metadata)) { # nolint
+ stop("Both feature and metadata tables must contain a 'sampleName' column.")
+ }
+ data <- merge(metadata, features, by = "sampleName")
required_columns <- c("sampleName", "class", "sampleType", "injectionOrder")
optional_columns <- c("batch")
-
data <- verify_input_dataframe(data, required_columns)
data <- sort_by_injection_order(data)
- feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)]
+ feature_columns <- colnames(data)[
+ !colnames(data) %in% c(required_columns, optional_columns)
+ ]
features <- data[, feature_columns]
injection_order <- data$injectionOrder
- # run WaveICA
+ # Run WaveICA normalization (single batch)
features <- recetox.waveica::waveica_nonbatchwise(
data = features,
wf = get_wf(wavelet_filter, wavelet_length),
@@ -119,45 +182,59 @@
alpha = alpha,
cutoff = cutoff
)
+ non_feature_columns <- setdiff(colnames(data), feature_columns)
+ # Update the data frame with normalized features
data[, feature_columns] <- features
group <- enumerate_groups(as.character(data$sampleType))
- # remove blanks from dataset
+ # Optionally remove blank samples
if (exclude_blanks) {
data <- exclude_group(data, group)
}
-
- return(data)
+ data <- final_data_processing(
+ data, non_feature_columns,
+ transpose, original_first_colname
+ )
+ data
}
+# Sorts the data frame by batch and injection order (if batch exists),
+# otherwise by injection order only
sort_by_injection_order <- function(data) {
if ("batch" %in% colnames(data)) {
- data <- data[order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE), ]
+ data <- data[
+ order(data[, "batch"],
+ data[, "injectionOrder"],
+ decreasing = FALSE
+ ),
+ ]
} else {
data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ]
}
- return(data)
+ data
}
+# Verifies that required columns exist and that there are no missing values
verify_input_dataframe <- function(data, required_columns) {
if (anyNA(data)) {
stop("Error: dataframe cannot contain NULL values!
-Make sure that your dataframe does not contain empty cells")
+ \nMake sure that your dataframe does not contain empty cells")
} else if (!all(required_columns %in% colnames(data))) {
stop(
"Error: missing metadata!
-Make sure that the following columns are present in your dataframe: ",
+ \nMake sure that the following columns are present in your dataframe: ",
paste(required_columns, collapse = ", ")
)
}
-
data <- verify_column_types(data, required_columns)
-
- return(data)
+ data
}
+# Checks column types for required and feature columns
+# and removes problematic feature columns
verify_column_types <- function(data, required_columns) {
- # Specify the column names and their expected types
+ # Checks that required columns have the correct type
+ # and removes non-numeric feature columns efficiently.
column_types <- list(
"sampleName" = c("character", "factor"),
"class" = c("character", "factor", "integer"),
@@ -165,120 +242,133 @@
"injectionOrder" = "integer",
"batch" = "integer"
)
-
column_types <- column_types[required_columns]
- for (col_name in names(data)) {
+ # Check required columns' types (fast, vectorized)
+ for (col_name in names(column_types)) {
+ if (!col_name %in% names(data)) next
+ expected_types <- column_types[[col_name]]
actual_type <- class(data[[col_name]])
- if (col_name %in% names(column_types)) {
- expected_types <- column_types[[col_name]]
-
- if (!actual_type %in% expected_types) {
- stop(
- "Column ", col_name, " is of type ", actual_type,
- " but expected type is ",
- paste(expected_types, collapse = " or "), "\n"
- )
- }
- } else {
- if (actual_type != "numeric") {
- data[[col_name]] <- as.numeric(as.character(data[[col_name]]))
- }
+ if (!actual_type %in% expected_types) {
+ stop(
+ "Column ", col_name, " is of type ", actual_type,
+ " but expected type is ",
+ paste(expected_types, collapse = " or "), "\n"
+ )
}
}
- return(data)
+
+ # Identify feature columns (not required columns)
+ feature_cols <- setdiff(names(data), required_columns)
+ # Try to convert all feature columns to numeric in one go
+ # as well as suppressing warnings
+ data[feature_cols] <- suppressWarnings(
+ lapply(
+ data[feature_cols],
+ function(x) as.numeric(as.character(x))
+ )
+ )
+ # Find columns that are problematic (contain any NA after conversion)
+ na_counts <- vapply(data[feature_cols], function(x) any(is.na(x)), logical(1))
+ removed_columns <- names(na_counts)[na_counts]
+ if (length(removed_columns) > 0) {
+ message(
+ "Removed problematic columns (non-numeric): ",
+ paste(removed_columns, collapse = ", ")
+ )
+ }
+
+ # Keep only good columns
+ keep_cols <- !(names(data) %in% removed_columns)
+ data <- data[, keep_cols, drop = FALSE]
+ data
}
-# Match group labels with [blank/sample/qc] and enumerate them
+# Enumerates group labels: blank=0, sample=1, qc=2, standard=3
enumerate_groups <- function(group) {
group[grepl("blank", tolower(group))] <- 0
group[grepl("sample", tolower(group))] <- 1
group[grepl("qc", tolower(group))] <- 2
group[grepl("standard", tolower(group))] <- 3
-
- return(group)
+ group
}
-# Create appropriate input for R wavelets function
+# Returns the correct wavelet filter string for the R wavelets function
get_wf <- function(wavelet_filter, wavelet_length) {
wf <- paste(wavelet_filter, wavelet_length, sep = "")
-
- # exception to the wavelet function
+ # Exception for Daubechies-2
if (wf == "d2") {
wf <- "haar"
}
-
- return(wf)
+ wf
}
-# Exclude blanks from a dataframe
+# Removes blank samples (group==0) from the data frame
exclude_group <- function(data, group) {
row_idx_to_exclude <- which(group %in% 0)
if (length(row_idx_to_exclude) > 0) {
data_without_blanks <- data[-c(row_idx_to_exclude), ]
cat("Blank samples have been excluded from the dataframe.\n")
- return(data_without_blanks)
+ data_without_blanks
} else {
- return(data)
+ data
}
}
-store_data <- function(data, feature_output, metadata_output, ext, split_output = FALSE) {
+# Stores the output data in the requested format (csv, tsv/tabular, parquet),
+# optionally splitting metadata and features
+store_data <- function(data, feature_output, ext) {
if (ext == "parquet") {
- if (split_output == TRUE) {
- split_df <- split_output(data)
- arrow::write_parquet(split_df$metadata, metadata_output)
- arrow::write_parquet(split_df$feature_table, feature_output)
- } else {
- arrow::write_parquet(data, feature_output)
- }
+ arrow::write_parquet(data, feature_output)
+ } else if (ext == "csv") {
+ write.csv(data, file = feature_output, row.names = FALSE, quote = FALSE)
+ } else if (ext == "tsv" || ext == "tabular") {
+ write.table(data,
+ file = feature_output, sep = "\t",
+ row.names = FALSE, quote = FALSE
+ )
} else {
- if (split_output == TRUE) {
- split_df <- split_output(data)
- write.table(split_df$metadata,
- file = metadata_output, sep = "\t",
- row.names = FALSE, quote = FALSE
- )
- write.table(split_df$feature_table,
- file = feature_output, sep = "\t",
- row.names = FALSE, quote = FALSE
- )
- } else {
- write.table(data,
- file = feature_output, sep = "\t",
- row.names = FALSE, quote = FALSE
- )
- }
+ stop(paste("Unsupported file extension:", ext))
}
cat("Normalization has been completed.\n")
}
-split_output <- function(df) {
- required_columns_set1 <- c("sampleName", "class", "sampleType", "injectionOrder", "batch")
- required_columns_set2 <- c("sampleName", "class", "sampleType", "injectionOrder")
-
- if (all(required_columns_set1 %in% colnames(df))) {
- metadata_df <- df[, required_columns_set1, drop = FALSE]
- df <- df[, -c(2:5)]
- } else if (all(required_columns_set2 %in% colnames(df))) {
- metadata_df <- df[, required_columns_set2, drop = FALSE]
- df <- df[, -c(2:4)]
- } else {
- stop("Neither set of required columns is present in the dataframe.")
- }
-
- # Transpose the feature table
- col_names <- c("id", as.vector(df[[1]]))
- feature_table <- tranpose_data(df, col_names)
-
- return(list(metadata = metadata_df, feature_table = feature_table))
-}
-
tranpose_data <- function(data, column_names) {
t_data <- data[-1]
t_data <- t(t_data)
tranposed_data <- data.frame(rownames(t_data), t_data)
colnames(tranposed_data) <- column_names
- return(tranposed_data)
+ tranposed_data
}
+
+
+final_data_processing <- function(
+ data, non_feature_columns,
+ transpose, original_first_colname) {
+ # Remove all columns that are in non_
+ # feature_columns, except the first column
+ cols_to_keep <- !(colnames(data) %in% non_feature_columns)
+ cols_to_keep[1] <- TRUE # Always keep the first column
+ data <- data[, cols_to_keep, drop = FALSE]
+
+
+ if (transpose) {
+ # The first column becomes the new column names
+ new_colnames <- as.character(data[[1]])
+ # Remove the first column
+ data <- data[, -1, drop = FALSE]
+ # Transpose the rest
+ data <- t(data)
+ # Convert to data frame
+ data <- as.data.frame(data, stringsAsFactors = FALSE)
+ # The first row becomes the first column
+ first_col <- rownames(data)
+ data <- cbind(first_col, data)
+ # Set column names
+ colnames(data) <- c(colnames(data)[1], new_colnames)
+ rownames(data) <- NULL
+ }
+ colnames(data)[1] <- original_first_colname
+ data
+}