comparison waveica_wrapper.R @ 10:821062fc5782 draft default tip

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit 2b8b1dcb2947c6503fd4f82904df708e4f88ea1d
author recetox
date Fri, 04 Jul 2025 09:43:22 +0000
parents 6fc9f6dbcef5
children
comparison
equal deleted inserted replaced
9:6fc9f6dbcef5 10:821062fc5782
1 read_file <- function(file, metadata, ft_ext, mt_ext, transpose) { 1 # Read data from a file in the specified format (csv, tsv/tabular, or parquet)
2 data <- read_data(file, ft_ext) 2 read_data <- function(file, ext, transpose = FALSE) {
3 3 # Reads a data file based on its extension and returns a data frame.
4 if (ext == "csv") {
5 tryCatch(
6 {
7 data <- read.csv(file, header = TRUE)
8 },
9 error = function(e) {
10 stop(
11 paste0(
12 "Failed to read as CSV. The file may not ",
13 "be a valid text file or may be corrupted: ",
14 file, "\nError: ", e$message
15 )
16 )
17 }
18 )
19 } else if (ext == "tsv" || ext == "tabular") {
20 tryCatch(
21 {
22 data <- read.csv(file, header = TRUE, sep = "\t")
23 },
24 error = function(e) {
25 stop(
26 paste0(
27 "Failed to read as TSV/tabular.",
28 "The file may not be a valid text file or may be corrupted: ",
29 file, "\nError: ", e$message
30 )
31 )
32 }
33 )
34 } else if (ext == "parquet") {
35 data <- arrow::read_parquet(file)
36 } else {
37 stop(paste("Unsupported file extension or format for reading:", ext))
38 }
39
40 original_first_colname <- colnames(data)[1]
4 if (transpose) { 41 if (transpose) {
5 col_names <- c("sampleName", data[[1]]) 42 col_names <- c("sampleName", data[[1]])
6 data <- tranpose_data(data, col_names) 43 data <- tranpose_data(data, col_names)
7 } 44 }
8 45 return(list(data = data, original_first_colname = original_first_colname))
9 if (!is.na(metadata)) { 46 }
10 mt_data <- read_data(metadata, mt_ext) 47
11 data <- merge(mt_data, data, by = "sampleName") 48 # Main function for batchwise WaveICA normalization
12 } 49 waveica <- function(data_matrix_file,
13 50 sample_metadata_file,
14 return(data) 51 ft_ext,
15 } 52 mt_ext,
16
17 read_data <- function(file, ext) {
18 if (ext == "csv") {
19 data <- read.csv(file, header = TRUE)
20 } else if (ext == "tsv" || ext == "tabular") {
21 data <- read.csv(file, header = TRUE, sep = "\t")
22 } else {
23 data <- arrow::read_parquet(file)
24 }
25
26 return(data)
27 }
28
29 waveica <- function(file,
30 metadata = NA,
31 ext,
32 transpose = FALSE,
33 wavelet_filter, 53 wavelet_filter,
34 wavelet_length, 54 wavelet_length,
35 k, 55 k,
36 t, 56 t,
37 t2, 57 t2,
38 alpha, 58 alpha,
39 exclude_blanks) { 59 exclude_blanks,
40 # get input from the Galaxy, preprocess data 60 transpose = FALSE) {
41 ext <- strsplit(x = ext, split = "\\,")[[1]] 61 # Reads feature and metadata tables, merges them,
42 62 # verifies columns, runs WaveICA, and returns normalized data.
43 ft_ext <- ext[1] 63 read_features_response <- read_data(
44 mt_ext <- ext[2] 64 data_matrix_file, ft_ext,
45 65 transpose
46 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) 66 )
67 features <- read_features_response$data
68 original_first_colname <- read_features_response$original_first_colname
69
70 read_metadata_response <- read_data(sample_metadata_file, mt_ext)
71 metadata <- read_metadata_response$data
47 72
48 required_columns <- c( 73 required_columns <- c(
49 "sampleName", "class", "sampleType", 74 "sampleName", "class", "sampleType",
50 "injectionOrder", "batch" 75 "injectionOrder", "batch"
51 ) 76 )
77
78 metadata <- dplyr::select(metadata, required_columns)
79
80 # Ensure both tables have a sampleName column
81 if (!"sampleName" %in% colnames(features) || !"sampleName" %in% colnames(metadata)) { # nolint
82 stop("Both feature and metadata tables must contain a 'sampleName' column.")
83 }
84 data <- merge(metadata, features, by = "sampleName")
85
86
52 data <- verify_input_dataframe(data, required_columns) 87 data <- verify_input_dataframe(data, required_columns)
53 88
54 data <- sort_by_injection_order(data) 89 data <- sort_by_injection_order(data)
55 90
56 # separate data into features, batch and group 91 # Separate features, batch, and group columns
57 feature_columns <- colnames(data)[!colnames(data) %in% required_columns] 92 feature_columns <- colnames(data)[!colnames(data) %in% required_columns]
58 features <- data[, feature_columns] 93 features <- data[, feature_columns]
59 group <- enumerate_groups(as.character(data$sampleType)) 94 group <- enumerate_groups(as.character(data$sampleType))
60 batch <- data$batch 95 batch <- data$batch
61 96
62 # run WaveICA 97 # Check that wavelet level is not too high for the number of samples
98 max_level <- floor(log2(nrow(features)))
99 requested_level <- as.numeric(wavelet_length)
100 if (requested_level > max_level) {
101 stop(sprintf(
102 paste0(
103 "Wavelet length/level (%d) is too high for ",
104 "the number of samples (%d). Maximum allowed is %d."
105 ),
106 requested_level, nrow(features), max_level
107 ))
108 }
109 # Run WaveICA normalization
63 features <- recetox.waveica::waveica( 110 features <- recetox.waveica::waveica(
64 data = features, 111 data = features,
65 wf = get_wf(wavelet_filter, wavelet_length), 112 wf = get_wf(wavelet_filter, wavelet_length),
66 batch = batch, 113 batch = batch,
67 group = group, 114 group = group,
68 K = k, 115 K = k,
69 t = t, 116 t = t,
70 t2 = t2, 117 t2 = t2,
71 alpha = alpha 118 alpha = alpha
72 ) 119 )
73 120 non_feature_columns <- setdiff(colnames(data), feature_columns)
121
122 # Update the data frame with normalized features
74 data[, feature_columns] <- features 123 data[, feature_columns] <- features
75 124
76 # remove blanks from dataset 125 # Optionally remove blank samples
77 if (exclude_blanks) { 126 if (exclude_blanks) {
78 data <- exclude_group(data, group) 127 data <- exclude_group(data, group)
79 } 128 }
80 129 data <- final_data_processing(
81 return(data) 130 data, non_feature_columns,
82 } 131 transpose, original_first_colname
83 132 )
84 waveica_singlebatch <- function(file, 133 data
85 metadata = NA, 134 }
86 ext, 135
87 transpose = FALSE, 136 # Main function for single-batch WaveICA normalization
137 waveica_singlebatch <- function(data_matrix_file,
138 sample_metadata_file,
139 ft_ext,
140 mt_ext,
88 wavelet_filter, 141 wavelet_filter,
89 wavelet_length, 142 wavelet_length,
90 k, 143 k,
91 alpha, 144 alpha,
92 cutoff, 145 cutoff,
93 exclude_blanks) { 146 exclude_blanks,
94 # get input from the Galaxy, preprocess data 147 transpose = FALSE) {
95 ext <- strsplit(x = ext, split = "\\,")[[1]] 148 # Reads feature and metadata tables, merges them,
96 149 # verifies columns, runs WaveICA (single batch), and returns normalized data.
97 ft_ext <- ext[1] 150 read_features_response <- read_data(data_matrix_file, ft_ext, transpose)
98 mt_ext <- ext[2] 151 features <- read_features_response$data
99 152 original_first_colname <- read_features_response$original_first_colname
100 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) 153
154 read_data_response <- read_data(sample_metadata_file, mt_ext)
155 metadata <- read_data_response$data
156
157 # Ensure both tables have a sampleName column
158 if (!"sampleName" %in% colnames(features) ||
159 !"sampleName" %in% colnames(metadata)) { # nolint
160 stop("Both feature and metadata tables must contain a 'sampleName' column.")
161 }
162 data <- merge(metadata, features, by = "sampleName")
101 163
102 required_columns <- c("sampleName", "class", "sampleType", "injectionOrder") 164 required_columns <- c("sampleName", "class", "sampleType", "injectionOrder")
103 optional_columns <- c("batch") 165 optional_columns <- c("batch")
104
105 data <- verify_input_dataframe(data, required_columns) 166 data <- verify_input_dataframe(data, required_columns)
106 167
107 data <- sort_by_injection_order(data) 168 data <- sort_by_injection_order(data)
108 169
109 feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)] 170 feature_columns <- colnames(data)[
171 !colnames(data) %in% c(required_columns, optional_columns)
172 ]
110 features <- data[, feature_columns] 173 features <- data[, feature_columns]
111 injection_order <- data$injectionOrder 174 injection_order <- data$injectionOrder
112 175
113 # run WaveICA 176 # Run WaveICA normalization (single batch)
114 features <- recetox.waveica::waveica_nonbatchwise( 177 features <- recetox.waveica::waveica_nonbatchwise(
115 data = features, 178 data = features,
116 wf = get_wf(wavelet_filter, wavelet_length), 179 wf = get_wf(wavelet_filter, wavelet_length),
117 injection_order = injection_order, 180 injection_order = injection_order,
118 K = k, 181 K = k,
119 alpha = alpha, 182 alpha = alpha,
120 cutoff = cutoff 183 cutoff = cutoff
121 ) 184 )
122 185 non_feature_columns <- setdiff(colnames(data), feature_columns)
186
187 # Update the data frame with normalized features
123 data[, feature_columns] <- features 188 data[, feature_columns] <- features
124 group <- enumerate_groups(as.character(data$sampleType)) 189 group <- enumerate_groups(as.character(data$sampleType))
125 # remove blanks from dataset 190 # Optionally remove blank samples
126 if (exclude_blanks) { 191 if (exclude_blanks) {
127 data <- exclude_group(data, group) 192 data <- exclude_group(data, group)
128 } 193 }
129 194 data <- final_data_processing(
130 return(data) 195 data, non_feature_columns,
131 } 196 transpose, original_first_colname
132 197 )
198 data
199 }
200
201 # Sorts the data frame by batch and injection order (if batch exists),
202 # otherwise by injection order only
133 sort_by_injection_order <- function(data) { 203 sort_by_injection_order <- function(data) {
134 if ("batch" %in% colnames(data)) { 204 if ("batch" %in% colnames(data)) {
135 data <- data[order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE), ] 205 data <- data[
206 order(data[, "batch"],
207 data[, "injectionOrder"],
208 decreasing = FALSE
209 ),
210 ]
136 } else { 211 } else {
137 data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ] 212 data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ]
138 } 213 }
139 return(data) 214 data
140 } 215 }
141 216
217 # Verifies that required columns exist and that there are no missing values
142 verify_input_dataframe <- function(data, required_columns) { 218 verify_input_dataframe <- function(data, required_columns) {
143 if (anyNA(data)) { 219 if (anyNA(data)) {
144 stop("Error: dataframe cannot contain NULL values! 220 stop("Error: dataframe cannot contain NULL values!
145 Make sure that your dataframe does not contain empty cells") 221 \nMake sure that your dataframe does not contain empty cells")
146 } else if (!all(required_columns %in% colnames(data))) { 222 } else if (!all(required_columns %in% colnames(data))) {
147 stop( 223 stop(
148 "Error: missing metadata! 224 "Error: missing metadata!
149 Make sure that the following columns are present in your dataframe: ", 225 \nMake sure that the following columns are present in your dataframe: ",
150 paste(required_columns, collapse = ", ") 226 paste(required_columns, collapse = ", ")
151 ) 227 )
152 } 228 }
153
154 data <- verify_column_types(data, required_columns) 229 data <- verify_column_types(data, required_columns)
155 230 data
156 return(data) 231 }
157 } 232
158 233 # Checks column types for required and feature columns
234 # and removes problematic feature columns
159 verify_column_types <- function(data, required_columns) { 235 verify_column_types <- function(data, required_columns) {
160 # Specify the column names and their expected types 236 # Checks that required columns have the correct type
237 # and removes non-numeric feature columns efficiently.
161 column_types <- list( 238 column_types <- list(
162 "sampleName" = c("character", "factor"), 239 "sampleName" = c("character", "factor"),
163 "class" = c("character", "factor", "integer"), 240 "class" = c("character", "factor", "integer"),
164 "sampleType" = c("character", "factor"), 241 "sampleType" = c("character", "factor"),
165 "injectionOrder" = "integer", 242 "injectionOrder" = "integer",
166 "batch" = "integer" 243 "batch" = "integer"
167 ) 244 )
168
169 column_types <- column_types[required_columns] 245 column_types <- column_types[required_columns]
170 246
171 for (col_name in names(data)) { 247 # Check required columns' types (fast, vectorized)
248 for (col_name in names(column_types)) {
249 if (!col_name %in% names(data)) next
250 expected_types <- column_types[[col_name]]
172 actual_type <- class(data[[col_name]]) 251 actual_type <- class(data[[col_name]])
173 if (col_name %in% names(column_types)) { 252 if (!actual_type %in% expected_types) {
174 expected_types <- column_types[[col_name]] 253 stop(
175 254 "Column ", col_name, " is of type ", actual_type,
176 if (!actual_type %in% expected_types) { 255 " but expected type is ",
177 stop( 256 paste(expected_types, collapse = " or "), "\n"
178 "Column ", col_name, " is of type ", actual_type, 257 )
179 " but expected type is ",
180 paste(expected_types, collapse = " or "), "\n"
181 )
182 }
183 } else {
184 if (actual_type != "numeric") {
185 data[[col_name]] <- as.numeric(as.character(data[[col_name]]))
186 }
187 } 258 }
188 } 259 }
189 return(data) 260
190 } 261 # Identify feature columns (not required columns)
191 262 feature_cols <- setdiff(names(data), required_columns)
192 # Match group labels with [blank/sample/qc] and enumerate them 263 # Try to convert all feature columns to numeric in one go
264 # as well as suppressing warnings
265 data[feature_cols] <- suppressWarnings(
266 lapply(
267 data[feature_cols],
268 function(x) as.numeric(as.character(x))
269 )
270 )
271 # Find columns that are problematic (contain any NA after conversion)
272 na_counts <- vapply(data[feature_cols], function(x) any(is.na(x)), logical(1))
273 removed_columns <- names(na_counts)[na_counts]
274 if (length(removed_columns) > 0) {
275 message(
276 "Removed problematic columns (non-numeric): ",
277 paste(removed_columns, collapse = ", ")
278 )
279 }
280
281 # Keep only good columns
282 keep_cols <- !(names(data) %in% removed_columns)
283 data <- data[, keep_cols, drop = FALSE]
284 data
285 }
286
287 # Enumerates group labels: blank=0, sample=1, qc=2, standard=3
193 enumerate_groups <- function(group) { 288 enumerate_groups <- function(group) {
194 group[grepl("blank", tolower(group))] <- 0 289 group[grepl("blank", tolower(group))] <- 0
195 group[grepl("sample", tolower(group))] <- 1 290 group[grepl("sample", tolower(group))] <- 1
196 group[grepl("qc", tolower(group))] <- 2 291 group[grepl("qc", tolower(group))] <- 2
197 group[grepl("standard", tolower(group))] <- 3 292 group[grepl("standard", tolower(group))] <- 3
198 293 group
199 return(group) 294 }
200 } 295
201 296 # Returns the correct wavelet filter string for the R wavelets function
202 # Create appropriate input for R wavelets function
203 get_wf <- function(wavelet_filter, wavelet_length) { 297 get_wf <- function(wavelet_filter, wavelet_length) {
204 wf <- paste(wavelet_filter, wavelet_length, sep = "") 298 wf <- paste(wavelet_filter, wavelet_length, sep = "")
205 299 # Exception for Daubechies-2
206 # exception to the wavelet function
207 if (wf == "d2") { 300 if (wf == "d2") {
208 wf <- "haar" 301 wf <- "haar"
209 } 302 }
210 303 wf
211 return(wf) 304 }
212 } 305
213 306 # Removes blank samples (group==0) from the data frame
214 # Exclude blanks from a dataframe
215 exclude_group <- function(data, group) { 307 exclude_group <- function(data, group) {
216 row_idx_to_exclude <- which(group %in% 0) 308 row_idx_to_exclude <- which(group %in% 0)
217 if (length(row_idx_to_exclude) > 0) { 309 if (length(row_idx_to_exclude) > 0) {
218 data_without_blanks <- data[-c(row_idx_to_exclude), ] 310 data_without_blanks <- data[-c(row_idx_to_exclude), ]
219 cat("Blank samples have been excluded from the dataframe.\n") 311 cat("Blank samples have been excluded from the dataframe.\n")
220 return(data_without_blanks) 312 data_without_blanks
221 } else { 313 } else {
222 return(data) 314 data
223 } 315 }
224 } 316 }
225 317
226 store_data <- function(data, feature_output, metadata_output, ext, split_output = FALSE) { 318 # Stores the output data in the requested format (csv, tsv/tabular, parquet),
319 # optionally splitting metadata and features
320 store_data <- function(data, feature_output, ext) {
227 if (ext == "parquet") { 321 if (ext == "parquet") {
228 if (split_output == TRUE) { 322 arrow::write_parquet(data, feature_output)
229 split_df <- split_output(data) 323 } else if (ext == "csv") {
230 arrow::write_parquet(split_df$metadata, metadata_output) 324 write.csv(data, file = feature_output, row.names = FALSE, quote = FALSE)
231 arrow::write_parquet(split_df$feature_table, feature_output) 325 } else if (ext == "tsv" || ext == "tabular") {
232 } else { 326 write.table(data,
233 arrow::write_parquet(data, feature_output) 327 file = feature_output, sep = "\t",
234 } 328 row.names = FALSE, quote = FALSE
329 )
235 } else { 330 } else {
236 if (split_output == TRUE) { 331 stop(paste("Unsupported file extension:", ext))
237 split_df <- split_output(data)
238 write.table(split_df$metadata,
239 file = metadata_output, sep = "\t",
240 row.names = FALSE, quote = FALSE
241 )
242 write.table(split_df$feature_table,
243 file = feature_output, sep = "\t",
244 row.names = FALSE, quote = FALSE
245 )
246 } else {
247 write.table(data,
248 file = feature_output, sep = "\t",
249 row.names = FALSE, quote = FALSE
250 )
251 }
252 } 332 }
253 cat("Normalization has been completed.\n") 333 cat("Normalization has been completed.\n")
254 }
255
256 split_output <- function(df) {
257 required_columns_set1 <- c("sampleName", "class", "sampleType", "injectionOrder", "batch")
258 required_columns_set2 <- c("sampleName", "class", "sampleType", "injectionOrder")
259
260 if (all(required_columns_set1 %in% colnames(df))) {
261 metadata_df <- df[, required_columns_set1, drop = FALSE]
262 df <- df[, -c(2:5)]
263 } else if (all(required_columns_set2 %in% colnames(df))) {
264 metadata_df <- df[, required_columns_set2, drop = FALSE]
265 df <- df[, -c(2:4)]
266 } else {
267 stop("Neither set of required columns is present in the dataframe.")
268 }
269
270 # Transpose the feature table
271 col_names <- c("id", as.vector(df[[1]]))
272 feature_table <- tranpose_data(df, col_names)
273
274 return(list(metadata = metadata_df, feature_table = feature_table))
275 } 334 }
276 335
277 tranpose_data <- function(data, column_names) { 336 tranpose_data <- function(data, column_names) {
278 t_data <- data[-1] 337 t_data <- data[-1]
279 t_data <- t(t_data) 338 t_data <- t(t_data)
280 tranposed_data <- data.frame(rownames(t_data), t_data) 339 tranposed_data <- data.frame(rownames(t_data), t_data)
281 colnames(tranposed_data) <- column_names 340 colnames(tranposed_data) <- column_names
282 341
283 return(tranposed_data) 342 tranposed_data
284 } 343 }
344
345
346 final_data_processing <- function(
347 data, non_feature_columns,
348 transpose, original_first_colname) {
349 # Remove all columns that are in non_
350 # feature_columns, except the first column
351 cols_to_keep <- !(colnames(data) %in% non_feature_columns)
352 cols_to_keep[1] <- TRUE # Always keep the first column
353 data <- data[, cols_to_keep, drop = FALSE]
354
355
356 if (transpose) {
357 # The first column becomes the new column names
358 new_colnames <- as.character(data[[1]])
359 # Remove the first column
360 data <- data[, -1, drop = FALSE]
361 # Transpose the rest
362 data <- t(data)
363 # Convert to data frame
364 data <- as.data.frame(data, stringsAsFactors = FALSE)
365 # The first row becomes the first column
366 first_col <- rownames(data)
367 data <- cbind(first_col, data)
368 # Set column names
369 colnames(data) <- c(colnames(data)[1], new_colnames)
370 rownames(data) <- NULL
371 }
372 colnames(data)[1] <- original_first_colname
373 data
374 }