Mercurial > repos > recetox > waveica
comparison waveica_wrapper.R @ 10:821062fc5782 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit 2b8b1dcb2947c6503fd4f82904df708e4f88ea1d
author | recetox |
---|---|
date | Fri, 04 Jul 2025 09:43:22 +0000 |
parents | 6fc9f6dbcef5 |
children |
comparison
equal
deleted
inserted
replaced
9:6fc9f6dbcef5 | 10:821062fc5782 |
---|---|
1 read_file <- function(file, metadata, ft_ext, mt_ext, transpose) { | 1 # Read data from a file in the specified format (csv, tsv/tabular, or parquet) |
2 data <- read_data(file, ft_ext) | 2 read_data <- function(file, ext, transpose = FALSE) { |
3 | 3 # Reads a data file based on its extension and returns a data frame. |
4 if (ext == "csv") { | |
5 tryCatch( | |
6 { | |
7 data <- read.csv(file, header = TRUE) | |
8 }, | |
9 error = function(e) { | |
10 stop( | |
11 paste0( | |
12 "Failed to read as CSV. The file may not ", | |
13 "be a valid text file or may be corrupted: ", | |
14 file, "\nError: ", e$message | |
15 ) | |
16 ) | |
17 } | |
18 ) | |
19 } else if (ext == "tsv" || ext == "tabular") { | |
20 tryCatch( | |
21 { | |
22 data <- read.csv(file, header = TRUE, sep = "\t") | |
23 }, | |
24 error = function(e) { | |
25 stop( | |
26 paste0( | |
27 "Failed to read as TSV/tabular.", | |
28 "The file may not be a valid text file or may be corrupted: ", | |
29 file, "\nError: ", e$message | |
30 ) | |
31 ) | |
32 } | |
33 ) | |
34 } else if (ext == "parquet") { | |
35 data <- arrow::read_parquet(file) | |
36 } else { | |
37 stop(paste("Unsupported file extension or format for reading:", ext)) | |
38 } | |
39 | |
40 original_first_colname <- colnames(data)[1] | |
4 if (transpose) { | 41 if (transpose) { |
5 col_names <- c("sampleName", data[[1]]) | 42 col_names <- c("sampleName", data[[1]]) |
6 data <- tranpose_data(data, col_names) | 43 data <- tranpose_data(data, col_names) |
7 } | 44 } |
8 | 45 return(list(data = data, original_first_colname = original_first_colname)) |
9 if (!is.na(metadata)) { | 46 } |
10 mt_data <- read_data(metadata, mt_ext) | 47 |
11 data <- merge(mt_data, data, by = "sampleName") | 48 # Main function for batchwise WaveICA normalization |
12 } | 49 waveica <- function(data_matrix_file, |
13 | 50 sample_metadata_file, |
14 return(data) | 51 ft_ext, |
15 } | 52 mt_ext, |
16 | |
17 read_data <- function(file, ext) { | |
18 if (ext == "csv") { | |
19 data <- read.csv(file, header = TRUE) | |
20 } else if (ext == "tsv" || ext == "tabular") { | |
21 data <- read.csv(file, header = TRUE, sep = "\t") | |
22 } else { | |
23 data <- arrow::read_parquet(file) | |
24 } | |
25 | |
26 return(data) | |
27 } | |
28 | |
29 waveica <- function(file, | |
30 metadata = NA, | |
31 ext, | |
32 transpose = FALSE, | |
33 wavelet_filter, | 53 wavelet_filter, |
34 wavelet_length, | 54 wavelet_length, |
35 k, | 55 k, |
36 t, | 56 t, |
37 t2, | 57 t2, |
38 alpha, | 58 alpha, |
39 exclude_blanks) { | 59 exclude_blanks, |
40 # get input from the Galaxy, preprocess data | 60 transpose = FALSE) { |
41 ext <- strsplit(x = ext, split = "\\,")[[1]] | 61 # Reads feature and metadata tables, merges them, |
42 | 62 # verifies columns, runs WaveICA, and returns normalized data. |
43 ft_ext <- ext[1] | 63 read_features_response <- read_data( |
44 mt_ext <- ext[2] | 64 data_matrix_file, ft_ext, |
45 | 65 transpose |
46 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) | 66 ) |
67 features <- read_features_response$data | |
68 original_first_colname <- read_features_response$original_first_colname | |
69 | |
70 read_metadata_response <- read_data(sample_metadata_file, mt_ext) | |
71 metadata <- read_metadata_response$data | |
47 | 72 |
48 required_columns <- c( | 73 required_columns <- c( |
49 "sampleName", "class", "sampleType", | 74 "sampleName", "class", "sampleType", |
50 "injectionOrder", "batch" | 75 "injectionOrder", "batch" |
51 ) | 76 ) |
77 | |
78 metadata <- dplyr::select(metadata, required_columns) | |
79 | |
80 # Ensure both tables have a sampleName column | |
81 if (!"sampleName" %in% colnames(features) || !"sampleName" %in% colnames(metadata)) { # nolint | |
82 stop("Both feature and metadata tables must contain a 'sampleName' column.") | |
83 } | |
84 data <- merge(metadata, features, by = "sampleName") | |
85 | |
86 | |
52 data <- verify_input_dataframe(data, required_columns) | 87 data <- verify_input_dataframe(data, required_columns) |
53 | 88 |
54 data <- sort_by_injection_order(data) | 89 data <- sort_by_injection_order(data) |
55 | 90 |
56 # separate data into features, batch and group | 91 # Separate features, batch, and group columns |
57 feature_columns <- colnames(data)[!colnames(data) %in% required_columns] | 92 feature_columns <- colnames(data)[!colnames(data) %in% required_columns] |
58 features <- data[, feature_columns] | 93 features <- data[, feature_columns] |
59 group <- enumerate_groups(as.character(data$sampleType)) | 94 group <- enumerate_groups(as.character(data$sampleType)) |
60 batch <- data$batch | 95 batch <- data$batch |
61 | 96 |
62 # run WaveICA | 97 # Check that wavelet level is not too high for the number of samples |
98 max_level <- floor(log2(nrow(features))) | |
99 requested_level <- as.numeric(wavelet_length) | |
100 if (requested_level > max_level) { | |
101 stop(sprintf( | |
102 paste0( | |
103 "Wavelet length/level (%d) is too high for ", | |
104 "the number of samples (%d). Maximum allowed is %d." | |
105 ), | |
106 requested_level, nrow(features), max_level | |
107 )) | |
108 } | |
109 # Run WaveICA normalization | |
63 features <- recetox.waveica::waveica( | 110 features <- recetox.waveica::waveica( |
64 data = features, | 111 data = features, |
65 wf = get_wf(wavelet_filter, wavelet_length), | 112 wf = get_wf(wavelet_filter, wavelet_length), |
66 batch = batch, | 113 batch = batch, |
67 group = group, | 114 group = group, |
68 K = k, | 115 K = k, |
69 t = t, | 116 t = t, |
70 t2 = t2, | 117 t2 = t2, |
71 alpha = alpha | 118 alpha = alpha |
72 ) | 119 ) |
73 | 120 non_feature_columns <- setdiff(colnames(data), feature_columns) |
121 | |
122 # Update the data frame with normalized features | |
74 data[, feature_columns] <- features | 123 data[, feature_columns] <- features |
75 | 124 |
76 # remove blanks from dataset | 125 # Optionally remove blank samples |
77 if (exclude_blanks) { | 126 if (exclude_blanks) { |
78 data <- exclude_group(data, group) | 127 data <- exclude_group(data, group) |
79 } | 128 } |
80 | 129 data <- final_data_processing( |
81 return(data) | 130 data, non_feature_columns, |
82 } | 131 transpose, original_first_colname |
83 | 132 ) |
84 waveica_singlebatch <- function(file, | 133 data |
85 metadata = NA, | 134 } |
86 ext, | 135 |
87 transpose = FALSE, | 136 # Main function for single-batch WaveICA normalization |
137 waveica_singlebatch <- function(data_matrix_file, | |
138 sample_metadata_file, | |
139 ft_ext, | |
140 mt_ext, | |
88 wavelet_filter, | 141 wavelet_filter, |
89 wavelet_length, | 142 wavelet_length, |
90 k, | 143 k, |
91 alpha, | 144 alpha, |
92 cutoff, | 145 cutoff, |
93 exclude_blanks) { | 146 exclude_blanks, |
94 # get input from the Galaxy, preprocess data | 147 transpose = FALSE) { |
95 ext <- strsplit(x = ext, split = "\\,")[[1]] | 148 # Reads feature and metadata tables, merges them, |
96 | 149 # verifies columns, runs WaveICA (single batch), and returns normalized data. |
97 ft_ext <- ext[1] | 150 read_features_response <- read_data(data_matrix_file, ft_ext, transpose) |
98 mt_ext <- ext[2] | 151 features <- read_features_response$data |
99 | 152 original_first_colname <- read_features_response$original_first_colname |
100 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) | 153 |
154 read_data_response <- read_data(sample_metadata_file, mt_ext) | |
155 metadata <- read_data_response$data | |
156 | |
157 # Ensure both tables have a sampleName column | |
158 if (!"sampleName" %in% colnames(features) || | |
159 !"sampleName" %in% colnames(metadata)) { # nolint | |
160 stop("Both feature and metadata tables must contain a 'sampleName' column.") | |
161 } | |
162 data <- merge(metadata, features, by = "sampleName") | |
101 | 163 |
102 required_columns <- c("sampleName", "class", "sampleType", "injectionOrder") | 164 required_columns <- c("sampleName", "class", "sampleType", "injectionOrder") |
103 optional_columns <- c("batch") | 165 optional_columns <- c("batch") |
104 | |
105 data <- verify_input_dataframe(data, required_columns) | 166 data <- verify_input_dataframe(data, required_columns) |
106 | 167 |
107 data <- sort_by_injection_order(data) | 168 data <- sort_by_injection_order(data) |
108 | 169 |
109 feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)] | 170 feature_columns <- colnames(data)[ |
171 !colnames(data) %in% c(required_columns, optional_columns) | |
172 ] | |
110 features <- data[, feature_columns] | 173 features <- data[, feature_columns] |
111 injection_order <- data$injectionOrder | 174 injection_order <- data$injectionOrder |
112 | 175 |
113 # run WaveICA | 176 # Run WaveICA normalization (single batch) |
114 features <- recetox.waveica::waveica_nonbatchwise( | 177 features <- recetox.waveica::waveica_nonbatchwise( |
115 data = features, | 178 data = features, |
116 wf = get_wf(wavelet_filter, wavelet_length), | 179 wf = get_wf(wavelet_filter, wavelet_length), |
117 injection_order = injection_order, | 180 injection_order = injection_order, |
118 K = k, | 181 K = k, |
119 alpha = alpha, | 182 alpha = alpha, |
120 cutoff = cutoff | 183 cutoff = cutoff |
121 ) | 184 ) |
122 | 185 non_feature_columns <- setdiff(colnames(data), feature_columns) |
186 | |
187 # Update the data frame with normalized features | |
123 data[, feature_columns] <- features | 188 data[, feature_columns] <- features |
124 group <- enumerate_groups(as.character(data$sampleType)) | 189 group <- enumerate_groups(as.character(data$sampleType)) |
125 # remove blanks from dataset | 190 # Optionally remove blank samples |
126 if (exclude_blanks) { | 191 if (exclude_blanks) { |
127 data <- exclude_group(data, group) | 192 data <- exclude_group(data, group) |
128 } | 193 } |
129 | 194 data <- final_data_processing( |
130 return(data) | 195 data, non_feature_columns, |
131 } | 196 transpose, original_first_colname |
132 | 197 ) |
198 data | |
199 } | |
200 | |
201 # Sorts the data frame by batch and injection order (if batch exists), | |
202 # otherwise by injection order only | |
133 sort_by_injection_order <- function(data) { | 203 sort_by_injection_order <- function(data) { |
134 if ("batch" %in% colnames(data)) { | 204 if ("batch" %in% colnames(data)) { |
135 data <- data[order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE), ] | 205 data <- data[ |
206 order(data[, "batch"], | |
207 data[, "injectionOrder"], | |
208 decreasing = FALSE | |
209 ), | |
210 ] | |
136 } else { | 211 } else { |
137 data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ] | 212 data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ] |
138 } | 213 } |
139 return(data) | 214 data |
140 } | 215 } |
141 | 216 |
217 # Verifies that required columns exist and that there are no missing values | |
142 verify_input_dataframe <- function(data, required_columns) { | 218 verify_input_dataframe <- function(data, required_columns) { |
143 if (anyNA(data)) { | 219 if (anyNA(data)) { |
144 stop("Error: dataframe cannot contain NULL values! | 220 stop("Error: dataframe cannot contain NULL values! |
145 Make sure that your dataframe does not contain empty cells") | 221 \nMake sure that your dataframe does not contain empty cells") |
146 } else if (!all(required_columns %in% colnames(data))) { | 222 } else if (!all(required_columns %in% colnames(data))) { |
147 stop( | 223 stop( |
148 "Error: missing metadata! | 224 "Error: missing metadata! |
149 Make sure that the following columns are present in your dataframe: ", | 225 \nMake sure that the following columns are present in your dataframe: ", |
150 paste(required_columns, collapse = ", ") | 226 paste(required_columns, collapse = ", ") |
151 ) | 227 ) |
152 } | 228 } |
153 | |
154 data <- verify_column_types(data, required_columns) | 229 data <- verify_column_types(data, required_columns) |
155 | 230 data |
156 return(data) | 231 } |
157 } | 232 |
158 | 233 # Checks column types for required and feature columns |
234 # and removes problematic feature columns | |
159 verify_column_types <- function(data, required_columns) { | 235 verify_column_types <- function(data, required_columns) { |
160 # Specify the column names and their expected types | 236 # Checks that required columns have the correct type |
237 # and removes non-numeric feature columns efficiently. | |
161 column_types <- list( | 238 column_types <- list( |
162 "sampleName" = c("character", "factor"), | 239 "sampleName" = c("character", "factor"), |
163 "class" = c("character", "factor", "integer"), | 240 "class" = c("character", "factor", "integer"), |
164 "sampleType" = c("character", "factor"), | 241 "sampleType" = c("character", "factor"), |
165 "injectionOrder" = "integer", | 242 "injectionOrder" = "integer", |
166 "batch" = "integer" | 243 "batch" = "integer" |
167 ) | 244 ) |
168 | |
169 column_types <- column_types[required_columns] | 245 column_types <- column_types[required_columns] |
170 | 246 |
171 for (col_name in names(data)) { | 247 # Check required columns' types (fast, vectorized) |
248 for (col_name in names(column_types)) { | |
249 if (!col_name %in% names(data)) next | |
250 expected_types <- column_types[[col_name]] | |
172 actual_type <- class(data[[col_name]]) | 251 actual_type <- class(data[[col_name]]) |
173 if (col_name %in% names(column_types)) { | 252 if (!actual_type %in% expected_types) { |
174 expected_types <- column_types[[col_name]] | 253 stop( |
175 | 254 "Column ", col_name, " is of type ", actual_type, |
176 if (!actual_type %in% expected_types) { | 255 " but expected type is ", |
177 stop( | 256 paste(expected_types, collapse = " or "), "\n" |
178 "Column ", col_name, " is of type ", actual_type, | 257 ) |
179 " but expected type is ", | |
180 paste(expected_types, collapse = " or "), "\n" | |
181 ) | |
182 } | |
183 } else { | |
184 if (actual_type != "numeric") { | |
185 data[[col_name]] <- as.numeric(as.character(data[[col_name]])) | |
186 } | |
187 } | 258 } |
188 } | 259 } |
189 return(data) | 260 |
190 } | 261 # Identify feature columns (not required columns) |
191 | 262 feature_cols <- setdiff(names(data), required_columns) |
192 # Match group labels with [blank/sample/qc] and enumerate them | 263 # Try to convert all feature columns to numeric in one go |
264 # as well as suppressing warnings | |
265 data[feature_cols] <- suppressWarnings( | |
266 lapply( | |
267 data[feature_cols], | |
268 function(x) as.numeric(as.character(x)) | |
269 ) | |
270 ) | |
271 # Find columns that are problematic (contain any NA after conversion) | |
272 na_counts <- vapply(data[feature_cols], function(x) any(is.na(x)), logical(1)) | |
273 removed_columns <- names(na_counts)[na_counts] | |
274 if (length(removed_columns) > 0) { | |
275 message( | |
276 "Removed problematic columns (non-numeric): ", | |
277 paste(removed_columns, collapse = ", ") | |
278 ) | |
279 } | |
280 | |
281 # Keep only good columns | |
282 keep_cols <- !(names(data) %in% removed_columns) | |
283 data <- data[, keep_cols, drop = FALSE] | |
284 data | |
285 } | |
286 | |
287 # Enumerates group labels: blank=0, sample=1, qc=2, standard=3 | |
193 enumerate_groups <- function(group) { | 288 enumerate_groups <- function(group) { |
194 group[grepl("blank", tolower(group))] <- 0 | 289 group[grepl("blank", tolower(group))] <- 0 |
195 group[grepl("sample", tolower(group))] <- 1 | 290 group[grepl("sample", tolower(group))] <- 1 |
196 group[grepl("qc", tolower(group))] <- 2 | 291 group[grepl("qc", tolower(group))] <- 2 |
197 group[grepl("standard", tolower(group))] <- 3 | 292 group[grepl("standard", tolower(group))] <- 3 |
198 | 293 group |
199 return(group) | 294 } |
200 } | 295 |
201 | 296 # Returns the correct wavelet filter string for the R wavelets function |
202 # Create appropriate input for R wavelets function | |
203 get_wf <- function(wavelet_filter, wavelet_length) { | 297 get_wf <- function(wavelet_filter, wavelet_length) { |
204 wf <- paste(wavelet_filter, wavelet_length, sep = "") | 298 wf <- paste(wavelet_filter, wavelet_length, sep = "") |
205 | 299 # Exception for Daubechies-2 |
206 # exception to the wavelet function | |
207 if (wf == "d2") { | 300 if (wf == "d2") { |
208 wf <- "haar" | 301 wf <- "haar" |
209 } | 302 } |
210 | 303 wf |
211 return(wf) | 304 } |
212 } | 305 |
213 | 306 # Removes blank samples (group==0) from the data frame |
214 # Exclude blanks from a dataframe | |
215 exclude_group <- function(data, group) { | 307 exclude_group <- function(data, group) { |
216 row_idx_to_exclude <- which(group %in% 0) | 308 row_idx_to_exclude <- which(group %in% 0) |
217 if (length(row_idx_to_exclude) > 0) { | 309 if (length(row_idx_to_exclude) > 0) { |
218 data_without_blanks <- data[-c(row_idx_to_exclude), ] | 310 data_without_blanks <- data[-c(row_idx_to_exclude), ] |
219 cat("Blank samples have been excluded from the dataframe.\n") | 311 cat("Blank samples have been excluded from the dataframe.\n") |
220 return(data_without_blanks) | 312 data_without_blanks |
221 } else { | 313 } else { |
222 return(data) | 314 data |
223 } | 315 } |
224 } | 316 } |
225 | 317 |
226 store_data <- function(data, feature_output, metadata_output, ext, split_output = FALSE) { | 318 # Stores the output data in the requested format (csv, tsv/tabular, parquet), |
319 # optionally splitting metadata and features | |
320 store_data <- function(data, feature_output, ext) { | |
227 if (ext == "parquet") { | 321 if (ext == "parquet") { |
228 if (split_output == TRUE) { | 322 arrow::write_parquet(data, feature_output) |
229 split_df <- split_output(data) | 323 } else if (ext == "csv") { |
230 arrow::write_parquet(split_df$metadata, metadata_output) | 324 write.csv(data, file = feature_output, row.names = FALSE, quote = FALSE) |
231 arrow::write_parquet(split_df$feature_table, feature_output) | 325 } else if (ext == "tsv" || ext == "tabular") { |
232 } else { | 326 write.table(data, |
233 arrow::write_parquet(data, feature_output) | 327 file = feature_output, sep = "\t", |
234 } | 328 row.names = FALSE, quote = FALSE |
329 ) | |
235 } else { | 330 } else { |
236 if (split_output == TRUE) { | 331 stop(paste("Unsupported file extension:", ext)) |
237 split_df <- split_output(data) | |
238 write.table(split_df$metadata, | |
239 file = metadata_output, sep = "\t", | |
240 row.names = FALSE, quote = FALSE | |
241 ) | |
242 write.table(split_df$feature_table, | |
243 file = feature_output, sep = "\t", | |
244 row.names = FALSE, quote = FALSE | |
245 ) | |
246 } else { | |
247 write.table(data, | |
248 file = feature_output, sep = "\t", | |
249 row.names = FALSE, quote = FALSE | |
250 ) | |
251 } | |
252 } | 332 } |
253 cat("Normalization has been completed.\n") | 333 cat("Normalization has been completed.\n") |
254 } | |
255 | |
256 split_output <- function(df) { | |
257 required_columns_set1 <- c("sampleName", "class", "sampleType", "injectionOrder", "batch") | |
258 required_columns_set2 <- c("sampleName", "class", "sampleType", "injectionOrder") | |
259 | |
260 if (all(required_columns_set1 %in% colnames(df))) { | |
261 metadata_df <- df[, required_columns_set1, drop = FALSE] | |
262 df <- df[, -c(2:5)] | |
263 } else if (all(required_columns_set2 %in% colnames(df))) { | |
264 metadata_df <- df[, required_columns_set2, drop = FALSE] | |
265 df <- df[, -c(2:4)] | |
266 } else { | |
267 stop("Neither set of required columns is present in the dataframe.") | |
268 } | |
269 | |
270 # Transpose the feature table | |
271 col_names <- c("id", as.vector(df[[1]])) | |
272 feature_table <- tranpose_data(df, col_names) | |
273 | |
274 return(list(metadata = metadata_df, feature_table = feature_table)) | |
275 } | 334 } |
276 | 335 |
277 tranpose_data <- function(data, column_names) { | 336 tranpose_data <- function(data, column_names) { |
278 t_data <- data[-1] | 337 t_data <- data[-1] |
279 t_data <- t(t_data) | 338 t_data <- t(t_data) |
280 tranposed_data <- data.frame(rownames(t_data), t_data) | 339 tranposed_data <- data.frame(rownames(t_data), t_data) |
281 colnames(tranposed_data) <- column_names | 340 colnames(tranposed_data) <- column_names |
282 | 341 |
283 return(tranposed_data) | 342 tranposed_data |
284 } | 343 } |
344 | |
345 | |
346 final_data_processing <- function( | |
347 data, non_feature_columns, | |
348 transpose, original_first_colname) { | |
349 # Remove all columns that are in non_ | |
350 # feature_columns, except the first column | |
351 cols_to_keep <- !(colnames(data) %in% non_feature_columns) | |
352 cols_to_keep[1] <- TRUE # Always keep the first column | |
353 data <- data[, cols_to_keep, drop = FALSE] | |
354 | |
355 | |
356 if (transpose) { | |
357 # The first column becomes the new column names | |
358 new_colnames <- as.character(data[[1]]) | |
359 # Remove the first column | |
360 data <- data[, -1, drop = FALSE] | |
361 # Transpose the rest | |
362 data <- t(data) | |
363 # Convert to data frame | |
364 data <- as.data.frame(data, stringsAsFactors = FALSE) | |
365 # The first row becomes the first column | |
366 first_col <- rownames(data) | |
367 data <- cbind(first_col, data) | |
368 # Set column names | |
369 colnames(data) <- c(colnames(data)[1], new_colnames) | |
370 rownames(data) <- NULL | |
371 } | |
372 colnames(data)[1] <- original_first_colname | |
373 data | |
374 } |