Mercurial > repos > recetox > waveica
comparison waveica_wrapper.R @ 10:821062fc5782 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit 2b8b1dcb2947c6503fd4f82904df708e4f88ea1d
| author | recetox |
|---|---|
| date | Fri, 04 Jul 2025 09:43:22 +0000 |
| parents | 6fc9f6dbcef5 |
| children |
comparison
equal
deleted
inserted
replaced
| 9:6fc9f6dbcef5 | 10:821062fc5782 |
|---|---|
| 1 read_file <- function(file, metadata, ft_ext, mt_ext, transpose) { | 1 # Read data from a file in the specified format (csv, tsv/tabular, or parquet) |
| 2 data <- read_data(file, ft_ext) | 2 read_data <- function(file, ext, transpose = FALSE) { |
| 3 | 3 # Reads a data file based on its extension and returns a data frame. |
| 4 if (ext == "csv") { | |
| 5 tryCatch( | |
| 6 { | |
| 7 data <- read.csv(file, header = TRUE) | |
| 8 }, | |
| 9 error = function(e) { | |
| 10 stop( | |
| 11 paste0( | |
| 12 "Failed to read as CSV. The file may not ", | |
| 13 "be a valid text file or may be corrupted: ", | |
| 14 file, "\nError: ", e$message | |
| 15 ) | |
| 16 ) | |
| 17 } | |
| 18 ) | |
| 19 } else if (ext == "tsv" || ext == "tabular") { | |
| 20 tryCatch( | |
| 21 { | |
| 22 data <- read.csv(file, header = TRUE, sep = "\t") | |
| 23 }, | |
| 24 error = function(e) { | |
| 25 stop( | |
| 26 paste0( | |
| 27 "Failed to read as TSV/tabular.", | |
| 28 "The file may not be a valid text file or may be corrupted: ", | |
| 29 file, "\nError: ", e$message | |
| 30 ) | |
| 31 ) | |
| 32 } | |
| 33 ) | |
| 34 } else if (ext == "parquet") { | |
| 35 data <- arrow::read_parquet(file) | |
| 36 } else { | |
| 37 stop(paste("Unsupported file extension or format for reading:", ext)) | |
| 38 } | |
| 39 | |
| 40 original_first_colname <- colnames(data)[1] | |
| 4 if (transpose) { | 41 if (transpose) { |
| 5 col_names <- c("sampleName", data[[1]]) | 42 col_names <- c("sampleName", data[[1]]) |
| 6 data <- tranpose_data(data, col_names) | 43 data <- tranpose_data(data, col_names) |
| 7 } | 44 } |
| 8 | 45 return(list(data = data, original_first_colname = original_first_colname)) |
| 9 if (!is.na(metadata)) { | 46 } |
| 10 mt_data <- read_data(metadata, mt_ext) | 47 |
| 11 data <- merge(mt_data, data, by = "sampleName") | 48 # Main function for batchwise WaveICA normalization |
| 12 } | 49 waveica <- function(data_matrix_file, |
| 13 | 50 sample_metadata_file, |
| 14 return(data) | 51 ft_ext, |
| 15 } | 52 mt_ext, |
| 16 | |
| 17 read_data <- function(file, ext) { | |
| 18 if (ext == "csv") { | |
| 19 data <- read.csv(file, header = TRUE) | |
| 20 } else if (ext == "tsv" || ext == "tabular") { | |
| 21 data <- read.csv(file, header = TRUE, sep = "\t") | |
| 22 } else { | |
| 23 data <- arrow::read_parquet(file) | |
| 24 } | |
| 25 | |
| 26 return(data) | |
| 27 } | |
| 28 | |
| 29 waveica <- function(file, | |
| 30 metadata = NA, | |
| 31 ext, | |
| 32 transpose = FALSE, | |
| 33 wavelet_filter, | 53 wavelet_filter, |
| 34 wavelet_length, | 54 wavelet_length, |
| 35 k, | 55 k, |
| 36 t, | 56 t, |
| 37 t2, | 57 t2, |
| 38 alpha, | 58 alpha, |
| 39 exclude_blanks) { | 59 exclude_blanks, |
| 40 # get input from the Galaxy, preprocess data | 60 transpose = FALSE) { |
| 41 ext <- strsplit(x = ext, split = "\\,")[[1]] | 61 # Reads feature and metadata tables, merges them, |
| 42 | 62 # verifies columns, runs WaveICA, and returns normalized data. |
| 43 ft_ext <- ext[1] | 63 read_features_response <- read_data( |
| 44 mt_ext <- ext[2] | 64 data_matrix_file, ft_ext, |
| 45 | 65 transpose |
| 46 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) | 66 ) |
| 67 features <- read_features_response$data | |
| 68 original_first_colname <- read_features_response$original_first_colname | |
| 69 | |
| 70 read_metadata_response <- read_data(sample_metadata_file, mt_ext) | |
| 71 metadata <- read_metadata_response$data | |
| 47 | 72 |
| 48 required_columns <- c( | 73 required_columns <- c( |
| 49 "sampleName", "class", "sampleType", | 74 "sampleName", "class", "sampleType", |
| 50 "injectionOrder", "batch" | 75 "injectionOrder", "batch" |
| 51 ) | 76 ) |
| 77 | |
| 78 metadata <- dplyr::select(metadata, required_columns) | |
| 79 | |
| 80 # Ensure both tables have a sampleName column | |
| 81 if (!"sampleName" %in% colnames(features) || !"sampleName" %in% colnames(metadata)) { # nolint | |
| 82 stop("Both feature and metadata tables must contain a 'sampleName' column.") | |
| 83 } | |
| 84 data <- merge(metadata, features, by = "sampleName") | |
| 85 | |
| 86 | |
| 52 data <- verify_input_dataframe(data, required_columns) | 87 data <- verify_input_dataframe(data, required_columns) |
| 53 | 88 |
| 54 data <- sort_by_injection_order(data) | 89 data <- sort_by_injection_order(data) |
| 55 | 90 |
| 56 # separate data into features, batch and group | 91 # Separate features, batch, and group columns |
| 57 feature_columns <- colnames(data)[!colnames(data) %in% required_columns] | 92 feature_columns <- colnames(data)[!colnames(data) %in% required_columns] |
| 58 features <- data[, feature_columns] | 93 features <- data[, feature_columns] |
| 59 group <- enumerate_groups(as.character(data$sampleType)) | 94 group <- enumerate_groups(as.character(data$sampleType)) |
| 60 batch <- data$batch | 95 batch <- data$batch |
| 61 | 96 |
| 62 # run WaveICA | 97 # Check that wavelet level is not too high for the number of samples |
| 98 max_level <- floor(log2(nrow(features))) | |
| 99 requested_level <- as.numeric(wavelet_length) | |
| 100 if (requested_level > max_level) { | |
| 101 stop(sprintf( | |
| 102 paste0( | |
| 103 "Wavelet length/level (%d) is too high for ", | |
| 104 "the number of samples (%d). Maximum allowed is %d." | |
| 105 ), | |
| 106 requested_level, nrow(features), max_level | |
| 107 )) | |
| 108 } | |
| 109 # Run WaveICA normalization | |
| 63 features <- recetox.waveica::waveica( | 110 features <- recetox.waveica::waveica( |
| 64 data = features, | 111 data = features, |
| 65 wf = get_wf(wavelet_filter, wavelet_length), | 112 wf = get_wf(wavelet_filter, wavelet_length), |
| 66 batch = batch, | 113 batch = batch, |
| 67 group = group, | 114 group = group, |
| 68 K = k, | 115 K = k, |
| 69 t = t, | 116 t = t, |
| 70 t2 = t2, | 117 t2 = t2, |
| 71 alpha = alpha | 118 alpha = alpha |
| 72 ) | 119 ) |
| 73 | 120 non_feature_columns <- setdiff(colnames(data), feature_columns) |
| 121 | |
| 122 # Update the data frame with normalized features | |
| 74 data[, feature_columns] <- features | 123 data[, feature_columns] <- features |
| 75 | 124 |
| 76 # remove blanks from dataset | 125 # Optionally remove blank samples |
| 77 if (exclude_blanks) { | 126 if (exclude_blanks) { |
| 78 data <- exclude_group(data, group) | 127 data <- exclude_group(data, group) |
| 79 } | 128 } |
| 80 | 129 data <- final_data_processing( |
| 81 return(data) | 130 data, non_feature_columns, |
| 82 } | 131 transpose, original_first_colname |
| 83 | 132 ) |
| 84 waveica_singlebatch <- function(file, | 133 data |
| 85 metadata = NA, | 134 } |
| 86 ext, | 135 |
| 87 transpose = FALSE, | 136 # Main function for single-batch WaveICA normalization |
| 137 waveica_singlebatch <- function(data_matrix_file, | |
| 138 sample_metadata_file, | |
| 139 ft_ext, | |
| 140 mt_ext, | |
| 88 wavelet_filter, | 141 wavelet_filter, |
| 89 wavelet_length, | 142 wavelet_length, |
| 90 k, | 143 k, |
| 91 alpha, | 144 alpha, |
| 92 cutoff, | 145 cutoff, |
| 93 exclude_blanks) { | 146 exclude_blanks, |
| 94 # get input from the Galaxy, preprocess data | 147 transpose = FALSE) { |
| 95 ext <- strsplit(x = ext, split = "\\,")[[1]] | 148 # Reads feature and metadata tables, merges them, |
| 96 | 149 # verifies columns, runs WaveICA (single batch), and returns normalized data. |
| 97 ft_ext <- ext[1] | 150 read_features_response <- read_data(data_matrix_file, ft_ext, transpose) |
| 98 mt_ext <- ext[2] | 151 features <- read_features_response$data |
| 99 | 152 original_first_colname <- read_features_response$original_first_colname |
| 100 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) | 153 |
| 154 read_data_response <- read_data(sample_metadata_file, mt_ext) | |
| 155 metadata <- read_data_response$data | |
| 156 | |
| 157 # Ensure both tables have a sampleName column | |
| 158 if (!"sampleName" %in% colnames(features) || | |
| 159 !"sampleName" %in% colnames(metadata)) { # nolint | |
| 160 stop("Both feature and metadata tables must contain a 'sampleName' column.") | |
| 161 } | |
| 162 data <- merge(metadata, features, by = "sampleName") | |
| 101 | 163 |
| 102 required_columns <- c("sampleName", "class", "sampleType", "injectionOrder") | 164 required_columns <- c("sampleName", "class", "sampleType", "injectionOrder") |
| 103 optional_columns <- c("batch") | 165 optional_columns <- c("batch") |
| 104 | |
| 105 data <- verify_input_dataframe(data, required_columns) | 166 data <- verify_input_dataframe(data, required_columns) |
| 106 | 167 |
| 107 data <- sort_by_injection_order(data) | 168 data <- sort_by_injection_order(data) |
| 108 | 169 |
| 109 feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)] | 170 feature_columns <- colnames(data)[ |
| 171 !colnames(data) %in% c(required_columns, optional_columns) | |
| 172 ] | |
| 110 features <- data[, feature_columns] | 173 features <- data[, feature_columns] |
| 111 injection_order <- data$injectionOrder | 174 injection_order <- data$injectionOrder |
| 112 | 175 |
| 113 # run WaveICA | 176 # Run WaveICA normalization (single batch) |
| 114 features <- recetox.waveica::waveica_nonbatchwise( | 177 features <- recetox.waveica::waveica_nonbatchwise( |
| 115 data = features, | 178 data = features, |
| 116 wf = get_wf(wavelet_filter, wavelet_length), | 179 wf = get_wf(wavelet_filter, wavelet_length), |
| 117 injection_order = injection_order, | 180 injection_order = injection_order, |
| 118 K = k, | 181 K = k, |
| 119 alpha = alpha, | 182 alpha = alpha, |
| 120 cutoff = cutoff | 183 cutoff = cutoff |
| 121 ) | 184 ) |
| 122 | 185 non_feature_columns <- setdiff(colnames(data), feature_columns) |
| 186 | |
| 187 # Update the data frame with normalized features | |
| 123 data[, feature_columns] <- features | 188 data[, feature_columns] <- features |
| 124 group <- enumerate_groups(as.character(data$sampleType)) | 189 group <- enumerate_groups(as.character(data$sampleType)) |
| 125 # remove blanks from dataset | 190 # Optionally remove blank samples |
| 126 if (exclude_blanks) { | 191 if (exclude_blanks) { |
| 127 data <- exclude_group(data, group) | 192 data <- exclude_group(data, group) |
| 128 } | 193 } |
| 129 | 194 data <- final_data_processing( |
| 130 return(data) | 195 data, non_feature_columns, |
| 131 } | 196 transpose, original_first_colname |
| 132 | 197 ) |
| 198 data | |
| 199 } | |
| 200 | |
| 201 # Sorts the data frame by batch and injection order (if batch exists), | |
| 202 # otherwise by injection order only | |
| 133 sort_by_injection_order <- function(data) { | 203 sort_by_injection_order <- function(data) { |
| 134 if ("batch" %in% colnames(data)) { | 204 if ("batch" %in% colnames(data)) { |
| 135 data <- data[order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE), ] | 205 data <- data[ |
| 206 order(data[, "batch"], | |
| 207 data[, "injectionOrder"], | |
| 208 decreasing = FALSE | |
| 209 ), | |
| 210 ] | |
| 136 } else { | 211 } else { |
| 137 data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ] | 212 data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ] |
| 138 } | 213 } |
| 139 return(data) | 214 data |
| 140 } | 215 } |
| 141 | 216 |
| 217 # Verifies that required columns exist and that there are no missing values | |
| 142 verify_input_dataframe <- function(data, required_columns) { | 218 verify_input_dataframe <- function(data, required_columns) { |
| 143 if (anyNA(data)) { | 219 if (anyNA(data)) { |
| 144 stop("Error: dataframe cannot contain NULL values! | 220 stop("Error: dataframe cannot contain NULL values! |
| 145 Make sure that your dataframe does not contain empty cells") | 221 \nMake sure that your dataframe does not contain empty cells") |
| 146 } else if (!all(required_columns %in% colnames(data))) { | 222 } else if (!all(required_columns %in% colnames(data))) { |
| 147 stop( | 223 stop( |
| 148 "Error: missing metadata! | 224 "Error: missing metadata! |
| 149 Make sure that the following columns are present in your dataframe: ", | 225 \nMake sure that the following columns are present in your dataframe: ", |
| 150 paste(required_columns, collapse = ", ") | 226 paste(required_columns, collapse = ", ") |
| 151 ) | 227 ) |
| 152 } | 228 } |
| 153 | |
| 154 data <- verify_column_types(data, required_columns) | 229 data <- verify_column_types(data, required_columns) |
| 155 | 230 data |
| 156 return(data) | 231 } |
| 157 } | 232 |
| 158 | 233 # Checks column types for required and feature columns |
| 234 # and removes problematic feature columns | |
| 159 verify_column_types <- function(data, required_columns) { | 235 verify_column_types <- function(data, required_columns) { |
| 160 # Specify the column names and their expected types | 236 # Checks that required columns have the correct type |
| 237 # and removes non-numeric feature columns efficiently. | |
| 161 column_types <- list( | 238 column_types <- list( |
| 162 "sampleName" = c("character", "factor"), | 239 "sampleName" = c("character", "factor"), |
| 163 "class" = c("character", "factor", "integer"), | 240 "class" = c("character", "factor", "integer"), |
| 164 "sampleType" = c("character", "factor"), | 241 "sampleType" = c("character", "factor"), |
| 165 "injectionOrder" = "integer", | 242 "injectionOrder" = "integer", |
| 166 "batch" = "integer" | 243 "batch" = "integer" |
| 167 ) | 244 ) |
| 168 | |
| 169 column_types <- column_types[required_columns] | 245 column_types <- column_types[required_columns] |
| 170 | 246 |
| 171 for (col_name in names(data)) { | 247 # Check required columns' types (fast, vectorized) |
| 248 for (col_name in names(column_types)) { | |
| 249 if (!col_name %in% names(data)) next | |
| 250 expected_types <- column_types[[col_name]] | |
| 172 actual_type <- class(data[[col_name]]) | 251 actual_type <- class(data[[col_name]]) |
| 173 if (col_name %in% names(column_types)) { | 252 if (!actual_type %in% expected_types) { |
| 174 expected_types <- column_types[[col_name]] | 253 stop( |
| 175 | 254 "Column ", col_name, " is of type ", actual_type, |
| 176 if (!actual_type %in% expected_types) { | 255 " but expected type is ", |
| 177 stop( | 256 paste(expected_types, collapse = " or "), "\n" |
| 178 "Column ", col_name, " is of type ", actual_type, | 257 ) |
| 179 " but expected type is ", | |
| 180 paste(expected_types, collapse = " or "), "\n" | |
| 181 ) | |
| 182 } | |
| 183 } else { | |
| 184 if (actual_type != "numeric") { | |
| 185 data[[col_name]] <- as.numeric(as.character(data[[col_name]])) | |
| 186 } | |
| 187 } | 258 } |
| 188 } | 259 } |
| 189 return(data) | 260 |
| 190 } | 261 # Identify feature columns (not required columns) |
| 191 | 262 feature_cols <- setdiff(names(data), required_columns) |
| 192 # Match group labels with [blank/sample/qc] and enumerate them | 263 # Try to convert all feature columns to numeric in one go |
| 264 # as well as suppressing warnings | |
| 265 data[feature_cols] <- suppressWarnings( | |
| 266 lapply( | |
| 267 data[feature_cols], | |
| 268 function(x) as.numeric(as.character(x)) | |
| 269 ) | |
| 270 ) | |
| 271 # Find columns that are problematic (contain any NA after conversion) | |
| 272 na_counts <- vapply(data[feature_cols], function(x) any(is.na(x)), logical(1)) | |
| 273 removed_columns <- names(na_counts)[na_counts] | |
| 274 if (length(removed_columns) > 0) { | |
| 275 message( | |
| 276 "Removed problematic columns (non-numeric): ", | |
| 277 paste(removed_columns, collapse = ", ") | |
| 278 ) | |
| 279 } | |
| 280 | |
| 281 # Keep only good columns | |
| 282 keep_cols <- !(names(data) %in% removed_columns) | |
| 283 data <- data[, keep_cols, drop = FALSE] | |
| 284 data | |
| 285 } | |
| 286 | |
| 287 # Enumerates group labels: blank=0, sample=1, qc=2, standard=3 | |
| 193 enumerate_groups <- function(group) { | 288 enumerate_groups <- function(group) { |
| 194 group[grepl("blank", tolower(group))] <- 0 | 289 group[grepl("blank", tolower(group))] <- 0 |
| 195 group[grepl("sample", tolower(group))] <- 1 | 290 group[grepl("sample", tolower(group))] <- 1 |
| 196 group[grepl("qc", tolower(group))] <- 2 | 291 group[grepl("qc", tolower(group))] <- 2 |
| 197 group[grepl("standard", tolower(group))] <- 3 | 292 group[grepl("standard", tolower(group))] <- 3 |
| 198 | 293 group |
| 199 return(group) | 294 } |
| 200 } | 295 |
| 201 | 296 # Returns the correct wavelet filter string for the R wavelets function |
| 202 # Create appropriate input for R wavelets function | |
| 203 get_wf <- function(wavelet_filter, wavelet_length) { | 297 get_wf <- function(wavelet_filter, wavelet_length) { |
| 204 wf <- paste(wavelet_filter, wavelet_length, sep = "") | 298 wf <- paste(wavelet_filter, wavelet_length, sep = "") |
| 205 | 299 # Exception for Daubechies-2 |
| 206 # exception to the wavelet function | |
| 207 if (wf == "d2") { | 300 if (wf == "d2") { |
| 208 wf <- "haar" | 301 wf <- "haar" |
| 209 } | 302 } |
| 210 | 303 wf |
| 211 return(wf) | 304 } |
| 212 } | 305 |
| 213 | 306 # Removes blank samples (group==0) from the data frame |
| 214 # Exclude blanks from a dataframe | |
| 215 exclude_group <- function(data, group) { | 307 exclude_group <- function(data, group) { |
| 216 row_idx_to_exclude <- which(group %in% 0) | 308 row_idx_to_exclude <- which(group %in% 0) |
| 217 if (length(row_idx_to_exclude) > 0) { | 309 if (length(row_idx_to_exclude) > 0) { |
| 218 data_without_blanks <- data[-c(row_idx_to_exclude), ] | 310 data_without_blanks <- data[-c(row_idx_to_exclude), ] |
| 219 cat("Blank samples have been excluded from the dataframe.\n") | 311 cat("Blank samples have been excluded from the dataframe.\n") |
| 220 return(data_without_blanks) | 312 data_without_blanks |
| 221 } else { | 313 } else { |
| 222 return(data) | 314 data |
| 223 } | 315 } |
| 224 } | 316 } |
| 225 | 317 |
| 226 store_data <- function(data, feature_output, metadata_output, ext, split_output = FALSE) { | 318 # Stores the output data in the requested format (csv, tsv/tabular, parquet), |
| 319 # optionally splitting metadata and features | |
| 320 store_data <- function(data, feature_output, ext) { | |
| 227 if (ext == "parquet") { | 321 if (ext == "parquet") { |
| 228 if (split_output == TRUE) { | 322 arrow::write_parquet(data, feature_output) |
| 229 split_df <- split_output(data) | 323 } else if (ext == "csv") { |
| 230 arrow::write_parquet(split_df$metadata, metadata_output) | 324 write.csv(data, file = feature_output, row.names = FALSE, quote = FALSE) |
| 231 arrow::write_parquet(split_df$feature_table, feature_output) | 325 } else if (ext == "tsv" || ext == "tabular") { |
| 232 } else { | 326 write.table(data, |
| 233 arrow::write_parquet(data, feature_output) | 327 file = feature_output, sep = "\t", |
| 234 } | 328 row.names = FALSE, quote = FALSE |
| 329 ) | |
| 235 } else { | 330 } else { |
| 236 if (split_output == TRUE) { | 331 stop(paste("Unsupported file extension:", ext)) |
| 237 split_df <- split_output(data) | |
| 238 write.table(split_df$metadata, | |
| 239 file = metadata_output, sep = "\t", | |
| 240 row.names = FALSE, quote = FALSE | |
| 241 ) | |
| 242 write.table(split_df$feature_table, | |
| 243 file = feature_output, sep = "\t", | |
| 244 row.names = FALSE, quote = FALSE | |
| 245 ) | |
| 246 } else { | |
| 247 write.table(data, | |
| 248 file = feature_output, sep = "\t", | |
| 249 row.names = FALSE, quote = FALSE | |
| 250 ) | |
| 251 } | |
| 252 } | 332 } |
| 253 cat("Normalization has been completed.\n") | 333 cat("Normalization has been completed.\n") |
| 254 } | |
| 255 | |
| 256 split_output <- function(df) { | |
| 257 required_columns_set1 <- c("sampleName", "class", "sampleType", "injectionOrder", "batch") | |
| 258 required_columns_set2 <- c("sampleName", "class", "sampleType", "injectionOrder") | |
| 259 | |
| 260 if (all(required_columns_set1 %in% colnames(df))) { | |
| 261 metadata_df <- df[, required_columns_set1, drop = FALSE] | |
| 262 df <- df[, -c(2:5)] | |
| 263 } else if (all(required_columns_set2 %in% colnames(df))) { | |
| 264 metadata_df <- df[, required_columns_set2, drop = FALSE] | |
| 265 df <- df[, -c(2:4)] | |
| 266 } else { | |
| 267 stop("Neither set of required columns is present in the dataframe.") | |
| 268 } | |
| 269 | |
| 270 # Transpose the feature table | |
| 271 col_names <- c("id", as.vector(df[[1]])) | |
| 272 feature_table <- tranpose_data(df, col_names) | |
| 273 | |
| 274 return(list(metadata = metadata_df, feature_table = feature_table)) | |
| 275 } | 334 } |
| 276 | 335 |
| 277 tranpose_data <- function(data, column_names) { | 336 tranpose_data <- function(data, column_names) { |
| 278 t_data <- data[-1] | 337 t_data <- data[-1] |
| 279 t_data <- t(t_data) | 338 t_data <- t(t_data) |
| 280 tranposed_data <- data.frame(rownames(t_data), t_data) | 339 tranposed_data <- data.frame(rownames(t_data), t_data) |
| 281 colnames(tranposed_data) <- column_names | 340 colnames(tranposed_data) <- column_names |
| 282 | 341 |
| 283 return(tranposed_data) | 342 tranposed_data |
| 284 } | 343 } |
| 344 | |
| 345 | |
| 346 final_data_processing <- function( | |
| 347 data, non_feature_columns, | |
| 348 transpose, original_first_colname) { | |
| 349 # Remove all columns that are in non_ | |
| 350 # feature_columns, except the first column | |
| 351 cols_to_keep <- !(colnames(data) %in% non_feature_columns) | |
| 352 cols_to_keep[1] <- TRUE # Always keep the first column | |
| 353 data <- data[, cols_to_keep, drop = FALSE] | |
| 354 | |
| 355 | |
| 356 if (transpose) { | |
| 357 # The first column becomes the new column names | |
| 358 new_colnames <- as.character(data[[1]]) | |
| 359 # Remove the first column | |
| 360 data <- data[, -1, drop = FALSE] | |
| 361 # Transpose the rest | |
| 362 data <- t(data) | |
| 363 # Convert to data frame | |
| 364 data <- as.data.frame(data, stringsAsFactors = FALSE) | |
| 365 # The first row becomes the first column | |
| 366 first_col <- rownames(data) | |
| 367 data <- cbind(first_col, data) | |
| 368 # Set column names | |
| 369 colnames(data) <- c(colnames(data)[1], new_colnames) | |
| 370 rownames(data) <- NULL | |
| 371 } | |
| 372 colnames(data)[1] <- original_first_colname | |
| 373 data | |
| 374 } |
