Mercurial > repos > recetox > waveica
comparison waveica_wrapper.R @ 6:071a424241ec draft
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit bc3445f7c41271b0062c7674108f57708d08dd28
author | recetox |
---|---|
date | Thu, 30 May 2024 14:54:02 +0000 |
parents | e424fa636281 |
children | 1a2aeb8137bf |
comparison
equal
deleted
inserted
replaced
5:e424fa636281 | 6:071a424241ec |
---|---|
1 read_file <- function(file, metadata, ft_ext, mt_ext, transpose) { | 1 read_file <- function(file, metadata, ft_ext, mt_ext, transpose) { |
2 data <- read_data(file, ft_ext) | 2 data <- read_data(file, ft_ext) |
3 | 3 |
4 if (transpose) { | 4 if (transpose) { |
5 col_names <- c("sampleName", data[[1]]) | 5 col_names <- c("sampleName", data[[1]]) |
6 t_data <- data[-1] | 6 t_data <- data[-1] |
7 t_data <- t(t_data) | 7 t_data <- t(t_data) |
8 data <- data.frame(rownames(t_data), t_data) | 8 data <- data.frame(rownames(t_data), t_data) |
9 colnames(data) <- col_names | 9 colnames(data) <- col_names |
10 } | 10 } |
11 | 11 |
12 if (!is.na(metadata)) { | 12 if (!is.na(metadata)) { |
13 mt_data <- read_data(metadata, mt_ext) | 13 mt_data <- read_data(metadata, mt_ext) |
14 data <- merge(mt_data, data, by = "sampleName") | 14 data <- merge(mt_data, data, by = "sampleName") |
15 } | 15 } |
16 | 16 |
17 return(data) | 17 return(data) |
18 } | 18 } |
19 | 19 |
20 read_data <- function(file, ext) { | 20 read_data <- function(file, ext) { |
21 if (ext == "csv") { | 21 if (ext == "csv") { |
22 data <- read.csv(file, header = TRUE) | 22 data <- read.csv(file, header = TRUE) |
23 } else if (ext == "tsv") { | 23 } else if (ext == "tsv") { |
24 data <- read.csv(file, header = TRUE, sep = "\t") | 24 data <- read.csv(file, header = TRUE, sep = "\t") |
25 } else { | 25 } else { |
26 data <- arrow::read_parquet(file) | 26 data <- arrow::read_parquet(file) |
27 } | 27 } |
28 | 28 |
29 return(data) | 29 return(data) |
30 } | 30 } |
31 | 31 |
32 waveica <- function(file, | 32 waveica <- function(file, |
33 metadata = NA, | 33 metadata = NA, |
34 ext, | 34 ext, |
38 k, | 38 k, |
39 t, | 39 t, |
40 t2, | 40 t2, |
41 alpha, | 41 alpha, |
42 exclude_blanks) { | 42 exclude_blanks) { |
43 # get input from the Galaxy, preprocess data | 43 # get input from the Galaxy, preprocess data |
44 ext <- strsplit(x = ext, split = "\\,")[[1]] | 44 ext <- strsplit(x = ext, split = "\\,")[[1]] |
45 | 45 |
46 ft_ext <- ext[1] | 46 ft_ext <- ext[1] |
47 mt_ext <- ext[2] | 47 mt_ext <- ext[2] |
48 | 48 |
49 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) | 49 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) |
50 | 50 |
51 required_columns <- c( | 51 required_columns <- c( |
52 "sampleName", "class", "sampleType", | 52 "sampleName", "class", "sampleType", |
53 "injectionOrder", "batch" | 53 "injectionOrder", "batch" |
54 ) | 54 ) |
55 data <- verify_input_dataframe(data, required_columns) | 55 data <- verify_input_dataframe(data, required_columns) |
56 | 56 |
57 data <- sort_by_injection_order(data) | 57 data <- sort_by_injection_order(data) |
58 | 58 |
59 # separate data into features, batch and group | 59 # separate data into features, batch and group |
60 feature_columns <- colnames(data)[!colnames(data) %in% required_columns] | 60 feature_columns <- colnames(data)[!colnames(data) %in% required_columns] |
61 features <- data[, feature_columns] | 61 features <- data[, feature_columns] |
62 group <- enumerate_groups(as.character(data$sampleType)) | 62 group <- enumerate_groups(as.character(data$sampleType)) |
63 batch <- data$batch | 63 batch <- data$batch |
64 | 64 |
65 # run WaveICA | 65 # run WaveICA |
66 features <- recetox.waveica::waveica( | 66 features <- recetox.waveica::waveica( |
67 data = features, | 67 data = features, |
68 wf = get_wf(wavelet_filter, wavelet_length), | 68 wf = get_wf(wavelet_filter, wavelet_length), |
69 batch = batch, | 69 batch = batch, |
70 group = group, | 70 group = group, |
71 K = k, | 71 K = k, |
72 t = t, | 72 t = t, |
73 t2 = t2, | 73 t2 = t2, |
74 alpha = alpha | 74 alpha = alpha |
75 ) | 75 ) |
76 | 76 |
77 data[, feature_columns] <- features | 77 data[, feature_columns] <- features |
78 | 78 |
79 # remove blanks from dataset | 79 # remove blanks from dataset |
80 if (exclude_blanks) { | 80 if (exclude_blanks) { |
81 data <- exclude_group(data, group) | 81 data <- exclude_group(data, group) |
82 } | 82 } |
83 | 83 |
84 return(data) | 84 return(data) |
85 } | 85 } |
86 | 86 |
87 waveica_singlebatch <- function(file, | 87 waveica_singlebatch <- function(file, |
88 metadata = NA, | 88 metadata = NA, |
89 ext, | 89 ext, |
92 wavelet_length, | 92 wavelet_length, |
93 k, | 93 k, |
94 alpha, | 94 alpha, |
95 cutoff, | 95 cutoff, |
96 exclude_blanks) { | 96 exclude_blanks) { |
97 # get input from the Galaxy, preprocess data | 97 # get input from the Galaxy, preprocess data |
98 ext <- strsplit(x = ext, split = "\\,")[[1]] | 98 ext <- strsplit(x = ext, split = "\\,")[[1]] |
99 | 99 |
100 ft_ext <- ext[1] | 100 ft_ext <- ext[1] |
101 mt_ext <- ext[2] | 101 mt_ext <- ext[2] |
102 | 102 |
103 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) | 103 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) |
104 | 104 |
105 required_columns <- c("sampleName", "class", "sampleType", "injectionOrder") | 105 required_columns <- c("sampleName", "class", "sampleType", "injectionOrder") |
106 optional_columns <- c("batch") | 106 optional_columns <- c("batch") |
107 | 107 |
108 data <- verify_input_dataframe(data, required_columns) | 108 data <- verify_input_dataframe(data, required_columns) |
109 | 109 |
110 data <- sort_by_injection_order(data) | 110 data <- sort_by_injection_order(data) |
111 | 111 |
112 feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)] | 112 feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)] |
113 features <- data[, feature_columns] | 113 features <- data[, feature_columns] |
114 injection_order <- data$injectionOrder | 114 injection_order <- data$injectionOrder |
115 | 115 |
116 # run WaveICA | 116 # run WaveICA |
117 features <- recetox.waveica::waveica_nonbatchwise( | 117 features <- recetox.waveica::waveica_nonbatchwise( |
118 data = features, | 118 data = features, |
119 wf = get_wf(wavelet_filter, wavelet_length), | 119 wf = get_wf(wavelet_filter, wavelet_length), |
120 injection_order = injection_order, | 120 injection_order = injection_order, |
121 K = k, | 121 K = k, |
122 alpha = alpha, | 122 alpha = alpha, |
123 cutoff = cutoff | 123 cutoff = cutoff |
124 ) | 124 ) |
125 | 125 |
126 data[, feature_columns] <- features | 126 data[, feature_columns] <- features |
127 group <- enumerate_groups(as.character(data$sampleType)) | 127 group <- enumerate_groups(as.character(data$sampleType)) |
128 # remove blanks from dataset | 128 # remove blanks from dataset |
129 if (exclude_blanks) { | 129 if (exclude_blanks) { |
130 data <- exclude_group(data, group) | 130 data <- exclude_group(data, group) |
131 } | 131 } |
132 | 132 |
133 return(data) | 133 return(data) |
134 } | 134 } |
135 | 135 |
136 | 136 |
137 sort_by_injection_order <- function(data) { | 137 sort_by_injection_order <- function(data) { |
138 if ("batch" %in% colnames(data)) { | 138 if ("batch" %in% colnames(data)) { |
139 data <- data[order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE), ] | 139 data <- data[order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE), ] |
140 } else { | 140 } else { |
141 data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ] | 141 data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ] |
142 } | 142 } |
143 return(data) | 143 return(data) |
144 } | 144 } |
145 | 145 |
146 | 146 |
147 verify_input_dataframe <- function(data, required_columns) { | 147 verify_input_dataframe <- function(data, required_columns) { |
148 if (anyNA(data)) { | 148 if (anyNA(data)) { |
149 stop("Error: dataframe cannot contain NULL values! | 149 stop("Error: dataframe cannot contain NULL values! |
150 Make sure that your dataframe does not contain empty cells") | 150 Make sure that your dataframe does not contain empty cells") |
151 } else if (!all(required_columns %in% colnames(data))) { | 151 } else if (!all(required_columns %in% colnames(data))) { |
152 stop( | 152 stop( |
153 "Error: missing metadata! | 153 "Error: missing metadata! |
154 Make sure that the following columns are present in your dataframe: ", | 154 Make sure that the following columns are present in your dataframe: ", |
155 paste(required_columns, collapse = ", ") | 155 paste(required_columns, collapse = ", ") |
156 ) | 156 ) |
157 } | 157 } |
158 | 158 |
159 data <- verify_column_types(data, required_columns) | 159 data <- verify_column_types(data, required_columns) |
160 | 160 |
161 return(data) | 161 return(data) |
162 } | 162 } |
163 | 163 |
164 verify_column_types <- function(data, required_columns) { | 164 verify_column_types <- function(data, required_columns) { |
165 # Specify the column names and their expected types | 165 # Specify the column names and their expected types |
166 column_types <- list( | 166 column_types <- list( |
167 "sampleName" = c("character", "factor"), | 167 "sampleName" = c("character", "factor"), |
168 "class" = c("character", "factor", "integer"), | 168 "class" = c("character", "factor", "integer"), |
169 "sampleType" = c("character", "factor"), | 169 "sampleType" = c("character", "factor"), |
170 "injectionOrder" = "integer", | 170 "injectionOrder" = "integer", |
171 "batch" = "integer" | 171 "batch" = "integer" |
172 ) | 172 ) |
173 | 173 |
174 column_types <- column_types[required_columns] | 174 column_types <- column_types[required_columns] |
175 | 175 |
176 for (col_name in names(data)) { | 176 for (col_name in names(data)) { |
177 actual_type <- class(data[[col_name]]) | 177 actual_type <- class(data[[col_name]]) |
178 if (col_name %in% names(column_types)) { | 178 if (col_name %in% names(column_types)) { |
179 expected_types <- column_types[[col_name]] | 179 expected_types <- column_types[[col_name]] |
180 | 180 |
181 if (!actual_type %in% expected_types) { | 181 if (!actual_type %in% expected_types) { |
182 stop( | 182 stop( |
183 "Column ", col_name, " is of type ", actual_type, | 183 "Column ", col_name, " is of type ", actual_type, |
184 " but expected type is ", | 184 " but expected type is ", |
185 paste(expected_types, collapse = " or "), "\n" | 185 paste(expected_types, collapse = " or "), "\n" |
186 ) | 186 ) |
187 } | 187 } |
188 } else { | 188 } else { |
189 if (actual_type != "numeric") { | 189 if (actual_type != "numeric") { |
190 data[[col_name]] <- as.numeric(as.character(data[[col_name]])) | 190 data[[col_name]] <- as.numeric(as.character(data[[col_name]])) |
191 } | 191 } |
192 } | 192 } |
193 } | 193 } |
194 return(data) | 194 return(data) |
195 } | 195 } |
196 | 196 |
197 | 197 |
198 # Match group labels with [blank/sample/qc] and enumerate them | 198 # Match group labels with [blank/sample/qc] and enumerate them |
199 enumerate_groups <- function(group) { | 199 enumerate_groups <- function(group) { |
200 group[grepl("blank", tolower(group))] <- 0 | 200 group[grepl("blank", tolower(group))] <- 0 |
201 group[grepl("sample", tolower(group))] <- 1 | 201 group[grepl("sample", tolower(group))] <- 1 |
202 group[grepl("qc", tolower(group))] <- 2 | 202 group[grepl("qc", tolower(group))] <- 2 |
203 | 203 |
204 return(group) | 204 return(group) |
205 } | 205 } |
206 | 206 |
207 | 207 |
208 # Create appropriate input for R wavelets function | 208 # Create appropriate input for R wavelets function |
209 get_wf <- function(wavelet_filter, wavelet_length) { | 209 get_wf <- function(wavelet_filter, wavelet_length) { |
210 wf <- paste(wavelet_filter, wavelet_length, sep = "") | 210 wf <- paste(wavelet_filter, wavelet_length, sep = "") |
211 | 211 |
212 # exception to the wavelet function | 212 # exception to the wavelet function |
213 if (wf == "d2") { | 213 if (wf == "d2") { |
214 wf <- "haar" | 214 wf <- "haar" |
215 } | 215 } |
216 | 216 |
217 return(wf) | 217 return(wf) |
218 } | 218 } |
219 | 219 |
220 | 220 |
221 # Exclude blanks from a dataframe | 221 # Exclude blanks from a dataframe |
222 exclude_group <- function(data, group) { | 222 exclude_group <- function(data, group) { |
223 row_idx_to_exclude <- which(group %in% 0) | 223 row_idx_to_exclude <- which(group %in% 0) |
224 if (length(row_idx_to_exclude) > 0) { | 224 if (length(row_idx_to_exclude) > 0) { |
225 data_without_blanks <- data[-c(row_idx_to_exclude), ] | 225 data_without_blanks <- data[-c(row_idx_to_exclude), ] |
226 cat("Blank samples have been excluded from the dataframe.\n") | 226 cat("Blank samples have been excluded from the dataframe.\n") |
227 return(data_without_blanks) | 227 return(data_without_blanks) |
228 } else { | 228 } else { |
229 return(data) | 229 return(data) |
230 } | 230 } |
231 } | 231 } |
232 | 232 |
233 store_data <- function(data, output, ext) { | 233 store_data <- function(data, output, ext) { |
234 if (ext == "parquet") { | 234 if (ext == "parquet") { |
235 arrow::write_parquet(data, output) | 235 arrow::write_parquet(data, output) |
236 } else { | 236 } else { |
237 write.table(data, | 237 write.table(data, |
238 file = output, sep = "\t", | 238 file = output, sep = "\t", |
239 row.names = FALSE, quote = FALSE | 239 row.names = FALSE, quote = FALSE |
240 ) | 240 ) |
241 } | 241 } |
242 cat("Normalization has been completed.\n") | 242 cat("Normalization has been completed.\n") |
243 } | 243 } |