comparison waveica_wrapper.R @ 6:071a424241ec draft

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit bc3445f7c41271b0062c7674108f57708d08dd28
author recetox
date Thu, 30 May 2024 14:54:02 +0000
parents e424fa636281
children 1a2aeb8137bf
comparison
equal deleted inserted replaced
5:e424fa636281 6:071a424241ec
1 read_file <- function(file, metadata, ft_ext, mt_ext, transpose) { 1 read_file <- function(file, metadata, ft_ext, mt_ext, transpose) {
2 data <- read_data(file, ft_ext) 2 data <- read_data(file, ft_ext)
3 3
4 if (transpose) { 4 if (transpose) {
5 col_names <- c("sampleName", data[[1]]) 5 col_names <- c("sampleName", data[[1]])
6 t_data <- data[-1] 6 t_data <- data[-1]
7 t_data <- t(t_data) 7 t_data <- t(t_data)
8 data <- data.frame(rownames(t_data), t_data) 8 data <- data.frame(rownames(t_data), t_data)
9 colnames(data) <- col_names 9 colnames(data) <- col_names
10 } 10 }
11 11
12 if (!is.na(metadata)) { 12 if (!is.na(metadata)) {
13 mt_data <- read_data(metadata, mt_ext) 13 mt_data <- read_data(metadata, mt_ext)
14 data <- merge(mt_data, data, by = "sampleName") 14 data <- merge(mt_data, data, by = "sampleName")
15 } 15 }
16 16
17 return(data) 17 return(data)
18 } 18 }
19 19
20 read_data <- function(file, ext) { 20 read_data <- function(file, ext) {
21 if (ext == "csv") { 21 if (ext == "csv") {
22 data <- read.csv(file, header = TRUE) 22 data <- read.csv(file, header = TRUE)
23 } else if (ext == "tsv") { 23 } else if (ext == "tsv") {
24 data <- read.csv(file, header = TRUE, sep = "\t") 24 data <- read.csv(file, header = TRUE, sep = "\t")
25 } else { 25 } else {
26 data <- arrow::read_parquet(file) 26 data <- arrow::read_parquet(file)
27 } 27 }
28 28
29 return(data) 29 return(data)
30 } 30 }
31 31
32 waveica <- function(file, 32 waveica <- function(file,
33 metadata = NA, 33 metadata = NA,
34 ext, 34 ext,
38 k, 38 k,
39 t, 39 t,
40 t2, 40 t2,
41 alpha, 41 alpha,
42 exclude_blanks) { 42 exclude_blanks) {
43 # get input from the Galaxy, preprocess data 43 # get input from the Galaxy, preprocess data
44 ext <- strsplit(x = ext, split = "\\,")[[1]] 44 ext <- strsplit(x = ext, split = "\\,")[[1]]
45 45
46 ft_ext <- ext[1] 46 ft_ext <- ext[1]
47 mt_ext <- ext[2] 47 mt_ext <- ext[2]
48 48
49 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) 49 data <- read_file(file, metadata, ft_ext, mt_ext, transpose)
50 50
51 required_columns <- c( 51 required_columns <- c(
52 "sampleName", "class", "sampleType", 52 "sampleName", "class", "sampleType",
53 "injectionOrder", "batch" 53 "injectionOrder", "batch"
54 ) 54 )
55 data <- verify_input_dataframe(data, required_columns) 55 data <- verify_input_dataframe(data, required_columns)
56 56
57 data <- sort_by_injection_order(data) 57 data <- sort_by_injection_order(data)
58 58
59 # separate data into features, batch and group 59 # separate data into features, batch and group
60 feature_columns <- colnames(data)[!colnames(data) %in% required_columns] 60 feature_columns <- colnames(data)[!colnames(data) %in% required_columns]
61 features <- data[, feature_columns] 61 features <- data[, feature_columns]
62 group <- enumerate_groups(as.character(data$sampleType)) 62 group <- enumerate_groups(as.character(data$sampleType))
63 batch <- data$batch 63 batch <- data$batch
64 64
65 # run WaveICA 65 # run WaveICA
66 features <- recetox.waveica::waveica( 66 features <- recetox.waveica::waveica(
67 data = features, 67 data = features,
68 wf = get_wf(wavelet_filter, wavelet_length), 68 wf = get_wf(wavelet_filter, wavelet_length),
69 batch = batch, 69 batch = batch,
70 group = group, 70 group = group,
71 K = k, 71 K = k,
72 t = t, 72 t = t,
73 t2 = t2, 73 t2 = t2,
74 alpha = alpha 74 alpha = alpha
75 ) 75 )
76 76
77 data[, feature_columns] <- features 77 data[, feature_columns] <- features
78 78
79 # remove blanks from dataset 79 # remove blanks from dataset
80 if (exclude_blanks) { 80 if (exclude_blanks) {
81 data <- exclude_group(data, group) 81 data <- exclude_group(data, group)
82 } 82 }
83 83
84 return(data) 84 return(data)
85 } 85 }
86 86
87 waveica_singlebatch <- function(file, 87 waveica_singlebatch <- function(file,
88 metadata = NA, 88 metadata = NA,
89 ext, 89 ext,
92 wavelet_length, 92 wavelet_length,
93 k, 93 k,
94 alpha, 94 alpha,
95 cutoff, 95 cutoff,
96 exclude_blanks) { 96 exclude_blanks) {
97 # get input from the Galaxy, preprocess data 97 # get input from the Galaxy, preprocess data
98 ext <- strsplit(x = ext, split = "\\,")[[1]] 98 ext <- strsplit(x = ext, split = "\\,")[[1]]
99 99
100 ft_ext <- ext[1] 100 ft_ext <- ext[1]
101 mt_ext <- ext[2] 101 mt_ext <- ext[2]
102 102
103 data <- read_file(file, metadata, ft_ext, mt_ext, transpose) 103 data <- read_file(file, metadata, ft_ext, mt_ext, transpose)
104 104
105 required_columns <- c("sampleName", "class", "sampleType", "injectionOrder") 105 required_columns <- c("sampleName", "class", "sampleType", "injectionOrder")
106 optional_columns <- c("batch") 106 optional_columns <- c("batch")
107 107
108 data <- verify_input_dataframe(data, required_columns) 108 data <- verify_input_dataframe(data, required_columns)
109 109
110 data <- sort_by_injection_order(data) 110 data <- sort_by_injection_order(data)
111 111
112 feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)] 112 feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)]
113 features <- data[, feature_columns] 113 features <- data[, feature_columns]
114 injection_order <- data$injectionOrder 114 injection_order <- data$injectionOrder
115 115
116 # run WaveICA 116 # run WaveICA
117 features <- recetox.waveica::waveica_nonbatchwise( 117 features <- recetox.waveica::waveica_nonbatchwise(
118 data = features, 118 data = features,
119 wf = get_wf(wavelet_filter, wavelet_length), 119 wf = get_wf(wavelet_filter, wavelet_length),
120 injection_order = injection_order, 120 injection_order = injection_order,
121 K = k, 121 K = k,
122 alpha = alpha, 122 alpha = alpha,
123 cutoff = cutoff 123 cutoff = cutoff
124 ) 124 )
125 125
126 data[, feature_columns] <- features 126 data[, feature_columns] <- features
127 group <- enumerate_groups(as.character(data$sampleType)) 127 group <- enumerate_groups(as.character(data$sampleType))
128 # remove blanks from dataset 128 # remove blanks from dataset
129 if (exclude_blanks) { 129 if (exclude_blanks) {
130 data <- exclude_group(data, group) 130 data <- exclude_group(data, group)
131 } 131 }
132 132
133 return(data) 133 return(data)
134 } 134 }
135 135
136 136
137 sort_by_injection_order <- function(data) { 137 sort_by_injection_order <- function(data) {
138 if ("batch" %in% colnames(data)) { 138 if ("batch" %in% colnames(data)) {
139 data <- data[order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE), ] 139 data <- data[order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE), ]
140 } else { 140 } else {
141 data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ] 141 data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ]
142 } 142 }
143 return(data) 143 return(data)
144 } 144 }
145 145
146 146
147 verify_input_dataframe <- function(data, required_columns) { 147 verify_input_dataframe <- function(data, required_columns) {
148 if (anyNA(data)) { 148 if (anyNA(data)) {
149 stop("Error: dataframe cannot contain NULL values! 149 stop("Error: dataframe cannot contain NULL values!
150 Make sure that your dataframe does not contain empty cells") 150 Make sure that your dataframe does not contain empty cells")
151 } else if (!all(required_columns %in% colnames(data))) { 151 } else if (!all(required_columns %in% colnames(data))) {
152 stop( 152 stop(
153 "Error: missing metadata! 153 "Error: missing metadata!
154 Make sure that the following columns are present in your dataframe: ", 154 Make sure that the following columns are present in your dataframe: ",
155 paste(required_columns, collapse = ", ") 155 paste(required_columns, collapse = ", ")
156 ) 156 )
157 } 157 }
158 158
159 data <- verify_column_types(data, required_columns) 159 data <- verify_column_types(data, required_columns)
160 160
161 return(data) 161 return(data)
162 } 162 }
163 163
164 verify_column_types <- function(data, required_columns) { 164 verify_column_types <- function(data, required_columns) {
165 # Specify the column names and their expected types 165 # Specify the column names and their expected types
166 column_types <- list( 166 column_types <- list(
167 "sampleName" = c("character", "factor"), 167 "sampleName" = c("character", "factor"),
168 "class" = c("character", "factor", "integer"), 168 "class" = c("character", "factor", "integer"),
169 "sampleType" = c("character", "factor"), 169 "sampleType" = c("character", "factor"),
170 "injectionOrder" = "integer", 170 "injectionOrder" = "integer",
171 "batch" = "integer" 171 "batch" = "integer"
172 ) 172 )
173 173
174 column_types <- column_types[required_columns] 174 column_types <- column_types[required_columns]
175 175
176 for (col_name in names(data)) { 176 for (col_name in names(data)) {
177 actual_type <- class(data[[col_name]]) 177 actual_type <- class(data[[col_name]])
178 if (col_name %in% names(column_types)) { 178 if (col_name %in% names(column_types)) {
179 expected_types <- column_types[[col_name]] 179 expected_types <- column_types[[col_name]]
180 180
181 if (!actual_type %in% expected_types) { 181 if (!actual_type %in% expected_types) {
182 stop( 182 stop(
183 "Column ", col_name, " is of type ", actual_type, 183 "Column ", col_name, " is of type ", actual_type,
184 " but expected type is ", 184 " but expected type is ",
185 paste(expected_types, collapse = " or "), "\n" 185 paste(expected_types, collapse = " or "), "\n"
186 ) 186 )
187 } 187 }
188 } else { 188 } else {
189 if (actual_type != "numeric") { 189 if (actual_type != "numeric") {
190 data[[col_name]] <- as.numeric(as.character(data[[col_name]])) 190 data[[col_name]] <- as.numeric(as.character(data[[col_name]]))
191 } 191 }
192 } 192 }
193 } 193 }
194 return(data) 194 return(data)
195 } 195 }
196 196
197 197
198 # Match group labels with [blank/sample/qc] and enumerate them 198 # Match group labels with [blank/sample/qc] and enumerate them
199 enumerate_groups <- function(group) { 199 enumerate_groups <- function(group) {
200 group[grepl("blank", tolower(group))] <- 0 200 group[grepl("blank", tolower(group))] <- 0
201 group[grepl("sample", tolower(group))] <- 1 201 group[grepl("sample", tolower(group))] <- 1
202 group[grepl("qc", tolower(group))] <- 2 202 group[grepl("qc", tolower(group))] <- 2
203 203
204 return(group) 204 return(group)
205 } 205 }
206 206
207 207
208 # Create appropriate input for R wavelets function 208 # Create appropriate input for R wavelets function
209 get_wf <- function(wavelet_filter, wavelet_length) { 209 get_wf <- function(wavelet_filter, wavelet_length) {
210 wf <- paste(wavelet_filter, wavelet_length, sep = "") 210 wf <- paste(wavelet_filter, wavelet_length, sep = "")
211 211
212 # exception to the wavelet function 212 # exception to the wavelet function
213 if (wf == "d2") { 213 if (wf == "d2") {
214 wf <- "haar" 214 wf <- "haar"
215 } 215 }
216 216
217 return(wf) 217 return(wf)
218 } 218 }
219 219
220 220
221 # Exclude blanks from a dataframe 221 # Exclude blanks from a dataframe
222 exclude_group <- function(data, group) { 222 exclude_group <- function(data, group) {
223 row_idx_to_exclude <- which(group %in% 0) 223 row_idx_to_exclude <- which(group %in% 0)
224 if (length(row_idx_to_exclude) > 0) { 224 if (length(row_idx_to_exclude) > 0) {
225 data_without_blanks <- data[-c(row_idx_to_exclude), ] 225 data_without_blanks <- data[-c(row_idx_to_exclude), ]
226 cat("Blank samples have been excluded from the dataframe.\n") 226 cat("Blank samples have been excluded from the dataframe.\n")
227 return(data_without_blanks) 227 return(data_without_blanks)
228 } else { 228 } else {
229 return(data) 229 return(data)
230 } 230 }
231 } 231 }
232 232
233 store_data <- function(data, output, ext) { 233 store_data <- function(data, output, ext) {
234 if (ext == "parquet") { 234 if (ext == "parquet") {
235 arrow::write_parquet(data, output) 235 arrow::write_parquet(data, output)
236 } else { 236 } else {
237 write.table(data, 237 write.table(data,
238 file = output, sep = "\t", 238 file = output, sep = "\t",
239 row.names = FALSE, quote = FALSE 239 row.names = FALSE, quote = FALSE
240 ) 240 )
241 } 241 }
242 cat("Normalization has been completed.\n") 242 cat("Normalization has been completed.\n")
243 } 243 }