Mercurial > repos > workflow4metabolomics > ms2snoop
comparison MS2snoop.R @ 5:78d5a12406c2 draft
planemo upload commit a5f94dac9b268629399dc22c5d6ac48c5a85adc3
author | workflow4metabolomics |
---|---|
date | Fri, 05 Aug 2022 17:25:45 +0000 |
parents | 856001213966 |
children | 77abacd33c31 |
comparison
equal
deleted
inserted
replaced
4:856001213966 | 5:78d5a12406c2 |
---|---|
13 #' | 13 #' |
14 #' @import optparse | 14 #' @import optparse |
15 #' | 15 #' |
16 | 16 |
17 | 17 |
18 assign("MS2SNOOP_VERSION", "2.0.0") | 18 get_version <- function() { |
19 lockBinding("MS2SNOOP_VERSION", globalenv()) | 19 cmd <- commandArgs(trailingOnly = FALSE) |
20 | 20 root <- dirname(gsub("--file=", "", cmd[grep("--file=", cmd)])) |
21 assign("MISSING_PARAMETER_ERROR", 1) | 21 readme <- readLines(file.path(root, "README.md")) |
22 lockBinding("MISSING_PARAMETER_ERROR", globalenv()) | 22 version_line <- readme[grepl(" * **@version**: ", readme, fixed = TRUE)] |
23 | 23 return(gsub(".*: ", "", version_line)) |
24 assign("BAD_PARAMETER_VALUE_ERROR", 2) | 24 } |
25 lockBinding("BAD_PARAMETER_VALUE_ERROR", globalenv()) | 25 |
26 | 26 defaults <- list( |
27 assign("MISSING_INPUT_FILE_ERROR", 3) | 27 MS2SNOOP_VERSION = get_version(), |
28 lockBinding("MISSING_INPUT_FILE_ERROR", globalenv()) | 28 MISSING_PARAMETER_ERROR = 1, |
29 | 29 BAD_PARAMETER_VALUE_ERROR = 2, |
30 assign("NO_ANY_RESULT_ERROR", 255) | 30 MISSING_INPUT_FILE_ERROR = 3, |
31 lockBinding("NO_ANY_RESULT_ERROR", globalenv()) | 31 NO_ANY_RESULT_ERROR = 255, |
32 | 32 DEFAULT_PRECURSOR_PATH = NULL, |
33 assign("DEFAULT_PRECURSOR_PATH", "peaklist_precursors.tsv") | 33 DEFAULT_FRAGMENTS_PATH = NULL, |
34 assign("DEFAULT_FRAGMENTS_PATH", "peaklist_fragments.tsv") | 34 DEFAULT_COMPOUNDS_PATH = NULL, |
35 assign("DEFAULT_COMPOUNDS_PATH", "compounds_pos.txt") | 35 DEFAULT_OUTPUT_PATH = "compound_fragments_result.txt", |
36 assign("DEFAULT_OUTPUT_PATH", "compound_fragments_result.txt") | 36 DEFAULT_TOLMZ = 0.01, |
37 assign("DEFAULT_TOLMZ", 0.01) | 37 DEFAULT_TOLRT = 20, |
38 assign("DEFAULT_TOLRT", 20) | 38 DEFAULT_MZDECIMAL = 3, |
39 assign("DEFAULT_MZDECIMAL", 0) | 39 DEFAULT_R_THRESHOLD = 0.85, |
40 assign("DEFAULT_R_THRESHOLD", 0.85) | 40 DEFAULT_MINNUMBERSCAN = 8, |
41 assign("DEFAULT_MINNUMBERSCAN", 8) | 41 DEFAULT_SEUIL_RA = 0.05, |
42 assign("DEFAULT_SEUIL_RA", 0.5) | 42 DEFAULT_FRAGMENTS_MATCH_DELTA = 10, |
43 lockBinding("DEFAULT_PRECURSOR_PATH", globalenv()) | 43 DEFAULT_FRAGMENTS_MATCH_DELTA_UNIT = "ppm", |
44 lockBinding("DEFAULT_FRAGMENTS_PATH", globalenv()) | 44 DEFAULT_PDF_PATH = "" |
45 lockBinding("DEFAULT_COMPOUNDS_PATH", globalenv()) | 45 ) |
46 lockBinding("DEFAULT_OUTPUT_PATH", globalenv()) | 46 env <- globalenv() |
47 lockBinding("DEFAULT_TOLMZ", globalenv()) | 47 for (default in names(defaults)) { |
48 lockBinding("DEFAULT_TOLRT", globalenv()) | 48 assign(default, defaults[[default]], envir = env) |
49 lockBinding("DEFAULT_MZDECIMAL", globalenv()) | 49 lockBinding(default, env) |
50 lockBinding("DEFAULT_R_THRESHOLD", globalenv()) | 50 } |
51 lockBinding("DEFAULT_MINNUMBERSCAN", globalenv()) | |
52 lockBinding("DEFAULT_SEUIL_RA", globalenv()) | |
53 | |
54 assign("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", 0.85) | |
55 assign("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", 0.1) | |
56 assign("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", 0.01) | |
57 assign("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", 60) | |
58 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", globalenv()) | |
59 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", globalenv()) | |
60 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", globalenv()) | |
61 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", globalenv()) | |
62 | |
63 | 51 |
64 ######################################################################## | 52 ######################################################################## |
53 | |
54 get_formulas <- function( | |
55 mzref, | |
56 spectra, | |
57 nominal_mz_list, | |
58 processing_parameters, | |
59 background = !TRUE | |
60 ) { | |
61 if (is.vector(mzref) && length(mzref) > 1) { | |
62 return(lapply( | |
63 mzref, | |
64 function(mz) { | |
65 return(get_formulas( | |
66 mzref = mz, | |
67 spectra = spectra, | |
68 nominal_mz_list = nominal_mz_list, | |
69 processing_parameters = processing_parameters, | |
70 background = background | |
71 )) | |
72 } | |
73 )) | |
74 } | |
75 input <- sprintf( | |
76 "%s-%s.ms", | |
77 gsub("[[:space:]]", "_", processing_parameters$c_name), | |
78 mzref | |
79 ) | |
80 create_ms_file(input, mzref, spectra, processing_parameters) | |
81 output <- sprintf( | |
82 "out/%s-%s.out", | |
83 gsub("[[:space:]]", "_", processing_parameters$c_name), | |
84 mzref | |
85 ) | |
86 command <- sprintf( | |
87 paste( | |
88 "sirius", | |
89 "--noCite", | |
90 "--noSummaries", | |
91 "--loglevel=WARNING", | |
92 "-i='%s'", | |
93 "-o='%s'", | |
94 "tree", | |
95 ## loglevel is not working taken into account during | |
96 ## sirius startup, so we filter outputs... | |
97 "2>&1 | grep '^(WARNING|SEVERE)'" | |
98 ), | |
99 input, | |
100 output | |
101 ) | |
102 verbose_catf( | |
103 ">> Sirius is running %swith the command: %s\n", | |
104 if (background) "in the background " else "", | |
105 command | |
106 ) | |
107 system( | |
108 command, | |
109 wait = !background, | |
110 ignore.stdout = background, | |
111 ignore.stderr = background | |
112 ) | |
113 return(extract_sirius_results(output, spectra$mz, processing_parameters)) | |
114 } | |
115 | |
116 create_ms_file <- function( | |
117 path, | |
118 mzref, | |
119 spectra, | |
120 processing_parameters | |
121 ) { | |
122 file_content <- paste( | |
123 sprintf(">compound %s", processing_parameters$c_name), | |
124 sprintf(">ionization %s", processing_parameters$ionization), | |
125 sprintf(">parentmass %s", mzref), | |
126 sprintf(">formula %s", processing_parameters$elemcomposition), | |
127 sep = "\n" | |
128 ) | |
129 displayed_file_content <- sprintf( | |
130 "%s\n>collision\n%s", | |
131 file_content, | |
132 paste( | |
133 sprintf( | |
134 "%s %s", | |
135 spectra[1:3, "mz"], | |
136 spectra[1:3, "intensities"] | |
137 ), | |
138 collapse = "\n" | |
139 ) | |
140 ) | |
141 if (nrow(spectra) > 3) { | |
142 displayed_file_content <- sprintf( | |
143 "%s\n[... %s more rows of mz and intensities ...]", | |
144 displayed_file_content, | |
145 nrow(spectra) - 3 | |
146 ) | |
147 } | |
148 catf( | |
149 ">> MS file created for %s with content:\n%s\n", | |
150 processing_parameters$c_name, | |
151 displayed_file_content | |
152 ) | |
153 file_content <- sprintf( | |
154 "%s\n\n>collision\n%s", | |
155 file_content, | |
156 paste( | |
157 sprintf("%s %s", spectra$mz, spectra$intensities), | |
158 collapse = "\n" | |
159 ) | |
160 ) | |
161 cat(file_content, file = path, append = FALSE) | |
162 } | |
163 | |
164 extract_sirius_results <- function( | |
165 output, | |
166 mz_list, | |
167 processing_parameters | |
168 ) { | |
169 | |
170 delta <- processing_parameters$fragment_match_delta | |
171 delta_unit <- tolower(processing_parameters$fragment_match_delta_unit) | |
172 | |
173 output <- list.dirs(output, recursive = FALSE)[[1]] | |
174 | |
175 spectra_out_dir <- sprintf("%s/spectra", output) | |
176 spectra_filename <- sprintf( | |
177 "%s/%s", | |
178 spectra_out_dir, | |
179 list.files(spectra_out_dir)[[1]] | |
180 ) | |
181 | |
182 trees_out_dir <- sprintf("%s/trees", output) | |
183 trees_filename <- sprintf( | |
184 "%s/%s", | |
185 trees_out_dir, | |
186 list.files(trees_out_dir)[[1]] | |
187 ) | |
188 | |
189 if (!is.null(spectra_filename)) { | |
190 sirius_results <- get_csv_or_tsv(spectra_filename) | |
191 } else { | |
192 return(rep(NA, length(mz_list))) | |
193 } | |
194 if (!is.null(trees_filename)) { | |
195 sirius_results <- cbind(sirius_results, extract_sirius_ppm(trees_filename)) | |
196 } else { | |
197 return(rep(NA, length(mz_list))) | |
198 } | |
199 | |
200 fragment_matchings <- data.frame( | |
201 formula = NA, | |
202 ppm = NA, | |
203 mz = mz_list, | |
204 error = NA | |
205 ) | |
206 | |
207 sirius_results <- filter_sirius_with_delta( | |
208 sirius_results = sirius_results, | |
209 original_mz = fragment_matchings$mz, | |
210 delta = delta, | |
211 delta_unit = delta_unit | |
212 ) | |
213 | |
214 for (index in seq_len(nrow(sirius_results))) { | |
215 result <- sirius_results[index, ] | |
216 filter <- which(order(abs(fragment_matchings$mz - result$mz)) == 1) | |
217 fragment_matchings[filter, "formula"] <- result$formula | |
218 fragment_matchings[filter, "ppm"] <- result$ppm | |
219 catf( | |
220 "[OK] Fragment with m/z=%s matches %s with a difference of %s ppm\n", | |
221 fragment_matchings[filter, "mz"], result$formula, result$ppm | |
222 ) | |
223 } | |
224 return(fragment_matchings) | |
225 } | |
226 | |
227 filter_sirius_with_delta <- function( | |
228 sirius_results, | |
229 original_mz, | |
230 delta, | |
231 delta_unit | |
232 ) { | |
233 if (is.numeric(delta) && !is.na(delta) && delta > 0) { | |
234 if (delta_unit == "ppm") { | |
235 filter <- abs(sirius_results$ppm) <= delta | |
236 fine <- which(filter) | |
237 not_fine <- which(!filter) | |
238 catf( | |
239 paste("[KO] fragment %s (m/z=%s) eleminated because ppm=%s is greater", | |
240 "than delta=%s\n" | |
241 ), | |
242 sirius_results[not_fine, ]$formula, | |
243 sirius_results[not_fine, ]$mz, | |
244 sirius_results[not_fine, ]$ppm, | |
245 delta | |
246 ) | |
247 sirius_results <- sirius_results[fine, ] | |
248 } else if (delta_unit == "mz") { | |
249 differences <- sapply( | |
250 sirius_results$mz, | |
251 function(mz) min(abs(original_mz - mz)) | |
252 ) | |
253 fine <- which(sapply( | |
254 sirius_results$mz, | |
255 function(mz) any(abs(original_mz - mz) <= delta) | |
256 )) | |
257 not_fine <- which(sapply( | |
258 sirius_results$mz, | |
259 function(mz) all(abs(original_mz - mz) > delta) | |
260 )) | |
261 catf( | |
262 paste( | |
263 "[KO] fragment %s eleminated because mz difference=%s is", | |
264 "greater than delta=%s\n" | |
265 ), | |
266 sirius_results[not_fine, ]$formula, | |
267 differences[not_fine], | |
268 delta | |
269 ) | |
270 sirius_results <- sirius_results[fine, ] | |
271 } | |
272 } | |
273 return(sirius_results) | |
274 } | |
275 | |
276 extract_sirius_ppm <- function(path) { | |
277 json <- file(path, "r") | |
278 suppressWarnings(json_lines <- readLines(json)) | |
279 close(json) | |
280 json_lines <- json_lines[ | |
281 grepl("\\s+\"(massDeviation|recalibratedMass)\" :", json_lines) | |
282 ] | |
283 json_lines <- gsub("^\\s+\"[^\"]+\" : \"?", "", json_lines) | |
284 ppms <- json_lines[seq(1, length(json_lines), 2)] | |
285 mz <- json_lines[seq(2, length(json_lines), 2)] | |
286 ppms <- as.numeric(gsub(" ppm .*", "", ppms)) | |
287 mz <- as.numeric(gsub(",$", "", mz)) | |
288 ordered <- order(mz) | |
289 return(list(ppm = ppms[ordered], recalibrated_mz = mz[ordered])) | |
290 } | |
65 | 291 |
66 #' @title plot_pseudo_spectra | 292 #' @title plot_pseudo_spectra |
67 #' @param x | 293 #' @param x |
68 #' @param r_threshold | |
69 #' @param fid | 294 #' @param fid |
70 #' @param sum_int | 295 #' @param sum_int |
71 #' @param vmz | 296 #' @param vmz |
72 #' @param cor_abs_int | 297 #' @param cor_abs_int |
73 #' @param refcol | 298 #' @param refcol |
79 #' x dataframe scan X fragments with scans number in the 1st column and | 304 #' x dataframe scan X fragments with scans number in the 1st column and |
80 #' ions in next with intensities | 305 #' ions in next with intensities |
81 #' fid file id when several a precursor has been detected in several files | 306 #' fid file id when several a precursor has been detected in several files |
82 plot_pseudo_spectra <- function( | 307 plot_pseudo_spectra <- function( |
83 x, | 308 x, |
84 r_threshold, | |
85 fid, | 309 fid, |
86 sum_int, | 310 sum_int, |
87 vmz, | 311 vmz, |
88 cor_abs_int, | 312 cor_abs_int, |
89 refcol, | 313 refcol, |
90 c_name, | 314 meaned_mz, |
91 inchikey, | 315 processing_parameters |
92 elemcomposition | |
93 ) { | 316 ) { |
94 ## du fait de la difference de nombre de colonne entre la dataframe qui | 317 ## du fait de la difference de nombre de colonne entre la dataframe qui |
95 ## inclue les scans en 1ere col, mzRef se decale de 1 | 318 ## inclue les scans en 1ere col, mzRef se decale de 1 |
96 refcol <- refcol - 1 | 319 refcol <- refcol - 1 |
97 ## compute relative intensities max=100% | 320 ## compute relative intensities max=100% |
98 rel_int <- sum_int[-1] | 321 rel_int <- sum_int[-1] |
99 rel_int <- rel_int / max(rel_int) | 322 rel_int <- rel_int / max(rel_int) |
100 | 323 |
101 ## define max value on vertical axis (need to increase in order to plot the | 324 if (processing_parameters$do_pdf) { |
102 ## label of fragments) | 325 ## define max value on vertical axis (need to increase in order to plot the |
103 ymax <- max(rel_int) + 0.2 * max(rel_int) | 326 ## label of fragments) |
104 | 327 ymax <- max(rel_int) + 0.2 * max(rel_int) |
105 par(mfrow = c(2, 1)) | 328 |
106 plot(vmz, rel_int, type = "h", ylim = c(0, ymax), main = c_name) | 329 par(mfrow = c(2, 1)) |
107 ## low correl coef. will be display in grey | 330 plot(vmz, rel_int, type = "h", ylim = c(0, ymax), |
108 cor_low <- which(round(cor_abs_int, 2) < r_threshold) | 331 main = processing_parameters$c_name |
109 | 332 ) |
110 lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2)) | 333 ## low correl coef. will be display in grey |
111 | 334 cor_low <- which(round(cor_abs_int, 2) < processing_parameters$r_threshold) |
112 if (length(cor_low) > 0) { | 335 |
113 text( | 336 lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2)) |
114 vmz[cor_low], | 337 |
115 rel_int[cor_low], | 338 if (length(cor_low) > 0) { |
116 lbmzcor[cor_low], | |
117 cex = 0.5, | |
118 col = "grey", | |
119 srt = 90, | |
120 adj = 0 | |
121 ) | |
122 if (length(vmz) - length(cor_low) > 1) { | |
123 text( | 339 text( |
124 vmz[-c(refcol, cor_low)], | 340 vmz[cor_low], |
125 rel_int[-c(refcol, cor_low)], | 341 rel_int[cor_low], |
126 lbmzcor[-c(refcol, cor_low)], | 342 lbmzcor[cor_low], |
127 cex = 0.6, | 343 cex = 0.5, |
128 col = 1, | 344 col = "grey", |
129 srt = 90, | 345 srt = 90, |
130 adj = 0 | 346 adj = 0 |
131 ) | 347 ) |
348 if (length(vmz) - length(cor_low) > 1) { | |
349 text( | |
350 vmz[-c(refcol, cor_low)], | |
351 rel_int[-c(refcol, cor_low)], | |
352 lbmzcor[-c(refcol, cor_low)], | |
353 cex = 0.6, | |
354 col = 1, | |
355 srt = 90, | |
356 adj = 0 | |
357 ) | |
358 } | |
359 } else { | |
360 if (length(vmz) > 1) { | |
361 text( | |
362 vmz[-c(refcol)], | |
363 rel_int[-c(refcol)], | |
364 lbmzcor[-c(refcol)], | |
365 cex = 0.6, | |
366 col = 1, | |
367 srt = 90, | |
368 adj = 0 | |
369 ) | |
370 } | |
371 } | |
372 | |
373 text( | |
374 vmz[refcol], | |
375 rel_int[refcol], | |
376 lbmzcor[refcol], | |
377 cex = 0.8, | |
378 col = 2, | |
379 srt = 90, | |
380 adj = 0 | |
381 ) | |
382 } | |
383 | |
384 ## prepare result file | |
385 cor_valid <- (round(cor_abs_int, 2) >= processing_parameters$r_threshold) | |
386 | |
387 do_sirius <- TRUE | |
388 verbose_catf("Checking sirius parameters...\n") | |
389 if (is.null(processing_parameters$ionization)) { | |
390 do_sirius <- FALSE | |
391 verbose_catf("[KO] No ionization passed in parameter.\n") | |
392 } else { | |
393 verbose_catf("[OK] Ionization=%s.\n", processing_parameters$ionization) | |
394 } | |
395 if (is.na(processing_parameters$elemcomposition)) { | |
396 do_sirius <- FALSE | |
397 verbose_catf("[KO] Elemental composition is NA.\n") | |
398 } else if (length(processing_parameters$elemcomposition) < 1) { | |
399 do_sirius <- FALSE | |
400 verbose_catf("[KO] No elemental composition is provided.\n") | |
401 } else if (processing_parameters$elemcomposition == "") { | |
402 do_sirius <- FALSE | |
403 verbose_catf("[KO] Elemental composition is an empty string.\n") | |
404 } else { | |
405 verbose_catf( | |
406 "[OK] Elemental composition=%s.\n", | |
407 processing_parameters$elemcomposition | |
408 ) | |
409 } | |
410 | |
411 cp_res_length <- length(vmz) | |
412 ppm <- rep(NA, cp_res_length) | |
413 formulas <- rep(NA, cp_res_length) | |
414 if (do_sirius) { | |
415 verbose_catf("Everything is ok, preparing for sirius.\n") | |
416 formulas <- get_formulas( | |
417 mzref = processing_parameters$mzref, | |
418 spectra = data.frame(mz = meaned_mz, intensities = sum_int[-1]), | |
419 nominal_mz_list = vmz, | |
420 processing_parameters = processing_parameters | |
421 ) | |
422 if (nrow(formulas) == 0) { | |
423 catf("No formula found.\n") | |
424 } else { | |
425 ppm <- formulas$ppm | |
426 formulas <- formulas$formula | |
427 catf( | |
428 "Found %s formula for %s fragments\n", | |
429 length(formulas[which(!(is.na(formulas)))]), | |
430 cp_res_length | |
431 ) | |
132 } | 432 } |
133 } else { | 433 } else { |
134 if (length(vmz) > 1) { | 434 verbose_catf("Sirius cannot be run.\n") |
135 text( | 435 } |
136 vmz[-c(refcol)], | |
137 rel_int[-c(refcol)], | |
138 lbmzcor[-c(refcol)], | |
139 cex = 0.6, | |
140 col = 1, | |
141 srt = 90, | |
142 adj = 0 | |
143 ) | |
144 } | |
145 } | |
146 | |
147 text( | |
148 vmz[refcol], | |
149 rel_int[refcol], | |
150 lbmzcor[refcol], | |
151 cex = 0.8, | |
152 col = 2, | |
153 srt = 90, | |
154 adj = 0 | |
155 ) | |
156 | |
157 ## prepare result file | |
158 corValid <- (round(cor_abs_int, 2) >= r_threshold) ##nolint object_name_linter | |
159 cp_res <- data.frame( | 436 cp_res <- data.frame( |
160 rep(c_name, length(vmz)), | 437 rep(processing_parameters$c_name, cp_res_length), |
161 rep(inchikey, length(vmz)), | 438 rep(processing_parameters$inchikey, cp_res_length), |
162 rep(elemcomposition, length(vmz)), | 439 rep(processing_parameters$elemcomposition, cp_res_length), |
163 rep(fid, length(vmz)), | 440 formulas, |
164 vmz, | 441 vmz, |
442 ppm, | |
443 rep(fid, cp_res_length), | |
165 cor_abs_int, | 444 cor_abs_int, |
166 sum_int[-1], | 445 sum_int[-1], |
167 rel_int, | 446 rel_int, |
168 corValid | 447 cor_valid |
169 ) | 448 ) |
170 | 449 |
171 colnames(cp_res) <- c( | 450 colnames(cp_res) <- c( |
172 "compoundName", | 451 "compoundName", |
173 "inchikey", | 452 "inchikey", |
174 "elemcomposition", | 453 "elemcomposition", |
454 "fragment", | |
455 "fragment_mz", | |
456 "ppm", | |
175 "fileid", | 457 "fileid", |
176 "fragments_mz", | |
177 "CorWithPrecursor", | 458 "CorWithPrecursor", |
178 "AbsoluteIntensity", | 459 "AbsoluteIntensity", |
179 "relativeIntensity", | 460 "relativeIntensity", |
180 "corValid" | 461 "corValid" |
181 ) | 462 ) |
182 return(cp_res) | 463 return(cp_res) |
183 | |
184 } | 464 } |
185 | 465 |
186 #' | 466 #' |
187 #' @title extract_fragments | 467 #' @title extract_fragments |
188 #' | 468 #' |
189 #' @param precursors the precursor list from mspurity | 469 #' @param precursors the precursor list from mspurity |
190 #' @param fragments the fragments list from ms purity | 470 #' @param fragments the fragments list from ms purity |
191 #' @param mzref | 471 # ' @param mzref |
192 #' @param rtref | 472 # ' @param rtref |
193 #' @param c_name | 473 # ' @param c_name |
194 #' @param r_threshold default = DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD | 474 # ' @param inchikey |
195 #' @param seuil_ra default = DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA | 475 # ' @param elemcomposition |
196 #' @param tolmz default = DEFAULT_EXTRACT_FRAGMENTS_TOLMZ | 476 #' @param processing_parameters |
197 #' @param tolrt default = DEFAULT_EXTRACT_FRAGMENTS_TOLRT | |
198 #' @returns | 477 #' @returns |
199 #' | 478 #' |
200 #' @description | 479 #' @description |
201 #' function for extraction of fragments corresponding to precursors | 480 #' function for extraction of fragments corresponding to precursors |
202 #' detected by MSPurity | 481 #' detected by MSPurity |
203 extract_fragments <- function( ## nolint cyclocomp_linter | 482 extract_fragments <- function( ## nolint cyclocomp_linter |
204 precursors, | 483 precursors, |
205 fragments, | 484 fragments, |
206 mzref, | 485 processing_parameters |
207 rtref, | |
208 c_name, | |
209 inchikey, | |
210 elemcomposition, | |
211 min_number_scan, | |
212 mzdecimal, | |
213 r_threshold = DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD, | |
214 seuil_ra = DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA, | |
215 tolmz = DEFAULT_EXTRACT_FRAGMENTS_TOLMZ, | |
216 tolrt = DEFAULT_EXTRACT_FRAGMENTS_TOLRT | |
217 ) { | 486 ) { |
218 ## filter precursor in the precursors file based on mz and rt in the | 487 ## filter precursor in the precursors file based on mz and rt in the |
219 ## compound list | 488 ## compound list |
220 cat("processing ", c_name, "\n") | 489 catf("processing %s\n", processing_parameters$c_name) |
490 verbose_catf("===\n") | |
491 param <- processing_parameters | |
221 selected_precursors <- which( | 492 selected_precursors <- which( |
222 (abs(precursors$precurMtchMZ - mzref) <= tolmz) | 493 (abs(precursors$precurMtchMZ - param$mzref) <= param$tolmz) |
223 & (abs(precursors$precurMtchRT - rtref) <= tolrt) | 494 & (abs(precursors$precurMtchRT - param$rtref) <= param$tolrt) |
495 ) | |
496 rm(param) | |
497 | |
498 verbose_catf( | |
499 "> %s precursors selected with mz=%s±%s and rt=%s±%s\n", | |
500 length(selected_precursors), | |
501 processing_parameters$mzref, | |
502 processing_parameters$tolmz, | |
503 processing_parameters$rtref, | |
504 processing_parameters$tolrt | |
224 ) | 505 ) |
225 | 506 |
226 ## check if there is the precursor in the file | 507 ## check if there is the precursor in the file |
227 if (length(selected_precursors) > 0) { | 508 |
228 | 509 if (length(selected_precursors) < 1) { |
229 sprecini <- precursors[selected_precursors, ] | 510 cat("> non detected in precursor file\n") |
230 | 511 show_end_processing() |
231 ## check if fragments corresponding to precursor are found in several | 512 return(NULL) |
232 ## files (collision energy) | 513 } |
233 ## this lead to a processing for each fileid | 514 |
234 mf <- levels(as.factor(sprecini$fileid)) | 515 precursors <- precursors[selected_precursors, ] |
235 if (length(mf) > 1 && global_verbose) { | 516 |
236 cat(" several files detected for this compounds :\n") | 517 ## check if fragments corresponding to precursor are found in several |
237 } | 518 ## files (collision energy) |
238 | 519 ## this lead to a processing for each fileid |
239 for (f in seq_along(mf)) { | 520 file_ids <- as.character(sort(unique(precursors$fileid))) |
240 | 521 if (length(file_ids) > 1) { |
241 sprec <- sprecini[sprecini$fileid == mf[f], ] | 522 catf("> several files detected for this compounds :\n") |
242 | 523 } else if (length(file_ids) < 1 || nrow(precursors) < 1) { |
243 ## selection of fragment in the fragments file with the grpid common in | 524 return(data.frame()) |
244 ## both fragments and precursors | 525 } |
245 selfrgt <- levels(as.factor(sprec$grpid)) | 526 |
246 sfrgt <- fragments[ | 527 res_comp <- data.frame() |
247 fragments$grpid %in% selfrgt | 528 for (curent_file_id in file_ids) { |
248 & fragments$fileid == mf[f], | 529 curent_precursors <- precursors[precursors$fileid == curent_file_id, ] |
249 ] | 530 selected_fragments <- fragments[ |
250 | 531 fragments$grpid %in% as.character(curent_precursors$grpid) |
251 ## filter fragments on relative intensity seuil_ra = user defined | 532 & fragments$fileid == curent_file_id, |
252 ## parameter (MSpurity flags could be used here) | 533 ] |
253 sfrgtfil <- sfrgt[sfrgt$ra > seuil_ra, ] | 534 filtered_fragments <- selected_fragments[ |
254 | 535 selected_fragments$ra > processing_parameters$seuil_ra, |
255 mznominal <- round(x = sfrgtfil$mz, mzdecimal) | 536 ] |
256 sfrgtfil <- data.frame(sfrgtfil, mznominal) | 537 if (nrow(filtered_fragments) != 0) { |
257 | 538 res_comp_by_file <- process_file( |
258 ## creation of cross table row=scan col=mz X=ra | 539 curent_file_id = curent_file_id, |
259 vmz <- levels(as.factor(sfrgtfil$mznominal)) | 540 precursor_mz = curent_precursors$mz, |
260 | 541 filtered_fragments = filtered_fragments, |
261 if (global_verbose) { | 542 processing_parameters = processing_parameters |
262 cat(" fragments :", vmz) | |
263 cat("\n") | |
264 } | |
265 | |
266 ## mz of precursor in data precursor to check correlation with | |
267 mz_prec <- paste0("mz", round(mean(sprec$mz), mzdecimal)) | |
268 | |
269 for (m in seq_along(vmz)) { | |
270 | |
271 ## absolute intensity | |
272 cln <- c( | |
273 which(colnames(sfrgtfil) == "acquisitionNum"), | |
274 which(colnames(sfrgtfil) == "i") | |
275 ) | |
276 int_mz <- sfrgtfil[sfrgtfil$mznominal == vmz[m], cln] | |
277 colnames(int_mz)[2] <- paste0("mz", vmz[m]) | |
278 | |
279 ## average intensities of mass in duplicate scans | |
280 comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean) | |
281 int_mz <- comp_scans[, -1] | |
282 | |
283 if (m == 1) { | |
284 ds_abs_int <- int_mz | |
285 } else { | |
286 ds_abs_int <- merge( | |
287 x = ds_abs_int, | |
288 y = int_mz, | |
289 by.x = 1, | |
290 by.y = 1, | |
291 all.x = TRUE, | |
292 all.y = TRUE | |
293 ) | |
294 } | |
295 } | |
296 if (global_debug) { | |
297 print(ds_abs_int) | |
298 write.table( | |
299 x = ds_abs_int, | |
300 file = paste0(c_name, "ds_abs_int.txt"), | |
301 row.names = FALSE, | |
302 sep = "\t" | |
303 ) | |
304 } | |
305 | |
306 ## elimination of mz with less than min_number_scan scans (user defined | |
307 ## parameter) | |
308 xmz <- rep(NA, ncol(ds_abs_int) - 1) | |
309 sum_int <- rep(NA, ncol(ds_abs_int)) | |
310 nbxmz <- 0 | |
311 nb_scan_check <- min(nrow(ds_abs_int), min_number_scan) | |
312 | |
313 for (j in 2:ncol(ds_abs_int)) { | |
314 sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE) | |
315 if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) { | |
316 nbxmz <- nbxmz + 1 | |
317 xmz[nbxmz] <- j | |
318 } | |
319 } | |
320 | |
321 xmz <- xmz[-which(is.na(xmz))] | |
322 if (length(xmz) > 0) { | |
323 ds_abs_int <- ds_abs_int[, -c(xmz)] | |
324 sum_int <- sum_int[-c(xmz)] | |
325 ## liste des mz keeped decale de 1 avec ds_abs_int | |
326 vmz <- as.numeric(vmz[-c(xmz - 1)]) | |
327 } | |
328 | |
329 ## reference ion for correlation computing = precursor OR maximum | |
330 ## intensity ion in precursor is not present | |
331 refcol <- which(colnames(ds_abs_int) == mz_prec) | |
332 if (length(refcol) == 0) { | |
333 refcol <- which(sum_int == max(sum_int, na.rm = TRUE)) | |
334 } | |
335 pdf( | |
336 file = sprintf("%s_processing_file%s.pdf", c_name, mf[f]), | |
337 width = 8, | |
338 height = 11 | |
339 ) | 543 ) |
340 par(mfrow = c(3, 2)) | |
341 | |
342 ## Pearson correlations between absolute intensities computing | |
343 cor_abs_int <- rep(NA, length(vmz)) | |
344 | |
345 if (length(refcol) > 0) { | |
346 for (i in 2:length(ds_abs_int)) { | |
347 cor_abs_int[i - 1] <- cor( | |
348 x = ds_abs_int[[refcol]], | |
349 y = ds_abs_int[[i]], | |
350 use = "pairwise.complete.obs", | |
351 method = "pearson" | |
352 ) | |
353 plot( | |
354 ds_abs_int[[refcol]], | |
355 ds_abs_int[[i]], | |
356 xlab = colnames(ds_abs_int)[refcol], | |
357 ylab = colnames(ds_abs_int)[i], | |
358 main = sprintf( | |
359 "%s corr coeff r=%s", c_name, round(cor_abs_int[i - 1], 2) | |
360 ) | |
361 ) | |
362 } | |
363 ## plot pseudo spectra | |
364 res_comp_by_file <- plot_pseudo_spectra( | |
365 x = ds_abs_int, | |
366 r_threshold = r_threshold, | |
367 fid = mf[f], | |
368 sum_int = sum_int, | |
369 vmz = vmz, | |
370 cor_abs_int = cor_abs_int, | |
371 refcol = refcol, | |
372 c_name = c_name, | |
373 inchikey = inchikey, | |
374 elemcomposition = elemcomposition | |
375 ) | |
376 if (f == 1) { | |
377 res_comp <- res_comp_by_file | |
378 } | |
379 } else { | |
380 res_comp_by_file <- NULL | |
381 cat(" non detected in fragments file \n") | |
382 } | |
383 if (!is.null(res_comp_by_file)) { | 544 if (!is.null(res_comp_by_file)) { |
384 res_comp <- rbind(res_comp, res_comp_by_file) | 545 res_comp <- rbind(res_comp, res_comp_by_file) |
385 } | 546 } |
386 dev.off() | 547 } else { |
548 catf("No fragment found for in fragment file\n") | |
387 } | 549 } |
550 } | |
551 return(unique(res_comp)) | |
552 } | |
553 | |
554 process_file <- function( | |
555 curent_file_id, | |
556 precursor_mz, | |
557 filtered_fragments, | |
558 processing_parameters | |
559 ) { | |
560 mznominal <- round(x = filtered_fragments$mz, digits = 0) | |
561 meaned_mz <- round( | |
562 aggregate( | |
563 data.frame( | |
564 mz = filtered_fragments$mz, | |
565 mznominal = mznominal | |
566 ), | |
567 list(mznominal), | |
568 FUN = mean | |
569 )$mz, | |
570 digits = processing_parameters$mzdecimal | |
571 ) | |
572 filtered_fragments <- data.frame(filtered_fragments, mznominal) | |
573 | |
574 ## creation of cross table row=scan col=mz X=ra | |
575 | |
576 vmz <- as.character(sort(unique(filtered_fragments$mznominal))) | |
577 | |
578 ds_abs_int <- create_ds_abs_int(vmz, filtered_fragments) | |
579 | |
580 if (global_debug) { | |
581 print(ds_abs_int) | |
582 } | |
583 | |
584 ## elimination of mz with less than min_number_scan scans (user defined | |
585 ## parameter) | |
586 xmz <- rep(NA, ncol(ds_abs_int) - 1) | |
587 sum_int <- rep(NA, ncol(ds_abs_int)) | |
588 nbxmz <- 0 | |
589 nb_scan_check <- min(nrow(ds_abs_int), processing_parameters$min_number_scan) | |
590 | |
591 for (j in 2:ncol(ds_abs_int)) { | |
592 sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE) | |
593 if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) { | |
594 nbxmz <- nbxmz + 1 | |
595 xmz[nbxmz] <- j | |
596 } | |
597 } | |
598 | |
599 xmz <- xmz[-which(is.na(xmz))] | |
600 if (length(xmz) > 0) { | |
601 ds_abs_int <- ds_abs_int[, -c(xmz)] | |
602 sum_int <- sum_int[-c(xmz)] | |
603 ## liste des mz keeped decale de 1 avec ds_abs_int | |
604 vmz <- as.numeric(vmz[-c(xmz - 1)]) | |
605 meaned_mz <- meaned_mz[-c(xmz - 1)] | |
606 } | |
607 | |
608 ## mz of precursor in data precursor to check correlation with | |
609 mz_prec <- paste0( | |
610 "mz", | |
611 round(mean(precursor_mz), processing_parameters$mzdecimal) | |
612 ) | |
613 ## reference ion for correlation computing = precursor OR maximum | |
614 ## intensity ion in precursor is not present | |
615 refcol <- which(colnames(ds_abs_int) == mz_prec) | |
616 if (length(refcol) == 0) { | |
617 refcol <- which(sum_int == max(sum_int, na.rm = TRUE)) | |
618 } | |
619 | |
620 if (processing_parameters$do_pdf) { | |
621 start_pdf(processing_parameters, curent_file_id) | |
622 } | |
623 | |
624 ## Pearson correlations between absolute intensities computing | |
625 cor_abs_int <- rep(NA, length(vmz)) | |
626 | |
627 if (length(refcol) > 0) { | |
628 for (i in 2:length(ds_abs_int)) { | |
629 cor_abs_int[i - 1] <- stats::cor( | |
630 x = ds_abs_int[[refcol]], | |
631 y = ds_abs_int[[i]], | |
632 use = "pairwise.complete.obs", | |
633 method = "pearson" | |
634 ) | |
635 debug_catf( | |
636 "Correlation between %s and %s: %s\n", | |
637 paste(ds_abs_int[[refcol]], collapse = ";"), | |
638 paste(ds_abs_int[[i]], collapse = ";"), | |
639 paste(cor_abs_int[i - 1], collapse = ";") | |
640 ) | |
641 if (processing_parameters$do_pdf) { | |
642 pdf_plot_ds_abs_int( | |
643 processing_parameters$c_name, | |
644 ds_abs_int, | |
645 refcol, | |
646 i, | |
647 round(cor_abs_int[i - 1], 2) | |
648 ) | |
649 } | |
650 } | |
651 ## plot pseudo spectra | |
652 res_comp_by_file <- plot_pseudo_spectra( | |
653 x = ds_abs_int, | |
654 fid = curent_file_id, | |
655 sum_int = sum_int, | |
656 vmz = vmz, | |
657 cor_abs_int = cor_abs_int, | |
658 refcol = refcol, | |
659 meaned_mz = meaned_mz, | |
660 processing_parameters = processing_parameters | |
661 ) | |
662 catf( | |
663 "%s has been processed and %s fragments have been found.\n", | |
664 processing_parameters$c_name, | |
665 nrow(res_comp_by_file) | |
666 ) | |
388 } else { | 667 } else { |
389 res_comp <- NULL | 668 res_comp_by_file <- NULL |
390 cat(" non detected in precursor file \n") | 669 cat(">> non detected in fragments file \n") |
391 } | 670 } |
392 return(res_comp) | 671 show_end_processing() |
672 if (processing_parameters$do_pdf) { | |
673 end_pdf() | |
674 } | |
675 return(res_comp_by_file) | |
676 } | |
677 | |
678 create_ds_abs_int <- function(vmz, filtered_fragments) { | |
679 verbose_catf( | |
680 ">> fragments: %s\n", | |
681 paste(vmz, collapse = " ") | |
682 ) | |
683 ds_abs_int <- create_int_mz(vmz[1], filtered_fragments) | |
684 for (mz in vmz[-1]) { | |
685 int_mz <- create_int_mz(mz, filtered_fragments) | |
686 ds_abs_int <- merge( | |
687 x = ds_abs_int, | |
688 y = int_mz, | |
689 by.x = 1, | |
690 by.y = 1, | |
691 all.x = TRUE, | |
692 all.y = TRUE | |
693 ) | |
694 } | |
695 return(ds_abs_int) | |
696 } | |
697 | |
698 create_int_mz <- function(mz, filtered_fragments) { | |
699 ## absolute intensity | |
700 int_mz <- filtered_fragments[ | |
701 filtered_fragments$mznominal == mz, | |
702 c("acquisitionNum", "i") | |
703 ] | |
704 colnames(int_mz)[2] <- paste0("mz", mz) | |
705 ## average intensities of mass in duplicate scans | |
706 comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean) | |
707 return(comp_scans[, -1]) | |
708 } | |
709 | |
710 show_end_processing <- function() { | |
711 verbose_catf("==========\n") | |
712 cat("\n") | |
713 } | |
714 | |
715 start_pdf <- function(processing_parameters, curent_file_id) { | |
716 if (!dir.exists(processing_parameters$pdf_path)) { | |
717 dir.create(processing_parameters$pdf_path, recursive = TRUE) | |
718 } | |
719 pdf( | |
720 file = sprintf( | |
721 "%s/%s_processing_file%s.pdf", | |
722 processing_parameters$pdf_path, | |
723 processing_parameters$c_name, | |
724 curent_file_id | |
725 ), | |
726 width = 8, | |
727 height = 11 | |
728 ) | |
729 par(mfrow = c(3, 2)) | |
730 } | |
731 | |
732 pdf_plot_ds_abs_int <- function(c_name, ds_abs_int, refcol, i, r_coef) { | |
733 plot( | |
734 ds_abs_int[[refcol]], | |
735 ds_abs_int[[i]], | |
736 xlab = colnames(ds_abs_int)[refcol], | |
737 ylab = colnames(ds_abs_int)[i], | |
738 main = sprintf( | |
739 "%s corr coeff r=%s", c_name, r_coef | |
740 ) | |
741 ) | |
742 } | |
743 end_pdf <- function() { | |
744 dev.off() | |
393 } | 745 } |
394 | 746 |
395 set_global <- function(var, value) { | 747 set_global <- function(var, value) { |
396 assign(var, value, envir = globalenv()) | 748 assign(var, value, envir = globalenv()) |
397 } | 749 } |
408 set_global("global_verbose", TRUE) | 760 set_global("global_verbose", TRUE) |
409 } | 761 } |
410 | 762 |
411 unset_verbose <- function() { | 763 unset_verbose <- function() { |
412 set_global("global_verbose", FALSE) | 764 set_global("global_verbose", FALSE) |
765 } | |
766 | |
767 verbose_catf <- function(...) { | |
768 if (global_verbose) { | |
769 cat(sprintf(...), sep = "") | |
770 } | |
771 } | |
772 | |
773 | |
774 debug_catf <- function(...) { | |
775 if (global_debug) { | |
776 cat(sprintf(...), sep = "") | |
777 } | |
778 } | |
779 | |
780 catf <- function(...) { | |
781 cat(sprintf(...), sep = "") | |
413 } | 782 } |
414 | 783 |
415 create_parser <- function() { | 784 create_parser <- function() { |
416 parser <- optparse::OptionParser() | 785 parser <- optparse::OptionParser() |
417 parser <- optparse::add_option( | 786 parser <- optparse::add_option( |
545 "Fragments are kept if there are found in a minimum number", | 914 "Fragments are kept if there are found in a minimum number", |
546 "of min_number_scan scans" | 915 "of min_number_scan scans" |
547 ), | 916 ), |
548 metavar = "number" | 917 metavar = "number" |
549 ) | 918 ) |
919 parser <- optparse::add_option( | |
920 parser, | |
921 c("--pdf_path"), | |
922 type = "character", | |
923 default = DEFAULT_PDF_PATH, | |
924 help = paste( | |
925 "[default %default]", | |
926 "PDF files output path" | |
927 ) | |
928 ) | |
929 parser <- optparse::add_option( | |
930 parser, | |
931 c("--ionization"), | |
932 type = "character", | |
933 action = "store", | |
934 default = "None", | |
935 help = paste( | |
936 "[default %default]", | |
937 "Which ionization to use for sirius" | |
938 ), | |
939 metavar = "character" | |
940 ) | |
941 parser <- optparse::add_option( | |
942 parser, | |
943 c("--fragment_match_delta"), | |
944 type = "numeric", | |
945 action = "store", | |
946 default = DEFAULT_FRAGMENTS_MATCH_DELTA, | |
947 help = paste( | |
948 "[default %default]", | |
949 "Fragment match delta" | |
950 ), | |
951 metavar = "numeric" | |
952 ) | |
953 parser <- optparse::add_option( | |
954 parser, | |
955 c("--fragment_match_delta_unit"), | |
956 type = "character", | |
957 action = "store", | |
958 default = DEFAULT_FRAGMENTS_MATCH_DELTA_UNIT, | |
959 help = paste( | |
960 "[default %default]", | |
961 "Fragment match delta" | |
962 ), | |
963 metavar = "character" | |
964 ) | |
550 return(parser) | 965 return(parser) |
551 } | 966 } |
552 | 967 |
553 stop_with_status <- function(msg, status) { | 968 stop_with_status <- function(msg, status) { |
554 sink(stderr()) | 969 sink(stderr()) |
557 sink(NULL) | 972 sink(NULL) |
558 base::quit(status = status) | 973 base::quit(status = status) |
559 } | 974 } |
560 | 975 |
561 check_args_validity <- function(args) { ## nolint cyclocomp_linter | 976 check_args_validity <- function(args) { ## nolint cyclocomp_linter |
562 sysvars <- Sys.getenv() | |
563 sysvarnames <- names(sysvars) | |
564 if (length(args$output) == 0 || nchar(args$output[1]) == 0) { | 977 if (length(args$output) == 0 || nchar(args$output[1]) == 0) { |
565 stop_with_status( | 978 stop_with_status( |
566 "Missing output parameters. Please set it with --output.", | 979 "Missing output parameters. Please set it with --output.", |
567 MISSING_PARAMETER_ERROR | 980 MISSING_PARAMETER_ERROR |
568 ) | 981 ) |
610 args$compounds | 1023 args$compounds |
611 ), | 1024 ), |
612 MISSING_INPUT_FILE_ERROR | 1025 MISSING_INPUT_FILE_ERROR |
613 ) | 1026 ) |
614 } | 1027 } |
615 if ( | 1028 if (in_galaxy_env()) { |
1029 check_galaxy_args_validity(args) | |
1030 } | |
1031 } | |
1032 | |
1033 in_galaxy_env <- function() { | |
1034 sysvars <- Sys.getenv() | |
1035 sysvarnames <- names(sysvars) | |
1036 return( | |
616 "_GALAXY_JOB_HOME_DIR" %in% sysvarnames | 1037 "_GALAXY_JOB_HOME_DIR" %in% sysvarnames |
617 || "_GALAXY_JOB_TMP_DIR" %in% sysvarnames | 1038 || "_GALAXY_JOB_TMP_DIR" %in% sysvarnames |
618 || "GALAXY_MEMORY_MB" %in% sysvarnames | 1039 || "GALAXY_MEMORY_MB" %in% sysvarnames |
619 || "GALAXY_MEMORY_MB_PER_SLOT" %in% sysvarnames | 1040 || "GALAXY_MEMORY_MB_PER_SLOT" %in% sysvarnames |
620 || "GALAXY_SLOTS" %in% sysvarnames | 1041 || "GALAXY_SLOTS" %in% sysvarnames |
621 ) { | 1042 ) |
622 check_galaxy_args_validity(args) | |
623 } | |
624 } | 1043 } |
625 | 1044 |
626 check_galaxy_args_validity <- function(args) { | 1045 check_galaxy_args_validity <- function(args) { |
627 if (!file.exists(args$output)) { | 1046 if (!file.exists(args$output)) { |
628 stop_with_status( | 1047 stop_with_status( |
636 } | 1055 } |
637 | 1056 |
638 get_csv_or_tsv <- function( | 1057 get_csv_or_tsv <- function( |
639 path, | 1058 path, |
640 sep_stack = c("\t", ",", ";"), | 1059 sep_stack = c("\t", ",", ";"), |
1060 sep_names = c("tab", "comma", "semicolon"), | |
641 header = TRUE, | 1061 header = TRUE, |
642 quote = "\"" | 1062 quote = "\"" |
643 ) { | 1063 ) { |
644 sep <- sep_stack[1] | 1064 sep <- determine_csv_or_tsv_sep( |
645 result <- tryCatch({ | 1065 path = path, |
646 read.table( | 1066 sep_stack = sep_stack, |
647 file = path, | 1067 header = header, |
648 sep = sep, | 1068 quote = quote |
649 header = header, | 1069 ) |
650 quote = quote | 1070 verbose_catf( |
651 ) | 1071 "%s separator has been determined for %s.\n", |
652 }, error = function(e) { | 1072 sep_names[sep_stack == sep], |
653 return(data.frame()) | 1073 path |
654 }) | 1074 ) |
655 if (length(sep_stack) == 1) { | 1075 return(read.table( |
656 return(result) | 1076 file = path, |
657 } | 1077 sep = sep, |
658 # if ( | 1078 header = header, |
659 # ncol(result) == 0 || ## failed | 1079 quote = quote |
660 # ncol(result) == 1 ## only one row, suspicious, possible fail # nolint | 1080 )) |
661 # ) { | 1081 } |
662 new_result <- get_csv_or_tsv( | 1082 |
663 path, | 1083 determine_csv_or_tsv_sep <- function( |
664 sep_stack = sep_stack[-1], | 1084 path, |
665 header = header, | 1085 sep_stack = c("\t", ",", ";"), |
666 quote = quote | 1086 header = TRUE, |
667 ) | 1087 quote = "\"" |
668 if (ncol(new_result) > ncol(result)) { | 1088 ) { |
669 return(new_result) | 1089 count <- -1 |
670 } | 1090 best_sep <- sep_stack[1] |
671 # } | 1091 for (sep in sep_stack) { |
672 return(result) | 1092 tryCatch({ |
1093 table <- read.table( | |
1094 file = path, | |
1095 sep = sep, | |
1096 header = header, | |
1097 quote = quote, | |
1098 nrows = 1 | |
1099 ) | |
1100 if (ncol(table) > count) { | |
1101 count <- ncol(table) | |
1102 best_sep <- sep | |
1103 } | |
1104 }) | |
1105 } | |
1106 return(best_sep) | |
673 } | 1107 } |
674 | 1108 |
675 uniformize_columns <- function(df) { | 1109 uniformize_columns <- function(df) { |
676 cols <- colnames(df) | 1110 cols <- colnames(df) |
677 for (func in c(tolower)) { | 1111 for (func in c(tolower)) { |
679 } | 1113 } |
680 colnames(df) <- cols | 1114 colnames(df) <- cols |
681 return(df) | 1115 return(df) |
682 } | 1116 } |
683 | 1117 |
1118 handle_galaxy_param <- function(args) { | |
1119 for (param in names(args)) { | |
1120 if (is.character(args[[param]])) { | |
1121 args[[param]] <- gsub("__ob__", "[", args[[param]]) | |
1122 args[[param]] <- gsub("__cb__", "]", args[[param]]) | |
1123 } | |
1124 } | |
1125 return(args) | |
1126 } | |
1127 | |
1128 zip_pdfs <- function(processing_parameters) { | |
1129 if (processing_parameters$do_pdf) { | |
1130 if (zip <- Sys.getenv("R_ZIPCMD", "zip") == "") { | |
1131 catf("R could not fin the zip executable. Trying luck: zip = \"zip\"") | |
1132 zip <- "zip" | |
1133 } else { | |
1134 catf("Found zip executable at %s .", zip) | |
1135 } | |
1136 utils::zip( | |
1137 processing_parameters$pdf_zip_path, | |
1138 processing_parameters$pdf_path, | |
1139 zip = zip | |
1140 ) | |
1141 } | |
1142 } | |
1143 | |
684 main <- function(args) { | 1144 main <- function(args) { |
685 if (args$version) { | 1145 if (args$version) { |
686 cat(sprintf("%s\n", MS2SNOOP_VERSION)) | 1146 catf("%s\n", MS2SNOOP_VERSION) |
687 base::quit(status = 0) | 1147 base::quit(status = 0) |
688 } | 1148 } |
689 sessionInfo() | 1149 if (in_galaxy_env()) { |
1150 print(sessionInfo()) | |
1151 cat("\n\n") | |
1152 } | |
690 check_args_validity(args) | 1153 check_args_validity(args) |
1154 args <- handle_galaxy_param(args) | |
1155 if (args$ionization == "None") { | |
1156 args$ionization <- NULL | |
1157 } | |
691 if (args$debug) { | 1158 if (args$debug) { |
692 set_debug() | 1159 set_debug() |
693 } | 1160 } |
694 if (args$verbose) { | 1161 if (args$verbose) { |
695 set_verbose() | 1162 set_verbose() |
696 } | 1163 } |
697 ## MSpurity precursors file | |
698 precursors <- get_csv_or_tsv(args$precursors) | 1164 precursors <- get_csv_or_tsv(args$precursors) |
699 ## MSpurity fragments file | |
700 fragments <- get_csv_or_tsv(args$fragments) | 1165 fragments <- get_csv_or_tsv(args$fragments) |
701 ## list of compounds : col1=Name of molecule, col2=m/z, col3=retention time | |
702 compounds <- get_csv_or_tsv(args$compounds) | 1166 compounds <- get_csv_or_tsv(args$compounds) |
703 | 1167 |
704 compounds <- uniformize_columns(compounds) | 1168 compounds <- uniformize_columns(compounds) |
705 mandatory_columns <- c( | 1169 mandatory_columns <- c( |
706 "compound_name", | 1170 "compound_name", |
717 ), | 1181 ), |
718 BAD_PARAMETER_VALUE_ERROR | 1182 BAD_PARAMETER_VALUE_ERROR |
719 ) | 1183 ) |
720 } | 1184 } |
721 | 1185 |
722 res_all <- NULL | 1186 res_all <- data.frame() |
1187 processing_parameters <- list( | |
1188 min_number_scan = args$min_number_scan, | |
1189 mzdecimal = args$mzdecimal, | |
1190 r_threshold = args$r_threshold, | |
1191 seuil_ra = args$seuil_ra, | |
1192 tolmz = args$tolmz, | |
1193 tolrt = args$tolrt, | |
1194 ionization = args$ionization, | |
1195 do_pdf = nchar(args$pdf_path) > 0, | |
1196 pdf_zip_path = args$pdf_path, | |
1197 pdf_path = tempdir(), | |
1198 fragment_match_delta = args$fragment_match_delta, | |
1199 fragment_match_delta_unit = args$fragment_match_delta_unit | |
1200 ) | |
723 for (i in seq_len(nrow(compounds))) { | 1201 for (i in seq_len(nrow(compounds))) { |
724 ## loop execution for all compounds in the compounds file | 1202 processing_parameters$mzref <- compounds[["mz"]][i] |
725 res_cor <- NULL | 1203 processing_parameters$rtref <- compounds[["rtsec"]][i] |
1204 processing_parameters$c_name <- compounds[["compound_name"]][i] | |
1205 processing_parameters$inchikey <- compounds[["inchikey"]][i] | |
1206 processing_parameters$elemcomposition <- compounds[["elemcomposition"]][i] | |
726 res_cor <- extract_fragments( | 1207 res_cor <- extract_fragments( |
727 precursors = precursors, | 1208 precursors = precursors, |
728 fragments = fragments, | 1209 fragments = fragments, |
729 mzref = compounds[["mz"]][i], | 1210 processing_parameters = processing_parameters |
730 rtref = compounds[["rtsec"]][i], | |
731 c_name = compounds[["compound_name"]][i], | |
732 inchikey = compounds[["inchikey"]][i], | |
733 elemcomposition = compounds[["elemcomposition"]][i], | |
734 min_number_scan = args$min_number_scan, | |
735 mzdecimal = args$mzdecimal, | |
736 r_threshold = args$r_threshold, | |
737 seuil_ra = args$seuil_ra, | |
738 tolmz = args$tolmz, | |
739 tolrt = args$tolrt | |
740 ) | 1211 ) |
741 if (!is.null(res_cor)) { | 1212 if (!is.null(res_cor)) { |
742 if (is.null(res_all)) { | 1213 res_all <- rbind(res_all, res_cor) |
743 res_all <- res_cor | |
744 } else { | |
745 res_all <- rbind(res_all, res_cor) | |
746 } | |
747 } | 1214 } |
748 } | 1215 } |
749 | 1216 |
750 if (is.null(res_all)) { | 1217 if (nrow(res_all) == 0) { |
751 stop_with_status("No result at all!", NO_ANY_RESULT_ERROR) | 1218 stop_with_status("No result at all!", NO_ANY_RESULT_ERROR) |
752 } | 1219 } |
1220 | |
753 write.table( | 1221 write.table( |
754 x = res_all, | 1222 x = res_all, |
755 file = args$output, | 1223 file = args$output, |
756 sep = "\t", | 1224 sep = "\t", |
757 row.names = FALSE | 1225 row.names = FALSE |
758 ) | 1226 ) |
1227 zip_pdfs(processing_parameters) | |
1228 unlink(processing_parameters$pdf_path, recursive = TRUE) | |
759 } | 1229 } |
760 | 1230 |
761 global_debug <- FALSE | 1231 global_debug <- FALSE |
762 global_verbose <- FALSE | 1232 global_verbose <- FALSE |
763 args <- optparse::parse_args(create_parser()) | 1233 args <- optparse::parse_args(create_parser()) |