Mercurial > repos > workflow4metabolomics > ms2snoop
comparison MS2snoop.R @ 0:91a3242fd67f draft
"planemo upload commit c7676a9c7ac542043691d735285ae19e430bf032"
author | workflow4metabolomics |
---|---|
date | Mon, 25 Apr 2022 08:23:54 +0000 |
parents | |
children | df2672c37732 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:91a3242fd67f |
---|---|
1 #' | |
2 #' read and process mspurity W4M files | |
3 #' create a summary of fragment for each precursor and a graphics of peseudo | |
4 #' spectra + correlation on which checking of fragment is based on | |
5 #' V3 try to identify and process multiple files for 1 precursor which may | |
6 #' occur if different collision energy are used | |
7 #' V4 elimination of correlation = NA. Correlation is done with precursor, if | |
8 #' precursor is not present correlation with most intense peak | |
9 #' author: Jean-Francois Martin | |
10 #' V5 is versionned, lintR-compliant, packaged, unit-tested, documented and | |
11 #' tested against data from other labs. | |
12 #' new maintainer: Lain Pavot - lain.pavot@inrae.fr | |
13 #' | |
14 #' @import optparse | |
15 #' | |
16 NULL | |
17 | |
18 | |
19 assign("DEFAULT_PRECURSOR_PATH", "peaklist_precursors.tsv") | |
20 assign("DEFAULT_FRAGMENTS_PATH", "peaklist_fragments.tsv") | |
21 assign("DEFAULT_COMPOUNDS_PATH", "compounds_pos.txt") | |
22 assign("DEFAULT_OUTPUT_PATH", "compound_fragments_result.txt") | |
23 assign("DEFAULT_TOLMZ", 0.01) | |
24 assign("DEFAULT_TOLRT", 20) | |
25 assign("DEFAULT_MZDECIMAL", 0) | |
26 assign("DEFAULT_R_THRESHOLD", 0.85) | |
27 assign("DEFAULT_MINNUMBERSCAN", 8) | |
28 assign("DEFAULT_SEUIL_RA", 0.5) | |
29 lockBinding("DEFAULT_PRECURSOR_PATH", globalenv()) | |
30 lockBinding("DEFAULT_FRAGMENTS_PATH", globalenv()) | |
31 lockBinding("DEFAULT_COMPOUNDS_PATH", globalenv()) | |
32 lockBinding("DEFAULT_OUTPUT_PATH", globalenv()) | |
33 lockBinding("DEFAULT_TOLMZ", globalenv()) | |
34 lockBinding("DEFAULT_TOLRT", globalenv()) | |
35 lockBinding("DEFAULT_MZDECIMAL", globalenv()) | |
36 lockBinding("DEFAULT_R_THRESHOLD", globalenv()) | |
37 lockBinding("DEFAULT_MINNUMBERSCAN", globalenv()) | |
38 lockBinding("DEFAULT_SEUIL_RA", globalenv()) | |
39 | |
40 assign("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", 0.85) | |
41 assign("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", 0.1) | |
42 assign("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", 0.01) | |
43 assign("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", 60) | |
44 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", globalenv()) | |
45 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", globalenv()) | |
46 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", globalenv()) | |
47 lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", globalenv()) | |
48 | |
49 | |
50 debug <- FALSE | |
51 | |
52 | |
53 ######################################################################## | |
54 | |
55 #' @title plot_pseudo_spectra | |
56 #' @param x | |
57 #' @param r_threshold | |
58 #' @param fid | |
59 #' @param sum_int | |
60 #' @param vmz | |
61 #' @param cor_abs_int | |
62 #' @param refcol | |
63 #' @param c_name | |
64 #' @description plot_pseudo_spectra | |
65 #' function to compute sum of intensities among scans for all | |
66 #' m/z kept (cor > r_threshold & minimum number of scans) | |
67 #' and plot pseudo spectra | |
68 #' x dataframe scan X fragments with scans number in the 1st column and | |
69 #' ions in next with intensities | |
70 #' fid file id when several a precursor has been detected in several files | |
71 plot_pseudo_spectra <- function( | |
72 x, | |
73 r_threshold, | |
74 fid, | |
75 sum_int, | |
76 vmz, | |
77 cor_abs_int, | |
78 refcol, | |
79 c_name | |
80 ) { | |
81 ## du fait de la difference de nombre de colonne entre la dataframe qui | |
82 ## inclue les scans en 1ere col, mzRef se decale de 1 | |
83 refcol <- refcol - 1 | |
84 ## compute relative intensities max=100% | |
85 rel_int <- sum_int[-1] | |
86 rel_int <- rel_int / max(rel_int) | |
87 | |
88 ## define max value on vertical axis (need to increase in order to plot the | |
89 ## label of fragments) | |
90 ymax <- max(rel_int) + 0.2 * max(rel_int) | |
91 | |
92 par(mfrow = c(2, 1)) | |
93 plot(vmz, rel_int, type = "h", ylim = c(0, ymax), main = c_name) | |
94 ## low correl coef. will be display in grey | |
95 cor_low <- which(round(cor_abs_int, 2) < r_threshold) | |
96 | |
97 lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2)) | |
98 | |
99 if (length(cor_low) > 0) { | |
100 text( | |
101 vmz[cor_low], | |
102 rel_int[cor_low], | |
103 lbmzcor[cor_low], | |
104 cex = 0.5, | |
105 col = "grey", | |
106 srt = 90, | |
107 adj = 0 | |
108 ) | |
109 if (length(vmz) - length(cor_low) > 1) { | |
110 text( | |
111 vmz[-c(refcol, cor_low)], | |
112 rel_int[-c(refcol, cor_low)], | |
113 lbmzcor[-c(refcol, cor_low)], | |
114 cex = 0.6, | |
115 col = 1, | |
116 srt = 90, | |
117 adj = 0 | |
118 ) | |
119 } | |
120 } else { | |
121 if (length(vmz) > 1) { | |
122 text( | |
123 vmz[-c(refcol)], | |
124 rel_int[-c(refcol)], | |
125 lbmzcor[-c(refcol)], | |
126 cex = 0.6, | |
127 col = 1, | |
128 srt = 90, | |
129 adj = 0 | |
130 ) | |
131 } | |
132 } | |
133 | |
134 text( | |
135 vmz[refcol], | |
136 rel_int[refcol], | |
137 lbmzcor[refcol], | |
138 cex = 0.8, | |
139 col = 2, | |
140 srt = 90, | |
141 adj = 0 | |
142 ) | |
143 | |
144 ## prepare result file | |
145 corValid <- (round(cor_abs_int, 2) >= r_threshold) ##nolint object_name_linter | |
146 cp_res <- data.frame( | |
147 rep(c_name, length(vmz)), | |
148 rep(fid, length(vmz)), | |
149 vmz, | |
150 cor_abs_int, | |
151 sum_int[-1], | |
152 rel_int, | |
153 corValid | |
154 ) | |
155 | |
156 colnames(cp_res) <- c( | |
157 "compoundName", | |
158 "fileid", | |
159 "fragments_mz", | |
160 "CorWithPrecursor", | |
161 "AbsoluteIntensity", | |
162 "relativeIntensity", | |
163 "corValid" | |
164 ) | |
165 return(cp_res) | |
166 | |
167 } | |
168 | |
169 #' | |
170 #' @title extract_fragments | |
171 #' | |
172 #' @param precursors the precursor list from mspurity | |
173 #' @param fragments the fragments list from ms purity | |
174 #' @param mzref | |
175 #' @param rtref | |
176 #' @param c_name | |
177 #' @param r_threshold default = DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD | |
178 #' @param seuil_ra default = DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA | |
179 #' @param tolmz default = DEFAULT_EXTRACT_FRAGMENTS_TOLMZ | |
180 #' @param tolrt default = DEFAULT_EXTRACT_FRAGMENTS_TOLRT | |
181 #' @returns | |
182 #' | |
183 #' @description | |
184 #' function for extraction of fragments corresponding to precursors | |
185 #' detected by MSPurity | |
186 extract_fragments <- function( ## nolint cyclocomp_linter | |
187 precursors, | |
188 fragments, | |
189 mzref, | |
190 rtref, | |
191 c_name, | |
192 min_number_scan, | |
193 mzdecimal, | |
194 r_threshold=DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD, | |
195 seuil_ra=DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA, | |
196 tolmz=DEFAULT_EXTRACT_FRAGMENTS_TOLMZ, | |
197 tolrt=DEFAULT_EXTRACT_FRAGMENTS_TOLRT | |
198 ) { | |
199 ## filter precursor in the precursors file based on mz and rt in the | |
200 ## compound list | |
201 cat("processing ", c_name, "\n") | |
202 selected_precursors <- which( | |
203 (abs(precursors$precurMtchMZ - mzref) <= tolmz) | |
204 & (abs(precursors$precurMtchRT - rtref) <= tolrt) | |
205 ) | |
206 | |
207 ## check if there is the precursor in the file | |
208 if (length(selected_precursors) > 0) { | |
209 | |
210 sprecini <- precursors[selected_precursors, ] | |
211 | |
212 ## check if fragments corresponding to precursor are found in several | |
213 ## files (collision energy) | |
214 ## this lead to a processing for each fileid | |
215 mf <- levels(as.factor(sprecini$fileid)) | |
216 if (length(mf) > 1) { | |
217 cat(" several files detected for this compounds :\n") | |
218 } | |
219 | |
220 for (f in seq_along(mf)) { | |
221 | |
222 sprec <- sprecini[sprecini$fileid == mf[f], ] | |
223 | |
224 ## selection of fragment in the fragments file with the grpid common in | |
225 ## both fragments and precursors | |
226 selfrgt <- levels(as.factor(sprec$grpid)) | |
227 sfrgt <- fragments[ | |
228 fragments$grpid %in% selfrgt | |
229 & fragments$fileid == mf[f], | |
230 ] | |
231 | |
232 ## filter fragments on relative intensity seuil_ra = user defined | |
233 ## parameter (MSpurity flags could be used here) | |
234 sfrgtfil <- sfrgt[sfrgt$ra > seuil_ra, ] | |
235 | |
236 mznominal <- round(x = sfrgtfil$mz, mzdecimal) | |
237 sfrgtfil <- data.frame(sfrgtfil, mznominal) | |
238 | |
239 ## creation of cross table row=scan col=mz X=ra | |
240 vmz <- levels(as.factor(sfrgtfil$mznominal)) | |
241 | |
242 cat(" fragments :", vmz) | |
243 | |
244 ## mz of precursor in data precursor to check correlation with | |
245 mz_prec <- paste0("mz", round(mean(sprec$mz), mzdecimal)) | |
246 | |
247 for (m in seq_along(vmz)) { | |
248 | |
249 ## absolute intensity | |
250 cln <- c( | |
251 which(colnames(sfrgtfil) == "acquisitionNum"), | |
252 which(colnames(sfrgtfil) == "i") | |
253 ) | |
254 int_mz <- sfrgtfil[sfrgtfil$mznominal == vmz[m], cln] | |
255 colnames(int_mz)[2] <- paste0("mz", vmz[m]) | |
256 | |
257 ## average intensities of mass in duplicate scans | |
258 comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean) | |
259 int_mz <- comp_scans[, -1] | |
260 | |
261 if (m == 1) { | |
262 ds_abs_int <- int_mz | |
263 } else { | |
264 ds_abs_int <- merge( | |
265 x = ds_abs_int, | |
266 y = int_mz, | |
267 by.x = 1, | |
268 by.y = 1, | |
269 all.x = TRUE, | |
270 all.y = TRUE | |
271 ) | |
272 } | |
273 } | |
274 if (debug) { | |
275 write.table( | |
276 x = ds_abs_int, | |
277 file = paste0(c_name, "ds_abs_int.txt"), | |
278 row.names = FALSE, | |
279 sep = "\t" | |
280 ) | |
281 } | |
282 | |
283 ## elimination of mz with less than min_number_scan scans (user defined | |
284 ## parameter) | |
285 xmz <- rep(NA, ncol(ds_abs_int) - 1) | |
286 sum_int <- rep(NA, ncol(ds_abs_int)) | |
287 nbxmz <- 0 | |
288 nb_scan_check <- min(nrow(ds_abs_int), min_number_scan) | |
289 | |
290 for (j in 2:ncol(ds_abs_int)) { | |
291 sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE) | |
292 if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) { | |
293 nbxmz <- nbxmz + 1 | |
294 xmz[nbxmz] <- j | |
295 } | |
296 } | |
297 | |
298 xmz <- xmz[-which(is.na(xmz))] | |
299 if (length(xmz) > 0) { | |
300 ds_abs_int <- ds_abs_int[, -c(xmz)] | |
301 sum_int <- sum_int[-c(xmz)] | |
302 ## liste des mz keeped decale de 1 avec ds_abs_int | |
303 vmz <- as.numeric(vmz[-c(xmz - 1)]) | |
304 } | |
305 | |
306 ## reference ion for correlation computing = precursor OR maximum | |
307 ## intensity ion in precursor is not present | |
308 refcol <- which(colnames(ds_abs_int) == mz_prec) | |
309 if (length(refcol) == 0) { | |
310 refcol <- which(sum_int == max(sum_int, na.rm = TRUE)) | |
311 } | |
312 pdf( | |
313 file = sprintf("%s_processing_file%s.pdf", c_name, mf[f]), | |
314 width = 8, | |
315 height = 11 | |
316 ) | |
317 par(mfrow = c(3, 2)) | |
318 | |
319 ## Pearson correlations between absolute intensities computing | |
320 cor_abs_int <- rep(NA, length(vmz)) | |
321 | |
322 if (length(refcol) > 0) { | |
323 for (i in 2:length(ds_abs_int)) { | |
324 cor_abs_int[i - 1] <- cor( | |
325 x = ds_abs_int[[refcol]], | |
326 y = ds_abs_int[[i]], | |
327 use = "pairwise.complete.obs", | |
328 method = "pearson" | |
329 ) | |
330 plot( | |
331 ds_abs_int[[refcol]], | |
332 ds_abs_int[[i]], | |
333 xlab = colnames(ds_abs_int)[refcol], | |
334 ylab = colnames(ds_abs_int)[i], | |
335 main = sprintf( | |
336 "%s corr coeff r=%s", c_name, round(cor_abs_int[i - 1], 2) | |
337 ) | |
338 ) | |
339 } | |
340 ## plot pseudo spectra | |
341 res_comp_by_file <- plot_pseudo_spectra( | |
342 x = ds_abs_int, | |
343 r_threshold = r_threshold, | |
344 fid = mf[f], | |
345 sum_int = sum_int, | |
346 vmz = vmz, | |
347 cor_abs_int = cor_abs_int, | |
348 refcol = refcol, | |
349 c_name = c_name | |
350 ) | |
351 if (f == 1) { | |
352 res_comp <- res_comp_by_file | |
353 } | |
354 } else { | |
355 res_comp_by_file <- NULL | |
356 cat(" non detected in fragments file \n") | |
357 } | |
358 if (!is.null(res_comp_by_file)) { | |
359 res_comp <- rbind(res_comp, res_comp_by_file) | |
360 } | |
361 cat("\n") | |
362 dev.off() | |
363 } | |
364 } else { | |
365 res_comp <- NULL | |
366 cat(" non detected in precursor file \n") | |
367 } | |
368 return(res_comp) | |
369 } | |
370 | |
371 | |
372 create_parser <- function() { | |
373 parser <- optparse::OptionParser() | |
374 parser <- optparse::add_option( | |
375 parser, | |
376 c("-v", "--verbose"), | |
377 action = "store_true", | |
378 default = FALSE, | |
379 help = "Print extra output [default %default]" | |
380 ) | |
381 parser <- optparse::add_option( | |
382 parser, | |
383 c("-o", "--output"), | |
384 type = "character", | |
385 default = DEFAULT_OUTPUT_PATH, | |
386 action = "store", | |
387 help = "Path to the output file [default %default]" | |
388 ) | |
389 parser <- optparse::add_option( | |
390 parser, | |
391 c("-p", "--precursors"), | |
392 type = "character", | |
393 default = DEFAULT_PRECURSOR_PATH, | |
394 action = "store", | |
395 help = "Path to the precursors file [default %default]" | |
396 ) | |
397 parser <- optparse::add_option( | |
398 parser, | |
399 c("-f", "--fragments"), | |
400 type = "character", | |
401 default = DEFAULT_FRAGMENTS_PATH, | |
402 action = "store", | |
403 help = "Path to the fragments file [default %default]" | |
404 ) | |
405 parser <- optparse::add_option( | |
406 parser, | |
407 c("-c", "--compounds"), | |
408 type = "character", | |
409 default = DEFAULT_COMPOUNDS_PATH, | |
410 action = "store", | |
411 help = "Path to the compounds file [default %default]" | |
412 ) | |
413 parser <- optparse::add_option( | |
414 parser, | |
415 c("--tolmz"), | |
416 type = "numeric", | |
417 action = "store", | |
418 default = DEFAULT_TOLMZ, | |
419 metavar = "number" | |
420 ) | |
421 parser <- optparse::add_option( | |
422 parser, | |
423 c("--tolrt"), | |
424 type = "integer", | |
425 action = "store", | |
426 default = DEFAULT_TOLRT, | |
427 metavar = "number" | |
428 ) | |
429 parser <- optparse::add_option( | |
430 parser, | |
431 c("--seuil_ra"), | |
432 type = "numeric", | |
433 action = "store", | |
434 help = "relative intensity threshold", | |
435 default = DEFAULT_SEUIL_RA, | |
436 metavar = "number" | |
437 ) | |
438 parser <- optparse::add_option( | |
439 parser, | |
440 c("--mzdecimal"), | |
441 type = "integer", | |
442 default = DEFAULT_MZDECIMAL, | |
443 action = "store", | |
444 help = "nb decimal for mz", | |
445 metavar = "number" | |
446 ) | |
447 parser <- optparse::add_option( | |
448 parser, | |
449 c("--r_threshold"), | |
450 type = "integer", | |
451 default = DEFAULT_R_THRESHOLD, | |
452 action = "store", | |
453 help = paste0( | |
454 "r pearson correlation threshold between precursor and fragment ", | |
455 "absolute intensity" | |
456 ), | |
457 metavar = "number" | |
458 ) | |
459 parser <- optparse::add_option( | |
460 parser, | |
461 c("--min_number_scan"), | |
462 type = "numeric", | |
463 action = "store", | |
464 default = DEFAULT_MINNUMBERSCAN, | |
465 help = paste0( | |
466 "fragments are kept if there are found in a minimum number ", | |
467 "of scans" | |
468 ), | |
469 metavar = "number" | |
470 ) | |
471 return(parser) | |
472 } | |
473 | |
474 main <- function(args) { | |
475 ## FOLDER AND FILES | |
476 ## MSpurity precursors file | |
477 precursors <- read.table( | |
478 file = args$precursors, | |
479 header = TRUE, | |
480 sep = "\t", | |
481 quote = "\"" | |
482 ) | |
483 ## MSpurity fragments file | |
484 fragments <- read.table( | |
485 file = args$fragments, | |
486 header = TRUE, | |
487 sep = "\t", | |
488 quote = "\"" | |
489 ) | |
490 ## list of compounds : col1=Name of molecule, col2=m/z, col3=retention time | |
491 compounds <- read.table( | |
492 file = args$compounds, | |
493 sep = "\t", | |
494 quote = "\"", | |
495 header = TRUE | |
496 ) | |
497 ## PARAMETERS | |
498 ## tolerance for mz(dalton) rt(seconds) to match the standard in the compounds | |
499 ## list with the precursor MSpurity file | |
500 tolmz <- args$tolmz | |
501 tolrt <- args$tolrt | |
502 | |
503 ## relative intensity threshold | |
504 seuil_ra <- args$seuil_ra | |
505 ## nb decimal for mz | |
506 mzdecimal <- args$mzdecimal | |
507 ## r pearson correlation threshold between precursor and | |
508 # #fragment absolute intensity | |
509 r_threshold <- args$r_threshold | |
510 ## fragments are kept if there are found in a minimum number of scans | |
511 min_number_scan <- args$min_number_scan | |
512 | |
513 for (i in seq_len(nrow(compounds))) { | |
514 ## loop execution for all compounds in the compounds file | |
515 res_cor <- NULL | |
516 res_cor <- extract_fragments( | |
517 precursors = precursors, | |
518 fragments = fragments, | |
519 mzref = compounds[[2]][i], | |
520 rtref = compounds[[3]][i], | |
521 c_name = compounds[[1]][i], | |
522 min_number_scan = min_number_scan, | |
523 mzdecimal = mzdecimal, | |
524 r_threshold = r_threshold, | |
525 seuil_ra = seuil_ra, | |
526 tolmz = tolmz, | |
527 tolrt = tolrt | |
528 ) | |
529 if (i == 1 & !is.null(res_cor)) { | |
530 res_all <- res_cor | |
531 } else if (!is.null(res_cor)) { | |
532 res_all <- rbind(res_all, res_cor) | |
533 } | |
534 } | |
535 | |
536 if (is.null(res_all)) { | |
537 stop("No result at all!") | |
538 } | |
539 write.table( | |
540 x = res_all, | |
541 file = args$output, | |
542 sep = "\t", | |
543 row.names = FALSE | |
544 ) | |
545 } | |
546 | |
547 args <- optparse::parse_args(create_parser()) | |
548 sessionInfo() | |
549 main(args) | |
550 | |
551 warnings() |