ms2snoop: MS2snoop.R comparison

comparison MS2snoop.R @ 5:78d5a12406c2 draft

planemo upload commit a5f94dac9b268629399dc22c5d6ac48c5a85adc3

author	workflow4metabolomics
date	Fri, 05 Aug 2022 17:25:45 +0000
parents	856001213966
children	77abacd33c31

comparison

equal deleted inserted replaced

-:856001213966
+:78d5a12406c2
 #'
 #' @import optparse
 #'
-assign("MS2SNOOP_VERSION", "2.0.0")
+get_version <- function() {
-lockBinding("MS2SNOOP_VERSION", globalenv())
+cmd <- commandArgs(trailingOnly = FALSE)
+root <- dirname(gsub("--file=", "", cmd[grep("--file=", cmd)]))
-assign("MISSING_PARAMETER_ERROR", 1)
+readme <- readLines(file.path(root, "README.md"))
-lockBinding("MISSING_PARAMETER_ERROR", globalenv())
+version_line <- readme[grepl(" * **@version**: ", readme, fixed = TRUE)]
+return(gsub(".*: ", "", version_line))
-assign("BAD_PARAMETER_VALUE_ERROR", 2)
+}
-lockBinding("BAD_PARAMETER_VALUE_ERROR", globalenv())
+defaults <- list(
-assign("MISSING_INPUT_FILE_ERROR", 3)
+MS2SNOOP_VERSION = get_version(),
-lockBinding("MISSING_INPUT_FILE_ERROR", globalenv())
+MISSING_PARAMETER_ERROR = 1,
+BAD_PARAMETER_VALUE_ERROR = 2,
-assign("NO_ANY_RESULT_ERROR", 255)
+MISSING_INPUT_FILE_ERROR = 3,
-lockBinding("NO_ANY_RESULT_ERROR", globalenv())
+NO_ANY_RESULT_ERROR = 255,
+DEFAULT_PRECURSOR_PATH = NULL,
-assign("DEFAULT_PRECURSOR_PATH", "peaklist_precursors.tsv")
+DEFAULT_FRAGMENTS_PATH = NULL,
-assign("DEFAULT_FRAGMENTS_PATH", "peaklist_fragments.tsv")
+DEFAULT_COMPOUNDS_PATH = NULL,
-assign("DEFAULT_COMPOUNDS_PATH", "compounds_pos.txt")
+DEFAULT_OUTPUT_PATH = "compound_fragments_result.txt",
-assign("DEFAULT_OUTPUT_PATH", "compound_fragments_result.txt")
+DEFAULT_TOLMZ = 0.01,
-assign("DEFAULT_TOLMZ", 0.01)
+DEFAULT_TOLRT = 20,
-assign("DEFAULT_TOLRT", 20)
+DEFAULT_MZDECIMAL = 3,
-assign("DEFAULT_MZDECIMAL", 0)
+DEFAULT_R_THRESHOLD = 0.85,
-assign("DEFAULT_R_THRESHOLD", 0.85)
+DEFAULT_MINNUMBERSCAN = 8,
-assign("DEFAULT_MINNUMBERSCAN", 8)
+DEFAULT_SEUIL_RA = 0.05,
-assign("DEFAULT_SEUIL_RA", 0.5)
+DEFAULT_FRAGMENTS_MATCH_DELTA = 10,
-lockBinding("DEFAULT_PRECURSOR_PATH", globalenv())
+DEFAULT_FRAGMENTS_MATCH_DELTA_UNIT = "ppm",
-lockBinding("DEFAULT_FRAGMENTS_PATH", globalenv())
+DEFAULT_PDF_PATH = ""
-lockBinding("DEFAULT_COMPOUNDS_PATH", globalenv())
+)
-lockBinding("DEFAULT_OUTPUT_PATH", globalenv())
+env <- globalenv()
-lockBinding("DEFAULT_TOLMZ", globalenv())
+for (default in names(defaults)) {
-lockBinding("DEFAULT_TOLRT", globalenv())
+assign(default, defaults[[default]], envir = env)
-lockBinding("DEFAULT_MZDECIMAL", globalenv())
+lockBinding(default, env)
-lockBinding("DEFAULT_R_THRESHOLD", globalenv())
+}
-lockBinding("DEFAULT_MINNUMBERSCAN", globalenv())
-lockBinding("DEFAULT_SEUIL_RA", globalenv())
-assign("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", 0.85)
-assign("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", 0.1)
-assign("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", 0.01)
-assign("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", 60)
-lockBinding("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", globalenv())
-lockBinding("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", globalenv())
-lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", globalenv())
-lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", globalenv())
 ########################################################################
+get_formulas <- function(
+mzref,
+spectra,
+nominal_mz_list,
+processing_parameters,
+background = !TRUE
+) {
+if (is.vector(mzref) && length(mzref) > 1) {
+return(lapply(
+mzref,
+function(mz) {
+return(get_formulas(
+mzref = mz,
+spectra = spectra,
+nominal_mz_list = nominal_mz_list,
+processing_parameters = processing_parameters,
+background = background
+))
+}
+))
+}
+input <- sprintf(
+"%s-%s.ms",
+gsub("[[:space:]]", "_", processing_parameters$c_name),
+mzref
+)
+create_ms_file(input, mzref, spectra, processing_parameters)
+output <- sprintf(
+"out/%s-%s.out",
+gsub("[[:space:]]", "_", processing_parameters$c_name),
+mzref
+)
+command <- sprintf(
+paste(
+"sirius",
+"--noCite",
+"--noSummaries",
+"--loglevel=WARNING",
+"-i='%s'",
+"-o='%s'",
+"tree",
+## loglevel is not working taken into account during
+## sirius startup, so we filter outputs...
+"2>&1 | grep '^(WARNING|SEVERE)'"
+),
+input,
+output
+)
+verbose_catf(
+">> Sirius is running %swith the command: %s\n",
+if (background) "in the background " else "",
+command
+)
+system(
+command,
+wait = !background,
+ignore.stdout = background,
+ignore.stderr = background
+)
+return(extract_sirius_results(output, spectra$mz, processing_parameters))
+}
+create_ms_file <- function(
+path,
+mzref,
+spectra,
+processing_parameters
+) {
+file_content <- paste(
+sprintf(">compound %s", processing_parameters$c_name),
+sprintf(">ionization %s", processing_parameters$ionization),
+sprintf(">parentmass %s", mzref),
+sprintf(">formula %s", processing_parameters$elemcomposition),
+sep = "\n"
+)
+displayed_file_content <- sprintf(
+"%s\n>collision\n%s",
+file_content,
+paste(
+sprintf(
+"%s %s",
+spectra[1:3, "mz"],
+spectra[1:3, "intensities"]
+),
+collapse = "\n"
+)
+)
+if (nrow(spectra) > 3) {
+displayed_file_content <- sprintf(
+"%s\n[... %s more rows of mz and intensities ...]",
+displayed_file_content,
+nrow(spectra) - 3
+)
+}
+catf(
+">> MS file created for %s with content:\n%s\n",
+processing_parameters$c_name,
+displayed_file_content
+)
+file_content <- sprintf(
+"%s\n\n>collision\n%s",
+file_content,
+paste(
+sprintf("%s %s", spectra$mz, spectra$intensities),
+collapse = "\n"
+)
+)
+cat(file_content, file = path, append = FALSE)
+}
+extract_sirius_results <- function(
+output,
+mz_list,
+processing_parameters
+) {
+delta <- processing_parameters$fragment_match_delta
+delta_unit <- tolower(processing_parameters$fragment_match_delta_unit)
+output <- list.dirs(output, recursive = FALSE)[[1]]
+spectra_out_dir <- sprintf("%s/spectra", output)
+spectra_filename <- sprintf(
+"%s/%s",
+spectra_out_dir,
+list.files(spectra_out_dir)[[1]]
+)
+trees_out_dir <- sprintf("%s/trees", output)
+trees_filename <- sprintf(
+"%s/%s",
+trees_out_dir,
+list.files(trees_out_dir)[[1]]
+)
+if (!is.null(spectra_filename)) {
+sirius_results <- get_csv_or_tsv(spectra_filename)
+} else {
+return(rep(NA, length(mz_list)))
+}
+if (!is.null(trees_filename)) {
+sirius_results <- cbind(sirius_results, extract_sirius_ppm(trees_filename))
+} else {
+return(rep(NA, length(mz_list)))
+}
+fragment_matchings <- data.frame(
+formula = NA,
+ppm = NA,
+mz = mz_list,
+error = NA
+)
+sirius_results <- filter_sirius_with_delta(
+sirius_results = sirius_results,
+original_mz = fragment_matchings$mz,
+delta = delta,
+delta_unit = delta_unit
+)
+for (index in seq_len(nrow(sirius_results))) {
+result <- sirius_results[index, ]
+filter <- which(order(abs(fragment_matchings$mz - result$mz)) == 1)
+fragment_matchings[filter, "formula"] <- result$formula
+fragment_matchings[filter, "ppm"] <- result$ppm
+catf(
+"[OK] Fragment with m/z=%s matches %s with a difference of %s ppm\n",
+fragment_matchings[filter, "mz"], result$formula, result$ppm
+)
+}
+return(fragment_matchings)
+}
+filter_sirius_with_delta <- function(
+sirius_results,
+original_mz,
+delta,
+delta_unit
+) {
+if (is.numeric(delta) && !is.na(delta) && delta > 0) {
+if (delta_unit == "ppm") {
+filter <- abs(sirius_results$ppm) <= delta
+fine <- which(filter)
+not_fine <- which(!filter)
+catf(
+paste("[KO] fragment %s (m/z=%s) eleminated because ppm=%s is greater",
+"than delta=%s\n"
+),
+sirius_results[not_fine, ]$formula,
+sirius_results[not_fine, ]$mz,
+sirius_results[not_fine, ]$ppm,
+delta
+)
+sirius_results <- sirius_results[fine, ]
+} else if (delta_unit == "mz") {
+differences <- sapply(
+sirius_results$mz,
+function(mz) min(abs(original_mz - mz))
+)
+fine <- which(sapply(
+sirius_results$mz,
+function(mz) any(abs(original_mz - mz) <= delta)
+))
+not_fine <- which(sapply(
+sirius_results$mz,
+function(mz) all(abs(original_mz - mz) > delta)
+))
+catf(
+paste(
+"[KO] fragment %s eleminated because mz difference=%s is",
+"greater than delta=%s\n"
+),
+sirius_results[not_fine, ]$formula,
+differences[not_fine],
+delta
+)
+sirius_results <- sirius_results[fine, ]
+}
+}
+return(sirius_results)
+}
+extract_sirius_ppm <- function(path) {
+json <- file(path, "r")
+suppressWarnings(json_lines <- readLines(json))
+close(json)
+json_lines <- json_lines[
+grepl("\\s+\"(massDeviation|recalibratedMass)\" :", json_lines)
+]
+json_lines <- gsub("^\\s+\"[^\"]+\" : \"?", "", json_lines)
+ppms <- json_lines[seq(1, length(json_lines), 2)]
+mz <- json_lines[seq(2, length(json_lines), 2)]
+ppms <- as.numeric(gsub(" ppm .*", "", ppms))
+mz <- as.numeric(gsub(",$", "", mz))
+ordered <- order(mz)
+return(list(ppm = ppms[ordered], recalibrated_mz = mz[ordered]))
+}
 #' @title plot_pseudo_spectra
 #' @param x
-#' @param r_threshold
 #' @param fid
 #' @param sum_int
 #' @param vmz
 #' @param cor_abs_int
 #' @param refcol
 #' x dataframe scan X fragments with scans number in the 1st column and
 #' ions in next with intensities
 #' fid file id when several a precursor has been detected in several files
 plot_pseudo_spectra <- function(
 x,
-r_threshold,
 fid,
 sum_int,
 vmz,
 cor_abs_int,
 refcol,
-c_name,
+meaned_mz,
-inchikey,
+processing_parameters
-elemcomposition
 ) {
 ## du fait de la difference de nombre de colonne entre la dataframe qui
 ## inclue les scans en 1ere col, mzRef se decale de 1
 refcol <- refcol - 1
 ## compute relative intensities max=100%
 rel_int <- sum_int[-1]
 rel_int <- rel_int / max(rel_int)
-## define max value on vertical axis (need to increase in order to plot the
+if (processing_parameters$do_pdf) {
-## label of fragments)
+## define max value on vertical axis (need to increase in order to plot the
-ymax <- max(rel_int) + 0.2 * max(rel_int)
+## label of fragments)
+ymax <- max(rel_int) + 0.2 * max(rel_int)
-par(mfrow = c(2, 1))
-plot(vmz, rel_int, type = "h", ylim = c(0, ymax), main = c_name)
+par(mfrow = c(2, 1))
-## low correl coef. will be display in grey
+plot(vmz, rel_int, type = "h", ylim = c(0, ymax),
-cor_low <- which(round(cor_abs_int, 2) < r_threshold)
+main = processing_parameters$c_name
+)
-lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2))
+## low correl coef. will be display in grey
+cor_low <- which(round(cor_abs_int, 2) < processing_parameters$r_threshold)
-if (length(cor_low) > 0) {
-text(
+lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2))
-vmz[cor_low],
-rel_int[cor_low],
+if (length(cor_low) > 0) {
-lbmzcor[cor_low],
-cex = 0.5,
-col = "grey",
-srt = 90,
-adj = 0
-)
-if (length(vmz) - length(cor_low) > 1) {
 text(
-vmz[-c(refcol, cor_low)],
+vmz[cor_low],
-rel_int[-c(refcol, cor_low)],
+rel_int[cor_low],
-lbmzcor[-c(refcol, cor_low)],
+lbmzcor[cor_low],
-cex = 0.6,
+cex = 0.5,
-col = 1,
+col = "grey",
 srt = 90,
 adj = 0
 )
+if (length(vmz) - length(cor_low) > 1) {
+text(
+vmz[-c(refcol, cor_low)],
+rel_int[-c(refcol, cor_low)],
+lbmzcor[-c(refcol, cor_low)],
+cex = 0.6,
+col = 1,
+srt = 90,
+adj = 0
+)
+}
+} else {
+if (length(vmz) > 1) {
+text(
+vmz[-c(refcol)],
+rel_int[-c(refcol)],
+lbmzcor[-c(refcol)],
+cex = 0.6,
+col = 1,
+srt = 90,
+adj = 0
+)
+}
+}
+text(
+vmz[refcol],
+rel_int[refcol],
+lbmzcor[refcol],
+cex = 0.8,
+col = 2,
+srt = 90,
+adj = 0
+)
+}
+## prepare result file
+cor_valid <- (round(cor_abs_int, 2) >= processing_parameters$r_threshold)
+do_sirius <- TRUE
+verbose_catf("Checking sirius parameters...\n")
+if (is.null(processing_parameters$ionization)) {
+do_sirius <- FALSE
+verbose_catf("[KO] No ionization passed in parameter.\n")
+} else {
+verbose_catf("[OK] Ionization=%s.\n", processing_parameters$ionization)
+}
+if (is.na(processing_parameters$elemcomposition)) {
+do_sirius <- FALSE
+verbose_catf("[KO] Elemental composition is NA.\n")
+} else if (length(processing_parameters$elemcomposition) < 1) {
+do_sirius <- FALSE
+verbose_catf("[KO] No elemental composition is provided.\n")
+} else if (processing_parameters$elemcomposition == "") {
+do_sirius <- FALSE
+verbose_catf("[KO] Elemental composition is an empty string.\n")
+} else {
+verbose_catf(
+"[OK] Elemental composition=%s.\n",
+processing_parameters$elemcomposition
+)
+}
+cp_res_length <- length(vmz)
+ppm <- rep(NA, cp_res_length)
+formulas <- rep(NA, cp_res_length)
+if (do_sirius) {
+verbose_catf("Everything is ok, preparing for sirius.\n")
+formulas <- get_formulas(
+mzref = processing_parameters$mzref,
+spectra = data.frame(mz = meaned_mz, intensities = sum_int[-1]),
+nominal_mz_list = vmz,
+processing_parameters = processing_parameters
+)
+if (nrow(formulas) == 0) {
+catf("No formula found.\n")
+} else {
+ppm <- formulas$ppm
+formulas <- formulas$formula
+catf(
+"Found %s formula for %s fragments\n",
+length(formulas[which(!(is.na(formulas)))]),
+cp_res_length
+)
 }
 } else {
-if (length(vmz) > 1) {
+verbose_catf("Sirius cannot be run.\n")
-text(
+}
-vmz[-c(refcol)],
-rel_int[-c(refcol)],
-lbmzcor[-c(refcol)],
-cex = 0.6,
-col = 1,
-srt = 90,
-adj = 0
-)
-}
-}
-text(
-vmz[refcol],
-rel_int[refcol],
-lbmzcor[refcol],
-cex = 0.8,
-col = 2,
-srt = 90,
-adj = 0
-)
-## prepare result file
-corValid <- (round(cor_abs_int, 2) >= r_threshold) ##nolint object_name_linter
 cp_res <- data.frame(
-rep(c_name, length(vmz)),
+rep(processing_parameters$c_name, cp_res_length),
-rep(inchikey, length(vmz)),
+rep(processing_parameters$inchikey, cp_res_length),
-rep(elemcomposition, length(vmz)),
+rep(processing_parameters$elemcomposition, cp_res_length),
-rep(fid, length(vmz)),
+formulas,
 vmz,
+ppm,
+rep(fid, cp_res_length),
 cor_abs_int,
 sum_int[-1],
 rel_int,
-corValid
+cor_valid
 )
 colnames(cp_res) <- c(
 "compoundName",
 "inchikey",
 "elemcomposition",
+"fragment",
+"fragment_mz",
+"ppm",
 "fileid",
-"fragments_mz",
 "CorWithPrecursor",
 "AbsoluteIntensity",
 "relativeIntensity",
 "corValid"
 )
 return(cp_res)
 }
 #'
 #' @title extract_fragments
 #'
 #' @param precursors the precursor list from mspurity
 #' @param fragments the fragments list from ms purity
-#' @param mzref
+# ' @param mzref
-#' @param rtref
+# ' @param rtref
-#' @param c_name
+# ' @param c_name
-#' @param r_threshold default = DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD
+# ' @param inchikey
-#' @param seuil_ra default = DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA
+# ' @param elemcomposition
-#' @param tolmz default = DEFAULT_EXTRACT_FRAGMENTS_TOLMZ
+#' @param processing_parameters
-#' @param tolrt default = DEFAULT_EXTRACT_FRAGMENTS_TOLRT
 #' @returns
 #'
 #' @description
 #' function for extraction of fragments corresponding to precursors
 #' detected by MSPurity
 extract_fragments <- function( ## nolint cyclocomp_linter
 precursors,
 fragments,
-mzref,
+processing_parameters
-rtref,
-c_name,
-inchikey,
-elemcomposition,
-min_number_scan,
-mzdecimal,
-r_threshold = DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD,
-seuil_ra = DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA,
-tolmz = DEFAULT_EXTRACT_FRAGMENTS_TOLMZ,
-tolrt = DEFAULT_EXTRACT_FRAGMENTS_TOLRT
 ) {
 ## filter precursor in the precursors file based on mz and rt in the
 ## compound list
-cat("processing ", c_name, "\n")
+catf("processing %s\n", processing_parameters$c_name)
+verbose_catf("===\n")
+param <- processing_parameters
 selected_precursors <- which(
-(abs(precursors$precurMtchMZ - mzref) <= tolmz)
+(abs(precursors$precurMtchMZ - param$mzref) <= param$tolmz)
-& (abs(precursors$precurMtchRT - rtref) <= tolrt)
+& (abs(precursors$precurMtchRT - param$rtref) <= param$tolrt)
+)
+rm(param)
+verbose_catf(
+"> %s precursors selected with mz=%s±%s and rt=%s±%s\n",
+length(selected_precursors),
+processing_parameters$mzref,
+processing_parameters$tolmz,
+processing_parameters$rtref,
+processing_parameters$tolrt
 )
 ## check if there is the precursor in the file
-if (length(selected_precursors) > 0) {
+if (length(selected_precursors) < 1) {
-sprecini <- precursors[selected_precursors, ]
+cat("> non detected in precursor file\n")
+show_end_processing()
-## check if fragments corresponding to precursor are found in several
+return(NULL)
-## files (collision energy)
+}
-## this lead to a processing for each fileid
-mf <- levels(as.factor(sprecini$fileid))
+precursors <- precursors[selected_precursors, ]
-if (length(mf) > 1 && global_verbose) {
-cat(" several files detected for this compounds :\n")
+## check if fragments corresponding to precursor are found in several
-}
+## files (collision energy)
+## this lead to a processing for each fileid
-for (f in seq_along(mf)) {
+file_ids <- as.character(sort(unique(precursors$fileid)))
+if (length(file_ids) > 1) {
-sprec <- sprecini[sprecini$fileid == mf[f], ]
+catf("> several files detected for this compounds :\n")
+} else if (length(file_ids) < 1 || nrow(precursors) < 1) {
-## selection of fragment in the fragments file with the grpid common in
+return(data.frame())
-## both fragments and precursors
+}
-selfrgt <- levels(as.factor(sprec$grpid))
-sfrgt <- fragments[
+res_comp <- data.frame()
-fragments$grpid %in% selfrgt
+for (curent_file_id in file_ids) {
-& fragments$fileid == mf[f],
+curent_precursors <- precursors[precursors$fileid == curent_file_id, ]
-]
+selected_fragments <- fragments[
+fragments$grpid %in% as.character(curent_precursors$grpid)
-## filter fragments on relative intensity seuil_ra = user defined
+& fragments$fileid == curent_file_id,
-## parameter (MSpurity flags could be used here)
+]
-sfrgtfil <- sfrgt[sfrgt$ra > seuil_ra, ]
+filtered_fragments <- selected_fragments[
+selected_fragments$ra > processing_parameters$seuil_ra,
-mznominal <- round(x = sfrgtfil$mz, mzdecimal)
+]
-sfrgtfil <- data.frame(sfrgtfil, mznominal)
+if (nrow(filtered_fragments) != 0) {
+res_comp_by_file <- process_file(
-## creation of cross table row=scan col=mz X=ra
+curent_file_id = curent_file_id,
-vmz <- levels(as.factor(sfrgtfil$mznominal))
+precursor_mz = curent_precursors$mz,
+filtered_fragments = filtered_fragments,
-if (global_verbose) {
+processing_parameters = processing_parameters
-cat(" fragments :", vmz)
-cat("\n")
-}
-## mz of precursor in data precursor to check correlation with
-mz_prec <- paste0("mz", round(mean(sprec$mz), mzdecimal))
-for (m in seq_along(vmz)) {
-## absolute intensity
-cln <- c(
-which(colnames(sfrgtfil) == "acquisitionNum"),
-which(colnames(sfrgtfil) == "i")
-)
-int_mz <- sfrgtfil[sfrgtfil$mznominal == vmz[m], cln]
-colnames(int_mz)[2] <- paste0("mz", vmz[m])
-## average intensities of mass in duplicate scans
-comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean)
-int_mz <- comp_scans[, -1]
-if (m == 1) {
-ds_abs_int <- int_mz
-} else {
-ds_abs_int <- merge(
-x = ds_abs_int,
-y = int_mz,
-by.x = 1,
-by.y = 1,
-all.x = TRUE,
-all.y = TRUE
-)
-}
-}
-if (global_debug) {
-print(ds_abs_int)
-write.table(
-x = ds_abs_int,
-file = paste0(c_name, "ds_abs_int.txt"),
-row.names = FALSE,
-sep = "\t"
-)
-}
-## elimination of mz with less than min_number_scan scans (user defined
-## parameter)
-xmz <- rep(NA, ncol(ds_abs_int) - 1)
-sum_int <- rep(NA, ncol(ds_abs_int))
-nbxmz <- 0
-nb_scan_check <- min(nrow(ds_abs_int), min_number_scan)
-for (j in 2:ncol(ds_abs_int)) {
-sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE)
-if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) {
-nbxmz <- nbxmz + 1
-xmz[nbxmz] <- j
-}
-}
-xmz <- xmz[-which(is.na(xmz))]
-if (length(xmz) > 0) {
-ds_abs_int <- ds_abs_int[, -c(xmz)]
-sum_int <- sum_int[-c(xmz)]
-## liste des mz keeped decale de 1 avec ds_abs_int
-vmz <- as.numeric(vmz[-c(xmz - 1)])
-}
-## reference ion for correlation computing = precursor OR maximum
-## intensity ion in precursor is not present
-refcol <- which(colnames(ds_abs_int) == mz_prec)
-if (length(refcol) == 0) {
-refcol <- which(sum_int == max(sum_int, na.rm = TRUE))
-}
-pdf(
-file = sprintf("%s_processing_file%s.pdf", c_name, mf[f]),
-width = 8,
-height = 11
 )
-par(mfrow = c(3, 2))
-## Pearson correlations between absolute intensities computing
-cor_abs_int <- rep(NA, length(vmz))
-if (length(refcol) > 0) {
-for (i in 2:length(ds_abs_int)) {
-cor_abs_int[i - 1] <- cor(
-x = ds_abs_int[[refcol]],
-y = ds_abs_int[[i]],
-use = "pairwise.complete.obs",
-method = "pearson"
-)
-plot(
-ds_abs_int[[refcol]],
-ds_abs_int[[i]],
-xlab = colnames(ds_abs_int)[refcol],
-ylab = colnames(ds_abs_int)[i],
-main = sprintf(
-"%s corr coeff r=%s", c_name, round(cor_abs_int[i - 1], 2)
-)
-)
-}
-## plot pseudo spectra
-res_comp_by_file <- plot_pseudo_spectra(
-x = ds_abs_int,
-r_threshold = r_threshold,
-fid = mf[f],
-sum_int = sum_int,
-vmz = vmz,
-cor_abs_int = cor_abs_int,
-refcol = refcol,
-c_name = c_name,
-inchikey = inchikey,
-elemcomposition = elemcomposition
-)
-if (f == 1) {
-res_comp <- res_comp_by_file
-}
-} else {
-res_comp_by_file <- NULL
-cat(" non detected in fragments file \n")
-}
 if (!is.null(res_comp_by_file)) {
 res_comp <- rbind(res_comp, res_comp_by_file)
 }
-dev.off()
+} else {
+catf("No fragment found for in fragment file\n")
 }
+}
+return(unique(res_comp))
+}
+process_file <- function(
+curent_file_id,
+precursor_mz,
+filtered_fragments,
+processing_parameters
+) {
+mznominal <- round(x = filtered_fragments$mz, digits = 0)
+meaned_mz <- round(
+aggregate(
+data.frame(
+mz = filtered_fragments$mz,
+mznominal = mznominal
+),
+list(mznominal),
+FUN = mean
+)$mz,
+digits = processing_parameters$mzdecimal
+)
+filtered_fragments <- data.frame(filtered_fragments, mznominal)
+## creation of cross table row=scan col=mz X=ra
+vmz <- as.character(sort(unique(filtered_fragments$mznominal)))
+ds_abs_int <- create_ds_abs_int(vmz, filtered_fragments)
+if (global_debug) {
+print(ds_abs_int)
+}
+## elimination of mz with less than min_number_scan scans (user defined
+## parameter)
+xmz <- rep(NA, ncol(ds_abs_int) - 1)
+sum_int <- rep(NA, ncol(ds_abs_int))
+nbxmz <- 0
+nb_scan_check <- min(nrow(ds_abs_int), processing_parameters$min_number_scan)
+for (j in 2:ncol(ds_abs_int)) {
+sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE)
+if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) {
+nbxmz <- nbxmz + 1
+xmz[nbxmz] <- j
+}
+}
+xmz <- xmz[-which(is.na(xmz))]
+if (length(xmz) > 0) {
+ds_abs_int <- ds_abs_int[, -c(xmz)]
+sum_int <- sum_int[-c(xmz)]
+## liste des mz keeped decale de 1 avec ds_abs_int
+vmz <- as.numeric(vmz[-c(xmz - 1)])
+meaned_mz <- meaned_mz[-c(xmz - 1)]
+}
+## mz of precursor in data precursor to check correlation with
+mz_prec <- paste0(
+"mz",
+round(mean(precursor_mz), processing_parameters$mzdecimal)
+)
+## reference ion for correlation computing = precursor OR maximum
+## intensity ion in precursor is not present
+refcol <- which(colnames(ds_abs_int) == mz_prec)
+if (length(refcol) == 0) {
+refcol <- which(sum_int == max(sum_int, na.rm = TRUE))
+}
+if (processing_parameters$do_pdf) {
+start_pdf(processing_parameters, curent_file_id)
+}
+## Pearson correlations between absolute intensities computing
+cor_abs_int <- rep(NA, length(vmz))
+if (length(refcol) > 0) {
+for (i in 2:length(ds_abs_int)) {
+cor_abs_int[i - 1] <- stats::cor(
+x = ds_abs_int[[refcol]],
+y = ds_abs_int[[i]],
+use = "pairwise.complete.obs",
+method = "pearson"
+)
+debug_catf(
+"Correlation between %s and %s: %s\n",
+paste(ds_abs_int[[refcol]], collapse = ";"),
+paste(ds_abs_int[[i]], collapse = ";"),
+paste(cor_abs_int[i - 1], collapse = ";")
+)
+if (processing_parameters$do_pdf) {
+pdf_plot_ds_abs_int(
+processing_parameters$c_name,
+ds_abs_int,
+refcol,
+i,
+round(cor_abs_int[i - 1], 2)
+)
+}
+}
+## plot pseudo spectra
+res_comp_by_file <- plot_pseudo_spectra(
+x = ds_abs_int,
+fid = curent_file_id,
+sum_int = sum_int,
+vmz = vmz,
+cor_abs_int = cor_abs_int,
+refcol = refcol,
+meaned_mz = meaned_mz,
+processing_parameters = processing_parameters
+)
+catf(
+"%s has been processed and %s fragments have been found.\n",
+processing_parameters$c_name,
+nrow(res_comp_by_file)
+)
 } else {
-res_comp <- NULL
+res_comp_by_file <- NULL
-cat(" non detected in precursor file \n")
+cat(">> non detected in fragments file \n")
 }
-return(res_comp)
+show_end_processing()
+if (processing_parameters$do_pdf) {
+end_pdf()
+}
+return(res_comp_by_file)
+}
+create_ds_abs_int <- function(vmz, filtered_fragments) {
+verbose_catf(
+">> fragments: %s\n",
+paste(vmz, collapse = " ")
+)
+ds_abs_int <- create_int_mz(vmz[1], filtered_fragments)
+for (mz in vmz[-1]) {
+int_mz <- create_int_mz(mz, filtered_fragments)
+ds_abs_int <- merge(
+x = ds_abs_int,
+y = int_mz,
+by.x = 1,
+by.y = 1,
+all.x = TRUE,
+all.y = TRUE
+)
+}
+return(ds_abs_int)
+}
+create_int_mz <- function(mz, filtered_fragments) {
+## absolute intensity
+int_mz <- filtered_fragments[
+filtered_fragments$mznominal == mz,
+c("acquisitionNum", "i")
+]
+colnames(int_mz)[2] <- paste0("mz", mz)
+## average intensities of mass in duplicate scans
+comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean)
+return(comp_scans[, -1])
+}
+show_end_processing <- function() {
+verbose_catf("==========\n")
+cat("\n")
+}
+start_pdf <- function(processing_parameters, curent_file_id) {
+if (!dir.exists(processing_parameters$pdf_path)) {
+dir.create(processing_parameters$pdf_path, recursive = TRUE)
+}
+pdf(
+file = sprintf(
+"%s/%s_processing_file%s.pdf",
+processing_parameters$pdf_path,
+processing_parameters$c_name,
+curent_file_id
+),
+width = 8,
+height = 11
+)
+par(mfrow = c(3, 2))
+}
+pdf_plot_ds_abs_int <- function(c_name, ds_abs_int, refcol, i, r_coef) {
+plot(
+ds_abs_int[[refcol]],
+ds_abs_int[[i]],
+xlab = colnames(ds_abs_int)[refcol],
+ylab = colnames(ds_abs_int)[i],
+main = sprintf(
+"%s corr coeff r=%s", c_name, r_coef
+)
+)
+}
+end_pdf <- function() {
+dev.off()
 }
 set_global <- function(var, value) {
 assign(var, value, envir = globalenv())
 }
 set_global("global_verbose", TRUE)
 }
 unset_verbose <- function() {
 set_global("global_verbose", FALSE)
+}
+verbose_catf <- function(...) {
+if (global_verbose) {
+cat(sprintf(...), sep = "")
+}
+}
+debug_catf <- function(...) {
+if (global_debug) {
+cat(sprintf(...), sep = "")
+}
+}
+catf <- function(...) {
+cat(sprintf(...), sep = "")
 }
 create_parser <- function() {
 parser <- optparse::OptionParser()
 parser <- optparse::add_option(
 "Fragments are kept if there are found in a minimum number",
 "of min_number_scan scans"
 ),
 metavar = "number"
 )
+parser <- optparse::add_option(
+parser,
+c("--pdf_path"),
+type = "character",
+default = DEFAULT_PDF_PATH,
+help = paste(
+"[default %default]",
+"PDF files output path"
+)
+)
+parser <- optparse::add_option(
+parser,
+c("--ionization"),
+type = "character",
+action = "store",
+default = "None",
+help = paste(
+"[default %default]",
+"Which ionization to use for sirius"
+),
+metavar = "character"
+)
+parser <- optparse::add_option(
+parser,
+c("--fragment_match_delta"),
+type = "numeric",
+action = "store",
+default = DEFAULT_FRAGMENTS_MATCH_DELTA,
+help = paste(
+"[default %default]",
+"Fragment match delta"
+),
+metavar = "numeric"
+)
+parser <- optparse::add_option(
+parser,
+c("--fragment_match_delta_unit"),
+type = "character",
+action = "store",
+default = DEFAULT_FRAGMENTS_MATCH_DELTA_UNIT,
+help = paste(
+"[default %default]",
+"Fragment match delta"
+),
+metavar = "character"
+)
 return(parser)
 }
 stop_with_status <- function(msg, status) {
 sink(stderr())
 sink(NULL)
 base::quit(status = status)
 }
 check_args_validity <- function(args) { ## nolint cyclocomp_linter
-sysvars <- Sys.getenv()
-sysvarnames <- names(sysvars)
 if (length(args$output) == 0 || nchar(args$output[1]) == 0) {
 stop_with_status(
 "Missing output parameters. Please set it with --output.",
 MISSING_PARAMETER_ERROR
 )
 args$compounds
 ),
 MISSING_INPUT_FILE_ERROR
 )
 }
-if (
+if (in_galaxy_env()) {
+check_galaxy_args_validity(args)
+}
+}
+in_galaxy_env <- function() {
+sysvars <- Sys.getenv()
+sysvarnames <- names(sysvars)
+return(
 "_GALAXY_JOB_HOME_DIR" %in% sysvarnames
 || "_GALAXY_JOB_TMP_DIR" %in% sysvarnames
 || "GALAXY_MEMORY_MB" %in% sysvarnames
 || "GALAXY_MEMORY_MB_PER_SLOT" %in% sysvarnames
 || "GALAXY_SLOTS" %in% sysvarnames
-) {
+)
-check_galaxy_args_validity(args)
-}
 }
 check_galaxy_args_validity <- function(args) {
 if (!file.exists(args$output)) {
 stop_with_status(
 }
 get_csv_or_tsv <- function(
 path,
 sep_stack = c("\t", ",", ";"),
+sep_names = c("tab", "comma", "semicolon"),
 header = TRUE,
 quote = "\""
 ) {
-sep <- sep_stack[1]
+sep <- determine_csv_or_tsv_sep(
-result <- tryCatch({
+path = path,
-read.table(
+sep_stack = sep_stack,
-file = path,
+header = header,
-sep = sep,
+quote = quote
-header = header,
+)
-quote = quote
+verbose_catf(
-)
+"%s separator has been determined for %s.\n",
-}, error = function(e) {
+sep_names[sep_stack == sep],
-return(data.frame())
+path
-})
+)
-if (length(sep_stack) == 1) {
+return(read.table(
-return(result)
+file = path,
-}
+sep = sep,
-# if (
+header = header,
-#   ncol(result) == 0 || ## failed
+quote = quote
-#   ncol(result) == 1    ## only one row, suspicious, possible fail # nolint
+))
-# ) {
+}
-new_result <- get_csv_or_tsv(
-path,
+determine_csv_or_tsv_sep <- function(
-sep_stack = sep_stack[-1],
+path,
-header = header,
+sep_stack = c("\t", ",", ";"),
-quote = quote
+header = TRUE,
-)
+quote = "\""
-if (ncol(new_result) > ncol(result)) {
+) {
-return(new_result)
+count <- -1
-}
+best_sep <- sep_stack[1]
-# }
+for (sep in sep_stack) {
-return(result)
+tryCatch({
+table <- read.table(
+file = path,
+sep = sep,
+header = header,
+quote = quote,
+nrows = 1
+)
+if (ncol(table) > count) {
+count <- ncol(table)
+best_sep <- sep
+}
+})
+}
+return(best_sep)
 }
 uniformize_columns <- function(df) {
 cols <- colnames(df)
 for (func in c(tolower)) {
 }
 colnames(df) <- cols
 return(df)
 }
+handle_galaxy_param <- function(args) {
+for (param in names(args)) {
+if (is.character(args[[param]])) {
+args[[param]] <- gsub("__ob__", "[", args[[param]])
+args[[param]] <- gsub("__cb__", "]", args[[param]])
+}
+}
+return(args)
+}
+zip_pdfs <- function(processing_parameters) {
+if (processing_parameters$do_pdf) {
+if (zip <- Sys.getenv("R_ZIPCMD", "zip") == "") {
+catf("R could not fin the zip executable. Trying luck: zip = \"zip\"")
+zip <- "zip"
+} else {
+catf("Found zip executable at %s .", zip)
+}
+utils::zip(
+processing_parameters$pdf_zip_path,
+processing_parameters$pdf_path,
+zip = zip
+)
+}
+}
 main <- function(args) {
 if (args$version) {
-cat(sprintf("%s\n", MS2SNOOP_VERSION))
+catf("%s\n", MS2SNOOP_VERSION)
 base::quit(status = 0)
 }
-sessionInfo()
+if (in_galaxy_env()) {
+print(sessionInfo())
+cat("\n\n")
+}
 check_args_validity(args)
+args <- handle_galaxy_param(args)
+if (args$ionization == "None") {
+args$ionization <- NULL
+}
 if (args$debug) {
 set_debug()
 }
 if (args$verbose) {
 set_verbose()
 }
-## MSpurity precursors file
 precursors <- get_csv_or_tsv(args$precursors)
-## MSpurity fragments file
 fragments <- get_csv_or_tsv(args$fragments)
-## list of compounds : col1=Name of molecule, col2=m/z, col3=retention time
 compounds <- get_csv_or_tsv(args$compounds)
 compounds <- uniformize_columns(compounds)
 mandatory_columns <- c(
 "compound_name",
 ),
 BAD_PARAMETER_VALUE_ERROR
 )
 }
-res_all <- NULL
+res_all <- data.frame()
+processing_parameters <- list(
+min_number_scan = args$min_number_scan,
+mzdecimal = args$mzdecimal,
+r_threshold = args$r_threshold,
+seuil_ra = args$seuil_ra,
+tolmz = args$tolmz,
+tolrt = args$tolrt,
+ionization = args$ionization,
+do_pdf = nchar(args$pdf_path) > 0,
+pdf_zip_path = args$pdf_path,
+pdf_path = tempdir(),
+fragment_match_delta = args$fragment_match_delta,
+fragment_match_delta_unit = args$fragment_match_delta_unit
+)
 for (i in seq_len(nrow(compounds))) {
-## loop execution for all compounds in the compounds file
+processing_parameters$mzref <- compounds[["mz"]][i]
-res_cor <- NULL
+processing_parameters$rtref <- compounds[["rtsec"]][i]
+processing_parameters$c_name <- compounds[["compound_name"]][i]
+processing_parameters$inchikey <- compounds[["inchikey"]][i]
+processing_parameters$elemcomposition <- compounds[["elemcomposition"]][i]
 res_cor <- extract_fragments(
 precursors = precursors,
 fragments = fragments,
-mzref = compounds[["mz"]][i],
+processing_parameters = processing_parameters
-rtref = compounds[["rtsec"]][i],
-c_name = compounds[["compound_name"]][i],
-inchikey = compounds[["inchikey"]][i],
-elemcomposition = compounds[["elemcomposition"]][i],
-min_number_scan = args$min_number_scan,
-mzdecimal = args$mzdecimal,
-r_threshold = args$r_threshold,
-seuil_ra = args$seuil_ra,
-tolmz = args$tolmz,
-tolrt = args$tolrt
 )
 if (!is.null(res_cor)) {
-if (is.null(res_all)) {
+res_all <- rbind(res_all, res_cor)
-res_all <- res_cor
-} else {
-res_all <- rbind(res_all, res_cor)
-}
 }
 }
-if (is.null(res_all)) {
+if (nrow(res_all) == 0) {
 stop_with_status("No result at all!", NO_ANY_RESULT_ERROR)
 }
 write.table(
 x = res_all,
 file = args$output,
 sep = "\t",
 row.names = FALSE
 )
+zip_pdfs(processing_parameters)
+unlink(processing_parameters$pdf_path, recursive = TRUE)
 }
 global_debug <- FALSE
 global_verbose <- FALSE
 args <- optparse::parse_args(create_parser())

Mercurial > repos > workflow4metabolomics > ms2snoop

comparison MS2snoop.R @ 5:78d5a12406c2 draft