Mercurial > repos > workflow4metabolomics > ms2snoop
diff MS2snoop.R @ 5:78d5a12406c2 draft
planemo upload commit a5f94dac9b268629399dc22c5d6ac48c5a85adc3
author | workflow4metabolomics |
---|---|
date | Fri, 05 Aug 2022 17:25:45 +0000 |
parents | 856001213966 |
children | 77abacd33c31 |
line wrap: on
line diff
--- a/MS2snoop.R Wed Jul 06 10:38:39 2022 +0000 +++ b/MS2snoop.R Fri Aug 05 17:25:45 2022 +0000 @@ -15,57 +15,282 @@ #' -assign("MS2SNOOP_VERSION", "2.0.0") -lockBinding("MS2SNOOP_VERSION", globalenv()) - -assign("MISSING_PARAMETER_ERROR", 1) -lockBinding("MISSING_PARAMETER_ERROR", globalenv()) - -assign("BAD_PARAMETER_VALUE_ERROR", 2) -lockBinding("BAD_PARAMETER_VALUE_ERROR", globalenv()) - -assign("MISSING_INPUT_FILE_ERROR", 3) -lockBinding("MISSING_INPUT_FILE_ERROR", globalenv()) - -assign("NO_ANY_RESULT_ERROR", 255) -lockBinding("NO_ANY_RESULT_ERROR", globalenv()) +get_version <- function() { + cmd <- commandArgs(trailingOnly = FALSE) + root <- dirname(gsub("--file=", "", cmd[grep("--file=", cmd)])) + readme <- readLines(file.path(root, "README.md")) + version_line <- readme[grepl(" * **@version**: ", readme, fixed = TRUE)] + return(gsub(".*: ", "", version_line)) +} -assign("DEFAULT_PRECURSOR_PATH", "peaklist_precursors.tsv") -assign("DEFAULT_FRAGMENTS_PATH", "peaklist_fragments.tsv") -assign("DEFAULT_COMPOUNDS_PATH", "compounds_pos.txt") -assign("DEFAULT_OUTPUT_PATH", "compound_fragments_result.txt") -assign("DEFAULT_TOLMZ", 0.01) -assign("DEFAULT_TOLRT", 20) -assign("DEFAULT_MZDECIMAL", 0) -assign("DEFAULT_R_THRESHOLD", 0.85) -assign("DEFAULT_MINNUMBERSCAN", 8) -assign("DEFAULT_SEUIL_RA", 0.5) -lockBinding("DEFAULT_PRECURSOR_PATH", globalenv()) -lockBinding("DEFAULT_FRAGMENTS_PATH", globalenv()) -lockBinding("DEFAULT_COMPOUNDS_PATH", globalenv()) -lockBinding("DEFAULT_OUTPUT_PATH", globalenv()) -lockBinding("DEFAULT_TOLMZ", globalenv()) -lockBinding("DEFAULT_TOLRT", globalenv()) -lockBinding("DEFAULT_MZDECIMAL", globalenv()) -lockBinding("DEFAULT_R_THRESHOLD", globalenv()) -lockBinding("DEFAULT_MINNUMBERSCAN", globalenv()) -lockBinding("DEFAULT_SEUIL_RA", globalenv()) - -assign("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", 0.85) -assign("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", 0.1) -assign("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", 0.01) -assign("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", 60) -lockBinding("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", globalenv()) -lockBinding("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", globalenv()) -lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", globalenv()) -lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", globalenv()) - +defaults <- list( + MS2SNOOP_VERSION = get_version(), + MISSING_PARAMETER_ERROR = 1, + BAD_PARAMETER_VALUE_ERROR = 2, + MISSING_INPUT_FILE_ERROR = 3, + NO_ANY_RESULT_ERROR = 255, + DEFAULT_PRECURSOR_PATH = NULL, + DEFAULT_FRAGMENTS_PATH = NULL, + DEFAULT_COMPOUNDS_PATH = NULL, + DEFAULT_OUTPUT_PATH = "compound_fragments_result.txt", + DEFAULT_TOLMZ = 0.01, + DEFAULT_TOLRT = 20, + DEFAULT_MZDECIMAL = 3, + DEFAULT_R_THRESHOLD = 0.85, + DEFAULT_MINNUMBERSCAN = 8, + DEFAULT_SEUIL_RA = 0.05, + DEFAULT_FRAGMENTS_MATCH_DELTA = 10, + DEFAULT_FRAGMENTS_MATCH_DELTA_UNIT = "ppm", + DEFAULT_PDF_PATH = "" +) +env <- globalenv() +for (default in names(defaults)) { + assign(default, defaults[[default]], envir = env) + lockBinding(default, env) +} ######################################################################## +get_formulas <- function( + mzref, + spectra, + nominal_mz_list, + processing_parameters, + background = !TRUE +) { + if (is.vector(mzref) && length(mzref) > 1) { + return(lapply( + mzref, + function(mz) { + return(get_formulas( + mzref = mz, + spectra = spectra, + nominal_mz_list = nominal_mz_list, + processing_parameters = processing_parameters, + background = background + )) + } + )) + } + input <- sprintf( + "%s-%s.ms", + gsub("[[:space:]]", "_", processing_parameters$c_name), + mzref + ) + create_ms_file(input, mzref, spectra, processing_parameters) + output <- sprintf( + "out/%s-%s.out", + gsub("[[:space:]]", "_", processing_parameters$c_name), + mzref + ) + command <- sprintf( + paste( + "sirius", + "--noCite", + "--noSummaries", + "--loglevel=WARNING", + "-i='%s'", + "-o='%s'", + "tree", + ## loglevel is not working taken into account during + ## sirius startup, so we filter outputs... + "2>&1 | grep '^(WARNING|SEVERE)'" + ), + input, + output + ) + verbose_catf( + ">> Sirius is running %swith the command: %s\n", + if (background) "in the background " else "", + command + ) + system( + command, + wait = !background, + ignore.stdout = background, + ignore.stderr = background + ) + return(extract_sirius_results(output, spectra$mz, processing_parameters)) +} + +create_ms_file <- function( + path, + mzref, + spectra, + processing_parameters +) { + file_content <- paste( + sprintf(">compound %s", processing_parameters$c_name), + sprintf(">ionization %s", processing_parameters$ionization), + sprintf(">parentmass %s", mzref), + sprintf(">formula %s", processing_parameters$elemcomposition), + sep = "\n" + ) + displayed_file_content <- sprintf( + "%s\n>collision\n%s", + file_content, + paste( + sprintf( + "%s %s", + spectra[1:3, "mz"], + spectra[1:3, "intensities"] + ), + collapse = "\n" + ) + ) + if (nrow(spectra) > 3) { + displayed_file_content <- sprintf( + "%s\n[... %s more rows of mz and intensities ...]", + displayed_file_content, + nrow(spectra) - 3 + ) + } + catf( + ">> MS file created for %s with content:\n%s\n", + processing_parameters$c_name, + displayed_file_content + ) + file_content <- sprintf( + "%s\n\n>collision\n%s", + file_content, + paste( + sprintf("%s %s", spectra$mz, spectra$intensities), + collapse = "\n" + ) + ) + cat(file_content, file = path, append = FALSE) +} + +extract_sirius_results <- function( + output, + mz_list, + processing_parameters +) { + + delta <- processing_parameters$fragment_match_delta + delta_unit <- tolower(processing_parameters$fragment_match_delta_unit) + + output <- list.dirs(output, recursive = FALSE)[[1]] + + spectra_out_dir <- sprintf("%s/spectra", output) + spectra_filename <- sprintf( + "%s/%s", + spectra_out_dir, + list.files(spectra_out_dir)[[1]] + ) + + trees_out_dir <- sprintf("%s/trees", output) + trees_filename <- sprintf( + "%s/%s", + trees_out_dir, + list.files(trees_out_dir)[[1]] + ) + + if (!is.null(spectra_filename)) { + sirius_results <- get_csv_or_tsv(spectra_filename) + } else { + return(rep(NA, length(mz_list))) + } + if (!is.null(trees_filename)) { + sirius_results <- cbind(sirius_results, extract_sirius_ppm(trees_filename)) + } else { + return(rep(NA, length(mz_list))) + } + + fragment_matchings <- data.frame( + formula = NA, + ppm = NA, + mz = mz_list, + error = NA + ) + + sirius_results <- filter_sirius_with_delta( + sirius_results = sirius_results, + original_mz = fragment_matchings$mz, + delta = delta, + delta_unit = delta_unit + ) + + for (index in seq_len(nrow(sirius_results))) { + result <- sirius_results[index, ] + filter <- which(order(abs(fragment_matchings$mz - result$mz)) == 1) + fragment_matchings[filter, "formula"] <- result$formula + fragment_matchings[filter, "ppm"] <- result$ppm + catf( + "[OK] Fragment with m/z=%s matches %s with a difference of %s ppm\n", + fragment_matchings[filter, "mz"], result$formula, result$ppm + ) + } + return(fragment_matchings) +} + +filter_sirius_with_delta <- function( + sirius_results, + original_mz, + delta, + delta_unit +) { + if (is.numeric(delta) && !is.na(delta) && delta > 0) { + if (delta_unit == "ppm") { + filter <- abs(sirius_results$ppm) <= delta + fine <- which(filter) + not_fine <- which(!filter) + catf( + paste("[KO] fragment %s (m/z=%s) eleminated because ppm=%s is greater", + "than delta=%s\n" + ), + sirius_results[not_fine, ]$formula, + sirius_results[not_fine, ]$mz, + sirius_results[not_fine, ]$ppm, + delta + ) + sirius_results <- sirius_results[fine, ] + } else if (delta_unit == "mz") { + differences <- sapply( + sirius_results$mz, + function(mz) min(abs(original_mz - mz)) + ) + fine <- which(sapply( + sirius_results$mz, + function(mz) any(abs(original_mz - mz) <= delta) + )) + not_fine <- which(sapply( + sirius_results$mz, + function(mz) all(abs(original_mz - mz) > delta) + )) + catf( + paste( + "[KO] fragment %s eleminated because mz difference=%s is", + "greater than delta=%s\n" + ), + sirius_results[not_fine, ]$formula, + differences[not_fine], + delta + ) + sirius_results <- sirius_results[fine, ] + } + } + return(sirius_results) +} + +extract_sirius_ppm <- function(path) { + json <- file(path, "r") + suppressWarnings(json_lines <- readLines(json)) + close(json) + json_lines <- json_lines[ + grepl("\\s+\"(massDeviation|recalibratedMass)\" :", json_lines) + ] + json_lines <- gsub("^\\s+\"[^\"]+\" : \"?", "", json_lines) + ppms <- json_lines[seq(1, length(json_lines), 2)] + mz <- json_lines[seq(2, length(json_lines), 2)] + ppms <- as.numeric(gsub(" ppm .*", "", ppms)) + mz <- as.numeric(gsub(",$", "", mz)) + ordered <- order(mz) + return(list(ppm = ppms[ordered], recalibrated_mz = mz[ordered])) +} + #' @title plot_pseudo_spectra #' @param x -#' @param r_threshold #' @param fid #' @param sum_int #' @param vmz @@ -81,15 +306,13 @@ #' fid file id when several a precursor has been detected in several files plot_pseudo_spectra <- function( x, - r_threshold, fid, sum_int, vmz, cor_abs_int, refcol, - c_name, - inchikey, - elemcomposition + meaned_mz, + processing_parameters ) { ## du fait de la difference de nombre de colonne entre la dataframe qui ## inclue les scans en 1ere col, mzRef se decale de 1 @@ -98,89 +321,146 @@ rel_int <- sum_int[-1] rel_int <- rel_int / max(rel_int) - ## define max value on vertical axis (need to increase in order to plot the - ## label of fragments) - ymax <- max(rel_int) + 0.2 * max(rel_int) + if (processing_parameters$do_pdf) { + ## define max value on vertical axis (need to increase in order to plot the + ## label of fragments) + ymax <- max(rel_int) + 0.2 * max(rel_int) - par(mfrow = c(2, 1)) - plot(vmz, rel_int, type = "h", ylim = c(0, ymax), main = c_name) - ## low correl coef. will be display in grey - cor_low <- which(round(cor_abs_int, 2) < r_threshold) + par(mfrow = c(2, 1)) + plot(vmz, rel_int, type = "h", ylim = c(0, ymax), + main = processing_parameters$c_name + ) + ## low correl coef. will be display in grey + cor_low <- which(round(cor_abs_int, 2) < processing_parameters$r_threshold) + + lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2)) - lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2)) + if (length(cor_low) > 0) { + text( + vmz[cor_low], + rel_int[cor_low], + lbmzcor[cor_low], + cex = 0.5, + col = "grey", + srt = 90, + adj = 0 + ) + if (length(vmz) - length(cor_low) > 1) { + text( + vmz[-c(refcol, cor_low)], + rel_int[-c(refcol, cor_low)], + lbmzcor[-c(refcol, cor_low)], + cex = 0.6, + col = 1, + srt = 90, + adj = 0 + ) + } + } else { + if (length(vmz) > 1) { + text( + vmz[-c(refcol)], + rel_int[-c(refcol)], + lbmzcor[-c(refcol)], + cex = 0.6, + col = 1, + srt = 90, + adj = 0 + ) + } + } - if (length(cor_low) > 0) { text( - vmz[cor_low], - rel_int[cor_low], - lbmzcor[cor_low], - cex = 0.5, - col = "grey", + vmz[refcol], + rel_int[refcol], + lbmzcor[refcol], + cex = 0.8, + col = 2, srt = 90, adj = 0 ) - if (length(vmz) - length(cor_low) > 1) { - text( - vmz[-c(refcol, cor_low)], - rel_int[-c(refcol, cor_low)], - lbmzcor[-c(refcol, cor_low)], - cex = 0.6, - col = 1, - srt = 90, - adj = 0 + } + + ## prepare result file + cor_valid <- (round(cor_abs_int, 2) >= processing_parameters$r_threshold) + + do_sirius <- TRUE + verbose_catf("Checking sirius parameters...\n") + if (is.null(processing_parameters$ionization)) { + do_sirius <- FALSE + verbose_catf("[KO] No ionization passed in parameter.\n") + } else { + verbose_catf("[OK] Ionization=%s.\n", processing_parameters$ionization) + } + if (is.na(processing_parameters$elemcomposition)) { + do_sirius <- FALSE + verbose_catf("[KO] Elemental composition is NA.\n") + } else if (length(processing_parameters$elemcomposition) < 1) { + do_sirius <- FALSE + verbose_catf("[KO] No elemental composition is provided.\n") + } else if (processing_parameters$elemcomposition == "") { + do_sirius <- FALSE + verbose_catf("[KO] Elemental composition is an empty string.\n") + } else { + verbose_catf( + "[OK] Elemental composition=%s.\n", + processing_parameters$elemcomposition + ) + } + + cp_res_length <- length(vmz) + ppm <- rep(NA, cp_res_length) + formulas <- rep(NA, cp_res_length) + if (do_sirius) { + verbose_catf("Everything is ok, preparing for sirius.\n") + formulas <- get_formulas( + mzref = processing_parameters$mzref, + spectra = data.frame(mz = meaned_mz, intensities = sum_int[-1]), + nominal_mz_list = vmz, + processing_parameters = processing_parameters + ) + if (nrow(formulas) == 0) { + catf("No formula found.\n") + } else { + ppm <- formulas$ppm + formulas <- formulas$formula + catf( + "Found %s formula for %s fragments\n", + length(formulas[which(!(is.na(formulas)))]), + cp_res_length ) } } else { - if (length(vmz) > 1) { - text( - vmz[-c(refcol)], - rel_int[-c(refcol)], - lbmzcor[-c(refcol)], - cex = 0.6, - col = 1, - srt = 90, - adj = 0 - ) - } + verbose_catf("Sirius cannot be run.\n") } - - text( - vmz[refcol], - rel_int[refcol], - lbmzcor[refcol], - cex = 0.8, - col = 2, - srt = 90, - adj = 0 - ) - - ## prepare result file - corValid <- (round(cor_abs_int, 2) >= r_threshold) ##nolint object_name_linter cp_res <- data.frame( - rep(c_name, length(vmz)), - rep(inchikey, length(vmz)), - rep(elemcomposition, length(vmz)), - rep(fid, length(vmz)), + rep(processing_parameters$c_name, cp_res_length), + rep(processing_parameters$inchikey, cp_res_length), + rep(processing_parameters$elemcomposition, cp_res_length), + formulas, vmz, + ppm, + rep(fid, cp_res_length), cor_abs_int, sum_int[-1], rel_int, - corValid + cor_valid ) colnames(cp_res) <- c( "compoundName", "inchikey", "elemcomposition", + "fragment", + "fragment_mz", + "ppm", "fileid", - "fragments_mz", "CorWithPrecursor", "AbsoluteIntensity", "relativeIntensity", "corValid" ) return(cp_res) - } #' @@ -188,13 +468,12 @@ #' #' @param precursors the precursor list from mspurity #' @param fragments the fragments list from ms purity -#' @param mzref -#' @param rtref -#' @param c_name -#' @param r_threshold default = DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD -#' @param seuil_ra default = DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA -#' @param tolmz default = DEFAULT_EXTRACT_FRAGMENTS_TOLMZ -#' @param tolrt default = DEFAULT_EXTRACT_FRAGMENTS_TOLRT +# ' @param mzref +# ' @param rtref +# ' @param c_name +# ' @param inchikey +# ' @param elemcomposition +#' @param processing_parameters #' @returns #' #' @description @@ -203,193 +482,266 @@ extract_fragments <- function( ## nolint cyclocomp_linter precursors, fragments, - mzref, - rtref, - c_name, - inchikey, - elemcomposition, - min_number_scan, - mzdecimal, - r_threshold = DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD, - seuil_ra = DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA, - tolmz = DEFAULT_EXTRACT_FRAGMENTS_TOLMZ, - tolrt = DEFAULT_EXTRACT_FRAGMENTS_TOLRT + processing_parameters ) { ## filter precursor in the precursors file based on mz and rt in the ## compound list - cat("processing ", c_name, "\n") + catf("processing %s\n", processing_parameters$c_name) + verbose_catf("===\n") + param <- processing_parameters selected_precursors <- which( - (abs(precursors$precurMtchMZ - mzref) <= tolmz) - & (abs(precursors$precurMtchRT - rtref) <= tolrt) + (abs(precursors$precurMtchMZ - param$mzref) <= param$tolmz) + & (abs(precursors$precurMtchRT - param$rtref) <= param$tolrt) + ) + rm(param) + + verbose_catf( + "> %s precursors selected with mz=%s±%s and rt=%s±%s\n", + length(selected_precursors), + processing_parameters$mzref, + processing_parameters$tolmz, + processing_parameters$rtref, + processing_parameters$tolrt ) ## check if there is the precursor in the file - if (length(selected_precursors) > 0) { - - sprecini <- precursors[selected_precursors, ] - - ## check if fragments corresponding to precursor are found in several - ## files (collision energy) - ## this lead to a processing for each fileid - mf <- levels(as.factor(sprecini$fileid)) - if (length(mf) > 1 && global_verbose) { - cat(" several files detected for this compounds :\n") - } - - for (f in seq_along(mf)) { - - sprec <- sprecini[sprecini$fileid == mf[f], ] - - ## selection of fragment in the fragments file with the grpid common in - ## both fragments and precursors - selfrgt <- levels(as.factor(sprec$grpid)) - sfrgt <- fragments[ - fragments$grpid %in% selfrgt - & fragments$fileid == mf[f], - ] - - ## filter fragments on relative intensity seuil_ra = user defined - ## parameter (MSpurity flags could be used here) - sfrgtfil <- sfrgt[sfrgt$ra > seuil_ra, ] - mznominal <- round(x = sfrgtfil$mz, mzdecimal) - sfrgtfil <- data.frame(sfrgtfil, mznominal) - - ## creation of cross table row=scan col=mz X=ra - vmz <- levels(as.factor(sfrgtfil$mznominal)) - - if (global_verbose) { - cat(" fragments :", vmz) - cat("\n") - } - - ## mz of precursor in data precursor to check correlation with - mz_prec <- paste0("mz", round(mean(sprec$mz), mzdecimal)) + if (length(selected_precursors) < 1) { + cat("> non detected in precursor file\n") + show_end_processing() + return(NULL) + } - for (m in seq_along(vmz)) { - - ## absolute intensity - cln <- c( - which(colnames(sfrgtfil) == "acquisitionNum"), - which(colnames(sfrgtfil) == "i") - ) - int_mz <- sfrgtfil[sfrgtfil$mznominal == vmz[m], cln] - colnames(int_mz)[2] <- paste0("mz", vmz[m]) - - ## average intensities of mass in duplicate scans - comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean) - int_mz <- comp_scans[, -1] + precursors <- precursors[selected_precursors, ] - if (m == 1) { - ds_abs_int <- int_mz - } else { - ds_abs_int <- merge( - x = ds_abs_int, - y = int_mz, - by.x = 1, - by.y = 1, - all.x = TRUE, - all.y = TRUE - ) - } - } - if (global_debug) { - print(ds_abs_int) - write.table( - x = ds_abs_int, - file = paste0(c_name, "ds_abs_int.txt"), - row.names = FALSE, - sep = "\t" - ) - } - - ## elimination of mz with less than min_number_scan scans (user defined - ## parameter) - xmz <- rep(NA, ncol(ds_abs_int) - 1) - sum_int <- rep(NA, ncol(ds_abs_int)) - nbxmz <- 0 - nb_scan_check <- min(nrow(ds_abs_int), min_number_scan) - - for (j in 2:ncol(ds_abs_int)) { - sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE) - if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) { - nbxmz <- nbxmz + 1 - xmz[nbxmz] <- j - } - } - - xmz <- xmz[-which(is.na(xmz))] - if (length(xmz) > 0) { - ds_abs_int <- ds_abs_int[, -c(xmz)] - sum_int <- sum_int[-c(xmz)] - ## liste des mz keeped decale de 1 avec ds_abs_int - vmz <- as.numeric(vmz[-c(xmz - 1)]) - } + ## check if fragments corresponding to precursor are found in several + ## files (collision energy) + ## this lead to a processing for each fileid + file_ids <- as.character(sort(unique(precursors$fileid))) + if (length(file_ids) > 1) { + catf("> several files detected for this compounds :\n") + } else if (length(file_ids) < 1 || nrow(precursors) < 1) { + return(data.frame()) + } - ## reference ion for correlation computing = precursor OR maximum - ## intensity ion in precursor is not present - refcol <- which(colnames(ds_abs_int) == mz_prec) - if (length(refcol) == 0) { - refcol <- which(sum_int == max(sum_int, na.rm = TRUE)) - } - pdf( - file = sprintf("%s_processing_file%s.pdf", c_name, mf[f]), - width = 8, - height = 11 + res_comp <- data.frame() + for (curent_file_id in file_ids) { + curent_precursors <- precursors[precursors$fileid == curent_file_id, ] + selected_fragments <- fragments[ + fragments$grpid %in% as.character(curent_precursors$grpid) + & fragments$fileid == curent_file_id, + ] + filtered_fragments <- selected_fragments[ + selected_fragments$ra > processing_parameters$seuil_ra, + ] + if (nrow(filtered_fragments) != 0) { + res_comp_by_file <- process_file( + curent_file_id = curent_file_id, + precursor_mz = curent_precursors$mz, + filtered_fragments = filtered_fragments, + processing_parameters = processing_parameters ) - par(mfrow = c(3, 2)) - - ## Pearson correlations between absolute intensities computing - cor_abs_int <- rep(NA, length(vmz)) - - if (length(refcol) > 0) { - for (i in 2:length(ds_abs_int)) { - cor_abs_int[i - 1] <- cor( - x = ds_abs_int[[refcol]], - y = ds_abs_int[[i]], - use = "pairwise.complete.obs", - method = "pearson" - ) - plot( - ds_abs_int[[refcol]], - ds_abs_int[[i]], - xlab = colnames(ds_abs_int)[refcol], - ylab = colnames(ds_abs_int)[i], - main = sprintf( - "%s corr coeff r=%s", c_name, round(cor_abs_int[i - 1], 2) - ) - ) - } - ## plot pseudo spectra - res_comp_by_file <- plot_pseudo_spectra( - x = ds_abs_int, - r_threshold = r_threshold, - fid = mf[f], - sum_int = sum_int, - vmz = vmz, - cor_abs_int = cor_abs_int, - refcol = refcol, - c_name = c_name, - inchikey = inchikey, - elemcomposition = elemcomposition - ) - if (f == 1) { - res_comp <- res_comp_by_file - } - } else { - res_comp_by_file <- NULL - cat(" non detected in fragments file \n") - } if (!is.null(res_comp_by_file)) { res_comp <- rbind(res_comp, res_comp_by_file) } - dev.off() + } else { + catf("No fragment found for in fragment file\n") + } + } + return(unique(res_comp)) +} + +process_file <- function( + curent_file_id, + precursor_mz, + filtered_fragments, + processing_parameters +) { + mznominal <- round(x = filtered_fragments$mz, digits = 0) + meaned_mz <- round( + aggregate( + data.frame( + mz = filtered_fragments$mz, + mznominal = mznominal + ), + list(mznominal), + FUN = mean + )$mz, + digits = processing_parameters$mzdecimal + ) + filtered_fragments <- data.frame(filtered_fragments, mznominal) + + ## creation of cross table row=scan col=mz X=ra + + vmz <- as.character(sort(unique(filtered_fragments$mznominal))) + + ds_abs_int <- create_ds_abs_int(vmz, filtered_fragments) + + if (global_debug) { + print(ds_abs_int) + } + + ## elimination of mz with less than min_number_scan scans (user defined + ## parameter) + xmz <- rep(NA, ncol(ds_abs_int) - 1) + sum_int <- rep(NA, ncol(ds_abs_int)) + nbxmz <- 0 + nb_scan_check <- min(nrow(ds_abs_int), processing_parameters$min_number_scan) + + for (j in 2:ncol(ds_abs_int)) { + sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE) + if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) { + nbxmz <- nbxmz + 1 + xmz[nbxmz] <- j } + } + + xmz <- xmz[-which(is.na(xmz))] + if (length(xmz) > 0) { + ds_abs_int <- ds_abs_int[, -c(xmz)] + sum_int <- sum_int[-c(xmz)] + ## liste des mz keeped decale de 1 avec ds_abs_int + vmz <- as.numeric(vmz[-c(xmz - 1)]) + meaned_mz <- meaned_mz[-c(xmz - 1)] + } + + ## mz of precursor in data precursor to check correlation with + mz_prec <- paste0( + "mz", + round(mean(precursor_mz), processing_parameters$mzdecimal) + ) + ## reference ion for correlation computing = precursor OR maximum + ## intensity ion in precursor is not present + refcol <- which(colnames(ds_abs_int) == mz_prec) + if (length(refcol) == 0) { + refcol <- which(sum_int == max(sum_int, na.rm = TRUE)) + } + + if (processing_parameters$do_pdf) { + start_pdf(processing_parameters, curent_file_id) + } + + ## Pearson correlations between absolute intensities computing + cor_abs_int <- rep(NA, length(vmz)) + + if (length(refcol) > 0) { + for (i in 2:length(ds_abs_int)) { + cor_abs_int[i - 1] <- stats::cor( + x = ds_abs_int[[refcol]], + y = ds_abs_int[[i]], + use = "pairwise.complete.obs", + method = "pearson" + ) + debug_catf( + "Correlation between %s and %s: %s\n", + paste(ds_abs_int[[refcol]], collapse = ";"), + paste(ds_abs_int[[i]], collapse = ";"), + paste(cor_abs_int[i - 1], collapse = ";") + ) + if (processing_parameters$do_pdf) { + pdf_plot_ds_abs_int( + processing_parameters$c_name, + ds_abs_int, + refcol, + i, + round(cor_abs_int[i - 1], 2) + ) + } + } + ## plot pseudo spectra + res_comp_by_file <- plot_pseudo_spectra( + x = ds_abs_int, + fid = curent_file_id, + sum_int = sum_int, + vmz = vmz, + cor_abs_int = cor_abs_int, + refcol = refcol, + meaned_mz = meaned_mz, + processing_parameters = processing_parameters + ) + catf( + "%s has been processed and %s fragments have been found.\n", + processing_parameters$c_name, + nrow(res_comp_by_file) + ) } else { - res_comp <- NULL - cat(" non detected in precursor file \n") + res_comp_by_file <- NULL + cat(">> non detected in fragments file \n") + } + show_end_processing() + if (processing_parameters$do_pdf) { + end_pdf() + } + return(res_comp_by_file) +} + +create_ds_abs_int <- function(vmz, filtered_fragments) { + verbose_catf( + ">> fragments: %s\n", + paste(vmz, collapse = " ") + ) + ds_abs_int <- create_int_mz(vmz[1], filtered_fragments) + for (mz in vmz[-1]) { + int_mz <- create_int_mz(mz, filtered_fragments) + ds_abs_int <- merge( + x = ds_abs_int, + y = int_mz, + by.x = 1, + by.y = 1, + all.x = TRUE, + all.y = TRUE + ) } - return(res_comp) + return(ds_abs_int) +} + +create_int_mz <- function(mz, filtered_fragments) { + ## absolute intensity + int_mz <- filtered_fragments[ + filtered_fragments$mznominal == mz, + c("acquisitionNum", "i") + ] + colnames(int_mz)[2] <- paste0("mz", mz) + ## average intensities of mass in duplicate scans + comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean) + return(comp_scans[, -1]) +} + +show_end_processing <- function() { + verbose_catf("==========\n") + cat("\n") +} + +start_pdf <- function(processing_parameters, curent_file_id) { + if (!dir.exists(processing_parameters$pdf_path)) { + dir.create(processing_parameters$pdf_path, recursive = TRUE) + } + pdf( + file = sprintf( + "%s/%s_processing_file%s.pdf", + processing_parameters$pdf_path, + processing_parameters$c_name, + curent_file_id + ), + width = 8, + height = 11 + ) + par(mfrow = c(3, 2)) +} + +pdf_plot_ds_abs_int <- function(c_name, ds_abs_int, refcol, i, r_coef) { + plot( + ds_abs_int[[refcol]], + ds_abs_int[[i]], + xlab = colnames(ds_abs_int)[refcol], + ylab = colnames(ds_abs_int)[i], + main = sprintf( + "%s corr coeff r=%s", c_name, r_coef + ) + ) +} +end_pdf <- function() { + dev.off() } set_global <- function(var, value) { @@ -412,6 +764,23 @@ set_global("global_verbose", FALSE) } +verbose_catf <- function(...) { + if (global_verbose) { + cat(sprintf(...), sep = "") + } +} + + +debug_catf <- function(...) { + if (global_debug) { + cat(sprintf(...), sep = "") + } +} + +catf <- function(...) { + cat(sprintf(...), sep = "") +} + create_parser <- function() { parser <- optparse::OptionParser() parser <- optparse::add_option( @@ -547,6 +916,52 @@ ), metavar = "number" ) + parser <- optparse::add_option( + parser, + c("--pdf_path"), + type = "character", + default = DEFAULT_PDF_PATH, + help = paste( + "[default %default]", + "PDF files output path" + ) + ) + parser <- optparse::add_option( + parser, + c("--ionization"), + type = "character", + action = "store", + default = "None", + help = paste( + "[default %default]", + "Which ionization to use for sirius" + ), + metavar = "character" + ) + parser <- optparse::add_option( + parser, + c("--fragment_match_delta"), + type = "numeric", + action = "store", + default = DEFAULT_FRAGMENTS_MATCH_DELTA, + help = paste( + "[default %default]", + "Fragment match delta" + ), + metavar = "numeric" + ) + parser <- optparse::add_option( + parser, + c("--fragment_match_delta_unit"), + type = "character", + action = "store", + default = DEFAULT_FRAGMENTS_MATCH_DELTA_UNIT, + help = paste( + "[default %default]", + "Fragment match delta" + ), + metavar = "character" + ) return(parser) } @@ -559,8 +974,6 @@ } check_args_validity <- function(args) { ## nolint cyclocomp_linter - sysvars <- Sys.getenv() - sysvarnames <- names(sysvars) if (length(args$output) == 0 || nchar(args$output[1]) == 0) { stop_with_status( "Missing output parameters. Please set it with --output.", @@ -612,15 +1025,21 @@ MISSING_INPUT_FILE_ERROR ) } - if ( + if (in_galaxy_env()) { + check_galaxy_args_validity(args) + } +} + +in_galaxy_env <- function() { + sysvars <- Sys.getenv() + sysvarnames <- names(sysvars) + return( "_GALAXY_JOB_HOME_DIR" %in% sysvarnames || "_GALAXY_JOB_TMP_DIR" %in% sysvarnames || "GALAXY_MEMORY_MB" %in% sysvarnames || "GALAXY_MEMORY_MB_PER_SLOT" %in% sysvarnames || "GALAXY_SLOTS" %in% sysvarnames - ) { - check_galaxy_args_validity(args) - } + ) } check_galaxy_args_validity <- function(args) { @@ -638,38 +1057,53 @@ get_csv_or_tsv <- function( path, sep_stack = c("\t", ",", ";"), + sep_names = c("tab", "comma", "semicolon"), header = TRUE, quote = "\"" ) { - sep <- sep_stack[1] - result <- tryCatch({ - read.table( - file = path, - sep = sep, - header = header, - quote = quote - ) - }, error = function(e) { - return(data.frame()) - }) - if (length(sep_stack) == 1) { - return(result) + sep <- determine_csv_or_tsv_sep( + path = path, + sep_stack = sep_stack, + header = header, + quote = quote + ) + verbose_catf( + "%s separator has been determined for %s.\n", + sep_names[sep_stack == sep], + path + ) + return(read.table( + file = path, + sep = sep, + header = header, + quote = quote + )) +} + +determine_csv_or_tsv_sep <- function( + path, + sep_stack = c("\t", ",", ";"), + header = TRUE, + quote = "\"" +) { + count <- -1 + best_sep <- sep_stack[1] + for (sep in sep_stack) { + tryCatch({ + table <- read.table( + file = path, + sep = sep, + header = header, + quote = quote, + nrows = 1 + ) + if (ncol(table) > count) { + count <- ncol(table) + best_sep <- sep + } + }) } - # if ( - # ncol(result) == 0 || ## failed - # ncol(result) == 1 ## only one row, suspicious, possible fail # nolint - # ) { - new_result <- get_csv_or_tsv( - path, - sep_stack = sep_stack[-1], - header = header, - quote = quote - ) - if (ncol(new_result) > ncol(result)) { - return(new_result) - } - # } - return(result) + return(best_sep) } uniformize_columns <- function(df) { @@ -681,24 +1115,54 @@ return(df) } +handle_galaxy_param <- function(args) { + for (param in names(args)) { + if (is.character(args[[param]])) { + args[[param]] <- gsub("__ob__", "[", args[[param]]) + args[[param]] <- gsub("__cb__", "]", args[[param]]) + } + } + return(args) +} + +zip_pdfs <- function(processing_parameters) { + if (processing_parameters$do_pdf) { + if (zip <- Sys.getenv("R_ZIPCMD", "zip") == "") { + catf("R could not fin the zip executable. Trying luck: zip = \"zip\"") + zip <- "zip" + } else { + catf("Found zip executable at %s .", zip) + } + utils::zip( + processing_parameters$pdf_zip_path, + processing_parameters$pdf_path, + zip = zip + ) + } +} + main <- function(args) { if (args$version) { - cat(sprintf("%s\n", MS2SNOOP_VERSION)) + catf("%s\n", MS2SNOOP_VERSION) base::quit(status = 0) } - sessionInfo() + if (in_galaxy_env()) { + print(sessionInfo()) + cat("\n\n") + } check_args_validity(args) + args <- handle_galaxy_param(args) + if (args$ionization == "None") { + args$ionization <- NULL + } if (args$debug) { set_debug() } if (args$verbose) { set_verbose() } - ## MSpurity precursors file precursors <- get_csv_or_tsv(args$precursors) - ## MSpurity fragments file fragments <- get_csv_or_tsv(args$fragments) - ## list of compounds : col1=Name of molecule, col2=m/z, col3=retention time compounds <- get_csv_or_tsv(args$compounds) compounds <- uniformize_columns(compounds) @@ -719,43 +1183,49 @@ ) } - res_all <- NULL + res_all <- data.frame() + processing_parameters <- list( + min_number_scan = args$min_number_scan, + mzdecimal = args$mzdecimal, + r_threshold = args$r_threshold, + seuil_ra = args$seuil_ra, + tolmz = args$tolmz, + tolrt = args$tolrt, + ionization = args$ionization, + do_pdf = nchar(args$pdf_path) > 0, + pdf_zip_path = args$pdf_path, + pdf_path = tempdir(), + fragment_match_delta = args$fragment_match_delta, + fragment_match_delta_unit = args$fragment_match_delta_unit + ) for (i in seq_len(nrow(compounds))) { - ## loop execution for all compounds in the compounds file - res_cor <- NULL + processing_parameters$mzref <- compounds[["mz"]][i] + processing_parameters$rtref <- compounds[["rtsec"]][i] + processing_parameters$c_name <- compounds[["compound_name"]][i] + processing_parameters$inchikey <- compounds[["inchikey"]][i] + processing_parameters$elemcomposition <- compounds[["elemcomposition"]][i] res_cor <- extract_fragments( precursors = precursors, fragments = fragments, - mzref = compounds[["mz"]][i], - rtref = compounds[["rtsec"]][i], - c_name = compounds[["compound_name"]][i], - inchikey = compounds[["inchikey"]][i], - elemcomposition = compounds[["elemcomposition"]][i], - min_number_scan = args$min_number_scan, - mzdecimal = args$mzdecimal, - r_threshold = args$r_threshold, - seuil_ra = args$seuil_ra, - tolmz = args$tolmz, - tolrt = args$tolrt + processing_parameters = processing_parameters ) if (!is.null(res_cor)) { - if (is.null(res_all)) { - res_all <- res_cor - } else { - res_all <- rbind(res_all, res_cor) - } + res_all <- rbind(res_all, res_cor) } } - if (is.null(res_all)) { + if (nrow(res_all) == 0) { stop_with_status("No result at all!", NO_ANY_RESULT_ERROR) } + write.table( x = res_all, file = args$output, sep = "\t", row.names = FALSE ) + zip_pdfs(processing_parameters) + unlink(processing_parameters$pdf_path, recursive = TRUE) } global_debug <- FALSE