view MS2snoop.R @ 2:e1c0f63872c4 draft

" master branch Updating"
author lain
date Tue, 19 Apr 2022 10:21:42 +0000
parents 67733206be53
children
line wrap: on
line source

#'
#' read and process mspurity W4M files
#' create a summary of fragment for each precursor and a graphics of peseudo
#' spectra + correlation on which checking of fragment is based on
#' V3 try to identify and process multiple files for 1 precursor which may
#' occur if different collision energy are used
#' V4 elimination of correlation = NA. Correlation is done with precursor, if
#' precursor is not present correlation with most intense peak
#' author: Jean-Francois Martin
#' V5 is versionned, lintR-compliant, packaged, unit-tested, documented and
#' tested against data from other labs.
#' new maintainer: Lain Pavot - lain.pavot@inrae.fr
#'
#' @import optparse
#'
NULL


assign("DEFAULT_PRECURSOR_PATH", "peaklist_precursors.tsv")
assign("DEFAULT_FRAGMENTS_PATH", "peaklist_fragments.tsv")
assign("DEFAULT_COMPOUNDS_PATH", "compounds_pos.txt")
assign("DEFAULT_OUTPUT_PATH", "compound_fragments_result.txt")
assign("DEFAULT_TOLMZ", 0.01)
assign("DEFAULT_TOLRT", 20)
assign("DEFAULT_MZDECIMAL", 0)
assign("DEFAULT_R_THRESHOLD", 0.85)
assign("DEFAULT_MINNUMBERSCAN", 8)
assign("DEFAULT_SEUIL_RA", 0.5)
lockBinding("DEFAULT_PRECURSOR_PATH", globalenv())
lockBinding("DEFAULT_FRAGMENTS_PATH", globalenv())
lockBinding("DEFAULT_COMPOUNDS_PATH", globalenv())
lockBinding("DEFAULT_OUTPUT_PATH", globalenv())
lockBinding("DEFAULT_TOLMZ", globalenv())
lockBinding("DEFAULT_TOLRT", globalenv())
lockBinding("DEFAULT_MZDECIMAL", globalenv())
lockBinding("DEFAULT_R_THRESHOLD", globalenv())
lockBinding("DEFAULT_MINNUMBERSCAN", globalenv())
lockBinding("DEFAULT_SEUIL_RA", globalenv())

assign("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", 0.85)
assign("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", 0.1)
assign("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", 0.01)
assign("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", 60)
lockBinding("DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD", globalenv())
lockBinding("DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA", globalenv())
lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLMZ", globalenv())
lockBinding("DEFAULT_EXTRACT_FRAGMENTS_TOLRT", globalenv())


debug <- FALSE


########################################################################

#' @title plot_pseudo_spectra
#' @param x
#' @param r_threshold
#' @param fid
#' @param sum_int
#' @param vmz
#' @param cor_abs_int
#' @param refcol
#' @param c_name
#' @description plot_pseudo_spectra
#' function to compute sum of intensities among scans for all
#' m/z kept (cor > r_threshold & minimum number of scans)
#' and plot pseudo spectra
#' x dataframe scan X fragments with scans number in the 1st column and
#' ions in next with intensities
#' fid file id when several a precursor has been detected in several files
plot_pseudo_spectra <- function(
  x,
  r_threshold,
  fid,
  sum_int,
  vmz,
  cor_abs_int,
  refcol,
  c_name
) {
  ## du fait de la difference de nombre de colonne entre la dataframe qui
  ## inclue les scans en 1ere col, mzRef se decale de 1
  refcol <- refcol - 1
  ## compute relative intensities max=100%
  rel_int <- sum_int[-1]
  rel_int <- rel_int / max(rel_int)

  ## define max value on vertical axis (need to increase in order to plot the
  ## label of fragments)
  ymax <- max(rel_int) + 0.2 * max(rel_int)

  par(mfrow = c(2, 1))
  plot(vmz, rel_int, type = "h", ylim = c(0, ymax), main = c_name)
  ## low correl coef. will be display in grey
  cor_low <- which(round(cor_abs_int, 2) < r_threshold)

  lbmzcor <- sprintf("%s(r=%s)", vmz, round(cor_abs_int, 2))

  if (length(cor_low) > 0) {
    text(
      vmz[cor_low],
      rel_int[cor_low],
      lbmzcor[cor_low],
      cex = 0.5,
      col = "grey",
      srt = 90,
      adj = 0
    )
    if (length(vmz) - length(cor_low) > 1) {
      text(
        vmz[-c(refcol, cor_low)],
        rel_int[-c(refcol, cor_low)],
        lbmzcor[-c(refcol, cor_low)],
        cex = 0.6,
        col = 1,
        srt = 90,
        adj = 0
      )
    }
  } else {
    if (length(vmz) > 1) {
      text(
        vmz[-c(refcol)],
        rel_int[-c(refcol)],
        lbmzcor[-c(refcol)],
        cex = 0.6,
        col = 1,
        srt = 90,
        adj = 0
      )
    }
  }

  text(
    vmz[refcol],
    rel_int[refcol],
    lbmzcor[refcol],
    cex = 0.8,
    col = 2,
    srt = 90,
    adj = 0
  )

  ## prepare result file
  corValid <- (round(cor_abs_int, 2) >= r_threshold) ##nolint object_name_linter
  cp_res <- data.frame(
    rep(c_name, length(vmz)),
    rep(fid, length(vmz)),
    vmz,
    cor_abs_int,
    sum_int[-1],
    rel_int,
    corValid
  )

  colnames(cp_res) <- c(
    "compoundName",
    "fileid",
    "fragments_mz",
    "CorWithPrecursor",
    "AbsoluteIntensity",
    "relativeIntensity",
    "corValid"
  )
  return(cp_res)

}

#'
#' @title extract_fragments
#'
#' @param precursors the precursor list from mspurity
#' @param fragments the fragments list from ms purity
#' @param mzref
#' @param rtref
#' @param c_name
#' @param r_threshold default = DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD
#' @param seuil_ra default = DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA
#' @param tolmz default = DEFAULT_EXTRACT_FRAGMENTS_TOLMZ
#' @param tolrt default = DEFAULT_EXTRACT_FRAGMENTS_TOLRT
#' @returns
#'
#' @description
#' function for extraction of fragments corresponding to precursors
#' detected by MSPurity
extract_fragments <- function( ## nolint cyclocomp_linter
  precursors,
  fragments,
  mzref,
  rtref,
  c_name,
  min_number_scan,
  mzdecimal,
  r_threshold=DEFAULT_EXTRACT_FRAGMENTS_R_THRESHOLD,
  seuil_ra=DEFAULT_EXTRACT_FRAGMENTS_SEUIL_RA,
  tolmz=DEFAULT_EXTRACT_FRAGMENTS_TOLMZ,
  tolrt=DEFAULT_EXTRACT_FRAGMENTS_TOLRT
) {
  ## filter precursor in the precursors file based on mz and rt in the
  ## compound list
  cat("processing ", c_name, "\n")
  selected_precursors <- which(
    (abs(precursors$precurMtchMZ - mzref) <= tolmz)
    & (abs(precursors$precurMtchRT - rtref) <= tolrt)
  )

  ## check if there is the precursor in the file
  if (length(selected_precursors) > 0) {

    sprecini <- precursors[selected_precursors, ]

    ## check if fragments corresponding to precursor are found in several
    ## files (collision energy)
    ## this lead to a processing for each fileid
    mf <- levels(as.factor(sprecini$fileid))
    if (length(mf) > 1) {
      cat(" several files detected for this compounds :\n")
    }

    for (f in seq_along(mf)) {

      sprec <- sprecini[sprecini$fileid == mf[f], ]

      ## selection of fragment in the fragments file with the grpid common in
      ## both fragments and precursors
      selfrgt <- levels(as.factor(sprec$grpid))
      sfrgt <- fragments[
        fragments$grpid %in% selfrgt
        & fragments$fileid == mf[f],
      ]

      ## filter fragments on relative intensity seuil_ra = user defined
      ## parameter (MSpurity flags could be used here)
      sfrgtfil <- sfrgt[sfrgt$ra > seuil_ra, ]

      mznominal <- round(x = sfrgtfil$mz, mzdecimal)
      sfrgtfil <- data.frame(sfrgtfil, mznominal)

      ## creation of cross table row=scan col=mz X=ra
      vmz <- levels(as.factor(sfrgtfil$mznominal))

      cat(" fragments :", vmz)

      ## mz of precursor in data precursor to check correlation with
      mz_prec <- paste0("mz", round(mean(sprec$mz), mzdecimal))

      for (m in seq_along(vmz)) {

        ## absolute intensity
        cln <- c(
          which(colnames(sfrgtfil) == "acquisitionNum"),
          which(colnames(sfrgtfil) == "i")
        )
        int_mz <- sfrgtfil[sfrgtfil$mznominal == vmz[m], cln]
        colnames(int_mz)[2] <- paste0("mz", vmz[m])

        ## average intensities of mass in duplicate scans
        comp_scans <- aggregate(x = int_mz, by = list(int_mz[[1]]), FUN = mean)
        int_mz <- comp_scans[, -1]

        if (m == 1) {
          ds_abs_int <- int_mz
        } else {
          ds_abs_int <- merge(
            x = ds_abs_int,
            y = int_mz,
            by.x = 1,
            by.y = 1,
            all.x = TRUE,
            all.y = TRUE
          )
        }
      }
      if (debug) {
        write.table(
          x = ds_abs_int,
          file = paste0(c_name, "ds_abs_int.txt"),
          row.names = FALSE,
          sep = "\t"
        )
      }

      ## elimination of mz with less than min_number_scan scans (user defined
      ## parameter)
      xmz <- rep(NA, ncol(ds_abs_int) - 1)
      sum_int <- rep(NA, ncol(ds_abs_int))
      nbxmz <- 0
      nb_scan_check <- min(nrow(ds_abs_int), min_number_scan)

      for (j in 2:ncol(ds_abs_int)) {
        sum_int[j] <- sum(ds_abs_int[j], na.rm = TRUE)
        if (sum(!is.na(ds_abs_int[[j]])) < nb_scan_check) {
          nbxmz <- nbxmz + 1
          xmz[nbxmz] <- j
        }
      }

      xmz <- xmz[-which(is.na(xmz))]
      if (length(xmz) > 0) {
        ds_abs_int <- ds_abs_int[, -c(xmz)]
        sum_int <- sum_int[-c(xmz)]
        ## liste des mz keeped decale de 1 avec ds_abs_int
        vmz <- as.numeric(vmz[-c(xmz - 1)])
      }

      ## reference ion for correlation computing = precursor OR maximum
      ## intensity ion in precursor is not present
      refcol <- which(colnames(ds_abs_int) == mz_prec)
      if (length(refcol) == 0) {
        refcol <- which(sum_int == max(sum_int, na.rm = TRUE))
      }
      pdf(
        file = sprintf("%s_processing_file%s.pdf", c_name, mf[f]),
        width = 8,
        height = 11
      )
      par(mfrow = c(3, 2))

      ## Pearson correlations between absolute intensities computing
      cor_abs_int <- rep(NA, length(vmz))

      if (length(refcol) > 0) {
        for (i in 2:length(ds_abs_int)) {
          cor_abs_int[i - 1] <- cor(
            x = ds_abs_int[[refcol]],
            y = ds_abs_int[[i]],
            use = "pairwise.complete.obs",
            method = "pearson"
          )
          plot(
            ds_abs_int[[refcol]],
            ds_abs_int[[i]],
             xlab = colnames(ds_abs_int)[refcol],
             ylab = colnames(ds_abs_int)[i],
             main = sprintf(
              "%s corr coeff r=%s", c_name, round(cor_abs_int[i - 1], 2)
            )
          )
        }
        ## plot pseudo spectra
        res_comp_by_file <- plot_pseudo_spectra(
          x = ds_abs_int,
          r_threshold = r_threshold,
          fid = mf[f],
          sum_int = sum_int,
          vmz = vmz,
          cor_abs_int = cor_abs_int,
          refcol = refcol,
          c_name = c_name
        )
        if (f == 1) {
          res_comp <- res_comp_by_file
        }
      } else {
        res_comp_by_file <- NULL
        cat(" non detected in fragments file \n")
      }
      if (!is.null(res_comp_by_file)) {
        res_comp <- rbind(res_comp, res_comp_by_file)
      }
      cat("\n")
      dev.off()
    }
  } else {
    res_comp <- NULL
    cat(" non detected in precursor file \n")
  }
  return(res_comp)
}


create_parser <- function() {
  parser <- optparse::OptionParser()
  parser <- optparse::add_option(
    parser,
    c("-v", "--verbose"),
    action = "store_true",
    default = FALSE,
    help = "Print extra output [default %default]"
  )
  parser <- optparse::add_option(
    parser,
    c("-o", "--output"),
    type = "character",
    default = DEFAULT_OUTPUT_PATH,
    action = "store",
    help = "Path to the output file [default %default]"
  )
  parser <- optparse::add_option(
    parser,
    c("-p", "--precursors"),
    type = "character",
    default = DEFAULT_PRECURSOR_PATH,
    action = "store",
    help = "Path to the precursors file [default %default]"
  )
  parser <- optparse::add_option(
    parser,
    c("-f", "--fragments"),
    type = "character",
    default = DEFAULT_FRAGMENTS_PATH,
    action = "store",
    help = "Path to the fragments file [default %default]"
  )
  parser <- optparse::add_option(
    parser,
    c("-c", "--compounds"),
    type = "character",
    default = DEFAULT_COMPOUNDS_PATH,
    action = "store",
    help = "Path to the compounds file [default %default]"
  )
  parser <- optparse::add_option(
    parser,
    c("--tolmz"),
    type = "numeric",
    action = "store",
    default = DEFAULT_TOLMZ,
    metavar = "number"
  )
  parser <- optparse::add_option(
    parser,
    c("--tolrt"),
    type = "integer",
    action = "store",
    default = DEFAULT_TOLRT,
    metavar = "number"
  )
  parser <- optparse::add_option(
    parser,
    c("--seuil_ra"),
    type = "numeric",
    action = "store",
    help = "relative intensity threshold",
    default = DEFAULT_SEUIL_RA,
    metavar = "number"
  )
  parser <- optparse::add_option(
    parser,
    c("--mzdecimal"),
    type = "integer",
    default = DEFAULT_MZDECIMAL,
    action = "store",
    help = "nb decimal for mz",
    metavar = "number"
  )
  parser <- optparse::add_option(
    parser,
    c("--r_threshold"),
    type = "integer",
    default = DEFAULT_R_THRESHOLD,
    action = "store",
    help = paste0(
      "r pearson correlation threshold between precursor and fragment ",
      "absolute intensity"
    ),
    metavar = "number"
  )
  parser <- optparse::add_option(
    parser,
    c("--min_number_scan"),
    type = "numeric",
    action = "store",
    default = DEFAULT_MINNUMBERSCAN,
    help = paste0(
      "fragments are kept if there are found in a minimum number ",
      "of scans"
    ),
    metavar = "number"
  )
  return(parser)
}

main <- function(args) {
  ## FOLDER AND FILES
  ## MSpurity precursors file
  precursors <- read.table(
    file = args$precursors,
    header = TRUE,
    sep = "\t",
    quote = "\""
  )
  ## MSpurity fragments file
  fragments <- read.table(
    file = args$fragments,
    header = TRUE,
    sep = "\t",
    quote = "\""
  )
  ## list of compounds : col1=Name of molecule, col2=m/z, col3=retention time
  compounds <- read.table(
    file = args$compounds,
    sep = "\t",
    quote = "\"",
    header = TRUE
  )
  ## PARAMETERS
  ## tolerance for mz(dalton) rt(seconds) to match the standard in the compounds
  ## list with the precursor MSpurity file
  tolmz <- args$tolmz
  tolrt <- args$tolrt

  ##  relative intensity threshold
  seuil_ra <- args$seuil_ra
  ## nb decimal for mz
  mzdecimal <- args$mzdecimal
  ## r pearson correlation threshold between precursor and
  # #fragment absolute intensity
  r_threshold <- args$r_threshold
  ## fragments are kept if there are found in a minimum number of scans
  min_number_scan <- args$min_number_scan

  for (i in seq_len(nrow(compounds))) {
    ## loop execution for all compounds in the compounds file
    res_cor <- NULL
    res_cor <- extract_fragments(
      precursors = precursors,
      fragments = fragments,
      mzref = compounds[[2]][i],
      rtref = compounds[[3]][i] * 60,
      c_name = compounds[[1]][i],
      min_number_scan = min_number_scan,
      mzdecimal = mzdecimal,
      r_threshold = r_threshold,
      seuil_ra = seuil_ra,
      tolmz = tolmz,
      tolrt = tolrt
    )
    if (i == 1 & !is.null(res_cor)) {
      res_all <- res_cor
    } else if (!is.null(res_cor)) {
      res_all <- rbind(res_all, res_cor)
    }
  }

  if (is.null(res_all)) {
    stop("No result at all!")
  }
  write.table(
    x = res_all,
    file = args$output,
    sep = "\t",
    row.names = FALSE
  )
}

args <- optparse::parse_args(create_parser())
sessionInfo()
main(args)

warnings()