mykrobe_parser: mykrobe_parser.R comparison

comparison mykrobe_parser.R @ 6:8e7e5a660942 draft

planemo upload for repository https://github.com/phac-nml/mykrobe-parser commit 077a66175f3666924b42f216bbcb671ee0d026f2

author	nml
date	Fri, 22 Sep 2023 17:24:02 +0000
parents	deebc6410d13
children

comparison

equal deleted inserted replaced

-:deebc6410d13
+:8e7e5a660942
 # Copyright Government of Canada 2018
 #
 # Written by: National Microbiology Laboratory, Public Health Agency of Canada
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 # this work except in compliance with the License. You may obtain a copy of the
 # License at:
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software distributed
 # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 # CONDITIONS OF ANY KIND, either express or implied. See the License for the
 # specific language governing permissions and limitations under the License.
 # Parsing JSONs from Mykrobe Predict into CSV reports
 # Take the JSON output from Mykrobe, rearrange, output for LIMS
 # Adrian Zetner
 # August 2018
+# Updated August 2023
 # Libraries ####
 sink(stdout(), type = "message")
 library(optparse)
 })
 # Define custom functions, variables, and paths. Collect and use CL arguments ####
-# Here's a function to recreate that output table from the input JSON files
+# Here's a function to recreate that output table from the input JSON files for 2019
-getResults <- function(listelement){
+getResults2019 <- function(listelement) {
 # Define list levels for various elements of the json
-species <- names(listelement[[1]][["phylogenetics"]][["species"]])
+phylo_group <-
-lineage <- names(listelement[[1]][["phylogenetics"]][["lineage"]])
+names(listelement[[1]][["phylogenetics"]][["phylo_group"]])
-phylo_group <- names(listelement[[1]][["phylogenetics"]][["phylo_group"]])
+if ("Non_tuberculosis_mycobacterium_complex" %in% phylo_group) {
-if("Non_tuberculosis_mycobacterium_complex" %in% phylo_group){
+warning(
-warning(paste("Non-tuberculosis mycobacteria detected in file ", names(listelement), ". Skipping.", sep = ""))
+paste(
-return()}
+"Non-tuberculosis mycobacteria detected in file ",
+names(listelement),
+". Skipping.",
+sep = ""
+)
+)
+return()
+}
+species <-
+names(listelement[[1]][["phylogenetics"]][["species"]])
+lineage <-
+if (length(listelement[[1]][["phylogenetics"]][["lineage"]]) == 1) {
+names(listelement[[1]][["phylogenetics"]][["lineage"]])
+} else {
+listelement[[1]][["phylogenetics"]][["lineage"]][["lineage"]]
+}
 # Start building a list of all your various elements
-temp <- list(mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]],
+temp <-
-file = names(listelement), # One element
+list(
-plate_name = "test", # This probably needs changing
+mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]],
-sample = "sequence_calls", # Likewise change this
+file = names(listelement),
-phylo_group = phylo_group, # As above
+# One element
-species = species, # As above
+plate_name = "test",
-lineage = lineage, # As above
+# This probably needs changing
-# The following expressions drill down into the list elements and pull out what is needed.
+sample = "sequence_calls",
-# It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work
+# Likewise change this
-phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]],
+phylo_group = phylo_group,
-species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]],
+# As above
-lineage_per_covg = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["percent_coverage"]],
+species = species,
-phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]],
+# As above
-species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]],
+lineage = lineage,
-lineage_depth = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["median_depth"]],
+# As above
-Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second?
+# The following expressions drill down into the list elements and pull out what is needed.
-)
+# It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work
+phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]],
+species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]],
+lineage_per_covg = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["percent_coverage"]],
+phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]],
+species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]],
+lineage_depth = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["median_depth"]],
+Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second?
+)
 # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants
 # Both produce character vectors of the same length as the number of drugs tested in the same order
 # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow
-if(length(map_chr(listelement[[1]][["susceptibility"]],  "predict")) != 0){
+if (length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0) {
-temp$susceptibility <- map_chr(listelement[[1]][["susceptibility"]],  "predict")
+temp$susceptibility <-
-}else{
+map_chr(listelement[[1]][["susceptibility"]], "predict")
+} else {
 temp$susceptibility <- NA
 }
-if(length(names(listelement[[1]][["susceptibility"]])) != 0){
+if (length(names(listelement[[1]][["susceptibility"]])) != 0) {
 temp$drug <- names(listelement[[1]][["susceptibility"]])
-}else{
+} else {
 temp$drug <- NA
 }
-mapped.variants <- map(listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector
+mapped.variants <-
-~ imap(.x[["called_by"]],  # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element
+map(
-~ paste(.y,
+listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector
-.x[["info"]][["coverage"]][["alternate"]][["median_depth"]],
+~ imap(
-.x[["info"]][["coverage"]][["reference"]][["median_depth"]],
+.x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element
-.x[["info"]][["conf"]],
+~ paste(.y,
-sep = ":"))) %>%
+.x[["info"]][["coverage"]][["alternate"]][["median_depth"]],
+.x[["info"]][["coverage"]][["reference"]][["median_depth"]],
+.x[["info"]][["conf"]],
+sep = ":"
+)
+)
+) %>%
 map_chr(~ paste(.x, collapse = "__"))
-if(length(mapped.variants) != 0){
+if (length(mapped.variants) != 0) {
 temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants
-}else{
+} else {
 temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA
 }
 temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA
 # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space
 # eg. phylo_group is repeated/recycled as many times as there are drugs tested
 as_tibble(temp)
 }
+# Here's a function to recreate that output table from the input JSON files for panel 2020
+getResults2020 <- function(listelement) {
+# Define list levels for various elements of the json
+phylo_group <-
+names(listelement[[1]][["phylogenetics"]][["phylo_group"]])
+if ("Non_tuberculosis_mycobacterium_complex" %in% phylo_group) {
+warning(
+paste(
+"Non-tuberculosis mycobacteria detected in file ",
+names(listelement),
+". Skipping.",
+sep = ""
+)
+)
+return()
+}
+species <-
+names(listelement[[1]][["phylogenetics"]][["species"]])
+# Start building a list of all your various elements
+temp <-
+list(
+mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]],
+file = names(listelement),
+# One element
+plate_name = "test",
+# This probably needs changing
+sample = "sequence_calls",
+# Likewise change this
+phylo_group = phylo_group,
+# As above
+species = species,
+# As above
+# The following expressions drill down into the list elements and pull out what is needed.
+# It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work
+phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]],
+species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]],
+phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]],
+species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]],
+Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second?
+)
+# Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants
+# Both produce character vectors of the same length as the number of drugs tested in the same order
+# All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow
+if (length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0) {
+temp$susceptibility <-
+map_chr(listelement[[1]][["susceptibility"]], "predict")
+} else {
+temp$susceptibility <- NA
+}
+if (length(names(listelement[[1]][["susceptibility"]])) != 0) {
+temp$drug <- names(listelement[[1]][["susceptibility"]])
+} else {
+temp$drug <- NA
+}
+mapped.variants <-
+map(
+listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector
+~ imap(
+.x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element
+~ paste(.y,
+.x[["info"]][["coverage"]][["alternate"]][["median_depth"]],
+.x[["info"]][["coverage"]][["reference"]][["median_depth"]],
+.x[["info"]][["conf"]],
+sep = ":"
+)
+)
+) %>%
+map_chr(~ paste(.x, collapse = "__"))
+if (length(mapped.variants) != 0) {
+temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants
+} else {
+temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA
+}
+temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA
+# Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space
+# eg. phylo_group is repeated/recycled as many times as there are drugs tested
+as_tibble(temp)
+}
 # Get command line arguments with optparse
-option_list = list(
+option_list <- list(
-make_option(c("-f", "--file"),
+make_option(
-type="character",
+c("-f", "--file"),
-default=NULL,
+type = "character",
-help='dataset file name or quoted comma separated names: eg. "file1,file2,file3"',
+default = NULL,
-metavar="character"),
+help = 'dataset file name or quoted comma separated names: eg. "file1,file2,file3"',
-make_option(c("-d", "--dir"),
+metavar = "character"
-type="character",
+),
-default=NULL,
+make_option(
-help="directory location of json files",
+c("-d", "--dir"),
-metavar="character"),
+type = "character",
-make_option(c("-v", "--version"),
+default = NULL,
-type="character",
+help = "directory location of json files",
-default="",
+metavar = "character"
-help="Mykrobe Workflow Version",
+),
-metavar="character"),
+make_option(
-make_option(c("-D", "--depth"),
+c("-v", "--version"),
-type="integer",
+type = "character",
-default=5,
+default = "",
-help="Minimum depth of coverage [default= %default]",
+help = "Mykrobe Workflow Version",
-metavar="integer"),
+metavar = "character"
-make_option(c("-c", "--conf"),
+),
-type="integer",
+make_option(
-default=10,
+c("-p", "--panel"),
-help="Minimum genotype confidence for variant genotyping [default= %default]",
+type = "character",
-metavar="integer"),
+default = "2019",
-make_option(c("-n", "--name"),
+help = "Mykrobe Panel Version: 2019 or 2020. [default= %default]",
-type="character",
+metavar = "character"
-default="",
+),
-help="Name of the run",
+make_option(
-metavar="character")
+c("-D", "--depth"),
+type = "integer",
+default = 5,
+help = "Minimum depth of coverage [default= %default]",
+metavar = "integer"
+),
+make_option(
+c("-c", "--conf"),
+type = "integer",
+default = 10,
+help = "Minimum genotype confidence for variant genotyping [default= %default]",
+metavar = "integer"
+),
+make_option(
+c("-n", "--name"),
+type = "character",
+default = "",
+help = "Name of the run",
+metavar = "character"
+),
+make_option(
+c("-r", "--reportfile"),
+type = "character",
+default = "report",
+help = "File name for susceptibility report data",
+metavar = "character"
+),
+make_option(
+c("-s", "--speciationfile"),
+type = "character",
+default = "jsondata",
+help = "File name for speciation data",
+metavar = "character"
+)
 )
-opt_parser = OptionParser(option_list=option_list)
+opt_parser <- OptionParser(option_list = option_list)
-opt = parse_args(opt_parser)
+opt <- parse_args(opt_parser)
-if (is.null(opt$file) & is.null(opt$dir)){
+if (is.null(opt$file) && is.null(opt$dir)) {
 print_help(opt_parser)
-stop("At least one argument must be supplied to input file or directory", call.=FALSE)
+stop("At least one argument must be supplied to input file or directory",
+call. = FALSE
+)
+}
+if (opt$panel != "2019" && opt$panel != "2020") {
+print_help(opt_parser)
+stop("Panel must be one of 2019 or 2020", call. = FALSE)
 }
 # Parameters to take from Galaxy/CL as args or however works best
-params <- c("",  # Lims_Comment
+params <- c(
-"",  # Lims_INTComment
+"", # Lims_Comment
-opt$version,  # Mykrobe_Workflow_Version
+"", # Lims_INTComment
-opt$depth,  # Mykrobe_min_depth_default_5
+opt$version, # Mykrobe_Workflow_Version
-opt$conf,  # Mykrobe_min_conf_default_10
+opt$panel, # Mykrobe Panel Version
-"",                             # LIMS_file - empty as it's an upload field in LIMS
+opt$depth, # Mykrobe_min_depth_default_5
-opt$name)  # Mutation_set_version
+opt$conf, # Mykrobe_min_conf_default_10
+"", # LIMS_file - empty as it's an upload field in LIMS
-names(params) <- c("Lims_Comment",
+opt$name
-"Lims_INTComment",
+) # Mutation_set_version
-"Mykrobe_Workflow_Version",
-"Mykrobe_min_depth_default_5",
+names(params) <- c(
-"Mykrobe_min_conf_default_10",
+"Lims_Comment",
-"LIMS_file",
+"Lims_INTComment",
-"Mutation_set_version")
+"Mykrobe_Workflow_Version",
+"Mykrobe_Panel_Version",
+"Mykrobe_min_depth_default_5",
+"Mykrobe_min_conf_default_10",
+"LIMS_file",
+"Mutation_set_version"
+)
 # A default report in the order our LIMS requires
 # Make a default dataframe to combine the rest into and enforce column order / fill missing ones with NAs
-columns <- c("file",
+columns <- c(
-"Mykrobe_fabG1",
+"file",
-"Mykrobe_katG",
+"Mykrobe_fabG1",
-"Mykrobe_ahpC",
+"Mykrobe_katG",
-"Mykrobe_inhA",
+"Mykrobe_ahpC",
-"Mykrobe_ndh",
+"Mykrobe_inhA",
-"Isoniazid_R_mutations",
+"Mykrobe_ndh",
-"Isoniazid_Prediction",
+"Isoniazid_R_mutations",
-"Mykrobe_rpoB",
+"Isoniazid_Prediction",
-"Rifampicin_R_mutations",
+"Mykrobe_rpoB",
-"Rifampicin_Prediction",
+"Rifampicin_R_mutations",
-"Mykrobe_embB",
+"Rifampicin_Prediction",
-"Mykrobe_embA",
+"Mykrobe_embB",
-"Ethambutol_R_mutations",
+"Mykrobe_embA",
-"Ethambutol_Prediction",
+"Ethambutol_R_mutations",
-"Mykrobe_pncA",
+"Ethambutol_Prediction",
-"Mykrobe_rpsA",
+"Mykrobe_pncA",
-"Pyrazinamide_R_mutations",
+"Mykrobe_rpsA",
-"Pyrazinamide_Prediction",
+"Pyrazinamide_R_mutations",
-"Mykrobe_Ofloxacin_gyrA",
+"Pyrazinamide_Prediction",
-"Ofloxacin_R_mutations",
+"Mykrobe_Ofloxacin_gyrA",
-"Ofloxacin_Prediction",
+"Ofloxacin_R_mutations",
-"Mykrobe_Moxifloxacin_gyrA",
+"Ofloxacin_Prediction",
-"Moxifloxacin_R_mutations",
+"Mykrobe_Moxifloxacin_gyrA",
-"Moxifloxacin_Prediction",
+"Moxifloxacin_R_mutations",
-"Mykrobe_rpsL",
+"Moxifloxacin_Prediction",
-"Mykrobe_Streptomycin_rrs",
+"Mykrobe_Ciprofloxacin_gyrA",
-"Mykrobe_Streptomycin_gid",
+"Ciprofloxacin_R_mutations",
-"Streptomycin_R_mutations",
+"Ciprofloxacin_Prediction",
-"Streptomycin_Prediction",
+"Mykrobe_rpsL",
-"Mykrobe_Amikacin_rrs",
+"Mykrobe_Streptomycin_rrs",
-"Amikacin_R_mutations",
+"Mykrobe_Streptomycin_gid",
-"Amikacin_Prediction",
+"Streptomycin_R_mutations",
-"Mykrobe_Capreomycin_rrs",
+"Streptomycin_Prediction",
-"Mykrobe_Capreomycin_tlyA",
+"Mykrobe_Amikacin_rrs",
-"Capreomycin_R_mutations",
+"Amikacin_R_mutations",
-"Capreomycin_Prediction",
+"Amikacin_Prediction",
-"Mykrobe_Kanamycin_rrs",
+"Mykrobe_Capreomycin_rrs",
-"Mykrobe_Kanamycin_eis",
+"Mykrobe_Capreomycin_tlyA",
-"Kanamycin_R_mutations",
+"Capreomycin_R_mutations",
-"Kanamycin_Prediction",
+"Capreomycin_Prediction",
-"Lims_Comment",
+"Mykrobe_Kanamycin_rrs",
-"Lims_INTComment",
+"Mykrobe_Kanamycin_eis",
-"Mykrobe_Workflow_Version",
+"Kanamycin_R_mutations",
-"mykrobe_version",
+"Kanamycin_Prediction",
-"Mykrobe_Resistance_probe_set",
+"Lims_Comment",
-"Mykrobe_min_depth_default_5",
+"Lims_INTComment",
-"Mykrobe_min_conf_default_10",
+"Mykrobe_Workflow_Version",
-"LIMS_file",
+"mykrobe_version",
-"Mutation_set_version")
+"Mykrobe_Resistance_probe_set",
+"Mykrobe_min_depth_default_5",
-report <- setNames(data.frame(matrix("", ncol = length(columns), nrow = 1), stringsAsFactors = F), columns)
+"Mykrobe_min_conf_default_10",
+"LIMS_file",
+"Mutation_set_version"
+)
+report <-
+setNames(data.frame(matrix(
+"",
+ncol = length(columns), nrow = 1
+), stringsAsFactors = FALSE), columns)
+report_cols <- c(
+"file",
+"phylo_group",
+"species",
+"lineage",
+"phylo_group_per_covg",
+"species_per_covg",
+"lineage_per_covg",
+"phylo_group_depth",
+"species_depth",
+"lineage_depth"
+)
 # List of drugs that are tested
-all_drugs <- c("Isoniazid",
+all_drugs <- c(
-"Rifampicin",
+"Isoniazid",
-"Ethambutol",
+"Rifampicin",
-"Pyrazinamide",
+"Ethambutol",
-"Moxifloxacin",
+"Pyrazinamide",
-"Ofloxacin",
+"Moxifloxacin",
-"Streptomycin",
+"Ofloxacin",
-"Amikacin",
+"Streptomycin",
-"Capreomycin",
+"Amikacin",
-"Kanamycin")
+"Capreomycin",
+"Kanamycin"
+)
 # Do Stuff ####
 # Import all the JSON files into a list of lists format ####
-if (is.null(opt$file)){
+if (is.null(opt$file)) {
 # opt$dir is used to get the list of files, a vector of non-duplicated files is then passed to map
-files <- list.files(path = opt$dir,
+files <- list.files(
-pattern = "*.json",
+path = opt$dir,
-full.names = T)
+pattern = "*.json",
-}else{
+full.names = TRUE
+)
+} else {
 files <- unlist(strsplit(opt$file, ","))
 }
 files <- files[!duplicated(basename(files))]
-list.of.json.files <- map(files,
+list.of.json.files <- map(
-~ fromJSON(.x, simplifyDataFrame = F)
+files,
+~ fromJSON(.x, simplifyDataFrame = FALSE)
 )
-# Apply that getResults function to each element in your list then bash it together into a final report
+# Apply the correct getResults function to each element in your list then bash it together into a final report
-temp <- map(list.of.json.files, getResults) %>%
+if (opt$panel == "2019") {
-bind_rows()
+temp <- map(list.of.json.files, getResults2019) %>%
+bind_rows()
+} else if (opt$panel == "2020") {
+temp <- map(list.of.json.files, getResults2020) %>%
+bind_rows()
+columns <-
+setdiff(
+columns,
+c(
+"Mykrobe_Ciprofloxacin_gyrA",
+"Ciprofloxacin_R_mutations",
+"Ciprofloxacin_Prediction"
+)
+)
+report_cols <- setdiff(
+report_cols,
+c(
+"lineage",
+"lineage_per_covg",
+"lineage_depth"
+)
+)
+} else {
+stop("Panel must be one of 2019 or 2020", call. = FALSE)
+}
 # Predictions of resistance or susceptibility
-predictions.table <-
+predictions.table <-
 temp %>%
 select(file, drug, susceptibility) %>%
 mutate(drug = paste(drug, "_Prediction", sep = "")) %>%
 spread(drug, susceptibility, fill = "failed") %>%
 select(-starts_with("NA"))
-if (length(predictions.table) == 1){
+if (length(predictions.table) == 1) {
 print(predictions.table)
-stop("No susceptibility results in files specified. Did the testing fail?", call.=FALSE)
+stop("No susceptibility results in files specified. Did the testing fail?",
+call. = FALSE
+)
 }
 # Variants, if present
-if (0 < predictions.table %>%
+num.variants <-
-select(ends_with("_Prediction")) %>%
+predictions.table %>%
-unlist(use.names = F) %>%
+select(ends_with("_Prediction")) %>%
-str_count("[R,r]") %>%
+unlist(use.names = FALSE) %>%
-sum()){
+str_count("[R,r]") %>%
+sum()
-# Multiple resistance mutations and confidence per drug in the X_R_mutations column
-# Actual protein changes in Mykrobe_X columns
+if (num.variants > 0) {
+# Multiple resistance mutations and confidence per drug in the X_R_mutations column
-variants.temp <-
+# Actual protein changes in Mykrobe_X columns
-temp %>%
-select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>%
+variants.temp <-
-mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent...
+temp %>%
-filter(!is.na(variants)) %>% # ...Then get rid of it
+select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>%
-mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>%
+mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent...
-mutate(R_mutations = variants) %>%
+filter(!is.na(variants)) %>% # ...Then get rid of it
-mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows)
+mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>%
-unnest(variants) %>%
+mutate(R_mutations = variants) %>%
-separate(variants, c("gene", "mutation"), "_") %>%
+mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows)
-mutate(columnname = ifelse(gene %in% c("gyrA", "tlyA", "rrs", "eis", "gid"), # Check for columns that include the drug name or not and paste accordingly
+unnest(variants) %>%
-paste("Mykrobe", drug, gene, sep = "_"),
+separate(variants, c("gene", "mutation"), "_") %>%
-paste("Mykrobe", gene, sep = "_"))) %>%
+mutate(columnname = ifelse(
-# Extract out the mutation information with a regex that covers all potential genes
+gene %in% c("gyrA", "rrs", "eis", "gid"),
-# This regex looks for whatever is ahead of the first colon and after the last hyphen
+# Check for columns that include the drug name or not and paste accordingly
-mutate(mutation = str_match(mutation, "(.*)-.*:")[,2]) %>%
+paste("Mykrobe", drug, gene, sep = "_"),
-select(file, tempcols, R_mutations, columnname, mutation)
+paste("Mykrobe", gene, sep = "_")
+)) %>%
-# Split each kind of variants into its own temp table then merge
+# Extract out the mutation information with a regex that covers all potential genes
-variants.1 <-
+# This regex looks for whatever is ahead of the first colon and after the last hyphen
-variants.temp %>%
+mutate(mutation = str_match(mutation, "(.*)-.*:")[, 2]) %>%
-select(file, tempcols, R_mutations) %>%
+select(file, tempcols, R_mutations, columnname, mutation)
-distinct() %>%
-spread(tempcols, R_mutations)
+# Split each kind of variants into its own temp table then merge
+variants.1 <-
-variants.2 <-
+variants.temp %>%
-variants.temp %>%
+select(file, tempcols, R_mutations) %>%
-select(file, columnname, mutation) %>%
+distinct() %>%
-group_by(file, columnname) %>%
+spread(tempcols, R_mutations)
-summarise(mutation = paste(mutation, collapse = ";")) %>%
-spread(columnname, mutation)
+variants.2 <-
+variants.temp %>%
-variants.table <- full_join(variants.1, variants.2, by = "file")
+select(file, columnname, mutation) %>%
-}else{
+group_by(file, columnname) %>%
-variants.table <- data.frame(file=predictions.table$file, stringsAsFactors = F)
+summarise(mutation = paste(mutation, collapse = ";")) %>%
+spread(columnname, mutation)
+variants.table <-
+full_join(variants.1, variants.2, by = "file")
+} else {
+variants.table <-
+data.frame(file = predictions.table$file, stringsAsFactors = FALSE)
 }
 # Make a report ####
 report <-
 temp %>%
 select(file, mykrobe_version, Mykrobe_Resistance_probe_set) %>% # Get important info from initial table
 distinct() %>% # Drop duped rows and combine all the tables together
 full_join(variants.table) %>%
 full_join(predictions.table) %>%
 bind_rows(report) %>% # Use bind_rows to add columns (eg. unteseted drugs) to the final output
 filter(file != "")
 # Only add the 'no mutation' replacement to the columns that actually have a result
 report <-
 report %>%
 filter_at(vars(ends_with("_Prediction")), any_vars(. != "failed")) %>%
 mutate_at(vars(starts_with("Mykrobe_")), funs(replace(., is.na(.), "No Mutation"))) %>%
 full_join(anti_join(report, ., by = "file")) %>%
 select(columns)
 # Add in the parameters fed from Galaxy using named character vector
 report <-
 report %>%
 mutate(
 Lims_Comment = params["Lims_Comment"],
 Lims_INTComment = params["Lims_INTComment"],
 Mykrobe_Workflow_Version = params["Mykrobe_Workflow_Version"],
 Mykrobe_min_depth_default_5 = params["Mykrobe_min_depth_default_5"],
 Mykrobe_min_conf_default_10 = params["Mykrobe_min_conf_default_10"],
 LIMS_file = params["LIMS_file"],
 Mutation_set_version = params["Mutation_set_version"]
 )
-#View(report)
 # Write some output
 # Report as is
-write.csv(report, "output-report.csv", row.names = F)
+write.csv(report, "output-report.csv", row.names = FALSE)
 print("Writing Susceptibility report to CSV as output-report.csv")
 # Select specific columns from temp and output them
-temp %>%
+# Addition of any_of accounts for both 2019 and 2020 panels
-select(file,
-phylo_group,
+temp %>%
-species,
+select_at( # This is a dplyr 0.8.3 function, superceded in newer versions but this tool is built around a number of specific deps
-lineage,
+report_cols
-phylo_group_per_covg,
+) %>%
-species_per_covg,
-lineage_per_covg,
-phylo_group_depth,
-species_depth,
-lineage_depth) %>%
 distinct() %>%
-write.csv("output-jsondata.csv", row.names = F)
+write.csv(file = "output-jsondata.csv", row.names = FALSE)
 print("Writing JSON data to CSV as output-jsondata.csv")
-sink(NULL, type="message") # close the sink
+sink(NULL, type = "message") # close the sink
 quit()

Mercurial > repos > nml > mykrobe_parser

comparison mykrobe_parser.R @ 6:8e7e5a660942 draft