Mercurial > repos > nml > mykrobe_parser
view mykrobe_parser.R @ 6:8e7e5a660942 draft
planemo upload for repository https://github.com/phac-nml/mykrobe-parser commit 077a66175f3666924b42f216bbcb671ee0d026f2
author | nml |
---|---|
date | Fri, 22 Sep 2023 17:24:02 +0000 |
parents | deebc6410d13 |
children |
line wrap: on
line source
# Copyright Government of Canada 2018 # # Written by: National Microbiology Laboratory, Public Health Agency of Canada # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use # this work except in compliance with the License. You may obtain a copy of the # License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR # CONDITIONS OF ANY KIND, either express or implied. See the License for the # specific language governing permissions and limitations under the License. # Parsing JSONs from Mykrobe Predict into CSV reports # Take the JSON output from Mykrobe, rearrange, output for LIMS # Adrian Zetner # August 2018 # Updated August 2023 # Libraries #### sink(stdout(), type = "message") suppressPackageStartupMessages({ library(jsonlite) library(here) library(dplyr) library(purrr) library(tidyr) library(stringr) library(optparse) }) # Define custom functions, variables, and paths. Collect and use CL arguments #### # Here's a function to recreate that output table from the input JSON files for 2019 getResults2019 <- function(listelement) { # Define list levels for various elements of the json phylo_group <- names(listelement[[1]][["phylogenetics"]][["phylo_group"]]) if ("Non_tuberculosis_mycobacterium_complex" %in% phylo_group) { warning( paste( "Non-tuberculosis mycobacteria detected in file ", names(listelement), ". Skipping.", sep = "" ) ) return() } species <- names(listelement[[1]][["phylogenetics"]][["species"]]) lineage <- if (length(listelement[[1]][["phylogenetics"]][["lineage"]]) == 1) { names(listelement[[1]][["phylogenetics"]][["lineage"]]) } else { listelement[[1]][["phylogenetics"]][["lineage"]][["lineage"]] } # Start building a list of all your various elements temp <- list( mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]], file = names(listelement), # One element plate_name = "test", # This probably needs changing sample = "sequence_calls", # Likewise change this phylo_group = phylo_group, # As above species = species, # As above lineage = lineage, # As above # The following expressions drill down into the list elements and pull out what is needed. # It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]], species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]], lineage_per_covg = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["percent_coverage"]], phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]], species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]], lineage_depth = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["median_depth"]], Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second? ) # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants # Both produce character vectors of the same length as the number of drugs tested in the same order # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow if (length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0) { temp$susceptibility <- map_chr(listelement[[1]][["susceptibility"]], "predict") } else { temp$susceptibility <- NA } if (length(names(listelement[[1]][["susceptibility"]])) != 0) { temp$drug <- names(listelement[[1]][["susceptibility"]]) } else { temp$drug <- NA } mapped.variants <- map( listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector ~ imap( .x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element ~ paste(.y, .x[["info"]][["coverage"]][["alternate"]][["median_depth"]], .x[["info"]][["coverage"]][["reference"]][["median_depth"]], .x[["info"]][["conf"]], sep = ":" ) ) ) %>% map_chr(~ paste(.x, collapse = "__")) if (length(mapped.variants) != 0) { temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants } else { temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA } temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space # eg. phylo_group is repeated/recycled as many times as there are drugs tested as_tibble(temp) } # Here's a function to recreate that output table from the input JSON files for panel 2020 getResults2020 <- function(listelement) { # Define list levels for various elements of the json phylo_group <- names(listelement[[1]][["phylogenetics"]][["phylo_group"]]) if ("Non_tuberculosis_mycobacterium_complex" %in% phylo_group) { warning( paste( "Non-tuberculosis mycobacteria detected in file ", names(listelement), ". Skipping.", sep = "" ) ) return() } species <- names(listelement[[1]][["phylogenetics"]][["species"]]) # Start building a list of all your various elements temp <- list( mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]], file = names(listelement), # One element plate_name = "test", # This probably needs changing sample = "sequence_calls", # Likewise change this phylo_group = phylo_group, # As above species = species, # As above # The following expressions drill down into the list elements and pull out what is needed. # It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]], species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]], phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]], species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]], Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second? ) # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants # Both produce character vectors of the same length as the number of drugs tested in the same order # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow if (length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0) { temp$susceptibility <- map_chr(listelement[[1]][["susceptibility"]], "predict") } else { temp$susceptibility <- NA } if (length(names(listelement[[1]][["susceptibility"]])) != 0) { temp$drug <- names(listelement[[1]][["susceptibility"]]) } else { temp$drug <- NA } mapped.variants <- map( listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector ~ imap( .x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element ~ paste(.y, .x[["info"]][["coverage"]][["alternate"]][["median_depth"]], .x[["info"]][["coverage"]][["reference"]][["median_depth"]], .x[["info"]][["conf"]], sep = ":" ) ) ) %>% map_chr(~ paste(.x, collapse = "__")) if (length(mapped.variants) != 0) { temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants } else { temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA } temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space # eg. phylo_group is repeated/recycled as many times as there are drugs tested as_tibble(temp) } # Get command line arguments with optparse option_list <- list( make_option( c("-f", "--file"), type = "character", default = NULL, help = 'dataset file name or quoted comma separated names: eg. "file1,file2,file3"', metavar = "character" ), make_option( c("-d", "--dir"), type = "character", default = NULL, help = "directory location of json files", metavar = "character" ), make_option( c("-v", "--version"), type = "character", default = "", help = "Mykrobe Workflow Version", metavar = "character" ), make_option( c("-p", "--panel"), type = "character", default = "2019", help = "Mykrobe Panel Version: 2019 or 2020. [default= %default]", metavar = "character" ), make_option( c("-D", "--depth"), type = "integer", default = 5, help = "Minimum depth of coverage [default= %default]", metavar = "integer" ), make_option( c("-c", "--conf"), type = "integer", default = 10, help = "Minimum genotype confidence for variant genotyping [default= %default]", metavar = "integer" ), make_option( c("-n", "--name"), type = "character", default = "", help = "Name of the run", metavar = "character" ), make_option( c("-r", "--reportfile"), type = "character", default = "report", help = "File name for susceptibility report data", metavar = "character" ), make_option( c("-s", "--speciationfile"), type = "character", default = "jsondata", help = "File name for speciation data", metavar = "character" ) ) opt_parser <- OptionParser(option_list = option_list) opt <- parse_args(opt_parser) if (is.null(opt$file) && is.null(opt$dir)) { print_help(opt_parser) stop("At least one argument must be supplied to input file or directory", call. = FALSE ) } if (opt$panel != "2019" && opt$panel != "2020") { print_help(opt_parser) stop("Panel must be one of 2019 or 2020", call. = FALSE) } # Parameters to take from Galaxy/CL as args or however works best params <- c( "", # Lims_Comment "", # Lims_INTComment opt$version, # Mykrobe_Workflow_Version opt$panel, # Mykrobe Panel Version opt$depth, # Mykrobe_min_depth_default_5 opt$conf, # Mykrobe_min_conf_default_10 "", # LIMS_file - empty as it's an upload field in LIMS opt$name ) # Mutation_set_version names(params) <- c( "Lims_Comment", "Lims_INTComment", "Mykrobe_Workflow_Version", "Mykrobe_Panel_Version", "Mykrobe_min_depth_default_5", "Mykrobe_min_conf_default_10", "LIMS_file", "Mutation_set_version" ) # A default report in the order our LIMS requires # Make a default dataframe to combine the rest into and enforce column order / fill missing ones with NAs columns <- c( "file", "Mykrobe_fabG1", "Mykrobe_katG", "Mykrobe_ahpC", "Mykrobe_inhA", "Mykrobe_ndh", "Isoniazid_R_mutations", "Isoniazid_Prediction", "Mykrobe_rpoB", "Rifampicin_R_mutations", "Rifampicin_Prediction", "Mykrobe_embB", "Mykrobe_embA", "Ethambutol_R_mutations", "Ethambutol_Prediction", "Mykrobe_pncA", "Mykrobe_rpsA", "Pyrazinamide_R_mutations", "Pyrazinamide_Prediction", "Mykrobe_Ofloxacin_gyrA", "Ofloxacin_R_mutations", "Ofloxacin_Prediction", "Mykrobe_Moxifloxacin_gyrA", "Moxifloxacin_R_mutations", "Moxifloxacin_Prediction", "Mykrobe_Ciprofloxacin_gyrA", "Ciprofloxacin_R_mutations", "Ciprofloxacin_Prediction", "Mykrobe_rpsL", "Mykrobe_Streptomycin_rrs", "Mykrobe_Streptomycin_gid", "Streptomycin_R_mutations", "Streptomycin_Prediction", "Mykrobe_Amikacin_rrs", "Amikacin_R_mutations", "Amikacin_Prediction", "Mykrobe_Capreomycin_rrs", "Mykrobe_Capreomycin_tlyA", "Capreomycin_R_mutations", "Capreomycin_Prediction", "Mykrobe_Kanamycin_rrs", "Mykrobe_Kanamycin_eis", "Kanamycin_R_mutations", "Kanamycin_Prediction", "Lims_Comment", "Lims_INTComment", "Mykrobe_Workflow_Version", "mykrobe_version", "Mykrobe_Resistance_probe_set", "Mykrobe_min_depth_default_5", "Mykrobe_min_conf_default_10", "LIMS_file", "Mutation_set_version" ) report <- setNames(data.frame(matrix( "", ncol = length(columns), nrow = 1 ), stringsAsFactors = FALSE), columns) report_cols <- c( "file", "phylo_group", "species", "lineage", "phylo_group_per_covg", "species_per_covg", "lineage_per_covg", "phylo_group_depth", "species_depth", "lineage_depth" ) # List of drugs that are tested all_drugs <- c( "Isoniazid", "Rifampicin", "Ethambutol", "Pyrazinamide", "Moxifloxacin", "Ofloxacin", "Streptomycin", "Amikacin", "Capreomycin", "Kanamycin" ) # Do Stuff #### # Import all the JSON files into a list of lists format #### if (is.null(opt$file)) { # opt$dir is used to get the list of files, a vector of non-duplicated files is then passed to map files <- list.files( path = opt$dir, pattern = "*.json", full.names = TRUE ) } else { files <- unlist(strsplit(opt$file, ",")) } files <- files[!duplicated(basename(files))] list.of.json.files <- map( files, ~ fromJSON(.x, simplifyDataFrame = FALSE) ) # Apply the correct getResults function to each element in your list then bash it together into a final report if (opt$panel == "2019") { temp <- map(list.of.json.files, getResults2019) %>% bind_rows() } else if (opt$panel == "2020") { temp <- map(list.of.json.files, getResults2020) %>% bind_rows() columns <- setdiff( columns, c( "Mykrobe_Ciprofloxacin_gyrA", "Ciprofloxacin_R_mutations", "Ciprofloxacin_Prediction" ) ) report_cols <- setdiff( report_cols, c( "lineage", "lineage_per_covg", "lineage_depth" ) ) } else { stop("Panel must be one of 2019 or 2020", call. = FALSE) } # Predictions of resistance or susceptibility predictions.table <- temp %>% select(file, drug, susceptibility) %>% mutate(drug = paste(drug, "_Prediction", sep = "")) %>% spread(drug, susceptibility, fill = "failed") %>% select(-starts_with("NA")) if (length(predictions.table) == 1) { print(predictions.table) stop("No susceptibility results in files specified. Did the testing fail?", call. = FALSE ) } # Variants, if present num.variants <- predictions.table %>% select(ends_with("_Prediction")) %>% unlist(use.names = FALSE) %>% str_count("[R,r]") %>% sum() if (num.variants > 0) { # Multiple resistance mutations and confidence per drug in the X_R_mutations column # Actual protein changes in Mykrobe_X columns variants.temp <- temp %>% select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>% mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent... filter(!is.na(variants)) %>% # ...Then get rid of it mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>% mutate(R_mutations = variants) %>% mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows) unnest(variants) %>% separate(variants, c("gene", "mutation"), "_") %>% mutate(columnname = ifelse( gene %in% c("gyrA", "rrs", "eis", "gid"), # Check for columns that include the drug name or not and paste accordingly paste("Mykrobe", drug, gene, sep = "_"), paste("Mykrobe", gene, sep = "_") )) %>% # Extract out the mutation information with a regex that covers all potential genes # This regex looks for whatever is ahead of the first colon and after the last hyphen mutate(mutation = str_match(mutation, "(.*)-.*:")[, 2]) %>% select(file, tempcols, R_mutations, columnname, mutation) # Split each kind of variants into its own temp table then merge variants.1 <- variants.temp %>% select(file, tempcols, R_mutations) %>% distinct() %>% spread(tempcols, R_mutations) variants.2 <- variants.temp %>% select(file, columnname, mutation) %>% group_by(file, columnname) %>% summarise(mutation = paste(mutation, collapse = ";")) %>% spread(columnname, mutation) variants.table <- full_join(variants.1, variants.2, by = "file") } else { variants.table <- data.frame(file = predictions.table$file, stringsAsFactors = FALSE) } # Make a report #### report <- temp %>% select(file, mykrobe_version, Mykrobe_Resistance_probe_set) %>% # Get important info from initial table distinct() %>% # Drop duped rows and combine all the tables together full_join(variants.table) %>% full_join(predictions.table) %>% bind_rows(report) %>% # Use bind_rows to add columns (eg. unteseted drugs) to the final output filter(file != "") # Only add the 'no mutation' replacement to the columns that actually have a result report <- report %>% filter_at(vars(ends_with("_Prediction")), any_vars(. != "failed")) %>% mutate_at(vars(starts_with("Mykrobe_")), funs(replace(., is.na(.), "No Mutation"))) %>% full_join(anti_join(report, ., by = "file")) %>% select(columns) # Add in the parameters fed from Galaxy using named character vector report <- report %>% mutate( Lims_Comment = params["Lims_Comment"], Lims_INTComment = params["Lims_INTComment"], Mykrobe_Workflow_Version = params["Mykrobe_Workflow_Version"], Mykrobe_min_depth_default_5 = params["Mykrobe_min_depth_default_5"], Mykrobe_min_conf_default_10 = params["Mykrobe_min_conf_default_10"], LIMS_file = params["LIMS_file"], Mutation_set_version = params["Mutation_set_version"] ) # Write some output # Report as is write.csv(report, "output-report.csv", row.names = FALSE) print("Writing Susceptibility report to CSV as output-report.csv") # Select specific columns from temp and output them # Addition of any_of accounts for both 2019 and 2020 panels temp %>% select_at( # This is a dplyr 0.8.3 function, superceded in newer versions but this tool is built around a number of specific deps report_cols ) %>% distinct() %>% write.csv(file = "output-jsondata.csv", row.names = FALSE) print("Writing JSON data to CSV as output-jsondata.csv") sink(NULL, type = "message") # close the sink quit()