Mercurial > repos > nml > mykrobe_parser
diff mykrobe_parser.R @ 0:6eae14751768 draft
planemo upload for repository https://github.com/phac-nml/mykrobe-parser commit 0b64469b5d819a9c5b9ea85d07ccbc3b5fb6b25e-dirty
author | nml |
---|---|
date | Fri, 28 Sep 2018 16:01:47 -0400 |
parents | |
children | f2608dccd3e0 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mykrobe_parser.R Fri Sep 28 16:01:47 2018 -0400 @@ -0,0 +1,375 @@ +# Copyright Government of Canada 2018 +# +# Written by: National Microbiology Laboratory, Public Health Agency of Canada +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use +# this work except in compliance with the License. You may obtain a copy of the +# License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + + +# Parsing JSONs from Mykrobe Predict into CSV reports +# Take the JSON output from Mykrobe, rearrange, output for LIMS +# Adrian Zetner +# August 2018 + +# Libraries #### +library(jsonlite, quietly = T) +library(here, quietly = T) +suppressMessages(library(dplyr, quietly = T)) +suppressMessages(library(purrr, quietly = T)) +library(tidyr, quietly = T) +library(stringr, quietly = T) +library(optparse, quietly = T) + +# Define custom functions, variables, and paths. Collect and use CL arguments #### + +# Here's a function to recreate that output table from the input JSON files + +getResults <- function(listelement){ + # Define list levels for various elements of the json + species <- names(listelement[[1]][["phylogenetics"]][["species"]]) + lineage <- names(listelement[[1]][["phylogenetics"]][["lineage"]]) + phylo_group <- names(listelement[[1]][["phylogenetics"]][["phylo_group"]]) + if("Non_tuberculosis_mycobacterium_complex" %in% phylo_group){ + warning(paste("Non-tuberculosis mycobacteria detected in file ", names(listelement), ". Skipping.", sep = "")) + return()} + + # Start building a list of all your various elements + temp <- list(mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]], + file = names(listelement), # One element + plate_name = "test", # This probably needs changing + sample = "sequence_calls", # Likewise change this + phylo_group = phylo_group, # As above + species = species, # As above + lineage = lineage, # As above + # The following expressions drill down into the list elements and pull out what is needed. + # It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work + phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]], + species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]], + lineage_per_covg = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["percent_coverage"]], + phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]], + species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]], + lineage_depth = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["median_depth"]], + Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second? + ) + + # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants + # Both produce character vectors of the same length as the number of drugs tested in the same order + # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow + + if(length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0){ + temp$susceptibility <- map_chr(listelement[[1]][["susceptibility"]], "predict") + }else{ + temp$susceptibility <- NA + } + + if(length(names(listelement[[1]][["susceptibility"]])) != 0){ + temp$drug <- names(listelement[[1]][["susceptibility"]]) + }else{ + temp$drug <- NA + } + + mapped.variants <- map(listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector + ~ imap(.x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element + ~ paste(.y, + .x[["info"]][["coverage"]][["alternate"]][["median_depth"]], + .x[["info"]][["coverage"]][["reference"]][["median_depth"]], + .x[["info"]][["conf"]], + sep = ":"))) %>% + map_chr(~ paste(.x, collapse = "__")) + + if(length(mapped.variants) != 0){ + temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants + }else{ + temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA + } + + temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA + + # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space + # eg. phylo_group is repeated/recycled as many times as there are drugs tested + as_tibble(temp) +} + +sink(stdout(), type = "message") + +suppressPackageStartupMessages({ + library(jsonlite) + library(here) + library(dplyr) + library(purrr) + library(tidyr) + library(stringr) + library(optparse) +}) + +# Get command line arguments with optparse +option_list = list( + make_option(c("-f", "--file"), + type="character", + default=NULL, + help='dataset file name or quoted comma separated names: eg. "file1,file2,file3"', + metavar="character"), + make_option(c("-d", "--dir"), + type="character", + default=NULL, + help="directory location of json files", + metavar="character"), + make_option(c("-v", "--version"), + type="character", + default="", + help="Mykrobe Workflow Version", + metavar="character"), + make_option(c("-D", "--depth"), + type="integer", + default=5, + help="Minimum depth of coverage [default= %default]", + metavar="integer"), + make_option(c("-c", "--conf"), + type="integer", + default=10, + help="Minimum genotype confidence for variant genotyping [default= %default]", + metavar="integer"), + make_option(c("-n", "--name"), + type="character", + default="", + help="Name of the run", + metavar="character") +) + +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) + +if (is.null(opt$file) & is.null(opt$dir)){ + print_help(opt_parser) + stop("At least one argument must be supplied to input file or directory", call.=FALSE) +} + +# Parameters to take from Galaxy/CL as args or however works best +params <- c("", # Lims_Comment + "", # Lims_INTComment + opt$version, # Mykrobe_Workflow_Version + opt$depth, # Mykrobe_min_depth_default_5 + opt$conf, # Mykrobe_min_conf_default_10 + "", # LIMS_file - empty as it's an upload field in LIMS + opt$name) # LIMS_filename + +names(params) <- c("Lims_Comment", + "Lims_INTComment", + "Mykrobe_Workflow_Version", + "Mykrobe_min_depth_default_5", + "Mykrobe_min_conf_default_10", + "LIMS_file", + "LIMS_filename") + + +# A default report in the order our LIMS requires + +# Make a default dataframe to combine the rest into and enforce column order / fill missing ones with NAs +columns <- c("file", + "Mykrobe_fabG1", + "Mykrobe_katG", + "Mykrobe_ahpC", + "Mykrobe_inhA", + "Mykrobe_ndh", + "Isoniazid_R_mutations", + "Isoniazid_Prediction", + "Mykrobe_rpoB", + "Rifampicin_R_mutations", + "Rifampicin_Prediction", + "Mykrobe_embB", + "Mykrobe_embA", + "Ethambutol_R_mutations", + "Ethambutol_Prediction", + "Mykrobe_pncA", + "Mykrobe_rpsA", + "Pyrazinamide_R_mutations", + "Pyrazinamide_Prediction", + "Mykrobe_gyrA", + "Quinolones_R_mutations", + "Quinolones_Prediction", + "Mykrobe_rpsL", + "Mykrobe_Streptomycin_rrs", + "Mykrobe_Streptomycin_gid", + "Streptomycin_R_mutations", + "Streptomycin_Prediction", + "Mykrobe_Amikacin_rrs", + "Amikacin_R_mutations", + "Amikacin_Prediction", + "Mykrobe_Capreomycin_rrs", + "Mykrobe_Capreomycin_tlyA", + "Capreomycin_R_mutations", + "Capreomycin_Prediction", + "Mykrobe_Kanamycin_rrs", + "Mykrobe_Kanamycin_eis", + "Kanamycin_R_mutations", + "Kanamycin_Prediction", + "Lims_Comment", + "Lims_INTComment", + "Mykrobe_Workflow_Version", + "mykrobe_version", + "Mykrobe_Resistance_probe_set", + "Mykrobe_min_depth_default_5", + "Mykrobe_min_conf_default_10", + "LIMS_file", + "LIMS_filename") + +report <- setNames(data.frame(matrix("", ncol = length(columns), nrow = 1), stringsAsFactors = F), columns) + + +# List of drugs that are tested +all_drugs <- c("Isoniazid", + "Rifampicin", + "Ethambutol", + "Pyrazinamide", + "Moxifloxacin_Ofloxacin", + "Streptomycin", + "Amikacin", + "Capreomycin", + "Kanamycin") + +# Do Stuff #### + +# Import all the JSON files into a list of lists format #### + +if (is.null(opt$file)){ + # opt$dir is used to get the list of files, a vector of non-duplicated files is then passed to map + files <- list.files(path = opt$dir, + pattern = "*.json", + full.names = T) +}else{ + files <- unlist(strsplit(opt$file, ",")) +} + +files <- files[!duplicated(basename(files))] + +list.of.json.files <- map(files, + ~ fromJSON(.x, simplifyDataFrame = F) +) + + +# Apply that getResults function to each element in your list then bash it together into a final report + +temp <- map(list.of.json.files, getResults) %>% + bind_rows() + + +# Predictions of resistance or susceptibility +predictions.table <- + temp %>% + select(file, drug, susceptibility) %>% + mutate(drug = paste(drug, "_Prediction", sep = "")) %>% + spread(drug, susceptibility, fill = "failed") %>% + select(-starts_with("NA")) + +if (length(predictions.table) == 1){ + print(predictions.table) + stop("No susceptibility results in files specified. Did the testing fail?", call.=FALSE) +} + +# Variants +# Multiple resistance mutations and confidence per drug in the X_R_mutations column +# Actual protein changes in Mykrobe_X columns + +variants.temp <- + temp %>% + select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>% + mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent... + filter(!is.na(variants)) %>% # ...Then get rid of it + mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>% + mutate(R_mutations = variants) %>% + mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows) + unnest(variants) %>% + separate(variants, c("gene", "mutation"), "_") %>% + mutate(columnname = ifelse(gene %in% c("tlyA", "rrs", "gid"), # Check for columns that include the drug name or not and paste accordingly + paste("Mykrobe", drug, gene, sep = "_"), + paste("Mykrobe", gene, sep = "_"))) %>% + # Extract out the mutation information with a regex that covers all potential genes + # This regex looks for whatever is ahead of the first colon and after the last hyphen + mutate(mutation = str_match(mutation, "(.*)-.*:")[,2]) %>% + select(file, tempcols, R_mutations, columnname, mutation) + +# Split each kind of variants into its own temp table then merge +variants.1 <- + variants.temp %>% + select(file, tempcols, R_mutations) %>% + distinct() %>% + spread(tempcols, R_mutations) + +variants.2 <- + variants.temp %>% + select(file, columnname, mutation) %>% + group_by(file, columnname) %>% + summarise(mutation = paste(mutation, collapse = ";")) %>% + spread(columnname, mutation) + +variants.table <- full_join(variants.1, variants.2, by = "file") + +# Make a report #### + +report <- + temp %>% + select(file, mykrobe_version, Mykrobe_Resistance_probe_set) %>% # Get important info from initial table + distinct() %>% # Drop duped rows and combine all the tables together + full_join(variants.table) %>% + full_join(predictions.table) %>% + bind_rows(report) %>% # Use bind_rows to add columns (eg. unteseted drugs) to the final output + filter(file != "") + +# Only add the 'no mutation' replacement to the columns that actually have a result +report <- + report %>% + filter_at(vars(ends_with("_Prediction")), any_vars(. != "failed")) %>% + mutate_at(vars(starts_with("Mykrobe_")), funs(replace(., is.na(.), "No Mutation"))) %>% + full_join(anti_join(report, ., by = "file")) %>% + select(columns) %>% + rename(Moxifloxacin_Ofloxacin_R_mutations = Quinolones_R_mutations, + Moxifloxacin_Ofloxacin_Prediction = Quinolones_Prediction) + + +# Add in the parameters fed from Galaxy using named character vector +report <- + report %>% + mutate( + Lims_Comment = params["Lims_Comment"], + Lims_INTComment = params["Lims_INTComment"], + Mykrobe_Workflow_Version = params["Mykrobe_Workflow_Version"], + Mykrobe_min_depth_default_5 = params["Mykrobe_min_depth_default_5"], + Mykrobe_min_conf_default_10 = params["Mykrobe_min_conf_default_10"], + LIMS_file = params["LIMS_file"], + LIMS_filename = params["LIMS_filename"] + ) + + +#View(report) + +# Write some output +# Report as is +write.csv(report, "output-report.csv", row.names = F) +print("Writing Susceptibility report to CSV as output-report.csv") + +# Select specific columns from temp and output them +temp %>% + select(file, + phylo_group, + species, + lineage, + phylo_group_per_covg, + species_per_covg, + lineage_per_covg, + phylo_group_depth, + species_depth, + lineage_depth) %>% + distinct() %>% + write.csv("output-jsondata.csv", row.names = F) +print("Writing JSON data to CSV as output-jsondata.txt") +sink(NULL, type="message") # close the sink + +quit() \ No newline at end of file