comparison mykrobe_parser.R @ 0:6eae14751768 draft

planemo upload for repository https://github.com/phac-nml/mykrobe-parser commit 0b64469b5d819a9c5b9ea85d07ccbc3b5fb6b25e-dirty
author nml
date Fri, 28 Sep 2018 16:01:47 -0400
parents
children f2608dccd3e0
comparison
equal deleted inserted replaced
-1:000000000000 0:6eae14751768
1 # Copyright Government of Canada 2018
2 #
3 # Written by: National Microbiology Laboratory, Public Health Agency of Canada
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use
6 # this work except in compliance with the License. You may obtain a copy of the
7 # License at:
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software distributed
12 # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
13 # CONDITIONS OF ANY KIND, either express or implied. See the License for the
14 # specific language governing permissions and limitations under the License.
15
16
17 # Parsing JSONs from Mykrobe Predict into CSV reports
18 # Take the JSON output from Mykrobe, rearrange, output for LIMS
19 # Adrian Zetner
20 # August 2018
21
22 # Libraries ####
23 library(jsonlite, quietly = T)
24 library(here, quietly = T)
25 suppressMessages(library(dplyr, quietly = T))
26 suppressMessages(library(purrr, quietly = T))
27 library(tidyr, quietly = T)
28 library(stringr, quietly = T)
29 library(optparse, quietly = T)
30
31 # Define custom functions, variables, and paths. Collect and use CL arguments ####
32
33 # Here's a function to recreate that output table from the input JSON files
34
35 getResults <- function(listelement){
36 # Define list levels for various elements of the json
37 species <- names(listelement[[1]][["phylogenetics"]][["species"]])
38 lineage <- names(listelement[[1]][["phylogenetics"]][["lineage"]])
39 phylo_group <- names(listelement[[1]][["phylogenetics"]][["phylo_group"]])
40 if("Non_tuberculosis_mycobacterium_complex" %in% phylo_group){
41 warning(paste("Non-tuberculosis mycobacteria detected in file ", names(listelement), ". Skipping.", sep = ""))
42 return()}
43
44 # Start building a list of all your various elements
45 temp <- list(mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]],
46 file = names(listelement), # One element
47 plate_name = "test", # This probably needs changing
48 sample = "sequence_calls", # Likewise change this
49 phylo_group = phylo_group, # As above
50 species = species, # As above
51 lineage = lineage, # As above
52 # The following expressions drill down into the list elements and pull out what is needed.
53 # It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work
54 phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]],
55 species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]],
56 lineage_per_covg = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["percent_coverage"]],
57 phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]],
58 species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]],
59 lineage_depth = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["median_depth"]],
60 Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second?
61 )
62
63 # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants
64 # Both produce character vectors of the same length as the number of drugs tested in the same order
65 # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow
66
67 if(length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0){
68 temp$susceptibility <- map_chr(listelement[[1]][["susceptibility"]], "predict")
69 }else{
70 temp$susceptibility <- NA
71 }
72
73 if(length(names(listelement[[1]][["susceptibility"]])) != 0){
74 temp$drug <- names(listelement[[1]][["susceptibility"]])
75 }else{
76 temp$drug <- NA
77 }
78
79 mapped.variants <- map(listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector
80 ~ imap(.x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element
81 ~ paste(.y,
82 .x[["info"]][["coverage"]][["alternate"]][["median_depth"]],
83 .x[["info"]][["coverage"]][["reference"]][["median_depth"]],
84 .x[["info"]][["conf"]],
85 sep = ":"))) %>%
86 map_chr(~ paste(.x, collapse = "__"))
87
88 if(length(mapped.variants) != 0){
89 temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants
90 }else{
91 temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA
92 }
93
94 temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA
95
96 # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space
97 # eg. phylo_group is repeated/recycled as many times as there are drugs tested
98 as_tibble(temp)
99 }
100
101 sink(stdout(), type = "message")
102
103 suppressPackageStartupMessages({
104 library(jsonlite)
105 library(here)
106 library(dplyr)
107 library(purrr)
108 library(tidyr)
109 library(stringr)
110 library(optparse)
111 })
112
113 # Get command line arguments with optparse
114 option_list = list(
115 make_option(c("-f", "--file"),
116 type="character",
117 default=NULL,
118 help='dataset file name or quoted comma separated names: eg. "file1,file2,file3"',
119 metavar="character"),
120 make_option(c("-d", "--dir"),
121 type="character",
122 default=NULL,
123 help="directory location of json files",
124 metavar="character"),
125 make_option(c("-v", "--version"),
126 type="character",
127 default="",
128 help="Mykrobe Workflow Version",
129 metavar="character"),
130 make_option(c("-D", "--depth"),
131 type="integer",
132 default=5,
133 help="Minimum depth of coverage [default= %default]",
134 metavar="integer"),
135 make_option(c("-c", "--conf"),
136 type="integer",
137 default=10,
138 help="Minimum genotype confidence for variant genotyping [default= %default]",
139 metavar="integer"),
140 make_option(c("-n", "--name"),
141 type="character",
142 default="",
143 help="Name of the run",
144 metavar="character")
145 )
146
147 opt_parser = OptionParser(option_list=option_list)
148 opt = parse_args(opt_parser)
149
150 if (is.null(opt$file) & is.null(opt$dir)){
151 print_help(opt_parser)
152 stop("At least one argument must be supplied to input file or directory", call.=FALSE)
153 }
154
155 # Parameters to take from Galaxy/CL as args or however works best
156 params <- c("", # Lims_Comment
157 "", # Lims_INTComment
158 opt$version, # Mykrobe_Workflow_Version
159 opt$depth, # Mykrobe_min_depth_default_5
160 opt$conf, # Mykrobe_min_conf_default_10
161 "", # LIMS_file - empty as it's an upload field in LIMS
162 opt$name) # LIMS_filename
163
164 names(params) <- c("Lims_Comment",
165 "Lims_INTComment",
166 "Mykrobe_Workflow_Version",
167 "Mykrobe_min_depth_default_5",
168 "Mykrobe_min_conf_default_10",
169 "LIMS_file",
170 "LIMS_filename")
171
172
173 # A default report in the order our LIMS requires
174
175 # Make a default dataframe to combine the rest into and enforce column order / fill missing ones with NAs
176 columns <- c("file",
177 "Mykrobe_fabG1",
178 "Mykrobe_katG",
179 "Mykrobe_ahpC",
180 "Mykrobe_inhA",
181 "Mykrobe_ndh",
182 "Isoniazid_R_mutations",
183 "Isoniazid_Prediction",
184 "Mykrobe_rpoB",
185 "Rifampicin_R_mutations",
186 "Rifampicin_Prediction",
187 "Mykrobe_embB",
188 "Mykrobe_embA",
189 "Ethambutol_R_mutations",
190 "Ethambutol_Prediction",
191 "Mykrobe_pncA",
192 "Mykrobe_rpsA",
193 "Pyrazinamide_R_mutations",
194 "Pyrazinamide_Prediction",
195 "Mykrobe_gyrA",
196 "Quinolones_R_mutations",
197 "Quinolones_Prediction",
198 "Mykrobe_rpsL",
199 "Mykrobe_Streptomycin_rrs",
200 "Mykrobe_Streptomycin_gid",
201 "Streptomycin_R_mutations",
202 "Streptomycin_Prediction",
203 "Mykrobe_Amikacin_rrs",
204 "Amikacin_R_mutations",
205 "Amikacin_Prediction",
206 "Mykrobe_Capreomycin_rrs",
207 "Mykrobe_Capreomycin_tlyA",
208 "Capreomycin_R_mutations",
209 "Capreomycin_Prediction",
210 "Mykrobe_Kanamycin_rrs",
211 "Mykrobe_Kanamycin_eis",
212 "Kanamycin_R_mutations",
213 "Kanamycin_Prediction",
214 "Lims_Comment",
215 "Lims_INTComment",
216 "Mykrobe_Workflow_Version",
217 "mykrobe_version",
218 "Mykrobe_Resistance_probe_set",
219 "Mykrobe_min_depth_default_5",
220 "Mykrobe_min_conf_default_10",
221 "LIMS_file",
222 "LIMS_filename")
223
224 report <- setNames(data.frame(matrix("", ncol = length(columns), nrow = 1), stringsAsFactors = F), columns)
225
226
227 # List of drugs that are tested
228 all_drugs <- c("Isoniazid",
229 "Rifampicin",
230 "Ethambutol",
231 "Pyrazinamide",
232 "Moxifloxacin_Ofloxacin",
233 "Streptomycin",
234 "Amikacin",
235 "Capreomycin",
236 "Kanamycin")
237
238 # Do Stuff ####
239
240 # Import all the JSON files into a list of lists format ####
241
242 if (is.null(opt$file)){
243 # opt$dir is used to get the list of files, a vector of non-duplicated files is then passed to map
244 files <- list.files(path = opt$dir,
245 pattern = "*.json",
246 full.names = T)
247 }else{
248 files <- unlist(strsplit(opt$file, ","))
249 }
250
251 files <- files[!duplicated(basename(files))]
252
253 list.of.json.files <- map(files,
254 ~ fromJSON(.x, simplifyDataFrame = F)
255 )
256
257
258 # Apply that getResults function to each element in your list then bash it together into a final report
259
260 temp <- map(list.of.json.files, getResults) %>%
261 bind_rows()
262
263
264 # Predictions of resistance or susceptibility
265 predictions.table <-
266 temp %>%
267 select(file, drug, susceptibility) %>%
268 mutate(drug = paste(drug, "_Prediction", sep = "")) %>%
269 spread(drug, susceptibility, fill = "failed") %>%
270 select(-starts_with("NA"))
271
272 if (length(predictions.table) == 1){
273 print(predictions.table)
274 stop("No susceptibility results in files specified. Did the testing fail?", call.=FALSE)
275 }
276
277 # Variants
278 # Multiple resistance mutations and confidence per drug in the X_R_mutations column
279 # Actual protein changes in Mykrobe_X columns
280
281 variants.temp <-
282 temp %>%
283 select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>%
284 mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent...
285 filter(!is.na(variants)) %>% # ...Then get rid of it
286 mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>%
287 mutate(R_mutations = variants) %>%
288 mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows)
289 unnest(variants) %>%
290 separate(variants, c("gene", "mutation"), "_") %>%
291 mutate(columnname = ifelse(gene %in% c("tlyA", "rrs", "gid"), # Check for columns that include the drug name or not and paste accordingly
292 paste("Mykrobe", drug, gene, sep = "_"),
293 paste("Mykrobe", gene, sep = "_"))) %>%
294 # Extract out the mutation information with a regex that covers all potential genes
295 # This regex looks for whatever is ahead of the first colon and after the last hyphen
296 mutate(mutation = str_match(mutation, "(.*)-.*:")[,2]) %>%
297 select(file, tempcols, R_mutations, columnname, mutation)
298
299 # Split each kind of variants into its own temp table then merge
300 variants.1 <-
301 variants.temp %>%
302 select(file, tempcols, R_mutations) %>%
303 distinct() %>%
304 spread(tempcols, R_mutations)
305
306 variants.2 <-
307 variants.temp %>%
308 select(file, columnname, mutation) %>%
309 group_by(file, columnname) %>%
310 summarise(mutation = paste(mutation, collapse = ";")) %>%
311 spread(columnname, mutation)
312
313 variants.table <- full_join(variants.1, variants.2, by = "file")
314
315 # Make a report ####
316
317 report <-
318 temp %>%
319 select(file, mykrobe_version, Mykrobe_Resistance_probe_set) %>% # Get important info from initial table
320 distinct() %>% # Drop duped rows and combine all the tables together
321 full_join(variants.table) %>%
322 full_join(predictions.table) %>%
323 bind_rows(report) %>% # Use bind_rows to add columns (eg. unteseted drugs) to the final output
324 filter(file != "")
325
326 # Only add the 'no mutation' replacement to the columns that actually have a result
327 report <-
328 report %>%
329 filter_at(vars(ends_with("_Prediction")), any_vars(. != "failed")) %>%
330 mutate_at(vars(starts_with("Mykrobe_")), funs(replace(., is.na(.), "No Mutation"))) %>%
331 full_join(anti_join(report, ., by = "file")) %>%
332 select(columns) %>%
333 rename(Moxifloxacin_Ofloxacin_R_mutations = Quinolones_R_mutations,
334 Moxifloxacin_Ofloxacin_Prediction = Quinolones_Prediction)
335
336
337 # Add in the parameters fed from Galaxy using named character vector
338 report <-
339 report %>%
340 mutate(
341 Lims_Comment = params["Lims_Comment"],
342 Lims_INTComment = params["Lims_INTComment"],
343 Mykrobe_Workflow_Version = params["Mykrobe_Workflow_Version"],
344 Mykrobe_min_depth_default_5 = params["Mykrobe_min_depth_default_5"],
345 Mykrobe_min_conf_default_10 = params["Mykrobe_min_conf_default_10"],
346 LIMS_file = params["LIMS_file"],
347 LIMS_filename = params["LIMS_filename"]
348 )
349
350
351 #View(report)
352
353 # Write some output
354 # Report as is
355 write.csv(report, "output-report.csv", row.names = F)
356 print("Writing Susceptibility report to CSV as output-report.csv")
357
358 # Select specific columns from temp and output them
359 temp %>%
360 select(file,
361 phylo_group,
362 species,
363 lineage,
364 phylo_group_per_covg,
365 species_per_covg,
366 lineage_per_covg,
367 phylo_group_depth,
368 species_depth,
369 lineage_depth) %>%
370 distinct() %>%
371 write.csv("output-jsondata.csv", row.names = F)
372 print("Writing JSON data to CSV as output-jsondata.txt")
373 sink(NULL, type="message") # close the sink
374
375 quit()