Mercurial > repos > nml > mykrobe_parser
comparison mykrobe_parser.R @ 0:6eae14751768 draft
planemo upload for repository https://github.com/phac-nml/mykrobe-parser commit 0b64469b5d819a9c5b9ea85d07ccbc3b5fb6b25e-dirty
author | nml |
---|---|
date | Fri, 28 Sep 2018 16:01:47 -0400 |
parents | |
children | f2608dccd3e0 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6eae14751768 |
---|---|
1 # Copyright Government of Canada 2018 | |
2 # | |
3 # Written by: National Microbiology Laboratory, Public Health Agency of Canada | |
4 # | |
5 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use | |
6 # this work except in compliance with the License. You may obtain a copy of the | |
7 # License at: | |
8 # | |
9 # http://www.apache.org/licenses/LICENSE-2.0 | |
10 # | |
11 # Unless required by applicable law or agreed to in writing, software distributed | |
12 # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |
13 # CONDITIONS OF ANY KIND, either express or implied. See the License for the | |
14 # specific language governing permissions and limitations under the License. | |
15 | |
16 | |
17 # Parsing JSONs from Mykrobe Predict into CSV reports | |
18 # Take the JSON output from Mykrobe, rearrange, output for LIMS | |
19 # Adrian Zetner | |
20 # August 2018 | |
21 | |
22 # Libraries #### | |
23 library(jsonlite, quietly = T) | |
24 library(here, quietly = T) | |
25 suppressMessages(library(dplyr, quietly = T)) | |
26 suppressMessages(library(purrr, quietly = T)) | |
27 library(tidyr, quietly = T) | |
28 library(stringr, quietly = T) | |
29 library(optparse, quietly = T) | |
30 | |
31 # Define custom functions, variables, and paths. Collect and use CL arguments #### | |
32 | |
33 # Here's a function to recreate that output table from the input JSON files | |
34 | |
35 getResults <- function(listelement){ | |
36 # Define list levels for various elements of the json | |
37 species <- names(listelement[[1]][["phylogenetics"]][["species"]]) | |
38 lineage <- names(listelement[[1]][["phylogenetics"]][["lineage"]]) | |
39 phylo_group <- names(listelement[[1]][["phylogenetics"]][["phylo_group"]]) | |
40 if("Non_tuberculosis_mycobacterium_complex" %in% phylo_group){ | |
41 warning(paste("Non-tuberculosis mycobacteria detected in file ", names(listelement), ". Skipping.", sep = "")) | |
42 return()} | |
43 | |
44 # Start building a list of all your various elements | |
45 temp <- list(mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]], | |
46 file = names(listelement), # One element | |
47 plate_name = "test", # This probably needs changing | |
48 sample = "sequence_calls", # Likewise change this | |
49 phylo_group = phylo_group, # As above | |
50 species = species, # As above | |
51 lineage = lineage, # As above | |
52 # The following expressions drill down into the list elements and pull out what is needed. | |
53 # It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work | |
54 phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]], | |
55 species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]], | |
56 lineage_per_covg = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["percent_coverage"]], | |
57 phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]], | |
58 species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]], | |
59 lineage_depth = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["median_depth"]], | |
60 Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second? | |
61 ) | |
62 | |
63 # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants | |
64 # Both produce character vectors of the same length as the number of drugs tested in the same order | |
65 # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow | |
66 | |
67 if(length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0){ | |
68 temp$susceptibility <- map_chr(listelement[[1]][["susceptibility"]], "predict") | |
69 }else{ | |
70 temp$susceptibility <- NA | |
71 } | |
72 | |
73 if(length(names(listelement[[1]][["susceptibility"]])) != 0){ | |
74 temp$drug <- names(listelement[[1]][["susceptibility"]]) | |
75 }else{ | |
76 temp$drug <- NA | |
77 } | |
78 | |
79 mapped.variants <- map(listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector | |
80 ~ imap(.x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element | |
81 ~ paste(.y, | |
82 .x[["info"]][["coverage"]][["alternate"]][["median_depth"]], | |
83 .x[["info"]][["coverage"]][["reference"]][["median_depth"]], | |
84 .x[["info"]][["conf"]], | |
85 sep = ":"))) %>% | |
86 map_chr(~ paste(.x, collapse = "__")) | |
87 | |
88 if(length(mapped.variants) != 0){ | |
89 temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants | |
90 }else{ | |
91 temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA | |
92 } | |
93 | |
94 temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA | |
95 | |
96 # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space | |
97 # eg. phylo_group is repeated/recycled as many times as there are drugs tested | |
98 as_tibble(temp) | |
99 } | |
100 | |
101 sink(stdout(), type = "message") | |
102 | |
103 suppressPackageStartupMessages({ | |
104 library(jsonlite) | |
105 library(here) | |
106 library(dplyr) | |
107 library(purrr) | |
108 library(tidyr) | |
109 library(stringr) | |
110 library(optparse) | |
111 }) | |
112 | |
113 # Get command line arguments with optparse | |
114 option_list = list( | |
115 make_option(c("-f", "--file"), | |
116 type="character", | |
117 default=NULL, | |
118 help='dataset file name or quoted comma separated names: eg. "file1,file2,file3"', | |
119 metavar="character"), | |
120 make_option(c("-d", "--dir"), | |
121 type="character", | |
122 default=NULL, | |
123 help="directory location of json files", | |
124 metavar="character"), | |
125 make_option(c("-v", "--version"), | |
126 type="character", | |
127 default="", | |
128 help="Mykrobe Workflow Version", | |
129 metavar="character"), | |
130 make_option(c("-D", "--depth"), | |
131 type="integer", | |
132 default=5, | |
133 help="Minimum depth of coverage [default= %default]", | |
134 metavar="integer"), | |
135 make_option(c("-c", "--conf"), | |
136 type="integer", | |
137 default=10, | |
138 help="Minimum genotype confidence for variant genotyping [default= %default]", | |
139 metavar="integer"), | |
140 make_option(c("-n", "--name"), | |
141 type="character", | |
142 default="", | |
143 help="Name of the run", | |
144 metavar="character") | |
145 ) | |
146 | |
147 opt_parser = OptionParser(option_list=option_list) | |
148 opt = parse_args(opt_parser) | |
149 | |
150 if (is.null(opt$file) & is.null(opt$dir)){ | |
151 print_help(opt_parser) | |
152 stop("At least one argument must be supplied to input file or directory", call.=FALSE) | |
153 } | |
154 | |
155 # Parameters to take from Galaxy/CL as args or however works best | |
156 params <- c("", # Lims_Comment | |
157 "", # Lims_INTComment | |
158 opt$version, # Mykrobe_Workflow_Version | |
159 opt$depth, # Mykrobe_min_depth_default_5 | |
160 opt$conf, # Mykrobe_min_conf_default_10 | |
161 "", # LIMS_file - empty as it's an upload field in LIMS | |
162 opt$name) # LIMS_filename | |
163 | |
164 names(params) <- c("Lims_Comment", | |
165 "Lims_INTComment", | |
166 "Mykrobe_Workflow_Version", | |
167 "Mykrobe_min_depth_default_5", | |
168 "Mykrobe_min_conf_default_10", | |
169 "LIMS_file", | |
170 "LIMS_filename") | |
171 | |
172 | |
173 # A default report in the order our LIMS requires | |
174 | |
175 # Make a default dataframe to combine the rest into and enforce column order / fill missing ones with NAs | |
176 columns <- c("file", | |
177 "Mykrobe_fabG1", | |
178 "Mykrobe_katG", | |
179 "Mykrobe_ahpC", | |
180 "Mykrobe_inhA", | |
181 "Mykrobe_ndh", | |
182 "Isoniazid_R_mutations", | |
183 "Isoniazid_Prediction", | |
184 "Mykrobe_rpoB", | |
185 "Rifampicin_R_mutations", | |
186 "Rifampicin_Prediction", | |
187 "Mykrobe_embB", | |
188 "Mykrobe_embA", | |
189 "Ethambutol_R_mutations", | |
190 "Ethambutol_Prediction", | |
191 "Mykrobe_pncA", | |
192 "Mykrobe_rpsA", | |
193 "Pyrazinamide_R_mutations", | |
194 "Pyrazinamide_Prediction", | |
195 "Mykrobe_gyrA", | |
196 "Quinolones_R_mutations", | |
197 "Quinolones_Prediction", | |
198 "Mykrobe_rpsL", | |
199 "Mykrobe_Streptomycin_rrs", | |
200 "Mykrobe_Streptomycin_gid", | |
201 "Streptomycin_R_mutations", | |
202 "Streptomycin_Prediction", | |
203 "Mykrobe_Amikacin_rrs", | |
204 "Amikacin_R_mutations", | |
205 "Amikacin_Prediction", | |
206 "Mykrobe_Capreomycin_rrs", | |
207 "Mykrobe_Capreomycin_tlyA", | |
208 "Capreomycin_R_mutations", | |
209 "Capreomycin_Prediction", | |
210 "Mykrobe_Kanamycin_rrs", | |
211 "Mykrobe_Kanamycin_eis", | |
212 "Kanamycin_R_mutations", | |
213 "Kanamycin_Prediction", | |
214 "Lims_Comment", | |
215 "Lims_INTComment", | |
216 "Mykrobe_Workflow_Version", | |
217 "mykrobe_version", | |
218 "Mykrobe_Resistance_probe_set", | |
219 "Mykrobe_min_depth_default_5", | |
220 "Mykrobe_min_conf_default_10", | |
221 "LIMS_file", | |
222 "LIMS_filename") | |
223 | |
224 report <- setNames(data.frame(matrix("", ncol = length(columns), nrow = 1), stringsAsFactors = F), columns) | |
225 | |
226 | |
227 # List of drugs that are tested | |
228 all_drugs <- c("Isoniazid", | |
229 "Rifampicin", | |
230 "Ethambutol", | |
231 "Pyrazinamide", | |
232 "Moxifloxacin_Ofloxacin", | |
233 "Streptomycin", | |
234 "Amikacin", | |
235 "Capreomycin", | |
236 "Kanamycin") | |
237 | |
238 # Do Stuff #### | |
239 | |
240 # Import all the JSON files into a list of lists format #### | |
241 | |
242 if (is.null(opt$file)){ | |
243 # opt$dir is used to get the list of files, a vector of non-duplicated files is then passed to map | |
244 files <- list.files(path = opt$dir, | |
245 pattern = "*.json", | |
246 full.names = T) | |
247 }else{ | |
248 files <- unlist(strsplit(opt$file, ",")) | |
249 } | |
250 | |
251 files <- files[!duplicated(basename(files))] | |
252 | |
253 list.of.json.files <- map(files, | |
254 ~ fromJSON(.x, simplifyDataFrame = F) | |
255 ) | |
256 | |
257 | |
258 # Apply that getResults function to each element in your list then bash it together into a final report | |
259 | |
260 temp <- map(list.of.json.files, getResults) %>% | |
261 bind_rows() | |
262 | |
263 | |
264 # Predictions of resistance or susceptibility | |
265 predictions.table <- | |
266 temp %>% | |
267 select(file, drug, susceptibility) %>% | |
268 mutate(drug = paste(drug, "_Prediction", sep = "")) %>% | |
269 spread(drug, susceptibility, fill = "failed") %>% | |
270 select(-starts_with("NA")) | |
271 | |
272 if (length(predictions.table) == 1){ | |
273 print(predictions.table) | |
274 stop("No susceptibility results in files specified. Did the testing fail?", call.=FALSE) | |
275 } | |
276 | |
277 # Variants | |
278 # Multiple resistance mutations and confidence per drug in the X_R_mutations column | |
279 # Actual protein changes in Mykrobe_X columns | |
280 | |
281 variants.temp <- | |
282 temp %>% | |
283 select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>% | |
284 mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent... | |
285 filter(!is.na(variants)) %>% # ...Then get rid of it | |
286 mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>% | |
287 mutate(R_mutations = variants) %>% | |
288 mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows) | |
289 unnest(variants) %>% | |
290 separate(variants, c("gene", "mutation"), "_") %>% | |
291 mutate(columnname = ifelse(gene %in% c("tlyA", "rrs", "gid"), # Check for columns that include the drug name or not and paste accordingly | |
292 paste("Mykrobe", drug, gene, sep = "_"), | |
293 paste("Mykrobe", gene, sep = "_"))) %>% | |
294 # Extract out the mutation information with a regex that covers all potential genes | |
295 # This regex looks for whatever is ahead of the first colon and after the last hyphen | |
296 mutate(mutation = str_match(mutation, "(.*)-.*:")[,2]) %>% | |
297 select(file, tempcols, R_mutations, columnname, mutation) | |
298 | |
299 # Split each kind of variants into its own temp table then merge | |
300 variants.1 <- | |
301 variants.temp %>% | |
302 select(file, tempcols, R_mutations) %>% | |
303 distinct() %>% | |
304 spread(tempcols, R_mutations) | |
305 | |
306 variants.2 <- | |
307 variants.temp %>% | |
308 select(file, columnname, mutation) %>% | |
309 group_by(file, columnname) %>% | |
310 summarise(mutation = paste(mutation, collapse = ";")) %>% | |
311 spread(columnname, mutation) | |
312 | |
313 variants.table <- full_join(variants.1, variants.2, by = "file") | |
314 | |
315 # Make a report #### | |
316 | |
317 report <- | |
318 temp %>% | |
319 select(file, mykrobe_version, Mykrobe_Resistance_probe_set) %>% # Get important info from initial table | |
320 distinct() %>% # Drop duped rows and combine all the tables together | |
321 full_join(variants.table) %>% | |
322 full_join(predictions.table) %>% | |
323 bind_rows(report) %>% # Use bind_rows to add columns (eg. unteseted drugs) to the final output | |
324 filter(file != "") | |
325 | |
326 # Only add the 'no mutation' replacement to the columns that actually have a result | |
327 report <- | |
328 report %>% | |
329 filter_at(vars(ends_with("_Prediction")), any_vars(. != "failed")) %>% | |
330 mutate_at(vars(starts_with("Mykrobe_")), funs(replace(., is.na(.), "No Mutation"))) %>% | |
331 full_join(anti_join(report, ., by = "file")) %>% | |
332 select(columns) %>% | |
333 rename(Moxifloxacin_Ofloxacin_R_mutations = Quinolones_R_mutations, | |
334 Moxifloxacin_Ofloxacin_Prediction = Quinolones_Prediction) | |
335 | |
336 | |
337 # Add in the parameters fed from Galaxy using named character vector | |
338 report <- | |
339 report %>% | |
340 mutate( | |
341 Lims_Comment = params["Lims_Comment"], | |
342 Lims_INTComment = params["Lims_INTComment"], | |
343 Mykrobe_Workflow_Version = params["Mykrobe_Workflow_Version"], | |
344 Mykrobe_min_depth_default_5 = params["Mykrobe_min_depth_default_5"], | |
345 Mykrobe_min_conf_default_10 = params["Mykrobe_min_conf_default_10"], | |
346 LIMS_file = params["LIMS_file"], | |
347 LIMS_filename = params["LIMS_filename"] | |
348 ) | |
349 | |
350 | |
351 #View(report) | |
352 | |
353 # Write some output | |
354 # Report as is | |
355 write.csv(report, "output-report.csv", row.names = F) | |
356 print("Writing Susceptibility report to CSV as output-report.csv") | |
357 | |
358 # Select specific columns from temp and output them | |
359 temp %>% | |
360 select(file, | |
361 phylo_group, | |
362 species, | |
363 lineage, | |
364 phylo_group_per_covg, | |
365 species_per_covg, | |
366 lineage_per_covg, | |
367 phylo_group_depth, | |
368 species_depth, | |
369 lineage_depth) %>% | |
370 distinct() %>% | |
371 write.csv("output-jsondata.csv", row.names = F) | |
372 print("Writing JSON data to CSV as output-jsondata.txt") | |
373 sink(NULL, type="message") # close the sink | |
374 | |
375 quit() |