comparison mykrobe_parser.R @ 6:8e7e5a660942 draft

planemo upload for repository https://github.com/phac-nml/mykrobe-parser commit 077a66175f3666924b42f216bbcb671ee0d026f2
author nml
date Fri, 22 Sep 2023 17:24:02 +0000
parents deebc6410d13
children
comparison
equal deleted inserted replaced
5:deebc6410d13 6:8e7e5a660942
1 # Copyright Government of Canada 2018 1 # Copyright Government of Canada 2018
2 # 2 #
3 # Written by: National Microbiology Laboratory, Public Health Agency of Canada 3 # Written by: National Microbiology Laboratory, Public Health Agency of Canada
4 # 4 #
5 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use 5 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use
6 # this work except in compliance with the License. You may obtain a copy of the 6 # this work except in compliance with the License. You may obtain a copy of the
7 # License at: 7 # License at:
8 # 8 #
9 # http://www.apache.org/licenses/LICENSE-2.0 9 # http://www.apache.org/licenses/LICENSE-2.0
10 # 10 #
11 # Unless required by applicable law or agreed to in writing, software distributed 11 # Unless required by applicable law or agreed to in writing, software distributed
12 # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
13 # CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 # CONDITIONS OF ANY KIND, either express or implied. See the License for the
14 # specific language governing permissions and limitations under the License. 14 # specific language governing permissions and limitations under the License.
15 15
16 16
17 # Parsing JSONs from Mykrobe Predict into CSV reports 17 # Parsing JSONs from Mykrobe Predict into CSV reports
18 # Take the JSON output from Mykrobe, rearrange, output for LIMS 18 # Take the JSON output from Mykrobe, rearrange, output for LIMS
19 # Adrian Zetner 19 # Adrian Zetner
20 # August 2018 20 # August 2018
21 # Updated August 2023
21 22
22 # Libraries #### 23 # Libraries ####
23 24
24 sink(stdout(), type = "message") 25 sink(stdout(), type = "message")
25 26
33 library(optparse) 34 library(optparse)
34 }) 35 })
35 36
36 # Define custom functions, variables, and paths. Collect and use CL arguments #### 37 # Define custom functions, variables, and paths. Collect and use CL arguments ####
37 38
38 # Here's a function to recreate that output table from the input JSON files 39 # Here's a function to recreate that output table from the input JSON files for 2019
39 40
40 getResults <- function(listelement){ 41 getResults2019 <- function(listelement) {
41 # Define list levels for various elements of the json 42 # Define list levels for various elements of the json
42 species <- names(listelement[[1]][["phylogenetics"]][["species"]]) 43 phylo_group <-
43 lineage <- names(listelement[[1]][["phylogenetics"]][["lineage"]]) 44 names(listelement[[1]][["phylogenetics"]][["phylo_group"]])
44 phylo_group <- names(listelement[[1]][["phylogenetics"]][["phylo_group"]]) 45 if ("Non_tuberculosis_mycobacterium_complex" %in% phylo_group) {
45 if("Non_tuberculosis_mycobacterium_complex" %in% phylo_group){ 46 warning(
46 warning(paste("Non-tuberculosis mycobacteria detected in file ", names(listelement), ". Skipping.", sep = "")) 47 paste(
47 return()} 48 "Non-tuberculosis mycobacteria detected in file ",
48 49 names(listelement),
50 ". Skipping.",
51 sep = ""
52 )
53 )
54 return()
55 }
56
57 species <-
58 names(listelement[[1]][["phylogenetics"]][["species"]])
59 lineage <-
60 if (length(listelement[[1]][["phylogenetics"]][["lineage"]]) == 1) {
61 names(listelement[[1]][["phylogenetics"]][["lineage"]])
62 } else {
63 listelement[[1]][["phylogenetics"]][["lineage"]][["lineage"]]
64 }
65
49 # Start building a list of all your various elements 66 # Start building a list of all your various elements
50 temp <- list(mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]], 67 temp <-
51 file = names(listelement), # One element 68 list(
52 plate_name = "test", # This probably needs changing 69 mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]],
53 sample = "sequence_calls", # Likewise change this 70 file = names(listelement),
54 phylo_group = phylo_group, # As above 71 # One element
55 species = species, # As above 72 plate_name = "test",
56 lineage = lineage, # As above 73 # This probably needs changing
57 # The following expressions drill down into the list elements and pull out what is needed. 74 sample = "sequence_calls",
58 # It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work 75 # Likewise change this
59 phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]], 76 phylo_group = phylo_group,
60 species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]], 77 # As above
61 lineage_per_covg = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["percent_coverage"]], 78 species = species,
62 phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]], 79 # As above
63 species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]], 80 lineage = lineage,
64 lineage_depth = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["median_depth"]], 81 # As above
65 Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second? 82 # The following expressions drill down into the list elements and pull out what is needed.
66 ) 83 # It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work
67 84 phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]],
85 species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]],
86 lineage_per_covg = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["percent_coverage"]],
87 phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]],
88 species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]],
89 lineage_depth = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["median_depth"]],
90 Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second?
91 )
92
68 # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants 93 # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants
69 # Both produce character vectors of the same length as the number of drugs tested in the same order 94 # Both produce character vectors of the same length as the number of drugs tested in the same order
70 # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow 95 # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow
71 96
72 if(length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0){ 97 if (length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0) {
73 temp$susceptibility <- map_chr(listelement[[1]][["susceptibility"]], "predict") 98 temp$susceptibility <-
74 }else{ 99 map_chr(listelement[[1]][["susceptibility"]], "predict")
100 } else {
75 temp$susceptibility <- NA 101 temp$susceptibility <- NA
76 } 102 }
77 103
78 if(length(names(listelement[[1]][["susceptibility"]])) != 0){ 104 if (length(names(listelement[[1]][["susceptibility"]])) != 0) {
79 temp$drug <- names(listelement[[1]][["susceptibility"]]) 105 temp$drug <- names(listelement[[1]][["susceptibility"]])
80 }else{ 106 } else {
81 temp$drug <- NA 107 temp$drug <- NA
82 } 108 }
83 109
84 mapped.variants <- map(listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector 110 mapped.variants <-
85 ~ imap(.x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element 111 map(
86 ~ paste(.y, 112 listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector
87 .x[["info"]][["coverage"]][["alternate"]][["median_depth"]], 113 ~ imap(
88 .x[["info"]][["coverage"]][["reference"]][["median_depth"]], 114 .x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element
89 .x[["info"]][["conf"]], 115 ~ paste(.y,
90 sep = ":"))) %>% 116 .x[["info"]][["coverage"]][["alternate"]][["median_depth"]],
117 .x[["info"]][["coverage"]][["reference"]][["median_depth"]],
118 .x[["info"]][["conf"]],
119 sep = ":"
120 )
121 )
122 ) %>%
91 map_chr(~ paste(.x, collapse = "__")) 123 map_chr(~ paste(.x, collapse = "__"))
92 124
93 if(length(mapped.variants) != 0){ 125 if (length(mapped.variants) != 0) {
94 temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants 126 temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants
95 }else{ 127 } else {
96 temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA 128 temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA
97 } 129 }
98 130
99 temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA 131 temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA
100 132
101 # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space 133 # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space
102 # eg. phylo_group is repeated/recycled as many times as there are drugs tested 134 # eg. phylo_group is repeated/recycled as many times as there are drugs tested
103 as_tibble(temp) 135 as_tibble(temp)
104 } 136 }
105 137
138 # Here's a function to recreate that output table from the input JSON files for panel 2020
139
140 getResults2020 <- function(listelement) {
141 # Define list levels for various elements of the json
142 phylo_group <-
143 names(listelement[[1]][["phylogenetics"]][["phylo_group"]])
144 if ("Non_tuberculosis_mycobacterium_complex" %in% phylo_group) {
145 warning(
146 paste(
147 "Non-tuberculosis mycobacteria detected in file ",
148 names(listelement),
149 ". Skipping.",
150 sep = ""
151 )
152 )
153 return()
154 }
155
156 species <-
157 names(listelement[[1]][["phylogenetics"]][["species"]])
158
159 # Start building a list of all your various elements
160 temp <-
161 list(
162 mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]],
163 file = names(listelement),
164 # One element
165 plate_name = "test",
166 # This probably needs changing
167 sample = "sequence_calls",
168 # Likewise change this
169 phylo_group = phylo_group,
170 # As above
171 species = species,
172 # As above
173 # The following expressions drill down into the list elements and pull out what is needed.
174 # It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work
175 phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]],
176 species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]],
177 phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]],
178 species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]],
179 Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second?
180 )
181
182 # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants
183 # Both produce character vectors of the same length as the number of drugs tested in the same order
184 # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow
185
186 if (length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0) {
187 temp$susceptibility <-
188 map_chr(listelement[[1]][["susceptibility"]], "predict")
189 } else {
190 temp$susceptibility <- NA
191 }
192
193 if (length(names(listelement[[1]][["susceptibility"]])) != 0) {
194 temp$drug <- names(listelement[[1]][["susceptibility"]])
195 } else {
196 temp$drug <- NA
197 }
198
199 mapped.variants <-
200 map(
201 listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector
202 ~ imap(
203 .x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element
204 ~ paste(.y,
205 .x[["info"]][["coverage"]][["alternate"]][["median_depth"]],
206 .x[["info"]][["coverage"]][["reference"]][["median_depth"]],
207 .x[["info"]][["conf"]],
208 sep = ":"
209 )
210 )
211 ) %>%
212 map_chr(~ paste(.x, collapse = "__"))
213
214 if (length(mapped.variants) != 0) {
215 temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants
216 } else {
217 temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA
218 }
219
220 temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA
221
222 # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space
223 # eg. phylo_group is repeated/recycled as many times as there are drugs tested
224 as_tibble(temp)
225 }
226
106 # Get command line arguments with optparse 227 # Get command line arguments with optparse
107 option_list = list( 228 option_list <- list(
108 make_option(c("-f", "--file"), 229 make_option(
109 type="character", 230 c("-f", "--file"),
110 default=NULL, 231 type = "character",
111 help='dataset file name or quoted comma separated names: eg. "file1,file2,file3"', 232 default = NULL,
112 metavar="character"), 233 help = 'dataset file name or quoted comma separated names: eg. "file1,file2,file3"',
113 make_option(c("-d", "--dir"), 234 metavar = "character"
114 type="character", 235 ),
115 default=NULL, 236 make_option(
116 help="directory location of json files", 237 c("-d", "--dir"),
117 metavar="character"), 238 type = "character",
118 make_option(c("-v", "--version"), 239 default = NULL,
119 type="character", 240 help = "directory location of json files",
120 default="", 241 metavar = "character"
121 help="Mykrobe Workflow Version", 242 ),
122 metavar="character"), 243 make_option(
123 make_option(c("-D", "--depth"), 244 c("-v", "--version"),
124 type="integer", 245 type = "character",
125 default=5, 246 default = "",
126 help="Minimum depth of coverage [default= %default]", 247 help = "Mykrobe Workflow Version",
127 metavar="integer"), 248 metavar = "character"
128 make_option(c("-c", "--conf"), 249 ),
129 type="integer", 250 make_option(
130 default=10, 251 c("-p", "--panel"),
131 help="Minimum genotype confidence for variant genotyping [default= %default]", 252 type = "character",
132 metavar="integer"), 253 default = "2019",
133 make_option(c("-n", "--name"), 254 help = "Mykrobe Panel Version: 2019 or 2020. [default= %default]",
134 type="character", 255 metavar = "character"
135 default="", 256 ),
136 help="Name of the run", 257 make_option(
137 metavar="character") 258 c("-D", "--depth"),
259 type = "integer",
260 default = 5,
261 help = "Minimum depth of coverage [default= %default]",
262 metavar = "integer"
263 ),
264 make_option(
265 c("-c", "--conf"),
266 type = "integer",
267 default = 10,
268 help = "Minimum genotype confidence for variant genotyping [default= %default]",
269 metavar = "integer"
270 ),
271 make_option(
272 c("-n", "--name"),
273 type = "character",
274 default = "",
275 help = "Name of the run",
276 metavar = "character"
277 ),
278 make_option(
279 c("-r", "--reportfile"),
280 type = "character",
281 default = "report",
282 help = "File name for susceptibility report data",
283 metavar = "character"
284 ),
285 make_option(
286 c("-s", "--speciationfile"),
287 type = "character",
288 default = "jsondata",
289 help = "File name for speciation data",
290 metavar = "character"
291 )
138 ) 292 )
139 293
140 opt_parser = OptionParser(option_list=option_list) 294 opt_parser <- OptionParser(option_list = option_list)
141 opt = parse_args(opt_parser) 295 opt <- parse_args(opt_parser)
142 296
143 if (is.null(opt$file) & is.null(opt$dir)){ 297 if (is.null(opt$file) && is.null(opt$dir)) {
144 print_help(opt_parser) 298 print_help(opt_parser)
145 stop("At least one argument must be supplied to input file or directory", call.=FALSE) 299 stop("At least one argument must be supplied to input file or directory",
300 call. = FALSE
301 )
302 }
303
304 if (opt$panel != "2019" && opt$panel != "2020") {
305 print_help(opt_parser)
306 stop("Panel must be one of 2019 or 2020", call. = FALSE)
146 } 307 }
147 308
148 # Parameters to take from Galaxy/CL as args or however works best 309 # Parameters to take from Galaxy/CL as args or however works best
149 params <- c("", # Lims_Comment 310 params <- c(
150 "", # Lims_INTComment 311 "", # Lims_Comment
151 opt$version, # Mykrobe_Workflow_Version 312 "", # Lims_INTComment
152 opt$depth, # Mykrobe_min_depth_default_5 313 opt$version, # Mykrobe_Workflow_Version
153 opt$conf, # Mykrobe_min_conf_default_10 314 opt$panel, # Mykrobe Panel Version
154 "", # LIMS_file - empty as it's an upload field in LIMS 315 opt$depth, # Mykrobe_min_depth_default_5
155 opt$name) # Mutation_set_version 316 opt$conf, # Mykrobe_min_conf_default_10
156 317 "", # LIMS_file - empty as it's an upload field in LIMS
157 names(params) <- c("Lims_Comment", 318 opt$name
158 "Lims_INTComment", 319 ) # Mutation_set_version
159 "Mykrobe_Workflow_Version", 320
160 "Mykrobe_min_depth_default_5", 321 names(params) <- c(
161 "Mykrobe_min_conf_default_10", 322 "Lims_Comment",
162 "LIMS_file", 323 "Lims_INTComment",
163 "Mutation_set_version") 324 "Mykrobe_Workflow_Version",
325 "Mykrobe_Panel_Version",
326 "Mykrobe_min_depth_default_5",
327 "Mykrobe_min_conf_default_10",
328 "LIMS_file",
329 "Mutation_set_version"
330 )
164 331
165 332
166 # A default report in the order our LIMS requires 333 # A default report in the order our LIMS requires
167 334
168 # Make a default dataframe to combine the rest into and enforce column order / fill missing ones with NAs 335 # Make a default dataframe to combine the rest into and enforce column order / fill missing ones with NAs
169 columns <- c("file", 336 columns <- c(
170 "Mykrobe_fabG1", 337 "file",
171 "Mykrobe_katG", 338 "Mykrobe_fabG1",
172 "Mykrobe_ahpC", 339 "Mykrobe_katG",
173 "Mykrobe_inhA", 340 "Mykrobe_ahpC",
174 "Mykrobe_ndh", 341 "Mykrobe_inhA",
175 "Isoniazid_R_mutations", 342 "Mykrobe_ndh",
176 "Isoniazid_Prediction", 343 "Isoniazid_R_mutations",
177 "Mykrobe_rpoB", 344 "Isoniazid_Prediction",
178 "Rifampicin_R_mutations", 345 "Mykrobe_rpoB",
179 "Rifampicin_Prediction", 346 "Rifampicin_R_mutations",
180 "Mykrobe_embB", 347 "Rifampicin_Prediction",
181 "Mykrobe_embA", 348 "Mykrobe_embB",
182 "Ethambutol_R_mutations", 349 "Mykrobe_embA",
183 "Ethambutol_Prediction", 350 "Ethambutol_R_mutations",
184 "Mykrobe_pncA", 351 "Ethambutol_Prediction",
185 "Mykrobe_rpsA", 352 "Mykrobe_pncA",
186 "Pyrazinamide_R_mutations", 353 "Mykrobe_rpsA",
187 "Pyrazinamide_Prediction", 354 "Pyrazinamide_R_mutations",
188 "Mykrobe_Ofloxacin_gyrA", 355 "Pyrazinamide_Prediction",
189 "Ofloxacin_R_mutations", 356 "Mykrobe_Ofloxacin_gyrA",
190 "Ofloxacin_Prediction", 357 "Ofloxacin_R_mutations",
191 "Mykrobe_Moxifloxacin_gyrA", 358 "Ofloxacin_Prediction",
192 "Moxifloxacin_R_mutations", 359 "Mykrobe_Moxifloxacin_gyrA",
193 "Moxifloxacin_Prediction", 360 "Moxifloxacin_R_mutations",
194 "Mykrobe_rpsL", 361 "Moxifloxacin_Prediction",
195 "Mykrobe_Streptomycin_rrs", 362 "Mykrobe_Ciprofloxacin_gyrA",
196 "Mykrobe_Streptomycin_gid", 363 "Ciprofloxacin_R_mutations",
197 "Streptomycin_R_mutations", 364 "Ciprofloxacin_Prediction",
198 "Streptomycin_Prediction", 365 "Mykrobe_rpsL",
199 "Mykrobe_Amikacin_rrs", 366 "Mykrobe_Streptomycin_rrs",
200 "Amikacin_R_mutations", 367 "Mykrobe_Streptomycin_gid",
201 "Amikacin_Prediction", 368 "Streptomycin_R_mutations",
202 "Mykrobe_Capreomycin_rrs", 369 "Streptomycin_Prediction",
203 "Mykrobe_Capreomycin_tlyA", 370 "Mykrobe_Amikacin_rrs",
204 "Capreomycin_R_mutations", 371 "Amikacin_R_mutations",
205 "Capreomycin_Prediction", 372 "Amikacin_Prediction",
206 "Mykrobe_Kanamycin_rrs", 373 "Mykrobe_Capreomycin_rrs",
207 "Mykrobe_Kanamycin_eis", 374 "Mykrobe_Capreomycin_tlyA",
208 "Kanamycin_R_mutations", 375 "Capreomycin_R_mutations",
209 "Kanamycin_Prediction", 376 "Capreomycin_Prediction",
210 "Lims_Comment", 377 "Mykrobe_Kanamycin_rrs",
211 "Lims_INTComment", 378 "Mykrobe_Kanamycin_eis",
212 "Mykrobe_Workflow_Version", 379 "Kanamycin_R_mutations",
213 "mykrobe_version", 380 "Kanamycin_Prediction",
214 "Mykrobe_Resistance_probe_set", 381 "Lims_Comment",
215 "Mykrobe_min_depth_default_5", 382 "Lims_INTComment",
216 "Mykrobe_min_conf_default_10", 383 "Mykrobe_Workflow_Version",
217 "LIMS_file", 384 "mykrobe_version",
218 "Mutation_set_version") 385 "Mykrobe_Resistance_probe_set",
219 386 "Mykrobe_min_depth_default_5",
220 report <- setNames(data.frame(matrix("", ncol = length(columns), nrow = 1), stringsAsFactors = F), columns) 387 "Mykrobe_min_conf_default_10",
221 388 "LIMS_file",
389 "Mutation_set_version"
390 )
391
392 report <-
393 setNames(data.frame(matrix(
394 "",
395 ncol = length(columns), nrow = 1
396 ), stringsAsFactors = FALSE), columns)
397
398 report_cols <- c(
399 "file",
400 "phylo_group",
401 "species",
402 "lineage",
403 "phylo_group_per_covg",
404 "species_per_covg",
405 "lineage_per_covg",
406 "phylo_group_depth",
407 "species_depth",
408 "lineage_depth"
409 )
222 410
223 # List of drugs that are tested 411 # List of drugs that are tested
224 all_drugs <- c("Isoniazid", 412 all_drugs <- c(
225 "Rifampicin", 413 "Isoniazid",
226 "Ethambutol", 414 "Rifampicin",
227 "Pyrazinamide", 415 "Ethambutol",
228 "Moxifloxacin", 416 "Pyrazinamide",
229 "Ofloxacin", 417 "Moxifloxacin",
230 "Streptomycin", 418 "Ofloxacin",
231 "Amikacin", 419 "Streptomycin",
232 "Capreomycin", 420 "Amikacin",
233 "Kanamycin") 421 "Capreomycin",
422 "Kanamycin"
423 )
234 424
235 # Do Stuff #### 425 # Do Stuff ####
236 426
237 # Import all the JSON files into a list of lists format #### 427 # Import all the JSON files into a list of lists format ####
238 428
239 if (is.null(opt$file)){ 429 if (is.null(opt$file)) {
240 # opt$dir is used to get the list of files, a vector of non-duplicated files is then passed to map 430 # opt$dir is used to get the list of files, a vector of non-duplicated files is then passed to map
241 files <- list.files(path = opt$dir, 431 files <- list.files(
242 pattern = "*.json", 432 path = opt$dir,
243 full.names = T) 433 pattern = "*.json",
244 }else{ 434 full.names = TRUE
435 )
436 } else {
245 files <- unlist(strsplit(opt$file, ",")) 437 files <- unlist(strsplit(opt$file, ","))
246 } 438 }
247 439
248 files <- files[!duplicated(basename(files))] 440 files <- files[!duplicated(basename(files))]
249 441
250 list.of.json.files <- map(files, 442 list.of.json.files <- map(
251 ~ fromJSON(.x, simplifyDataFrame = F) 443 files,
444 ~ fromJSON(.x, simplifyDataFrame = FALSE)
252 ) 445 )
253 446
254 447
255 # Apply that getResults function to each element in your list then bash it together into a final report 448 # Apply the correct getResults function to each element in your list then bash it together into a final report
256 449
257 temp <- map(list.of.json.files, getResults) %>% 450 if (opt$panel == "2019") {
258 bind_rows() 451 temp <- map(list.of.json.files, getResults2019) %>%
452 bind_rows()
453 } else if (opt$panel == "2020") {
454 temp <- map(list.of.json.files, getResults2020) %>%
455 bind_rows()
456 columns <-
457 setdiff(
458 columns,
459 c(
460 "Mykrobe_Ciprofloxacin_gyrA",
461 "Ciprofloxacin_R_mutations",
462 "Ciprofloxacin_Prediction"
463 )
464 )
465 report_cols <- setdiff(
466 report_cols,
467 c(
468 "lineage",
469 "lineage_per_covg",
470 "lineage_depth"
471 )
472 )
473 } else {
474 stop("Panel must be one of 2019 or 2020", call. = FALSE)
475 }
259 476
260 477
261 # Predictions of resistance or susceptibility 478 # Predictions of resistance or susceptibility
262 predictions.table <- 479
480 predictions.table <-
263 temp %>% 481 temp %>%
264 select(file, drug, susceptibility) %>% 482 select(file, drug, susceptibility) %>%
265 mutate(drug = paste(drug, "_Prediction", sep = "")) %>% 483 mutate(drug = paste(drug, "_Prediction", sep = "")) %>%
266 spread(drug, susceptibility, fill = "failed") %>% 484 spread(drug, susceptibility, fill = "failed") %>%
267 select(-starts_with("NA")) 485 select(-starts_with("NA"))
268 486
269 if (length(predictions.table) == 1){ 487 if (length(predictions.table) == 1) {
270 print(predictions.table) 488 print(predictions.table)
271 stop("No susceptibility results in files specified. Did the testing fail?", call.=FALSE) 489 stop("No susceptibility results in files specified. Did the testing fail?",
490 call. = FALSE
491 )
272 } 492 }
273 493
274 # Variants, if present 494 # Variants, if present
275 if (0 < predictions.table %>% 495 num.variants <-
276 select(ends_with("_Prediction")) %>% 496 predictions.table %>%
277 unlist(use.names = F) %>% 497 select(ends_with("_Prediction")) %>%
278 str_count("[R,r]") %>% 498 unlist(use.names = FALSE) %>%
279 sum()){ 499 str_count("[R,r]") %>%
280 500 sum()
281 # Multiple resistance mutations and confidence per drug in the X_R_mutations column 501
282 # Actual protein changes in Mykrobe_X columns 502 if (num.variants > 0) {
283 503 # Multiple resistance mutations and confidence per drug in the X_R_mutations column
284 variants.temp <- 504 # Actual protein changes in Mykrobe_X columns
285 temp %>% 505
286 select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>% 506 variants.temp <-
287 mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent... 507 temp %>%
288 filter(!is.na(variants)) %>% # ...Then get rid of it 508 select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>%
289 mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>% 509 mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent...
290 mutate(R_mutations = variants) %>% 510 filter(!is.na(variants)) %>% # ...Then get rid of it
291 mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows) 511 mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>%
292 unnest(variants) %>% 512 mutate(R_mutations = variants) %>%
293 separate(variants, c("gene", "mutation"), "_") %>% 513 mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows)
294 mutate(columnname = ifelse(gene %in% c("gyrA", "tlyA", "rrs", "eis", "gid"), # Check for columns that include the drug name or not and paste accordingly 514 unnest(variants) %>%
295 paste("Mykrobe", drug, gene, sep = "_"), 515 separate(variants, c("gene", "mutation"), "_") %>%
296 paste("Mykrobe", gene, sep = "_"))) %>% 516 mutate(columnname = ifelse(
297 # Extract out the mutation information with a regex that covers all potential genes 517 gene %in% c("gyrA", "rrs", "eis", "gid"),
298 # This regex looks for whatever is ahead of the first colon and after the last hyphen 518 # Check for columns that include the drug name or not and paste accordingly
299 mutate(mutation = str_match(mutation, "(.*)-.*:")[,2]) %>% 519 paste("Mykrobe", drug, gene, sep = "_"),
300 select(file, tempcols, R_mutations, columnname, mutation) 520 paste("Mykrobe", gene, sep = "_")
301 521 )) %>%
302 # Split each kind of variants into its own temp table then merge 522 # Extract out the mutation information with a regex that covers all potential genes
303 variants.1 <- 523 # This regex looks for whatever is ahead of the first colon and after the last hyphen
304 variants.temp %>% 524 mutate(mutation = str_match(mutation, "(.*)-.*:")[, 2]) %>%
305 select(file, tempcols, R_mutations) %>% 525 select(file, tempcols, R_mutations, columnname, mutation)
306 distinct() %>% 526
307 spread(tempcols, R_mutations) 527 # Split each kind of variants into its own temp table then merge
308 528 variants.1 <-
309 variants.2 <- 529 variants.temp %>%
310 variants.temp %>% 530 select(file, tempcols, R_mutations) %>%
311 select(file, columnname, mutation) %>% 531 distinct() %>%
312 group_by(file, columnname) %>% 532 spread(tempcols, R_mutations)
313 summarise(mutation = paste(mutation, collapse = ";")) %>% 533
314 spread(columnname, mutation) 534 variants.2 <-
315 535 variants.temp %>%
316 variants.table <- full_join(variants.1, variants.2, by = "file") 536 select(file, columnname, mutation) %>%
317 }else{ 537 group_by(file, columnname) %>%
318 variants.table <- data.frame(file=predictions.table$file, stringsAsFactors = F) 538 summarise(mutation = paste(mutation, collapse = ";")) %>%
539 spread(columnname, mutation)
540
541 variants.table <-
542 full_join(variants.1, variants.2, by = "file")
543 } else {
544 variants.table <-
545 data.frame(file = predictions.table$file, stringsAsFactors = FALSE)
319 } 546 }
320 547
321 548
322 # Make a report #### 549 # Make a report ####
323 550
324 report <- 551 report <-
325 temp %>% 552 temp %>%
326 select(file, mykrobe_version, Mykrobe_Resistance_probe_set) %>% # Get important info from initial table 553 select(file, mykrobe_version, Mykrobe_Resistance_probe_set) %>% # Get important info from initial table
327 distinct() %>% # Drop duped rows and combine all the tables together 554 distinct() %>% # Drop duped rows and combine all the tables together
328 full_join(variants.table) %>% 555 full_join(variants.table) %>%
329 full_join(predictions.table) %>% 556 full_join(predictions.table) %>%
330 bind_rows(report) %>% # Use bind_rows to add columns (eg. unteseted drugs) to the final output 557 bind_rows(report) %>% # Use bind_rows to add columns (eg. unteseted drugs) to the final output
331 filter(file != "") 558 filter(file != "")
332 559
333 # Only add the 'no mutation' replacement to the columns that actually have a result 560 # Only add the 'no mutation' replacement to the columns that actually have a result
334 report <- 561 report <-
335 report %>% 562 report %>%
336 filter_at(vars(ends_with("_Prediction")), any_vars(. != "failed")) %>% 563 filter_at(vars(ends_with("_Prediction")), any_vars(. != "failed")) %>%
337 mutate_at(vars(starts_with("Mykrobe_")), funs(replace(., is.na(.), "No Mutation"))) %>% 564 mutate_at(vars(starts_with("Mykrobe_")), funs(replace(., is.na(.), "No Mutation"))) %>%
338 full_join(anti_join(report, ., by = "file")) %>% 565 full_join(anti_join(report, ., by = "file")) %>%
339 select(columns) 566 select(columns)
340 567
341 568
342 # Add in the parameters fed from Galaxy using named character vector 569 # Add in the parameters fed from Galaxy using named character vector
343 report <- 570 report <-
344 report %>% 571 report %>%
345 mutate( 572 mutate(
346 Lims_Comment = params["Lims_Comment"], 573 Lims_Comment = params["Lims_Comment"],
347 Lims_INTComment = params["Lims_INTComment"], 574 Lims_INTComment = params["Lims_INTComment"],
348 Mykrobe_Workflow_Version = params["Mykrobe_Workflow_Version"], 575 Mykrobe_Workflow_Version = params["Mykrobe_Workflow_Version"],
349 Mykrobe_min_depth_default_5 = params["Mykrobe_min_depth_default_5"], 576 Mykrobe_min_depth_default_5 = params["Mykrobe_min_depth_default_5"],
350 Mykrobe_min_conf_default_10 = params["Mykrobe_min_conf_default_10"], 577 Mykrobe_min_conf_default_10 = params["Mykrobe_min_conf_default_10"],
351 LIMS_file = params["LIMS_file"], 578 LIMS_file = params["LIMS_file"],
352 Mutation_set_version = params["Mutation_set_version"] 579 Mutation_set_version = params["Mutation_set_version"]
353 ) 580 )
354
355
356 #View(report)
357 581
358 # Write some output 582 # Write some output
359 # Report as is 583 # Report as is
360 write.csv(report, "output-report.csv", row.names = F) 584 write.csv(report, "output-report.csv", row.names = FALSE)
361 print("Writing Susceptibility report to CSV as output-report.csv") 585 print("Writing Susceptibility report to CSV as output-report.csv")
362 586
363 # Select specific columns from temp and output them 587 # Select specific columns from temp and output them
364 temp %>% 588 # Addition of any_of accounts for both 2019 and 2020 panels
365 select(file, 589
366 phylo_group, 590 temp %>%
367 species, 591 select_at( # This is a dplyr 0.8.3 function, superceded in newer versions but this tool is built around a number of specific deps
368 lineage, 592 report_cols
369 phylo_group_per_covg, 593 ) %>%
370 species_per_covg,
371 lineage_per_covg,
372 phylo_group_depth,
373 species_depth,
374 lineage_depth) %>%
375 distinct() %>% 594 distinct() %>%
376 write.csv("output-jsondata.csv", row.names = F) 595 write.csv(file = "output-jsondata.csv", row.names = FALSE)
377 print("Writing JSON data to CSV as output-jsondata.csv") 596 print("Writing JSON data to CSV as output-jsondata.csv")
378 sink(NULL, type="message") # close the sink 597 sink(NULL, type = "message") # close the sink
379 598
380 quit() 599 quit()