Mercurial > repos > nml > mykrobe_parser
comparison mykrobe_parser.R @ 6:8e7e5a660942 draft
planemo upload for repository https://github.com/phac-nml/mykrobe-parser commit 077a66175f3666924b42f216bbcb671ee0d026f2
author | nml |
---|---|
date | Fri, 22 Sep 2023 17:24:02 +0000 |
parents | deebc6410d13 |
children |
comparison
equal
deleted
inserted
replaced
5:deebc6410d13 | 6:8e7e5a660942 |
---|---|
1 # Copyright Government of Canada 2018 | 1 # Copyright Government of Canada 2018 |
2 # | 2 # |
3 # Written by: National Microbiology Laboratory, Public Health Agency of Canada | 3 # Written by: National Microbiology Laboratory, Public Health Agency of Canada |
4 # | 4 # |
5 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use | 5 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use |
6 # this work except in compliance with the License. You may obtain a copy of the | 6 # this work except in compliance with the License. You may obtain a copy of the |
7 # License at: | 7 # License at: |
8 # | 8 # |
9 # http://www.apache.org/licenses/LICENSE-2.0 | 9 # http://www.apache.org/licenses/LICENSE-2.0 |
10 # | 10 # |
11 # Unless required by applicable law or agreed to in writing, software distributed | 11 # Unless required by applicable law or agreed to in writing, software distributed |
12 # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | 12 # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR |
13 # CONDITIONS OF ANY KIND, either express or implied. See the License for the | 13 # CONDITIONS OF ANY KIND, either express or implied. See the License for the |
14 # specific language governing permissions and limitations under the License. | 14 # specific language governing permissions and limitations under the License. |
15 | 15 |
16 | 16 |
17 # Parsing JSONs from Mykrobe Predict into CSV reports | 17 # Parsing JSONs from Mykrobe Predict into CSV reports |
18 # Take the JSON output from Mykrobe, rearrange, output for LIMS | 18 # Take the JSON output from Mykrobe, rearrange, output for LIMS |
19 # Adrian Zetner | 19 # Adrian Zetner |
20 # August 2018 | 20 # August 2018 |
21 # Updated August 2023 | |
21 | 22 |
22 # Libraries #### | 23 # Libraries #### |
23 | 24 |
24 sink(stdout(), type = "message") | 25 sink(stdout(), type = "message") |
25 | 26 |
33 library(optparse) | 34 library(optparse) |
34 }) | 35 }) |
35 | 36 |
36 # Define custom functions, variables, and paths. Collect and use CL arguments #### | 37 # Define custom functions, variables, and paths. Collect and use CL arguments #### |
37 | 38 |
38 # Here's a function to recreate that output table from the input JSON files | 39 # Here's a function to recreate that output table from the input JSON files for 2019 |
39 | 40 |
40 getResults <- function(listelement){ | 41 getResults2019 <- function(listelement) { |
41 # Define list levels for various elements of the json | 42 # Define list levels for various elements of the json |
42 species <- names(listelement[[1]][["phylogenetics"]][["species"]]) | 43 phylo_group <- |
43 lineage <- names(listelement[[1]][["phylogenetics"]][["lineage"]]) | 44 names(listelement[[1]][["phylogenetics"]][["phylo_group"]]) |
44 phylo_group <- names(listelement[[1]][["phylogenetics"]][["phylo_group"]]) | 45 if ("Non_tuberculosis_mycobacterium_complex" %in% phylo_group) { |
45 if("Non_tuberculosis_mycobacterium_complex" %in% phylo_group){ | 46 warning( |
46 warning(paste("Non-tuberculosis mycobacteria detected in file ", names(listelement), ". Skipping.", sep = "")) | 47 paste( |
47 return()} | 48 "Non-tuberculosis mycobacteria detected in file ", |
48 | 49 names(listelement), |
50 ". Skipping.", | |
51 sep = "" | |
52 ) | |
53 ) | |
54 return() | |
55 } | |
56 | |
57 species <- | |
58 names(listelement[[1]][["phylogenetics"]][["species"]]) | |
59 lineage <- | |
60 if (length(listelement[[1]][["phylogenetics"]][["lineage"]]) == 1) { | |
61 names(listelement[[1]][["phylogenetics"]][["lineage"]]) | |
62 } else { | |
63 listelement[[1]][["phylogenetics"]][["lineage"]][["lineage"]] | |
64 } | |
65 | |
49 # Start building a list of all your various elements | 66 # Start building a list of all your various elements |
50 temp <- list(mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]], | 67 temp <- |
51 file = names(listelement), # One element | 68 list( |
52 plate_name = "test", # This probably needs changing | 69 mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]], |
53 sample = "sequence_calls", # Likewise change this | 70 file = names(listelement), |
54 phylo_group = phylo_group, # As above | 71 # One element |
55 species = species, # As above | 72 plate_name = "test", |
56 lineage = lineage, # As above | 73 # This probably needs changing |
57 # The following expressions drill down into the list elements and pull out what is needed. | 74 sample = "sequence_calls", |
58 # It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work | 75 # Likewise change this |
59 phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]], | 76 phylo_group = phylo_group, |
60 species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]], | 77 # As above |
61 lineage_per_covg = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["percent_coverage"]], | 78 species = species, |
62 phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]], | 79 # As above |
63 species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]], | 80 lineage = lineage, |
64 lineage_depth = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["median_depth"]], | 81 # As above |
65 Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second? | 82 # The following expressions drill down into the list elements and pull out what is needed. |
66 ) | 83 # It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work |
67 | 84 phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]], |
85 species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]], | |
86 lineage_per_covg = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["percent_coverage"]], | |
87 phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]], | |
88 species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]], | |
89 lineage_depth = listelement[[1]][["phylogenetics"]][["lineage"]][[lineage]][["median_depth"]], | |
90 Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second? | |
91 ) | |
92 | |
68 # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants | 93 # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants |
69 # Both produce character vectors of the same length as the number of drugs tested in the same order | 94 # Both produce character vectors of the same length as the number of drugs tested in the same order |
70 # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow | 95 # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow |
71 | 96 |
72 if(length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0){ | 97 if (length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0) { |
73 temp$susceptibility <- map_chr(listelement[[1]][["susceptibility"]], "predict") | 98 temp$susceptibility <- |
74 }else{ | 99 map_chr(listelement[[1]][["susceptibility"]], "predict") |
100 } else { | |
75 temp$susceptibility <- NA | 101 temp$susceptibility <- NA |
76 } | 102 } |
77 | 103 |
78 if(length(names(listelement[[1]][["susceptibility"]])) != 0){ | 104 if (length(names(listelement[[1]][["susceptibility"]])) != 0) { |
79 temp$drug <- names(listelement[[1]][["susceptibility"]]) | 105 temp$drug <- names(listelement[[1]][["susceptibility"]]) |
80 }else{ | 106 } else { |
81 temp$drug <- NA | 107 temp$drug <- NA |
82 } | 108 } |
83 | 109 |
84 mapped.variants <- map(listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector | 110 mapped.variants <- |
85 ~ imap(.x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element | 111 map( |
86 ~ paste(.y, | 112 listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector |
87 .x[["info"]][["coverage"]][["alternate"]][["median_depth"]], | 113 ~ imap( |
88 .x[["info"]][["coverage"]][["reference"]][["median_depth"]], | 114 .x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element |
89 .x[["info"]][["conf"]], | 115 ~ paste(.y, |
90 sep = ":"))) %>% | 116 .x[["info"]][["coverage"]][["alternate"]][["median_depth"]], |
117 .x[["info"]][["coverage"]][["reference"]][["median_depth"]], | |
118 .x[["info"]][["conf"]], | |
119 sep = ":" | |
120 ) | |
121 ) | |
122 ) %>% | |
91 map_chr(~ paste(.x, collapse = "__")) | 123 map_chr(~ paste(.x, collapse = "__")) |
92 | 124 |
93 if(length(mapped.variants) != 0){ | 125 if (length(mapped.variants) != 0) { |
94 temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants | 126 temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants |
95 }else{ | 127 } else { |
96 temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA | 128 temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA |
97 } | 129 } |
98 | 130 |
99 temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA | 131 temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA |
100 | 132 |
101 # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space | 133 # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space |
102 # eg. phylo_group is repeated/recycled as many times as there are drugs tested | 134 # eg. phylo_group is repeated/recycled as many times as there are drugs tested |
103 as_tibble(temp) | 135 as_tibble(temp) |
104 } | 136 } |
105 | 137 |
138 # Here's a function to recreate that output table from the input JSON files for panel 2020 | |
139 | |
140 getResults2020 <- function(listelement) { | |
141 # Define list levels for various elements of the json | |
142 phylo_group <- | |
143 names(listelement[[1]][["phylogenetics"]][["phylo_group"]]) | |
144 if ("Non_tuberculosis_mycobacterium_complex" %in% phylo_group) { | |
145 warning( | |
146 paste( | |
147 "Non-tuberculosis mycobacteria detected in file ", | |
148 names(listelement), | |
149 ". Skipping.", | |
150 sep = "" | |
151 ) | |
152 ) | |
153 return() | |
154 } | |
155 | |
156 species <- | |
157 names(listelement[[1]][["phylogenetics"]][["species"]]) | |
158 | |
159 # Start building a list of all your various elements | |
160 temp <- | |
161 list( | |
162 mykrobe_version = listelement[[1]][["version"]][["mykrobe-predictor"]], | |
163 file = names(listelement), | |
164 # One element | |
165 plate_name = "test", | |
166 # This probably needs changing | |
167 sample = "sequence_calls", | |
168 # Likewise change this | |
169 phylo_group = phylo_group, | |
170 # As above | |
171 species = species, | |
172 # As above | |
173 # The following expressions drill down into the list elements and pull out what is needed. | |
174 # It's inelegant and vulnerable to changes in the input formats but if they're consistent it'll work | |
175 phylo_group_per_covg = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["percent_coverage"]], | |
176 species_per_covg = listelement[[1]][["phylogenetics"]][["species"]][[species]][["percent_coverage"]], | |
177 phylo_group_depth = listelement[[1]][["phylogenetics"]][["phylo_group"]][[phylo_group]][["median_depth"]], | |
178 species_depth = listelement[[1]][["phylogenetics"]][["species"]][[species]][["median_depth"]], | |
179 Mykrobe_Resistance_probe_set = basename(listelement[[1]][["probe_sets"]][2]) # Is it always the second? | |
180 ) | |
181 | |
182 # Super cool nested and vectorized (for SPEED!) functions to grab the predictions for drug sensitivity and gene variants | |
183 # Both produce character vectors of the same length as the number of drugs tested in the same order | |
184 # All of these also check if there are missing values in drug/susceptibility/variant elements and adds the column anyhow | |
185 | |
186 if (length(map_chr(listelement[[1]][["susceptibility"]], "predict")) != 0) { | |
187 temp$susceptibility <- | |
188 map_chr(listelement[[1]][["susceptibility"]], "predict") | |
189 } else { | |
190 temp$susceptibility <- NA | |
191 } | |
192 | |
193 if (length(names(listelement[[1]][["susceptibility"]])) != 0) { | |
194 temp$drug <- names(listelement[[1]][["susceptibility"]]) | |
195 } else { | |
196 temp$drug <- NA | |
197 } | |
198 | |
199 mapped.variants <- | |
200 map( | |
201 listelement[[1]][["susceptibility"]], # Dig into the lists, pull out variants and collapse into chr vector | |
202 ~ imap( | |
203 .x[["called_by"]], # imap is shorthand for map2(x, names(x), ...), calling .y gets you the name / index of the current element | |
204 ~ paste(.y, | |
205 .x[["info"]][["coverage"]][["alternate"]][["median_depth"]], | |
206 .x[["info"]][["coverage"]][["reference"]][["median_depth"]], | |
207 .x[["info"]][["conf"]], | |
208 sep = ":" | |
209 ) | |
210 ) | |
211 ) %>% | |
212 map_chr(~ paste(.x, collapse = "__")) | |
213 | |
214 if (length(mapped.variants) != 0) { | |
215 temp$`variants (gene:alt_depth:wt_depth:conf)` <- mapped.variants | |
216 } else { | |
217 temp$`variants (gene:alt_depth:wt_depth:conf)` <- NA | |
218 } | |
219 | |
220 temp$`genes (prot_mut-ref_mut:percent_covg:depth)` <- NA | |
221 | |
222 # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space | |
223 # eg. phylo_group is repeated/recycled as many times as there are drugs tested | |
224 as_tibble(temp) | |
225 } | |
226 | |
106 # Get command line arguments with optparse | 227 # Get command line arguments with optparse |
107 option_list = list( | 228 option_list <- list( |
108 make_option(c("-f", "--file"), | 229 make_option( |
109 type="character", | 230 c("-f", "--file"), |
110 default=NULL, | 231 type = "character", |
111 help='dataset file name or quoted comma separated names: eg. "file1,file2,file3"', | 232 default = NULL, |
112 metavar="character"), | 233 help = 'dataset file name or quoted comma separated names: eg. "file1,file2,file3"', |
113 make_option(c("-d", "--dir"), | 234 metavar = "character" |
114 type="character", | 235 ), |
115 default=NULL, | 236 make_option( |
116 help="directory location of json files", | 237 c("-d", "--dir"), |
117 metavar="character"), | 238 type = "character", |
118 make_option(c("-v", "--version"), | 239 default = NULL, |
119 type="character", | 240 help = "directory location of json files", |
120 default="", | 241 metavar = "character" |
121 help="Mykrobe Workflow Version", | 242 ), |
122 metavar="character"), | 243 make_option( |
123 make_option(c("-D", "--depth"), | 244 c("-v", "--version"), |
124 type="integer", | 245 type = "character", |
125 default=5, | 246 default = "", |
126 help="Minimum depth of coverage [default= %default]", | 247 help = "Mykrobe Workflow Version", |
127 metavar="integer"), | 248 metavar = "character" |
128 make_option(c("-c", "--conf"), | 249 ), |
129 type="integer", | 250 make_option( |
130 default=10, | 251 c("-p", "--panel"), |
131 help="Minimum genotype confidence for variant genotyping [default= %default]", | 252 type = "character", |
132 metavar="integer"), | 253 default = "2019", |
133 make_option(c("-n", "--name"), | 254 help = "Mykrobe Panel Version: 2019 or 2020. [default= %default]", |
134 type="character", | 255 metavar = "character" |
135 default="", | 256 ), |
136 help="Name of the run", | 257 make_option( |
137 metavar="character") | 258 c("-D", "--depth"), |
259 type = "integer", | |
260 default = 5, | |
261 help = "Minimum depth of coverage [default= %default]", | |
262 metavar = "integer" | |
263 ), | |
264 make_option( | |
265 c("-c", "--conf"), | |
266 type = "integer", | |
267 default = 10, | |
268 help = "Minimum genotype confidence for variant genotyping [default= %default]", | |
269 metavar = "integer" | |
270 ), | |
271 make_option( | |
272 c("-n", "--name"), | |
273 type = "character", | |
274 default = "", | |
275 help = "Name of the run", | |
276 metavar = "character" | |
277 ), | |
278 make_option( | |
279 c("-r", "--reportfile"), | |
280 type = "character", | |
281 default = "report", | |
282 help = "File name for susceptibility report data", | |
283 metavar = "character" | |
284 ), | |
285 make_option( | |
286 c("-s", "--speciationfile"), | |
287 type = "character", | |
288 default = "jsondata", | |
289 help = "File name for speciation data", | |
290 metavar = "character" | |
291 ) | |
138 ) | 292 ) |
139 | 293 |
140 opt_parser = OptionParser(option_list=option_list) | 294 opt_parser <- OptionParser(option_list = option_list) |
141 opt = parse_args(opt_parser) | 295 opt <- parse_args(opt_parser) |
142 | 296 |
143 if (is.null(opt$file) & is.null(opt$dir)){ | 297 if (is.null(opt$file) && is.null(opt$dir)) { |
144 print_help(opt_parser) | 298 print_help(opt_parser) |
145 stop("At least one argument must be supplied to input file or directory", call.=FALSE) | 299 stop("At least one argument must be supplied to input file or directory", |
300 call. = FALSE | |
301 ) | |
302 } | |
303 | |
304 if (opt$panel != "2019" && opt$panel != "2020") { | |
305 print_help(opt_parser) | |
306 stop("Panel must be one of 2019 or 2020", call. = FALSE) | |
146 } | 307 } |
147 | 308 |
148 # Parameters to take from Galaxy/CL as args or however works best | 309 # Parameters to take from Galaxy/CL as args or however works best |
149 params <- c("", # Lims_Comment | 310 params <- c( |
150 "", # Lims_INTComment | 311 "", # Lims_Comment |
151 opt$version, # Mykrobe_Workflow_Version | 312 "", # Lims_INTComment |
152 opt$depth, # Mykrobe_min_depth_default_5 | 313 opt$version, # Mykrobe_Workflow_Version |
153 opt$conf, # Mykrobe_min_conf_default_10 | 314 opt$panel, # Mykrobe Panel Version |
154 "", # LIMS_file - empty as it's an upload field in LIMS | 315 opt$depth, # Mykrobe_min_depth_default_5 |
155 opt$name) # Mutation_set_version | 316 opt$conf, # Mykrobe_min_conf_default_10 |
156 | 317 "", # LIMS_file - empty as it's an upload field in LIMS |
157 names(params) <- c("Lims_Comment", | 318 opt$name |
158 "Lims_INTComment", | 319 ) # Mutation_set_version |
159 "Mykrobe_Workflow_Version", | 320 |
160 "Mykrobe_min_depth_default_5", | 321 names(params) <- c( |
161 "Mykrobe_min_conf_default_10", | 322 "Lims_Comment", |
162 "LIMS_file", | 323 "Lims_INTComment", |
163 "Mutation_set_version") | 324 "Mykrobe_Workflow_Version", |
325 "Mykrobe_Panel_Version", | |
326 "Mykrobe_min_depth_default_5", | |
327 "Mykrobe_min_conf_default_10", | |
328 "LIMS_file", | |
329 "Mutation_set_version" | |
330 ) | |
164 | 331 |
165 | 332 |
166 # A default report in the order our LIMS requires | 333 # A default report in the order our LIMS requires |
167 | 334 |
168 # Make a default dataframe to combine the rest into and enforce column order / fill missing ones with NAs | 335 # Make a default dataframe to combine the rest into and enforce column order / fill missing ones with NAs |
169 columns <- c("file", | 336 columns <- c( |
170 "Mykrobe_fabG1", | 337 "file", |
171 "Mykrobe_katG", | 338 "Mykrobe_fabG1", |
172 "Mykrobe_ahpC", | 339 "Mykrobe_katG", |
173 "Mykrobe_inhA", | 340 "Mykrobe_ahpC", |
174 "Mykrobe_ndh", | 341 "Mykrobe_inhA", |
175 "Isoniazid_R_mutations", | 342 "Mykrobe_ndh", |
176 "Isoniazid_Prediction", | 343 "Isoniazid_R_mutations", |
177 "Mykrobe_rpoB", | 344 "Isoniazid_Prediction", |
178 "Rifampicin_R_mutations", | 345 "Mykrobe_rpoB", |
179 "Rifampicin_Prediction", | 346 "Rifampicin_R_mutations", |
180 "Mykrobe_embB", | 347 "Rifampicin_Prediction", |
181 "Mykrobe_embA", | 348 "Mykrobe_embB", |
182 "Ethambutol_R_mutations", | 349 "Mykrobe_embA", |
183 "Ethambutol_Prediction", | 350 "Ethambutol_R_mutations", |
184 "Mykrobe_pncA", | 351 "Ethambutol_Prediction", |
185 "Mykrobe_rpsA", | 352 "Mykrobe_pncA", |
186 "Pyrazinamide_R_mutations", | 353 "Mykrobe_rpsA", |
187 "Pyrazinamide_Prediction", | 354 "Pyrazinamide_R_mutations", |
188 "Mykrobe_Ofloxacin_gyrA", | 355 "Pyrazinamide_Prediction", |
189 "Ofloxacin_R_mutations", | 356 "Mykrobe_Ofloxacin_gyrA", |
190 "Ofloxacin_Prediction", | 357 "Ofloxacin_R_mutations", |
191 "Mykrobe_Moxifloxacin_gyrA", | 358 "Ofloxacin_Prediction", |
192 "Moxifloxacin_R_mutations", | 359 "Mykrobe_Moxifloxacin_gyrA", |
193 "Moxifloxacin_Prediction", | 360 "Moxifloxacin_R_mutations", |
194 "Mykrobe_rpsL", | 361 "Moxifloxacin_Prediction", |
195 "Mykrobe_Streptomycin_rrs", | 362 "Mykrobe_Ciprofloxacin_gyrA", |
196 "Mykrobe_Streptomycin_gid", | 363 "Ciprofloxacin_R_mutations", |
197 "Streptomycin_R_mutations", | 364 "Ciprofloxacin_Prediction", |
198 "Streptomycin_Prediction", | 365 "Mykrobe_rpsL", |
199 "Mykrobe_Amikacin_rrs", | 366 "Mykrobe_Streptomycin_rrs", |
200 "Amikacin_R_mutations", | 367 "Mykrobe_Streptomycin_gid", |
201 "Amikacin_Prediction", | 368 "Streptomycin_R_mutations", |
202 "Mykrobe_Capreomycin_rrs", | 369 "Streptomycin_Prediction", |
203 "Mykrobe_Capreomycin_tlyA", | 370 "Mykrobe_Amikacin_rrs", |
204 "Capreomycin_R_mutations", | 371 "Amikacin_R_mutations", |
205 "Capreomycin_Prediction", | 372 "Amikacin_Prediction", |
206 "Mykrobe_Kanamycin_rrs", | 373 "Mykrobe_Capreomycin_rrs", |
207 "Mykrobe_Kanamycin_eis", | 374 "Mykrobe_Capreomycin_tlyA", |
208 "Kanamycin_R_mutations", | 375 "Capreomycin_R_mutations", |
209 "Kanamycin_Prediction", | 376 "Capreomycin_Prediction", |
210 "Lims_Comment", | 377 "Mykrobe_Kanamycin_rrs", |
211 "Lims_INTComment", | 378 "Mykrobe_Kanamycin_eis", |
212 "Mykrobe_Workflow_Version", | 379 "Kanamycin_R_mutations", |
213 "mykrobe_version", | 380 "Kanamycin_Prediction", |
214 "Mykrobe_Resistance_probe_set", | 381 "Lims_Comment", |
215 "Mykrobe_min_depth_default_5", | 382 "Lims_INTComment", |
216 "Mykrobe_min_conf_default_10", | 383 "Mykrobe_Workflow_Version", |
217 "LIMS_file", | 384 "mykrobe_version", |
218 "Mutation_set_version") | 385 "Mykrobe_Resistance_probe_set", |
219 | 386 "Mykrobe_min_depth_default_5", |
220 report <- setNames(data.frame(matrix("", ncol = length(columns), nrow = 1), stringsAsFactors = F), columns) | 387 "Mykrobe_min_conf_default_10", |
221 | 388 "LIMS_file", |
389 "Mutation_set_version" | |
390 ) | |
391 | |
392 report <- | |
393 setNames(data.frame(matrix( | |
394 "", | |
395 ncol = length(columns), nrow = 1 | |
396 ), stringsAsFactors = FALSE), columns) | |
397 | |
398 report_cols <- c( | |
399 "file", | |
400 "phylo_group", | |
401 "species", | |
402 "lineage", | |
403 "phylo_group_per_covg", | |
404 "species_per_covg", | |
405 "lineage_per_covg", | |
406 "phylo_group_depth", | |
407 "species_depth", | |
408 "lineage_depth" | |
409 ) | |
222 | 410 |
223 # List of drugs that are tested | 411 # List of drugs that are tested |
224 all_drugs <- c("Isoniazid", | 412 all_drugs <- c( |
225 "Rifampicin", | 413 "Isoniazid", |
226 "Ethambutol", | 414 "Rifampicin", |
227 "Pyrazinamide", | 415 "Ethambutol", |
228 "Moxifloxacin", | 416 "Pyrazinamide", |
229 "Ofloxacin", | 417 "Moxifloxacin", |
230 "Streptomycin", | 418 "Ofloxacin", |
231 "Amikacin", | 419 "Streptomycin", |
232 "Capreomycin", | 420 "Amikacin", |
233 "Kanamycin") | 421 "Capreomycin", |
422 "Kanamycin" | |
423 ) | |
234 | 424 |
235 # Do Stuff #### | 425 # Do Stuff #### |
236 | 426 |
237 # Import all the JSON files into a list of lists format #### | 427 # Import all the JSON files into a list of lists format #### |
238 | 428 |
239 if (is.null(opt$file)){ | 429 if (is.null(opt$file)) { |
240 # opt$dir is used to get the list of files, a vector of non-duplicated files is then passed to map | 430 # opt$dir is used to get the list of files, a vector of non-duplicated files is then passed to map |
241 files <- list.files(path = opt$dir, | 431 files <- list.files( |
242 pattern = "*.json", | 432 path = opt$dir, |
243 full.names = T) | 433 pattern = "*.json", |
244 }else{ | 434 full.names = TRUE |
435 ) | |
436 } else { | |
245 files <- unlist(strsplit(opt$file, ",")) | 437 files <- unlist(strsplit(opt$file, ",")) |
246 } | 438 } |
247 | 439 |
248 files <- files[!duplicated(basename(files))] | 440 files <- files[!duplicated(basename(files))] |
249 | 441 |
250 list.of.json.files <- map(files, | 442 list.of.json.files <- map( |
251 ~ fromJSON(.x, simplifyDataFrame = F) | 443 files, |
444 ~ fromJSON(.x, simplifyDataFrame = FALSE) | |
252 ) | 445 ) |
253 | 446 |
254 | 447 |
255 # Apply that getResults function to each element in your list then bash it together into a final report | 448 # Apply the correct getResults function to each element in your list then bash it together into a final report |
256 | 449 |
257 temp <- map(list.of.json.files, getResults) %>% | 450 if (opt$panel == "2019") { |
258 bind_rows() | 451 temp <- map(list.of.json.files, getResults2019) %>% |
452 bind_rows() | |
453 } else if (opt$panel == "2020") { | |
454 temp <- map(list.of.json.files, getResults2020) %>% | |
455 bind_rows() | |
456 columns <- | |
457 setdiff( | |
458 columns, | |
459 c( | |
460 "Mykrobe_Ciprofloxacin_gyrA", | |
461 "Ciprofloxacin_R_mutations", | |
462 "Ciprofloxacin_Prediction" | |
463 ) | |
464 ) | |
465 report_cols <- setdiff( | |
466 report_cols, | |
467 c( | |
468 "lineage", | |
469 "lineage_per_covg", | |
470 "lineage_depth" | |
471 ) | |
472 ) | |
473 } else { | |
474 stop("Panel must be one of 2019 or 2020", call. = FALSE) | |
475 } | |
259 | 476 |
260 | 477 |
261 # Predictions of resistance or susceptibility | 478 # Predictions of resistance or susceptibility |
262 predictions.table <- | 479 |
480 predictions.table <- | |
263 temp %>% | 481 temp %>% |
264 select(file, drug, susceptibility) %>% | 482 select(file, drug, susceptibility) %>% |
265 mutate(drug = paste(drug, "_Prediction", sep = "")) %>% | 483 mutate(drug = paste(drug, "_Prediction", sep = "")) %>% |
266 spread(drug, susceptibility, fill = "failed") %>% | 484 spread(drug, susceptibility, fill = "failed") %>% |
267 select(-starts_with("NA")) | 485 select(-starts_with("NA")) |
268 | 486 |
269 if (length(predictions.table) == 1){ | 487 if (length(predictions.table) == 1) { |
270 print(predictions.table) | 488 print(predictions.table) |
271 stop("No susceptibility results in files specified. Did the testing fail?", call.=FALSE) | 489 stop("No susceptibility results in files specified. Did the testing fail?", |
490 call. = FALSE | |
491 ) | |
272 } | 492 } |
273 | 493 |
274 # Variants, if present | 494 # Variants, if present |
275 if (0 < predictions.table %>% | 495 num.variants <- |
276 select(ends_with("_Prediction")) %>% | 496 predictions.table %>% |
277 unlist(use.names = F) %>% | 497 select(ends_with("_Prediction")) %>% |
278 str_count("[R,r]") %>% | 498 unlist(use.names = FALSE) %>% |
279 sum()){ | 499 str_count("[R,r]") %>% |
280 | 500 sum() |
281 # Multiple resistance mutations and confidence per drug in the X_R_mutations column | 501 |
282 # Actual protein changes in Mykrobe_X columns | 502 if (num.variants > 0) { |
283 | 503 # Multiple resistance mutations and confidence per drug in the X_R_mutations column |
284 variants.temp <- | 504 # Actual protein changes in Mykrobe_X columns |
285 temp %>% | 505 |
286 select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>% | 506 variants.temp <- |
287 mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent... | 507 temp %>% |
288 filter(!is.na(variants)) %>% # ...Then get rid of it | 508 select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>% |
289 mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>% | 509 mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent... |
290 mutate(R_mutations = variants) %>% | 510 filter(!is.na(variants)) %>% # ...Then get rid of it |
291 mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows) | 511 mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>% |
292 unnest(variants) %>% | 512 mutate(R_mutations = variants) %>% |
293 separate(variants, c("gene", "mutation"), "_") %>% | 513 mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows) |
294 mutate(columnname = ifelse(gene %in% c("gyrA", "tlyA", "rrs", "eis", "gid"), # Check for columns that include the drug name or not and paste accordingly | 514 unnest(variants) %>% |
295 paste("Mykrobe", drug, gene, sep = "_"), | 515 separate(variants, c("gene", "mutation"), "_") %>% |
296 paste("Mykrobe", gene, sep = "_"))) %>% | 516 mutate(columnname = ifelse( |
297 # Extract out the mutation information with a regex that covers all potential genes | 517 gene %in% c("gyrA", "rrs", "eis", "gid"), |
298 # This regex looks for whatever is ahead of the first colon and after the last hyphen | 518 # Check for columns that include the drug name or not and paste accordingly |
299 mutate(mutation = str_match(mutation, "(.*)-.*:")[,2]) %>% | 519 paste("Mykrobe", drug, gene, sep = "_"), |
300 select(file, tempcols, R_mutations, columnname, mutation) | 520 paste("Mykrobe", gene, sep = "_") |
301 | 521 )) %>% |
302 # Split each kind of variants into its own temp table then merge | 522 # Extract out the mutation information with a regex that covers all potential genes |
303 variants.1 <- | 523 # This regex looks for whatever is ahead of the first colon and after the last hyphen |
304 variants.temp %>% | 524 mutate(mutation = str_match(mutation, "(.*)-.*:")[, 2]) %>% |
305 select(file, tempcols, R_mutations) %>% | 525 select(file, tempcols, R_mutations, columnname, mutation) |
306 distinct() %>% | 526 |
307 spread(tempcols, R_mutations) | 527 # Split each kind of variants into its own temp table then merge |
308 | 528 variants.1 <- |
309 variants.2 <- | 529 variants.temp %>% |
310 variants.temp %>% | 530 select(file, tempcols, R_mutations) %>% |
311 select(file, columnname, mutation) %>% | 531 distinct() %>% |
312 group_by(file, columnname) %>% | 532 spread(tempcols, R_mutations) |
313 summarise(mutation = paste(mutation, collapse = ";")) %>% | 533 |
314 spread(columnname, mutation) | 534 variants.2 <- |
315 | 535 variants.temp %>% |
316 variants.table <- full_join(variants.1, variants.2, by = "file") | 536 select(file, columnname, mutation) %>% |
317 }else{ | 537 group_by(file, columnname) %>% |
318 variants.table <- data.frame(file=predictions.table$file, stringsAsFactors = F) | 538 summarise(mutation = paste(mutation, collapse = ";")) %>% |
539 spread(columnname, mutation) | |
540 | |
541 variants.table <- | |
542 full_join(variants.1, variants.2, by = "file") | |
543 } else { | |
544 variants.table <- | |
545 data.frame(file = predictions.table$file, stringsAsFactors = FALSE) | |
319 } | 546 } |
320 | 547 |
321 | 548 |
322 # Make a report #### | 549 # Make a report #### |
323 | 550 |
324 report <- | 551 report <- |
325 temp %>% | 552 temp %>% |
326 select(file, mykrobe_version, Mykrobe_Resistance_probe_set) %>% # Get important info from initial table | 553 select(file, mykrobe_version, Mykrobe_Resistance_probe_set) %>% # Get important info from initial table |
327 distinct() %>% # Drop duped rows and combine all the tables together | 554 distinct() %>% # Drop duped rows and combine all the tables together |
328 full_join(variants.table) %>% | 555 full_join(variants.table) %>% |
329 full_join(predictions.table) %>% | 556 full_join(predictions.table) %>% |
330 bind_rows(report) %>% # Use bind_rows to add columns (eg. unteseted drugs) to the final output | 557 bind_rows(report) %>% # Use bind_rows to add columns (eg. unteseted drugs) to the final output |
331 filter(file != "") | 558 filter(file != "") |
332 | 559 |
333 # Only add the 'no mutation' replacement to the columns that actually have a result | 560 # Only add the 'no mutation' replacement to the columns that actually have a result |
334 report <- | 561 report <- |
335 report %>% | 562 report %>% |
336 filter_at(vars(ends_with("_Prediction")), any_vars(. != "failed")) %>% | 563 filter_at(vars(ends_with("_Prediction")), any_vars(. != "failed")) %>% |
337 mutate_at(vars(starts_with("Mykrobe_")), funs(replace(., is.na(.), "No Mutation"))) %>% | 564 mutate_at(vars(starts_with("Mykrobe_")), funs(replace(., is.na(.), "No Mutation"))) %>% |
338 full_join(anti_join(report, ., by = "file")) %>% | 565 full_join(anti_join(report, ., by = "file")) %>% |
339 select(columns) | 566 select(columns) |
340 | 567 |
341 | 568 |
342 # Add in the parameters fed from Galaxy using named character vector | 569 # Add in the parameters fed from Galaxy using named character vector |
343 report <- | 570 report <- |
344 report %>% | 571 report %>% |
345 mutate( | 572 mutate( |
346 Lims_Comment = params["Lims_Comment"], | 573 Lims_Comment = params["Lims_Comment"], |
347 Lims_INTComment = params["Lims_INTComment"], | 574 Lims_INTComment = params["Lims_INTComment"], |
348 Mykrobe_Workflow_Version = params["Mykrobe_Workflow_Version"], | 575 Mykrobe_Workflow_Version = params["Mykrobe_Workflow_Version"], |
349 Mykrobe_min_depth_default_5 = params["Mykrobe_min_depth_default_5"], | 576 Mykrobe_min_depth_default_5 = params["Mykrobe_min_depth_default_5"], |
350 Mykrobe_min_conf_default_10 = params["Mykrobe_min_conf_default_10"], | 577 Mykrobe_min_conf_default_10 = params["Mykrobe_min_conf_default_10"], |
351 LIMS_file = params["LIMS_file"], | 578 LIMS_file = params["LIMS_file"], |
352 Mutation_set_version = params["Mutation_set_version"] | 579 Mutation_set_version = params["Mutation_set_version"] |
353 ) | 580 ) |
354 | |
355 | |
356 #View(report) | |
357 | 581 |
358 # Write some output | 582 # Write some output |
359 # Report as is | 583 # Report as is |
360 write.csv(report, "output-report.csv", row.names = F) | 584 write.csv(report, "output-report.csv", row.names = FALSE) |
361 print("Writing Susceptibility report to CSV as output-report.csv") | 585 print("Writing Susceptibility report to CSV as output-report.csv") |
362 | 586 |
363 # Select specific columns from temp and output them | 587 # Select specific columns from temp and output them |
364 temp %>% | 588 # Addition of any_of accounts for both 2019 and 2020 panels |
365 select(file, | 589 |
366 phylo_group, | 590 temp %>% |
367 species, | 591 select_at( # This is a dplyr 0.8.3 function, superceded in newer versions but this tool is built around a number of specific deps |
368 lineage, | 592 report_cols |
369 phylo_group_per_covg, | 593 ) %>% |
370 species_per_covg, | |
371 lineage_per_covg, | |
372 phylo_group_depth, | |
373 species_depth, | |
374 lineage_depth) %>% | |
375 distinct() %>% | 594 distinct() %>% |
376 write.csv("output-jsondata.csv", row.names = F) | 595 write.csv(file = "output-jsondata.csv", row.names = FALSE) |
377 print("Writing JSON data to CSV as output-jsondata.csv") | 596 print("Writing JSON data to CSV as output-jsondata.csv") |
378 sink(NULL, type="message") # close the sink | 597 sink(NULL, type = "message") # close the sink |
379 | 598 |
380 quit() | 599 quit() |