comparison mykrobe_parser.R @ 2:f2608dccd3e0 draft

planemo upload for repository https://github.com/phac-nml/mykrobe-parser commit 1d77b6cae26ef3456ff6d469d71c61cab6a19906-dirty
author nml
date Tue, 23 Oct 2018 08:54:51 -0400
parents 6eae14751768
children 8529045f0fdf
comparison
equal deleted inserted replaced
1:05ca0dbc9f46 2:f2608dccd3e0
18 # Take the JSON output from Mykrobe, rearrange, output for LIMS 18 # Take the JSON output from Mykrobe, rearrange, output for LIMS
19 # Adrian Zetner 19 # Adrian Zetner
20 # August 2018 20 # August 2018
21 21
22 # Libraries #### 22 # Libraries ####
23 library(jsonlite, quietly = T) 23
24 library(here, quietly = T) 24 sink(stdout(), type = "message")
25 suppressMessages(library(dplyr, quietly = T)) 25
26 suppressMessages(library(purrr, quietly = T)) 26 suppressPackageStartupMessages({
27 library(tidyr, quietly = T) 27 library(jsonlite)
28 library(stringr, quietly = T) 28 library(here)
29 library(optparse, quietly = T) 29 library(dplyr)
30 library(purrr)
31 library(tidyr)
32 library(stringr)
33 library(optparse)
34 })
30 35
31 # Define custom functions, variables, and paths. Collect and use CL arguments #### 36 # Define custom functions, variables, and paths. Collect and use CL arguments ####
32 37
33 # Here's a function to recreate that output table from the input JSON files 38 # Here's a function to recreate that output table from the input JSON files
34 39
95 100
96 # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space 101 # Take that list and mash all the elements together as columns in a tibble, recycling as needed to fill in space
97 # eg. phylo_group is repeated/recycled as many times as there are drugs tested 102 # eg. phylo_group is repeated/recycled as many times as there are drugs tested
98 as_tibble(temp) 103 as_tibble(temp)
99 } 104 }
100
101 sink(stdout(), type = "message")
102
103 suppressPackageStartupMessages({
104 library(jsonlite)
105 library(here)
106 library(dplyr)
107 library(purrr)
108 library(tidyr)
109 library(stringr)
110 library(optparse)
111 })
112 105
113 # Get command line arguments with optparse 106 # Get command line arguments with optparse
114 option_list = list( 107 option_list = list(
115 make_option(c("-f", "--file"), 108 make_option(c("-f", "--file"),
116 type="character", 109 type="character",
272 if (length(predictions.table) == 1){ 265 if (length(predictions.table) == 1){
273 print(predictions.table) 266 print(predictions.table)
274 stop("No susceptibility results in files specified. Did the testing fail?", call.=FALSE) 267 stop("No susceptibility results in files specified. Did the testing fail?", call.=FALSE)
275 } 268 }
276 269
277 # Variants 270 # Variants, if present
278 # Multiple resistance mutations and confidence per drug in the X_R_mutations column 271 if (0 < predictions.table %>%
279 # Actual protein changes in Mykrobe_X columns 272 select(ends_with("_Prediction")) %>%
280 273 unlist(use.names = F) %>%
281 variants.temp <- 274 str_count("[R,r]") %>%
282 temp %>% 275 sum()){
283 select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>% 276
284 mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent... 277 # Multiple resistance mutations and confidence per drug in the X_R_mutations column
285 filter(!is.na(variants)) %>% # ...Then get rid of it 278 # Actual protein changes in Mykrobe_X columns
286 mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>% 279
287 mutate(R_mutations = variants) %>% 280 variants.temp <-
288 mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows) 281 temp %>%
289 unnest(variants) %>% 282 select(file, drug, variants = `variants (gene:alt_depth:wt_depth:conf)`) %>%
290 separate(variants, c("gene", "mutation"), "_") %>% 283 mutate(variants = replace(variants, variants == "", NA)) %>% # Make missing data consistent...
291 mutate(columnname = ifelse(gene %in% c("tlyA", "rrs", "gid"), # Check for columns that include the drug name or not and paste accordingly 284 filter(!is.na(variants)) %>% # ...Then get rid of it
292 paste("Mykrobe", drug, gene, sep = "_"), 285 mutate(tempcols = paste(drug, "R_mutations", sep = "_")) %>%
293 paste("Mykrobe", gene, sep = "_"))) %>% 286 mutate(R_mutations = variants) %>%
294 # Extract out the mutation information with a regex that covers all potential genes 287 mutate(variants = strsplit(variants, "__")) %>% # Split the mutations across rows (list first then split across rows)
295 # This regex looks for whatever is ahead of the first colon and after the last hyphen 288 unnest(variants) %>%
296 mutate(mutation = str_match(mutation, "(.*)-.*:")[,2]) %>% 289 separate(variants, c("gene", "mutation"), "_") %>%
297 select(file, tempcols, R_mutations, columnname, mutation) 290 mutate(columnname = ifelse(gene %in% c("tlyA", "rrs", "gid"), # Check for columns that include the drug name or not and paste accordingly
298 291 paste("Mykrobe", drug, gene, sep = "_"),
299 # Split each kind of variants into its own temp table then merge 292 paste("Mykrobe", gene, sep = "_"))) %>%
300 variants.1 <- 293 # Extract out the mutation information with a regex that covers all potential genes
301 variants.temp %>% 294 # This regex looks for whatever is ahead of the first colon and after the last hyphen
302 select(file, tempcols, R_mutations) %>% 295 mutate(mutation = str_match(mutation, "(.*)-.*:")[,2]) %>%
303 distinct() %>% 296 select(file, tempcols, R_mutations, columnname, mutation)
304 spread(tempcols, R_mutations) 297
305 298 # Split each kind of variants into its own temp table then merge
306 variants.2 <- 299 variants.1 <-
307 variants.temp %>% 300 variants.temp %>%
308 select(file, columnname, mutation) %>% 301 select(file, tempcols, R_mutations) %>%
309 group_by(file, columnname) %>% 302 distinct() %>%
310 summarise(mutation = paste(mutation, collapse = ";")) %>% 303 spread(tempcols, R_mutations)
311 spread(columnname, mutation) 304
312 305 variants.2 <-
313 variants.table <- full_join(variants.1, variants.2, by = "file") 306 variants.temp %>%
307 select(file, columnname, mutation) %>%
308 group_by(file, columnname) %>%
309 summarise(mutation = paste(mutation, collapse = ";")) %>%
310 spread(columnname, mutation)
311
312 variants.table <- full_join(variants.1, variants.2, by = "file")
313 }else{
314 variants.table <- data.frame(file=predictions.table$file, stringsAsFactors = F)
315 }
316
314 317
315 # Make a report #### 318 # Make a report ####
316 319
317 report <- 320 report <-
318 temp %>% 321 temp %>%
367 phylo_group_depth, 370 phylo_group_depth,
368 species_depth, 371 species_depth,
369 lineage_depth) %>% 372 lineage_depth) %>%
370 distinct() %>% 373 distinct() %>%
371 write.csv("output-jsondata.csv", row.names = F) 374 write.csv("output-jsondata.csv", row.names = F)
372 print("Writing JSON data to CSV as output-jsondata.txt") 375 print("Writing JSON data to CSV as output-jsondata.csv")
373 sink(NULL, type="message") # close the sink 376 sink(NULL, type="message") # close the sink
374 377
375 quit() 378 quit()