comparison report.Rmd @ 0:315c2ed31af1 draft

planemo upload for repository https://github.com/Helmholtz-UFZ/ufz-galaxy-tools/blob/main/tools/phi-toolkit commit 45c746567f48e6c9bcc19ba4e94e87348df3ac7a
author ufz
date Wed, 04 Jun 2025 17:36:40 +0000
parents
children 3a7f73d638ba
comparison
equal deleted inserted replaced
-1:000000000000 0:315c2ed31af1
1 ---
2 title: "PHI Prophage-Host Interaction Toolkit report"
3 subtitle: "Toolkit for the Detection, Comparison, and Annotation of Prophages in Bacterial Genomes."
4 date: "`r format(Sys.Date(), '%B %d, %Y')`"
5 output:
6 html_document:
7 theme: flatly
8 toc: yes
9 toc_float: false
10 number_sections: yes
11 code_folding: none
12 fig_width: 12
13 fig_height: 8
14 fig_caption: true
15 df_print: paged
16 editor_options:
17 markdown:
18 wrap: 72
19 params:
20 outdir: "data"
21 ---
22
23
24 ------------------------------------------------------------------------
25
26 ```{r setup_env, include=FALSE}
27 knitr::opts_chunk$set(echo = FALSE)
28
29 cat("params$outdir:", params$outdir, "\n")
30 ```
31
32 ```{r setup_libraries, message=FALSE, warning=FALSE, echo=FALSE, results='asis'}
33 # Define required packages
34 required_packages <- c("tidyverse", "janitor", "here",
35 "kableExtra", "gmoviz", "circlize",
36 "GenomicRanges", "patchwork", "fs",
37 "tools", "scales", "formattable",
38 "pdftools", "base64")
39
40 # Load required packages
41 invisible(lapply(required_packages, library, character.only = TRUE))
42 ```
43
44 ```{r helper_functions, echo=FALSE}
45 log_file <- "debug.log"
46
47 log_debug <- function(message) {
48 if (!exists("log_initialized") || !log_initialized) {
49 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = FALSE)
50 assign("log_initialized", TRUE, envir = .GlobalEnv)
51 } else {
52 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = TRUE)
53 }
54 }
55
56 load_file <- function(path) {
57 log_debug(paste("Attempting to load:", path))
58 if (file.exists(path)) {
59 ext <- tools::file_ext(path)
60 if (ext %in% c("tsv", "csv")) {
61 data <- read_delim(path, delim = ifelse(ext == "csv", ",", "\t"), show_col_types = FALSE) %>% clean_names
62 log_debug(paste("Loaded", nrow(data), "rows from", path))
63 data
64 } else if (ext == "fna") {
65 data <- Biostrings::readDNAStringSet(path)
66 log_debug(paste("Loaded", length(data), "sequences from", path))
67 data
68 } else {
69 log_debug(paste("Skipping", path, "- unsupported file type"))
70 NULL
71 }
72 } else {
73 log_debug(paste("File does not exist:", path))
74 NULL
75 }
76 }
77
78 get_file_info <- function(path, loaded_data) {
79 log_debug(paste("Processing file info for:", path))
80 if (file.exists(path)) {
81 ext <- tools::file_ext(path)
82 if (ext %in% c("tsv", "csv", "fna")) {
83 data <- loaded_data[[basename(path)]]
84 rows <- if(ext == "fna") length(data) else nrow(data)
85 tibble(exists = TRUE, rows = rows, size = file.size(path), path = path)
86 } else {
87 tibble(exists = TRUE, rows = NA_integer_, size = file.size(path), path = path)
88 }
89 } else {
90 tibble(exists = FALSE, rows = NA_integer_, size = NA_real_, path = NA_character_)
91 }
92 }
93
94 process_genome_folder <- function(folder, host_analyses_dir, virus_analyses_dir) {
95 log_debug(paste("Processing folder:", folder))
96 genome_name <- basename(folder)
97
98 paths <- list(
99 genomad = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_summary.tsv")),
100 genomad_phages = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus.fna")),
101 genomad_annotations = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_genes.tsv")),
102 defense_finder = file.path(host_analyses_dir, "defense-finder", genome_name, paste0(genome_name, "_defense_finder_systems.tsv")),
103 checkv = file.path(virus_analyses_dir, "checkv", genome_name, "quality_summary.tsv"),
104 iphop = file.path(virus_analyses_dir, "iphop", genome_name, "Host_prediction_to_genome_m90.csv"),
105 drep = file.path(virus_analyses_dir, "drep_compare", genome_name, "data_tables", "Cdb.csv"),
106 phatyp = file.path(virus_analyses_dir, "phatyp", genome_name, "phatyp.csv"),
107 abricate = file.path(virus_analyses_dir, "abricate", genome_name, paste0(genome_name, "_virus_vfdb.tsv")),
108 vibrant = file.path(virus_analyses_dir, "vibrant", genome_name,
109 paste0("VIBRANT_", genome_name, "_virus"),
110 paste0("VIBRANT_results_", genome_name, "_virus"),
111 paste0("VIBRANT_AMG_individuals_", genome_name, "_virus.tsv"))
112 )
113
114 loaded_data <- map(paths, load_file)
115 file_info <- map_dfr(paths, ~get_file_info(.x, loaded_data), .id = "file_type")
116
117 virus_count <- if(!is.null(loaded_data$genomad)) {
118 count <- sum(loaded_data$genomad$virus_score > 0.5, na.rm = TRUE)
119 log_debug(paste("Virus count:", count))
120 count
121 } else {
122 log_debug("No genomad summary found, virus count set to 0")
123 0
124 }
125
126 log_debug("Returning results from process_genome_folder")
127 list(file_info = file_info, virus_count = virus_count, loaded_data = loaded_data)
128 }
129 ```
130
131
132 ```{r compile_results, message=FALSE, warning=FALSE, echo=FALSE}
133 compile_results <- function() {
134 base_dir <- params$outdir
135 log_debug(paste("Base directory:", base_dir))
136
137 host_analyses_dir <- file.path(base_dir, "host_analyses")
138 virus_analyses_dir <- file.path(base_dir, "virus_analyses")
139
140 # List all sample-level directories from all tools under virus_analyses
141 tool_dirs <- list.dirs(virus_analyses_dir, full.names = TRUE, recursive = FALSE)
142
143 genome_folders <- list.dirs(file.path(base_dir, "host_analyses", "genomad"),
144 full.names = TRUE, recursive = FALSE)
145
146 # cat(length(genome_folders), "sample(s) processed\n")
147
148 log_debug("Processing genome folders")
149 genome_data <- map(genome_folders, process_genome_folder,
150 host_analyses_dir = host_analyses_dir,
151 virus_analyses_dir = virus_analyses_dir) %>%
152 set_names(basename(genome_folders)) %>%
153 compact()
154
155 log_debug("Creating summary dataframe")
156 summary_df <- map_dfr(genome_data, ~{
157 file_info <- .x$file_info
158 tibble(
159 Sample = basename(file_info$path[1]),
160 Virus_Count = .x$virus_count,
161 geNomad = file_info$exists[file_info$file_type == "genomad"],
162 CheckV = file_info$exists[file_info$file_type == "checkv"],
163 VIBRANT = file_info$exists[file_info$file_type == "vibrant"],
164 dRep = file_info$exists[file_info$file_type == "drep"],
165 iPHOP = file_info$exists[file_info$file_type == "iphop"],
166 PhaTYP = file_info$exists[file_info$file_type == "phatyp"],
167 Defense_Finder = file_info$exists[file_info$file_type == "defense_finder"],
168 geNomad_Path = file_info$path[file_info$file_type == "genomad"],
169 CheckV_Path = file_info$path[file_info$file_type == "checkv"],
170 VIBRANT_Path = file_info$path[file_info$file_type == "vibrant"],
171 dRep_Path = file_info$path[file_info$file_type == "drep"],
172 PhaTYP_Path = file_info$path[file_info$file_type == "phatyp"],
173 Defense_Finder_Path = file_info$path[file_info$file_type == "defense_finder"],
174 Virus_Contigs = ifelse(file_info$exists[file_info$file_type == "genomad_phages"],
175 file_info$rows[file_info$file_type == "genomad_phages"],
176 0)
177 )
178 }) %>%
179 mutate(across(ends_with("_Path"), ~ifelse(is.na(.), "Not available", as.character(.))))
180
181 host_genomes_fasta <- list.files(
182 path = file.path(params$outdir, "genomes"),
183 pattern = "\\.fna$",
184 full.names = TRUE
185 )
186
187 host_genomes_paths <- tibble(
188 name = tools::file_path_sans_ext(basename(host_genomes_fasta)),
189 path = host_genomes_fasta
190 )
191
192 data_gtdbtk_host <- read_tsv(
193 file.path(params$outdir, "host_analyses/gtdbtk/gtdbtk.bac120.summary.tsv"),
194 show_col_types = FALSE
195 ) %>% clean_names()
196
197 data_checkm_host <- read_tsv(
198 file.path(params$outdir, "host_analyses/checkm2/quality_report.tsv"),
199 show_col_types = FALSE
200 ) %>% clean_names()
201
202 log_debug("Returning summary dataframe, genome data, and host data")
203
204 log_debug(paste("summary_df dimensions:", nrow(summary_df), "rows,", ncol(summary_df), "columns"))
205 log_debug(paste("summary_df column names:", paste(colnames(summary_df), collapse = ", ")))
206 log_debug(paste("genome_data length:", length(genome_data)))
207 log_debug(paste("genome_data names:", paste(names(genome_data), collapse = ", ")))
208 log_debug(paste("host_genomes_paths dimensions:", nrow(host_genomes_paths), "rows,", ncol(host_genomes_paths), "columns"))
209 log_debug(paste("host_genomes_paths column names:", paste(colnames(host_genomes_paths), collapse = ", ")))
210 log_debug(paste("data_gtdbtk_host dimensions:", nrow(data_gtdbtk_host), "rows,", ncol(data_gtdbtk_host), "columns"))
211 log_debug(paste("data_gtdbtk_host column names:", paste(colnames(data_gtdbtk_host), collapse = ", ")))
212 log_debug(paste("data_checkm_host dimensions:", nrow(data_checkm_host), "rows,", ncol(data_checkm_host), "columns"))
213 log_debug(paste("data_checkm_host column names:", paste(colnames(data_checkm_host), collapse = ", ")))
214
215 list(
216 summary = summary_df,
217 genome_data = genome_data,
218 host_genomes_paths = host_genomes_paths,
219 data_gtdbtk_host = data_gtdbtk_host,
220 data_checkm_host = data_checkm_host
221 )
222 }
223 ```
224
225 ```{r run_main_function, echo=FALSE, message=FALSE, warning=FALSE}
226 log_debug("Starting execution of main function")
227 result <- compile_results()
228
229 if (is.null(result)) {
230 log_debug("Main function execution failed")
231 stop("Main function execution failed")
232 }
233
234 summary_df <- result$summary
235 genome_data <- result$genome_data
236 host_genomes_paths <- result$host_genomes_paths
237 data_gtdbtk_host <- result$data_gtdbtk_host
238 data_checkm_host <- result$data_checkm_host
239 log_debug("Data extracted successfully")
240
241 # Remove any extensions from names in data gtdbtk and checm2
242 data_gtdbtk_host <- data_gtdbtk_host %>%
243 mutate(user_genome = str_remove(user_genome, "\\.[^.]+$"))
244
245 data_checkm_host <- data_checkm_host %>%
246 mutate(name = str_remove(name, "\\.[^.]+$"))
247
248 result$summary <- result$summary %>%
249 mutate(Sample = str_remove(Sample, "_virus_summary.tsv"))
250 ```
251
252 # Summary {.tabset .tabset-fade}
253
254 ## Overview Table
255
256 This table provides sample-by-sample information on detected viruses and key host genome statistics. It includes taxonomy, virus count, genome quality classification, CheckM2 metrics (completeness and contamination), and genome assembly statistics such as size and N50.
257
258 ```{r render_table, message=FALSE, warning=FALSE, echo=FALSE, results='asis'}
259 data <- result$summary
260
261 log_debug("Assigning checkm2 host data")
262 checkm_host_data <- data_checkm_host %>% clean_names() %>%
263 select(name, completeness, contamination,
264 contig_n50, genome_size)
265
266 log_debug("Assigning GTDB-Tk host data")
267 gtdbtk_data <- data_gtdbtk_host %>%
268 select(user_genome, classification)
269
270 log_debug("Defining color-blind friendly palette")
271 cb_friendly_colors <- list(
272 green = "#009E73",
273 blue = "#0072B2",
274 orange = "#E69F00",
275 red = "#D55E00",
276 grey = "#999999"
277 )
278
279 log_debug("Defining function to color cells")
280 color_cell <- function(values, color_true = cb_friendly_colors$green,
281 color_false = cb_friendly_colors$red) {
282 ifelse(values,
283 cell_spec("Yes", color = "white", bold = TRUE, background = color_true),
284 cell_spec("No", color = "white", bold = TRUE, background = color_false))
285 }
286
287 log_debug("Defining function to create bar plot")
288 create_bar_plot <- function(values, max_value, color = cb_friendly_colors$grey) {
289 sapply(values, function(value) {
290 if(is.na(value) || !is.numeric(value)) {
291 return("N/A")
292 }
293 bar_width <- min(max(value, 0), max_value) / max_value * 100
294 sprintf('<div style="background-color: %s; width: %f%%; height: 10px;"></div>%.1f%%',
295 color, bar_width, value)
296 })
297 }
298
299 log_debug("Defining function to format large numbers")
300 format_large_number <- function(x) {
301 sapply(x, function(value) {
302 if (is.na(value) || !is.numeric(value)) {
303 return("N/A")
304 } else if (value < 1000) {
305 return(as.character(value))
306 } else if (value < 1e6) {
307 return(paste0(round(value / 1e3, 1), "K"))
308 } else if (value < 1e9) {
309 return(paste0(round(value / 1e6, 1), "M"))
310 } else {
311 return(paste0(round(value / 1e9, 1), "G"))
312 }
313 })
314 }
315
316 log_debug("Defining function to extract last known taxonomy level")
317 extract_last_known_taxonomy <- function(classification) {
318 if (is.na(classification) || classification == "") {
319 return(list(level = "Unknown", name = "Unknown"))
320 }
321
322 parts <- strsplit(classification, ";")[[1]]
323 for (i in length(parts):1) {
324 level <- sub("^[a-z]__", "", parts[i])
325 if (level != "") {
326 prefix <- sub("__.*$", "", parts[i])
327 return(list(level = prefix, name = level))
328 }
329 }
330 return(list(level = "Unknown", name = "Unknown"))
331 }
332
333 log_debug("Defining function to format taxonomy")
334 format_taxonomy <- function(classification) {
335 result <- extract_last_known_taxonomy(classification)
336 if (result$level == "Unknown") {
337 return("Unknown")
338 } else if (result$level == "s") {
339 return(paste0("<i>", result$name, "</i>"))
340 } else {
341 genus <- str_replace_all(result$name, "_", " ")
342 return(paste0("<i>", genus, "</i> sp."))
343 }
344 }
345
346 log_debug("Defining function to calculate quality score and determine genome quality class")
347 calculate_quality_score_and_class <- function(completeness, contamination) {
348 if (is.na(completeness) || is.na(contamination)) {
349 return(list(
350 score = cell_spec("N/A", color = "white", bold = TRUE, background = cb_friendly_colors$grey),
351 class = cell_spec("Unknown", color = "white", bold = TRUE, background = cb_friendly_colors$grey),
352 numeric_score = NA
353 ))
354 }
355
356 quality_score <- completeness - (5 * contamination)
357 formatted_score <- sprintf("%.1f", quality_score)
358
359 if (completeness > 90 && contamination < 5) {
360 class <- "High-quality draft"
361 color <- cb_friendly_colors$green
362 } else if (completeness >= 50 && contamination < 10) {
363 class <- "Medium-quality draft"
364 color <- cb_friendly_colors$blue
365 } else {
366 class <- "Low-quality draft"
367 color <- cb_friendly_colors$red
368 }
369
370 list(
371 score = cell_spec(formatted_score, color = "white", bold = TRUE, background = color),
372 class = cell_spec(class, color = "white", bold = TRUE, background = color),
373 numeric_score = quality_score
374 )
375 }
376
377 log_debug("Preparing the data")
378
379 table_data <- data %>%
380 #mutate(Sample = basename(Sample) %>% trim_sample_name()) %>%
381 mutate(Sample = basename(Sample)) %>%
382 left_join(checkm_host_data, by = c("Sample" = "name")) %>%
383 left_join(gtdbtk_data, by = c("Sample" = "user_genome")) %>%
384 mutate(
385 quality_data = pmap(list(as.numeric(completeness),
386 as.numeric(contamination)),
387 calculate_quality_score_and_class),
388 Quality_Score = map_chr(quality_data, ~.$score),
389 Genome_Quality = map_chr(quality_data, ~.$class),
390 Quality_Score_Numeric = map_dbl(quality_data, ~.$numeric_score),
391 Virus_Count_Numeric = as.numeric(Virus_Count),
392 Virus_Count = cell_spec(
393 Virus_Count,
394 color = "white",
395 bold = TRUE,
396 background = case_when(
397 Virus_Count == 0 ~ cb_friendly_colors$red,
398 Virus_Count == 1 ~ cb_friendly_colors$blue,
399 Virus_Count > 1 ~ cb_friendly_colors$green
400 )
401 ),
402 Completeness_Numeric = as.numeric(completeness),
403 Completeness = create_bar_plot(as.numeric(completeness), 100),
404 Contamination = create_bar_plot(as.numeric(contamination), 100),
405 `N50 (contigs)` = format_large_number(as.numeric(contig_n50)),
406 `Genome size (bp)` = format_large_number(as.numeric(genome_size)),
407 `GTDB Taxonomy` = sapply(classification, format_taxonomy)
408 ) %>%
409 mutate(`#` = row_number()) %>%
410 select(`#`, Sample, `GTDB Taxonomy`, Virus_Count,
411 Quality_Score, Genome_Quality, Completeness, Contamination,
412 `Genome size (bp)`, `N50 (contigs)`)
413
414 log_debug("Creating the table")
415 kbl(table_data, escape = FALSE,
416 align = c("c", "l", "l", "c", rep("c", 2), rep("r", 2), rep("r", 2))) %>%
417 kable_paper(full_width = TRUE) %>%
418 column_spec(1, bold = TRUE, width = "2em") %>%
419 column_spec(2:3, bold = TRUE) %>%
420 column_spec(4:5, width = "5em") %>%
421 column_spec(6:7, width = "60px") %>%
422 column_spec(8:9, width = "4em") %>%
423 add_header_above(c(" " = 4, "Host Genome Quality" = 2, "CheckM Metrics" = 2,
424 "Statistics" = 2)) %>%
425 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
426 font_size = 9,
427 html_font = "Arial",
428 position = "left") %>%
429 row_spec(0, bold = TRUE, color = "white", background = "#333333") %>%
430 row_spec(0, extra_css = "border-bottom: 2px solid #000000;") %>%
431 column_spec(9, extra_css = "border-right: 2px solid #000000;") %>%
432 scroll_box(width = "100%", height = "100%",
433 extra_css = "overflow-x: auto; border: 1px solid #ccc; border-radius: 4px;")
434 ```
435
436 ## Tools Documentation
437
438 The following tools are utilized in this workflow. Each tool name below is a link to its respective documentation.
439
440 **Host-analyses**
441
442 - [**CheckM2 v1.1.0**](https://github.com/chklovski/CheckM2): Assesses the quality of the host. Most useful when working with assembled genomes.
443
444 - [**GTDB-Tk v2.3.2**](https://ecogenomics.github.io/GTDBTk/index.html): Assigns a taxonomy to the host genome.
445
446 - [**Defense-Finder v2.0.0, models 2.0.2**](https://ecogenomics.github.io/GTDBTk/index.html): Detects known anti-phage systems in the host.
447
448 - [**geNomad v1.7.1**](https://portal.nersc.gov/genomad/): Predicts and annotates proviruses.
449
450 **Virus-analyses**
451
452 - [**CheckV v1.0.1**](https://pypi.org/project/checkv/): Evaluates the quality of viral genomes.
453
454 - [**dRep v3.4.5**](https://drep.readthedocs.io/en/latest/): Compares viral genomes within the same host.
455
456 - [**Abricate v1.0.1**](https://github.com/tseemann/abricate): Identifies virulence genes in the prophage genomes with the [VFDB database](https://www.mgc.ac.cn/VFs/).
457
458 - [**iPHOP v1.3.3**](https://bitbucket.org/srouxjgi/iphop/src/main/): Predicts other potential hosts of viral genomes.
459
460 - [**VIBRANT v1.2.1**](https://github.com/AnantharamanLab/VIBRANT): Used to identify Auxiliary Metabolic Genes in the prophages.
461
462 ## Workflow
463
464 The workflow begins with the input of bacterial genomes by the user. These are processed by the **host-analyses** tools. Prophage prediction is
465 performed by **geNomad** only. Afterward, prophages identified by **geNomad** are processed by the **virus-analyses** tools.
466
467 If more than one prophage is recovered in the same sample, **dRep** is used to compare and determine if the viruses are identical or different within the same host.
468
469 *PLACEHOLDER FOR PIPELINE*
470
471 ## R Session Info
472
473 Information about the R session used to render this markdown document.
474
475 ```{r}
476 sessionInfo()
477 ```
478
479
480 # Results {.tabset .tabset-fade}
481
482 ```{r}
483 # Creating combined_unique object
484
485 combined_unique <- bind_rows(
486 checkm_host_data %>%
487 # select(bin_id) %>%
488 # dplyr::rename(Sample = bin_id),
489 select(name) %>%
490 dplyr::rename(Sample = name),
491
492 data %>%
493 #mutate(Sample = str_remove(Sample, "_virus_summary.tsv")) %>%
494 select(Sample)
495 ) %>%
496 distinct(Sample) %>%
497 arrange(Sample)
498
499 log_debug(paste("combined_unique samples:", paste(combined_unique$Sample, collapse = ", ")))
500 ```
501
502
503
504 ```{r main_workflow, fig.width=6, fig.height=6, out.height="100%", out.width='100%', dpi=300, fig.align='center', warning=FALSE, message=FALSE, results='asis'}
505 # Process proviruses data
506 process_proviruses <- function(data_genomad) {
507 proviruses <- data_genomad %>%
508 dplyr::filter(topology == "Provirus") %>%
509 dplyr::mutate(contig = sub("\\|provirus_.*", "", seq_name)) %>% # take everything before "|provirus"
510 dplyr::mutate(contig = paste0("c", as.numeric(factor(contig)))) %>% # map them to c_1, c_2, ...
511 dplyr::select(seq_name, coordinates, length, contig, virus_score, n_hallmarks)
512
513 proviruses <- proviruses %>%
514 tidyr::separate(coordinates, into = c("start", "end"), sep = "-")
515
516 proviruses$start <- as.integer(proviruses$start)
517 proviruses$end <- as.integer(proviruses$end)
518
519 proviruses_gr_features <- GRanges(seqnames = proviruses$contig,
520 ranges = IRanges(start = proviruses$start,
521 end = proviruses$end))
522 proviruses_gr_features$length <- proviruses_gr_features %>% ranges %>% width
523 proviruses_gr_features$score <- as.numeric(proviruses$virus_score)
524 proviruses_gr_features$n_hallmarks <- as.numeric(proviruses$n_hallmarks)
525
526 proviruses_gr_features$n_hallmarks_pos <-
527 abs(start(proviruses_gr_features) - end(proviruses_gr_features)) / 2
528
529 return(proviruses_gr_features)
530 }
531
532 plot_genome_ideogram <- function(genome_current, proviruses_gr_features) {
533 fasta_file_path <- file.path(params$outdir, "genomes", paste0(genome_current, ".fna"))
534 #cat(fasta_file_path, "\n\n")
535 genome_ideogram <- getIdeogramData(fasta_file = fasta_file_path)
536
537 # Replace any seqlevel to c_1, c_2, c_3, ...
538 new_seqlevels <- paste0("c", seq_along(seqlevels(genome_ideogram)))
539 names(new_seqlevels) <- seqlevels(genome_ideogram)
540 genome_ideogram <- GenomeInfoDb::renameSeqlevels(genome_ideogram, new_seqlevels)
541 colours <- rep("#a58bc5", length(seqlevels(genome_ideogram)))
542
543 par(mar = c(2, 2, 2, 2)) # minimal margins around the plot
544
545 gmovizInitialise(genome_ideogram,
546 sector_colours = colours,
547 sector_border_colours = colours,
548 sector_labels = FALSE
549 )
550
551 for (i in 1:length(proviruses_gr_features)) {
552 name <- as.character(seqnames(proviruses_gr_features[i]))
553 start <- as.numeric(start(proviruses_gr_features[i]))
554 end <- as.numeric(end(proviruses_gr_features[i]))
555 region <- data.frame(start = start, end = end)
556 circos.genomicRect(seqnames = name,
557 region,
558 ytop = .5,
559 ybottom = 0,
560 track.index = 1,
561 sector.index = name,
562 border = "#e9d27d",
563 col = "#e9d27d")
564 }
565
566 length <- as.numeric(proviruses_gr_features$length)
567 length <- ifelse(length > 1000000,
568 paste0(round(length/1000000, 2), "mb"),
569 paste0(round(length/1000, 2), "kb"))
570 labels <- paste0(as.character(seqnames(proviruses_gr_features)), " (", length, ")")
571 circos.labels(sectors = as.character(seqnames(proviruses_gr_features)),
572 x = as.numeric(start(proviruses_gr_features)),
573 labels,
574 facing = "clockwise")
575 }
576
577 process_sample <- function(sample, combined_unique, host_genomes_paths, genome_data) {
578 genome_current <- sample # Add this line
579 tryCatch({
580 log_debug(paste("Starting to process sample:", sample))
581
582 # Check if sample exists in genome_data
583 if (!(sample %in% names(genome_data))) {
584 log_debug(paste("Sample", sample, "not found in genome_data"))
585 cat(paste("Error: Sample", sample, "not found in genome_data\n\n"))
586 return()
587 }
588
589 cat(paste("## ", sample, "{.tabset .tabset-fade} \n\n"))
590
591 host_genome_path <- host_genomes_paths$path[host_genomes_paths$name == sample]
592 if (length(host_genome_path) == 0) {
593 log_debug(paste("Host genome path not found for sample:", sample))
594 cat(paste("Error: Host genome path not found for sample", sample, "\n\n"))
595 return()
596 }
597
598 host_genome_ideogram <- tryCatch({
599 getIdeogramData(fasta_file = host_genome_path)
600 }, error = function(e) {
601 log_debug(paste("Error loading host genome ideogram for sample", sample, ":", conditionMessage(e)))
602 NULL
603 })
604
605 if (is.null(host_genome_ideogram)) {
606 cat(paste("Error: Unable to load host genome ideogram for sample", sample, "\n\n"))
607 return()
608 }
609
610 sample_data <- genome_data[[sample]]$loaded_data
611 genomad_summary <- sample_data$genomad
612 genomad_annotation <- sample_data$genomad_annotations
613 checkv_data <- sample_data$checkv
614 defense_finder_data <- sample_data$defense_finder
615 abricate_data <- sample_data$abricate
616 iphop_data <- sample_data$iphop
617 vibrant_data <- sample_data$vibrant
618
619 cat("### Host Genome\n\n")
620
621 cat("**GTDB-Tk taxonomy**: \n\n")
622 data_gtdbtk_host %>% filter(user_genome == sample) %>%
623 select(classification) %>%
624 kbl() %>%
625 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
626 kable_paper("striped", full_width = TRUE) %>%
627 scroll_box(width = "100%", height = "100%") %>%
628 cat()
629
630 # Cat checkm summary for this genome
631 cat("**CheckM2 Summary**:\n\n")
632 #checkm_summary <- data_checkm_host %>% filter(`bin_id` == sample)
633 checkm_summary <- data_checkm_host %>% filter(`name` == sample)
634 checkm_summary %>% clean_names %>%
635 #select(number_contigs, n50_contigs, completeness, contamination, strain_heterogeneity) %>%
636 select(total_contigs, contig_n50, completeness, contamination) %>%
637 kbl() %>%
638 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
639 kable_paper("striped", full_width = TRUE) %>%
640 scroll_box(width = "100%", height = "100%") %>%
641 cat()
642
643 # Display defense-finder as a table
644 if (!is.null(defense_finder_data) && nrow(defense_finder_data) > 0) {
645 cat("**Defense-Finder Systems**:\n\n")
646
647 defense_finder_data %>%
648 select(sys_id, type, subtype, sys_beg, sys_end, protein_in_syst, genes_count, name_of_profiles_in_sys) %>%
649 kbl() %>%
650 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
651 kable_paper("striped", full_width = TRUE) %>%
652 scroll_box(width = "100%", height = "100%") %>%
653 cat()
654 } else {
655 cat("No Defense-Finder systems detected.\n\n")
656 }
657
658 if (is.null(genomad_summary) || nrow(genomad_summary) == 0) {
659 log_debug(paste("No geNomad summary data found for sample:", sample))
660 return()
661 }
662
663 if (length(seqlevels(host_genome_ideogram)) == 1) {
664 host_genome_size <- sum(width(host_genome_ideogram))
665 } else {
666 virus_containing_contigs <- unique(sub("\\|.*", "", genomad_summary$seq_name))
667 virus_containing_contigs <- paste0("c_", as.numeric(factor(virus_containing_contigs)))
668 filtered_host_genome <- subset_and_update_ideogram(host_genome_ideogram, virus_containing_contigs)
669 host_genome_size <- sum(width(filtered_host_genome))
670 }
671
672 # Process proviruses
673 proviruses_gr_features <- process_proviruses(genomad_summary)
674
675 cat("**Genomad and CheckV Summary**:\n\n")
676 genomad_summary %>%
677 select(seq_name, taxonomy, topology, coordinates, length) %>%
678 left_join(
679 checkv_data %>% select(contig_id, gene_count, viral_genes, checkv_quality, miuvig_quality),
680 by = c("seq_name" = "contig_id")) %>%
681 select(seq_name, length, gene_count, viral_genes, checkv_quality, miuvig_quality, taxonomy, topology, coordinates) %>%
682 kbl() %>%
683 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
684 kable_paper("striped", full_width = TRUE) %>%
685 scroll_box(width = "100%", height = "100%") %>%
686 cat()
687
688 cat("**Host Genome Ideogram with Phages**:\n\n")
689 plot_genome_ideogram(sample, proviruses_gr_features)
690 cat('In this circular plot, **"c"** indicates the contig, and the number that follows (e.g., **c1**) represents the contig number.
691 If multiple contigs are present in the genome, each will be shown with a distinct label (e.g., **c1**, **c2**, etc.).\n\n')
692 cat("\n\n")
693
694 # Process phage genomes
695 cat("### Prophages {.tabset .tabset-fade} \n\n")
696 cat("**Select prophage to show: ** \n\n")
697 for (i in seq_len(nrow(genomad_summary))) {
698 log_debug(paste("Processing phage", i, "of", nrow(genomad_summary), "for sample", sample))
699 process_phage(genomad_summary[i, ], genomad_summary, genomad_annotation, checkv_data, host_genome_size)
700 }
701
702 # Plot dREP if applicable
703 if (nrow(genomad_summary) > 1) {
704 cat("### vOTUs\n\n")
705 plot_drep(sample, genomad_summary)
706 }
707
708 # Creating table with Abricate data
709 if (nrow(abricate_data) > 0) {
710 cat("### Virulence Genes {.tabset .tabset-fade} \n\n")
711 cat("Screening of virulence genes present in the prophage contigs. \n\n")
712 abricate_data %>% select(-number_file) %>%
713 kbl() %>%
714 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
715 kable_paper("striped", full_width = TRUE) %>%
716 scroll_box(width = "100%", height = "100%") %>% cat()
717 cat("\n\n")
718 }
719
720 # Creating table with iPHOP
721 if (nrow(iphop_data) > 0) {
722 cat("### Prophage-Host Prediction {.tabset .tabset-fade} \n\n")
723 cat("Prediction of potential hosts for the prophage contigs. \n\n")
724 iphop_data %>%
725 kbl() %>%
726 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
727 kable_paper("striped", full_width = TRUE) %>%
728 scroll_box(width = "100%", height = "100%") %>% cat()
729 cat("\n\n")
730 }
731
732 # Creating table with VIBRANT AMGs
733 if (nrow(vibrant_data) > 0) {
734 cat("### AMG Predictions {.tabset .tabset-fade} \n\n")
735 cat("Prediction of auxiliary metabolic genes in the prophage contigs. \n\n")
736 vibrant_data %>%
737 kbl() %>%
738 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
739 kable_paper("striped", full_width = TRUE) %>%
740 scroll_box(width = "100%", height = "100%") %>% cat()
741 cat("\n\n")
742 }
743
744 log_debug(paste("Finished processing sample:", sample))
745 }, error = function(e) {
746 log_debug(paste("Error in process_sample for", sample, ":", conditionMessage(e)))
747 cat(paste("Error processing sample", sample, ":", conditionMessage(e), "\n\n"))
748 })
749 }
750
751 process_phage <- function(virus, genomad_summary, genomad_annotation, checkv_data, host_genome_size) {
752 cat(paste("#### Phage ID:", virus$seq_name, " {.tabset .tabset-fade} \n\n"))
753
754 current_contig <- sub("\\|.*", "", virus$seq_name)
755
756 provirus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", virus$seq_name))
757 provirus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", virus$seq_name))
758 virus_length <- provirus_end - provirus_start + 1
759
760 current_contig_base <- sub("\\|provirus_.*", "", virus$seq_name)
761 current_provirus_range <- sub(".*\\|provirus_", "", virus$seq_name)
762 current_annotations <- genomad_annotation[grepl(paste0(current_contig_base, "\\|provirus_", current_provirus_range, "_"),
763 genomad_annotation$gene, fixed = FALSE), ] %>%
764 mutate(arrow_pos = ifelse(strand == -1, "start", "end"))
765
766
767 cat("\n\n**Phage–Host Genome Ideogram:**\n\n")
768
769 plot_phage_circos(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data)
770
771 cat("\n\n")
772 cat("\n\n**Genes Annotation (geNomad):**\n\n")
773
774 current_annotations %>%
775 select(gene, length, marker, annotation_accessions, annotation_description) %>%
776 kbl() %>%
777 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
778 kable_paper() %>%
779 cat()
780 cat("\n\n")
781 }
782
783 plot_phage_circos <- function(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data) {
784 tryCatch({
785 log_debug("Starting plot_phage_circos function")
786 log_debug(paste("Current virus:", virus$seq_name))
787 log_debug(paste("Virus length:", virus_length))
788 log_debug(paste("Host genome size:", host_genome_size))
789 log_debug(paste("Provirus start:", provirus_start))
790 log_debug(paste("Provirus end:", provirus_end))
791
792 # Check for NA or invalid values in input parameters
793 if (is.na(virus_length) || virus_length <= 0) {
794 log_debug("Error: Invalid virus length")
795 return(NULL)
796 }
797 if (is.na(host_genome_size) || host_genome_size <= 0) {
798 log_debug("Error: Invalid host genome size")
799 return(NULL)
800 }
801 if (is.na(provirus_start) || provirus_start < 0) {
802 log_debug("Error: Invalid provirus start position")
803 return(NULL)
804 }
805 if (is.na(provirus_end) || provirus_end <= provirus_start) {
806 log_debug("Error: Invalid provirus end position")
807 return(NULL)
808 }
809
810 # Extract contig information
811 current_contig <- sub("\\|.*", "", virus$seq_name)
812 log_debug(paste("Current contig:", current_contig))
813
814 contig_viruses <- genomad_summary[grepl(paste0("^", current_contig), genomad_summary$seq_name), ]
815 if (nrow(contig_viruses) == 0) {
816 log_debug("Error: No viruses found for the current contig")
817 return(NULL)
818 }
819
820 contig_length <- max(as.numeric(sub(".*_(\\d+)$", "\\1", contig_viruses$seq_name)))
821 if (is.na(contig_length) || contig_length <= 0) {
822 contig_length <- virus_length # Use virus length as fallback if contig length is invalid
823 log_debug(paste("Using virus length as contig length:", contig_length))
824 } else {
825 log_debug(paste("Contig length:", contig_length))
826 }
827
828 if (provirus_end > contig_length) {
829 log_debug("Error: Provirus end position exceeds contig length")
830 return(NULL)
831 }
832
833 log_debug("Clearing circos")
834 circos.clear()
835
836 log_debug("Setting circos parameters")
837 circos.par(start.degree = 180, gap.degree = 10, track.margin = c(0.01, 0.01))
838
839 main_color <- "#a58bc5"
840 zoom_color <- "#e9d27d"
841
842 zoom_start <- (provirus_start / contig_length) * 100
843 zoom_end <- (provirus_end / contig_length) * 100
844
845 log_debug(paste("Zoom start:", zoom_start))
846 log_debug(paste("Zoom end:", zoom_end))
847
848 log_debug("Initializing circos")
849 circos.initialize(factors = c("Zoom", "Main"), xlim = c(0, 100))
850
851 format_genome_labels <- function(x) {
852 ifelse(x >= 1e6, paste0(round(x / 1e6, 2), " Mb"),
853 ifelse(x >= 1e3, paste0(round(x / 1e3, 2), " Kb"),
854 paste0(x, " bp")))
855 }
856
857 log_debug("Adding link")
858 tryCatch({
859 circos.link("Main", c(zoom_start, zoom_end), "Zoom", c(0, 100),
860 rou1 = 0.8,
861 rou2 = 0.97,
862 h.ratio = 0.55, # width?
863 lty = 2,
864 lwd = 0.5,
865 h2 = 1,
866 col = "grey99", border = "grey80")
867 }, error = function(e) {
868 log_debug(paste("Error in circos.link:", e$message))
869 })
870
871 log_debug("Adding zoom track")
872 circos.track(factors = "Zoom", ylim = c(0, 1), track.height = 0.15,
873 panel.fun = function(x, y) {
874 circos.rect(0, 0, 100, 1, col = zoom_color, border = NA)
875 axis_labels <- seq(0, virus_length, length.out = 6)
876 axis_positions <- seq(0, 100, length.out = 6)
877 circos.axis(h = "top", major.at = axis_positions,
878 labels = format_genome_labels(axis_labels),
879 labels.cex = 0.7, direction = "outside")
880
881 for (i in 1:nrow(current_annotations)) {
882 gene_start <- current_annotations$start[i]
883 gene_end <- current_annotations$end[i]
884 arrow_start <- (gene_start - provirus_start) / virus_length * 100
885 arrow_end <- (gene_end - provirus_start) / virus_length * 100
886
887 circos.arrow(arrow_start, arrow_end, y1 = 0, y2 = 1,
888 arrow.head.width = 0.75, arrow.head.length = cm_x(0.1),
889 arrow.position = current_annotations$arrow_pos[i],
890 col = ifelse(is.na(current_annotations$annotation_description[i]), "grey", "#7fbfff"),
891 border = ifelse(is.na(current_annotations$annotation_description[i]), "grey20", "darkblue"))
892 }
893 }, bg.border = NA)
894
895 log_debug("Adding main track")
896 circos.track(factors = "Main", ylim = c(0, 1), track.height = 0.1,
897 panel.fun = function(x, y) {
898 circos.rect(xleft = 0, ybottom = 0, xright = 100, ytop = 1, col = main_color, border = NA)
899
900 for (i in 1:nrow(contig_viruses)) {
901 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i]))
902 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i]))
903
904 virus_start_percent <- (virus_start / contig_length) * 100
905 virus_end_percent <- (virus_end / contig_length) * 100
906
907 rect_color <- if (contig_viruses$seq_name[i] == virus$seq_name) zoom_color else adjustcolor(zoom_color, alpha.f = 0.7)
908
909 circos.rect(xleft = virus_start_percent, ybottom = 0,
910 xright = virus_end_percent, ytop = 1,
911 col = rect_color, border = NA)
912 }
913
914 axis_labels <- seq(0, contig_length, length.out = 6)
915 axis_positions <- seq(0, 100, length.out = 6)
916 circos.axis(h = "top", major.at = axis_positions,
917 labels = format_genome_labels(axis_labels),
918 labels.cex = 0.7, direction = "outside")
919 }, bg.border = NA)
920
921 log_debug("Locating phage positions")
922 phage_positions <- sapply(1:nrow(contig_viruses), function(i) {
923 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i]))
924 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i]))
925 ((virus_start + virus_end) / 2 / contig_length) * 100
926 })
927
928 log_debug("Annotating names on phage positions")
929
930 # Extract start and end positions from sequence names
931 start_positions <- as.numeric(sub(".*provirus_([0-9]+)_.*", "\\1", contig_viruses$seq_name))
932 end_positions <- as.numeric(sub(".*provirus_[0-9]+_([0-9]+)", "\\1", contig_viruses$seq_name))
933
934 # Create phage labels with the desired format
935 phage_labels <- paste0(round(contig_viruses$length / 1e3, 2), " Kb")
936
937 # Apply labels to circos plot
938 circos.labels(
939 sectors = "Main",
940 x = phage_positions,
941 labels = phage_labels,
942 facing = "reverse.clockwise",
943 niceFacing = TRUE,
944 col = "black",
945 cex = 0.6,
946 side = "inside",
947 connection_height = 0.02,
948 line_col = "gray"
949 )
950
951 center_x <- 50
952 virus_name <- virus$seq_name
953 taxonomy <- virus$taxonomy
954
955 log_debug("Adding taxonomy and virus name to the plot")
956 circos.text(x = center_x, y = -0.2, labels = taxonomy,
957 sector.index = "Zoom", track.index = 1,
958 facing = "bending.inside", niceFacing = TRUE,
959 adj = c(0.5, 0.7), cex = 0.8)
960
961 checkv_info <- checkv_data[checkv_data$contig_id == virus$seq_name, ]
962 if (nrow(checkv_info) > 0) {
963 checkv_quality <- checkv_info$checkv_quality
964 gene_count <- checkv_info$gene_count
965 viral_genes <- checkv_info$viral_genes
966 host_genes <- checkv_info$host_genes
967 miuvig_quality <- checkv_info$miuvig_quality
968 completeness <- checkv_info$completeness
969 completeness_method <- checkv_info$completeness_method
970 contamination <- checkv_info$contamination
971
972 circos.text(
973 x = center_x, y = -0.5,
974 labels = paste("CheckV Quality:", checkv_quality, " - miuvig Quality:", miuvig_quality),
975 sector.index = "Zoom", track.index = 2,
976 facing = "bending.inside", niceFacing = TRUE,
977 adj = c(0.5, 0), cex = 0.7
978 )
979
980 circos.text(
981 x = center_x, y = -1.5,
982 labels = paste("Gene Count:", gene_count, " - Viral Genes:", viral_genes, " - Host Genes:", host_genes),
983 sector.index = "Zoom", track.index = 2,
984 facing = "bending.inside", niceFacing = TRUE,
985 adj = c(0.5, 0), cex = 0.7
986 )
987
988 circos.text(
989 x = center_x, y = -2.5,
990 labels = paste("Completeness:", completeness, " - Contamination:", contamination),
991 sector.index = "Zoom", track.index = 2,
992 facing = "bending.inside", niceFacing = TRUE,
993 adj = c(0.5, 0), cex = 0.7
994 )
995 }
996
997 log_debug("Adding legend")
998 # Add legend
999 legend("topright",
1000 legend = c("Annotated gene", "Unknown gene"),
1001 fill = c("#7fbfff", "grey"),
1002 border = c("darkblue", "grey20"),
1003 cex = 0.8,
1004 bty = "n")
1005
1006 log_debug("Clearing circos")
1007 circos.clear()
1008 log_debug("Finished plot_phage_circos function successfully")
1009 }, error = function(e) {
1010 log_debug(paste("Error in plot_phage_circos:", e$message))
1011 circos.clear()
1012 })
1013 }
1014
1015 plot_drep <- function(sample, genomad_summary) {
1016 drep_file_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "data_tables", "Cdb.csv")
1017 drep_data <- read_csv(drep_file_path) %>% clean_names()
1018 drep_data <- cbind(genomad_summary$seq_name, drep_data)
1019
1020 cat("When more than 1 phage is detected in the host genome, we perform a clustering step using the tool dRep.\n\n")
1021 cat("A threshold of 0.95 was applied to the ANI similarity index to define clusters of virus operational taxonomic units (vOTUs).")
1022
1023 cat("\n\n**Final cluster designations**\n\n")
1024 drep_data %>%
1025 kbl() %>%
1026 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
1027 kable_paper("striped", full_width = TRUE) %>%
1028 cat()
1029
1030 # Insert the PDF plot
1031 plot_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.pdf")
1032 png_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.png")
1033
1034 if (file.exists(plot_path)) {
1035 pdftools::pdf_convert(plot_path, format = "png", filenames = png_path, verbose = FALSE, dpi=150)
1036 base64_str <- base64enc::dataURI(file = png_path, mime = "image/png")
1037 cat("**Primary clustering plot**\n\n")
1038 cat(sprintf(
1039 '<div style="text-align: center;"><img src="%s" style="max-width: 100%%; height: auto; border:1px solid #ddd; border-radius:4px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); margin: 1em 0;"></div>',base64_str
1040 ))
1041 } else {
1042 cat("**No dRep clustering plot found.**\n\n")
1043 }
1044
1045 }
1046
1047 subset_and_update_ideogram <- function(ideogram, contigs) {
1048 filtered <- ideogram[seqnames(ideogram) %in% contigs]
1049 seqlevels(filtered) <- contigs
1050 seqinfo(filtered) <- seqinfo(filtered)[contigs]
1051 filtered
1052 }
1053
1054 render_all_samples <- function(test_mode = FALSE) {
1055 if (test_mode) {
1056 if (nrow(combined_unique) > 0) {
1057 cat("**Select sample to show:** \n\n\n")
1058 current_sample <- combined_unique$Sample[6]
1059 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data)
1060 } else {
1061 print("No samples can be further analysed.")
1062 }
1063 } else {
1064 cat("**Select sample to show:** \n\n\n")
1065 for (i in seq_len(nrow(combined_unique))) {
1066 current_sample <- combined_unique$Sample[i]
1067 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data)
1068 }
1069 }
1070 }
1071
1072 # Execute the main function
1073 # Test mode processes one sample only
1074 render_all_samples(test_mode = F)
1075 ```
1076
1077 # Citation
1078
1079 Cite this work: XXXXX
1080
1081