Mercurial > repos > ufz > phi_toolkit_report
comparison report.Rmd @ 0:315c2ed31af1 draft
planemo upload for repository https://github.com/Helmholtz-UFZ/ufz-galaxy-tools/blob/main/tools/phi-toolkit commit 45c746567f48e6c9bcc19ba4e94e87348df3ac7a
| author | ufz |
|---|---|
| date | Wed, 04 Jun 2025 17:36:40 +0000 |
| parents | |
| children | 3a7f73d638ba |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:315c2ed31af1 |
|---|---|
| 1 --- | |
| 2 title: "PHI Prophage-Host Interaction Toolkit report" | |
| 3 subtitle: "Toolkit for the Detection, Comparison, and Annotation of Prophages in Bacterial Genomes." | |
| 4 date: "`r format(Sys.Date(), '%B %d, %Y')`" | |
| 5 output: | |
| 6 html_document: | |
| 7 theme: flatly | |
| 8 toc: yes | |
| 9 toc_float: false | |
| 10 number_sections: yes | |
| 11 code_folding: none | |
| 12 fig_width: 12 | |
| 13 fig_height: 8 | |
| 14 fig_caption: true | |
| 15 df_print: paged | |
| 16 editor_options: | |
| 17 markdown: | |
| 18 wrap: 72 | |
| 19 params: | |
| 20 outdir: "data" | |
| 21 --- | |
| 22 | |
| 23 | |
| 24 ------------------------------------------------------------------------ | |
| 25 | |
| 26 ```{r setup_env, include=FALSE} | |
| 27 knitr::opts_chunk$set(echo = FALSE) | |
| 28 | |
| 29 cat("params$outdir:", params$outdir, "\n") | |
| 30 ``` | |
| 31 | |
| 32 ```{r setup_libraries, message=FALSE, warning=FALSE, echo=FALSE, results='asis'} | |
| 33 # Define required packages | |
| 34 required_packages <- c("tidyverse", "janitor", "here", | |
| 35 "kableExtra", "gmoviz", "circlize", | |
| 36 "GenomicRanges", "patchwork", "fs", | |
| 37 "tools", "scales", "formattable", | |
| 38 "pdftools", "base64") | |
| 39 | |
| 40 # Load required packages | |
| 41 invisible(lapply(required_packages, library, character.only = TRUE)) | |
| 42 ``` | |
| 43 | |
| 44 ```{r helper_functions, echo=FALSE} | |
| 45 log_file <- "debug.log" | |
| 46 | |
| 47 log_debug <- function(message) { | |
| 48 if (!exists("log_initialized") || !log_initialized) { | |
| 49 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = FALSE) | |
| 50 assign("log_initialized", TRUE, envir = .GlobalEnv) | |
| 51 } else { | |
| 52 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = TRUE) | |
| 53 } | |
| 54 } | |
| 55 | |
| 56 load_file <- function(path) { | |
| 57 log_debug(paste("Attempting to load:", path)) | |
| 58 if (file.exists(path)) { | |
| 59 ext <- tools::file_ext(path) | |
| 60 if (ext %in% c("tsv", "csv")) { | |
| 61 data <- read_delim(path, delim = ifelse(ext == "csv", ",", "\t"), show_col_types = FALSE) %>% clean_names | |
| 62 log_debug(paste("Loaded", nrow(data), "rows from", path)) | |
| 63 data | |
| 64 } else if (ext == "fna") { | |
| 65 data <- Biostrings::readDNAStringSet(path) | |
| 66 log_debug(paste("Loaded", length(data), "sequences from", path)) | |
| 67 data | |
| 68 } else { | |
| 69 log_debug(paste("Skipping", path, "- unsupported file type")) | |
| 70 NULL | |
| 71 } | |
| 72 } else { | |
| 73 log_debug(paste("File does not exist:", path)) | |
| 74 NULL | |
| 75 } | |
| 76 } | |
| 77 | |
| 78 get_file_info <- function(path, loaded_data) { | |
| 79 log_debug(paste("Processing file info for:", path)) | |
| 80 if (file.exists(path)) { | |
| 81 ext <- tools::file_ext(path) | |
| 82 if (ext %in% c("tsv", "csv", "fna")) { | |
| 83 data <- loaded_data[[basename(path)]] | |
| 84 rows <- if(ext == "fna") length(data) else nrow(data) | |
| 85 tibble(exists = TRUE, rows = rows, size = file.size(path), path = path) | |
| 86 } else { | |
| 87 tibble(exists = TRUE, rows = NA_integer_, size = file.size(path), path = path) | |
| 88 } | |
| 89 } else { | |
| 90 tibble(exists = FALSE, rows = NA_integer_, size = NA_real_, path = NA_character_) | |
| 91 } | |
| 92 } | |
| 93 | |
| 94 process_genome_folder <- function(folder, host_analyses_dir, virus_analyses_dir) { | |
| 95 log_debug(paste("Processing folder:", folder)) | |
| 96 genome_name <- basename(folder) | |
| 97 | |
| 98 paths <- list( | |
| 99 genomad = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_summary.tsv")), | |
| 100 genomad_phages = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus.fna")), | |
| 101 genomad_annotations = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_genes.tsv")), | |
| 102 defense_finder = file.path(host_analyses_dir, "defense-finder", genome_name, paste0(genome_name, "_defense_finder_systems.tsv")), | |
| 103 checkv = file.path(virus_analyses_dir, "checkv", genome_name, "quality_summary.tsv"), | |
| 104 iphop = file.path(virus_analyses_dir, "iphop", genome_name, "Host_prediction_to_genome_m90.csv"), | |
| 105 drep = file.path(virus_analyses_dir, "drep_compare", genome_name, "data_tables", "Cdb.csv"), | |
| 106 phatyp = file.path(virus_analyses_dir, "phatyp", genome_name, "phatyp.csv"), | |
| 107 abricate = file.path(virus_analyses_dir, "abricate", genome_name, paste0(genome_name, "_virus_vfdb.tsv")), | |
| 108 vibrant = file.path(virus_analyses_dir, "vibrant", genome_name, | |
| 109 paste0("VIBRANT_", genome_name, "_virus"), | |
| 110 paste0("VIBRANT_results_", genome_name, "_virus"), | |
| 111 paste0("VIBRANT_AMG_individuals_", genome_name, "_virus.tsv")) | |
| 112 ) | |
| 113 | |
| 114 loaded_data <- map(paths, load_file) | |
| 115 file_info <- map_dfr(paths, ~get_file_info(.x, loaded_data), .id = "file_type") | |
| 116 | |
| 117 virus_count <- if(!is.null(loaded_data$genomad)) { | |
| 118 count <- sum(loaded_data$genomad$virus_score > 0.5, na.rm = TRUE) | |
| 119 log_debug(paste("Virus count:", count)) | |
| 120 count | |
| 121 } else { | |
| 122 log_debug("No genomad summary found, virus count set to 0") | |
| 123 0 | |
| 124 } | |
| 125 | |
| 126 log_debug("Returning results from process_genome_folder") | |
| 127 list(file_info = file_info, virus_count = virus_count, loaded_data = loaded_data) | |
| 128 } | |
| 129 ``` | |
| 130 | |
| 131 | |
| 132 ```{r compile_results, message=FALSE, warning=FALSE, echo=FALSE} | |
| 133 compile_results <- function() { | |
| 134 base_dir <- params$outdir | |
| 135 log_debug(paste("Base directory:", base_dir)) | |
| 136 | |
| 137 host_analyses_dir <- file.path(base_dir, "host_analyses") | |
| 138 virus_analyses_dir <- file.path(base_dir, "virus_analyses") | |
| 139 | |
| 140 # List all sample-level directories from all tools under virus_analyses | |
| 141 tool_dirs <- list.dirs(virus_analyses_dir, full.names = TRUE, recursive = FALSE) | |
| 142 | |
| 143 genome_folders <- list.dirs(file.path(base_dir, "host_analyses", "genomad"), | |
| 144 full.names = TRUE, recursive = FALSE) | |
| 145 | |
| 146 # cat(length(genome_folders), "sample(s) processed\n") | |
| 147 | |
| 148 log_debug("Processing genome folders") | |
| 149 genome_data <- map(genome_folders, process_genome_folder, | |
| 150 host_analyses_dir = host_analyses_dir, | |
| 151 virus_analyses_dir = virus_analyses_dir) %>% | |
| 152 set_names(basename(genome_folders)) %>% | |
| 153 compact() | |
| 154 | |
| 155 log_debug("Creating summary dataframe") | |
| 156 summary_df <- map_dfr(genome_data, ~{ | |
| 157 file_info <- .x$file_info | |
| 158 tibble( | |
| 159 Sample = basename(file_info$path[1]), | |
| 160 Virus_Count = .x$virus_count, | |
| 161 geNomad = file_info$exists[file_info$file_type == "genomad"], | |
| 162 CheckV = file_info$exists[file_info$file_type == "checkv"], | |
| 163 VIBRANT = file_info$exists[file_info$file_type == "vibrant"], | |
| 164 dRep = file_info$exists[file_info$file_type == "drep"], | |
| 165 iPHOP = file_info$exists[file_info$file_type == "iphop"], | |
| 166 PhaTYP = file_info$exists[file_info$file_type == "phatyp"], | |
| 167 Defense_Finder = file_info$exists[file_info$file_type == "defense_finder"], | |
| 168 geNomad_Path = file_info$path[file_info$file_type == "genomad"], | |
| 169 CheckV_Path = file_info$path[file_info$file_type == "checkv"], | |
| 170 VIBRANT_Path = file_info$path[file_info$file_type == "vibrant"], | |
| 171 dRep_Path = file_info$path[file_info$file_type == "drep"], | |
| 172 PhaTYP_Path = file_info$path[file_info$file_type == "phatyp"], | |
| 173 Defense_Finder_Path = file_info$path[file_info$file_type == "defense_finder"], | |
| 174 Virus_Contigs = ifelse(file_info$exists[file_info$file_type == "genomad_phages"], | |
| 175 file_info$rows[file_info$file_type == "genomad_phages"], | |
| 176 0) | |
| 177 ) | |
| 178 }) %>% | |
| 179 mutate(across(ends_with("_Path"), ~ifelse(is.na(.), "Not available", as.character(.)))) | |
| 180 | |
| 181 host_genomes_fasta <- list.files( | |
| 182 path = file.path(params$outdir, "genomes"), | |
| 183 pattern = "\\.fna$", | |
| 184 full.names = TRUE | |
| 185 ) | |
| 186 | |
| 187 host_genomes_paths <- tibble( | |
| 188 name = tools::file_path_sans_ext(basename(host_genomes_fasta)), | |
| 189 path = host_genomes_fasta | |
| 190 ) | |
| 191 | |
| 192 data_gtdbtk_host <- read_tsv( | |
| 193 file.path(params$outdir, "host_analyses/gtdbtk/gtdbtk.bac120.summary.tsv"), | |
| 194 show_col_types = FALSE | |
| 195 ) %>% clean_names() | |
| 196 | |
| 197 data_checkm_host <- read_tsv( | |
| 198 file.path(params$outdir, "host_analyses/checkm2/quality_report.tsv"), | |
| 199 show_col_types = FALSE | |
| 200 ) %>% clean_names() | |
| 201 | |
| 202 log_debug("Returning summary dataframe, genome data, and host data") | |
| 203 | |
| 204 log_debug(paste("summary_df dimensions:", nrow(summary_df), "rows,", ncol(summary_df), "columns")) | |
| 205 log_debug(paste("summary_df column names:", paste(colnames(summary_df), collapse = ", "))) | |
| 206 log_debug(paste("genome_data length:", length(genome_data))) | |
| 207 log_debug(paste("genome_data names:", paste(names(genome_data), collapse = ", "))) | |
| 208 log_debug(paste("host_genomes_paths dimensions:", nrow(host_genomes_paths), "rows,", ncol(host_genomes_paths), "columns")) | |
| 209 log_debug(paste("host_genomes_paths column names:", paste(colnames(host_genomes_paths), collapse = ", "))) | |
| 210 log_debug(paste("data_gtdbtk_host dimensions:", nrow(data_gtdbtk_host), "rows,", ncol(data_gtdbtk_host), "columns")) | |
| 211 log_debug(paste("data_gtdbtk_host column names:", paste(colnames(data_gtdbtk_host), collapse = ", "))) | |
| 212 log_debug(paste("data_checkm_host dimensions:", nrow(data_checkm_host), "rows,", ncol(data_checkm_host), "columns")) | |
| 213 log_debug(paste("data_checkm_host column names:", paste(colnames(data_checkm_host), collapse = ", "))) | |
| 214 | |
| 215 list( | |
| 216 summary = summary_df, | |
| 217 genome_data = genome_data, | |
| 218 host_genomes_paths = host_genomes_paths, | |
| 219 data_gtdbtk_host = data_gtdbtk_host, | |
| 220 data_checkm_host = data_checkm_host | |
| 221 ) | |
| 222 } | |
| 223 ``` | |
| 224 | |
| 225 ```{r run_main_function, echo=FALSE, message=FALSE, warning=FALSE} | |
| 226 log_debug("Starting execution of main function") | |
| 227 result <- compile_results() | |
| 228 | |
| 229 if (is.null(result)) { | |
| 230 log_debug("Main function execution failed") | |
| 231 stop("Main function execution failed") | |
| 232 } | |
| 233 | |
| 234 summary_df <- result$summary | |
| 235 genome_data <- result$genome_data | |
| 236 host_genomes_paths <- result$host_genomes_paths | |
| 237 data_gtdbtk_host <- result$data_gtdbtk_host | |
| 238 data_checkm_host <- result$data_checkm_host | |
| 239 log_debug("Data extracted successfully") | |
| 240 | |
| 241 # Remove any extensions from names in data gtdbtk and checm2 | |
| 242 data_gtdbtk_host <- data_gtdbtk_host %>% | |
| 243 mutate(user_genome = str_remove(user_genome, "\\.[^.]+$")) | |
| 244 | |
| 245 data_checkm_host <- data_checkm_host %>% | |
| 246 mutate(name = str_remove(name, "\\.[^.]+$")) | |
| 247 | |
| 248 result$summary <- result$summary %>% | |
| 249 mutate(Sample = str_remove(Sample, "_virus_summary.tsv")) | |
| 250 ``` | |
| 251 | |
| 252 # Summary {.tabset .tabset-fade} | |
| 253 | |
| 254 ## Overview Table | |
| 255 | |
| 256 This table provides sample-by-sample information on detected viruses and key host genome statistics. It includes taxonomy, virus count, genome quality classification, CheckM2 metrics (completeness and contamination), and genome assembly statistics such as size and N50. | |
| 257 | |
| 258 ```{r render_table, message=FALSE, warning=FALSE, echo=FALSE, results='asis'} | |
| 259 data <- result$summary | |
| 260 | |
| 261 log_debug("Assigning checkm2 host data") | |
| 262 checkm_host_data <- data_checkm_host %>% clean_names() %>% | |
| 263 select(name, completeness, contamination, | |
| 264 contig_n50, genome_size) | |
| 265 | |
| 266 log_debug("Assigning GTDB-Tk host data") | |
| 267 gtdbtk_data <- data_gtdbtk_host %>% | |
| 268 select(user_genome, classification) | |
| 269 | |
| 270 log_debug("Defining color-blind friendly palette") | |
| 271 cb_friendly_colors <- list( | |
| 272 green = "#009E73", | |
| 273 blue = "#0072B2", | |
| 274 orange = "#E69F00", | |
| 275 red = "#D55E00", | |
| 276 grey = "#999999" | |
| 277 ) | |
| 278 | |
| 279 log_debug("Defining function to color cells") | |
| 280 color_cell <- function(values, color_true = cb_friendly_colors$green, | |
| 281 color_false = cb_friendly_colors$red) { | |
| 282 ifelse(values, | |
| 283 cell_spec("Yes", color = "white", bold = TRUE, background = color_true), | |
| 284 cell_spec("No", color = "white", bold = TRUE, background = color_false)) | |
| 285 } | |
| 286 | |
| 287 log_debug("Defining function to create bar plot") | |
| 288 create_bar_plot <- function(values, max_value, color = cb_friendly_colors$grey) { | |
| 289 sapply(values, function(value) { | |
| 290 if(is.na(value) || !is.numeric(value)) { | |
| 291 return("N/A") | |
| 292 } | |
| 293 bar_width <- min(max(value, 0), max_value) / max_value * 100 | |
| 294 sprintf('<div style="background-color: %s; width: %f%%; height: 10px;"></div>%.1f%%', | |
| 295 color, bar_width, value) | |
| 296 }) | |
| 297 } | |
| 298 | |
| 299 log_debug("Defining function to format large numbers") | |
| 300 format_large_number <- function(x) { | |
| 301 sapply(x, function(value) { | |
| 302 if (is.na(value) || !is.numeric(value)) { | |
| 303 return("N/A") | |
| 304 } else if (value < 1000) { | |
| 305 return(as.character(value)) | |
| 306 } else if (value < 1e6) { | |
| 307 return(paste0(round(value / 1e3, 1), "K")) | |
| 308 } else if (value < 1e9) { | |
| 309 return(paste0(round(value / 1e6, 1), "M")) | |
| 310 } else { | |
| 311 return(paste0(round(value / 1e9, 1), "G")) | |
| 312 } | |
| 313 }) | |
| 314 } | |
| 315 | |
| 316 log_debug("Defining function to extract last known taxonomy level") | |
| 317 extract_last_known_taxonomy <- function(classification) { | |
| 318 if (is.na(classification) || classification == "") { | |
| 319 return(list(level = "Unknown", name = "Unknown")) | |
| 320 } | |
| 321 | |
| 322 parts <- strsplit(classification, ";")[[1]] | |
| 323 for (i in length(parts):1) { | |
| 324 level <- sub("^[a-z]__", "", parts[i]) | |
| 325 if (level != "") { | |
| 326 prefix <- sub("__.*$", "", parts[i]) | |
| 327 return(list(level = prefix, name = level)) | |
| 328 } | |
| 329 } | |
| 330 return(list(level = "Unknown", name = "Unknown")) | |
| 331 } | |
| 332 | |
| 333 log_debug("Defining function to format taxonomy") | |
| 334 format_taxonomy <- function(classification) { | |
| 335 result <- extract_last_known_taxonomy(classification) | |
| 336 if (result$level == "Unknown") { | |
| 337 return("Unknown") | |
| 338 } else if (result$level == "s") { | |
| 339 return(paste0("<i>", result$name, "</i>")) | |
| 340 } else { | |
| 341 genus <- str_replace_all(result$name, "_", " ") | |
| 342 return(paste0("<i>", genus, "</i> sp.")) | |
| 343 } | |
| 344 } | |
| 345 | |
| 346 log_debug("Defining function to calculate quality score and determine genome quality class") | |
| 347 calculate_quality_score_and_class <- function(completeness, contamination) { | |
| 348 if (is.na(completeness) || is.na(contamination)) { | |
| 349 return(list( | |
| 350 score = cell_spec("N/A", color = "white", bold = TRUE, background = cb_friendly_colors$grey), | |
| 351 class = cell_spec("Unknown", color = "white", bold = TRUE, background = cb_friendly_colors$grey), | |
| 352 numeric_score = NA | |
| 353 )) | |
| 354 } | |
| 355 | |
| 356 quality_score <- completeness - (5 * contamination) | |
| 357 formatted_score <- sprintf("%.1f", quality_score) | |
| 358 | |
| 359 if (completeness > 90 && contamination < 5) { | |
| 360 class <- "High-quality draft" | |
| 361 color <- cb_friendly_colors$green | |
| 362 } else if (completeness >= 50 && contamination < 10) { | |
| 363 class <- "Medium-quality draft" | |
| 364 color <- cb_friendly_colors$blue | |
| 365 } else { | |
| 366 class <- "Low-quality draft" | |
| 367 color <- cb_friendly_colors$red | |
| 368 } | |
| 369 | |
| 370 list( | |
| 371 score = cell_spec(formatted_score, color = "white", bold = TRUE, background = color), | |
| 372 class = cell_spec(class, color = "white", bold = TRUE, background = color), | |
| 373 numeric_score = quality_score | |
| 374 ) | |
| 375 } | |
| 376 | |
| 377 log_debug("Preparing the data") | |
| 378 | |
| 379 table_data <- data %>% | |
| 380 #mutate(Sample = basename(Sample) %>% trim_sample_name()) %>% | |
| 381 mutate(Sample = basename(Sample)) %>% | |
| 382 left_join(checkm_host_data, by = c("Sample" = "name")) %>% | |
| 383 left_join(gtdbtk_data, by = c("Sample" = "user_genome")) %>% | |
| 384 mutate( | |
| 385 quality_data = pmap(list(as.numeric(completeness), | |
| 386 as.numeric(contamination)), | |
| 387 calculate_quality_score_and_class), | |
| 388 Quality_Score = map_chr(quality_data, ~.$score), | |
| 389 Genome_Quality = map_chr(quality_data, ~.$class), | |
| 390 Quality_Score_Numeric = map_dbl(quality_data, ~.$numeric_score), | |
| 391 Virus_Count_Numeric = as.numeric(Virus_Count), | |
| 392 Virus_Count = cell_spec( | |
| 393 Virus_Count, | |
| 394 color = "white", | |
| 395 bold = TRUE, | |
| 396 background = case_when( | |
| 397 Virus_Count == 0 ~ cb_friendly_colors$red, | |
| 398 Virus_Count == 1 ~ cb_friendly_colors$blue, | |
| 399 Virus_Count > 1 ~ cb_friendly_colors$green | |
| 400 ) | |
| 401 ), | |
| 402 Completeness_Numeric = as.numeric(completeness), | |
| 403 Completeness = create_bar_plot(as.numeric(completeness), 100), | |
| 404 Contamination = create_bar_plot(as.numeric(contamination), 100), | |
| 405 `N50 (contigs)` = format_large_number(as.numeric(contig_n50)), | |
| 406 `Genome size (bp)` = format_large_number(as.numeric(genome_size)), | |
| 407 `GTDB Taxonomy` = sapply(classification, format_taxonomy) | |
| 408 ) %>% | |
| 409 mutate(`#` = row_number()) %>% | |
| 410 select(`#`, Sample, `GTDB Taxonomy`, Virus_Count, | |
| 411 Quality_Score, Genome_Quality, Completeness, Contamination, | |
| 412 `Genome size (bp)`, `N50 (contigs)`) | |
| 413 | |
| 414 log_debug("Creating the table") | |
| 415 kbl(table_data, escape = FALSE, | |
| 416 align = c("c", "l", "l", "c", rep("c", 2), rep("r", 2), rep("r", 2))) %>% | |
| 417 kable_paper(full_width = TRUE) %>% | |
| 418 column_spec(1, bold = TRUE, width = "2em") %>% | |
| 419 column_spec(2:3, bold = TRUE) %>% | |
| 420 column_spec(4:5, width = "5em") %>% | |
| 421 column_spec(6:7, width = "60px") %>% | |
| 422 column_spec(8:9, width = "4em") %>% | |
| 423 add_header_above(c(" " = 4, "Host Genome Quality" = 2, "CheckM Metrics" = 2, | |
| 424 "Statistics" = 2)) %>% | |
| 425 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), | |
| 426 font_size = 9, | |
| 427 html_font = "Arial", | |
| 428 position = "left") %>% | |
| 429 row_spec(0, bold = TRUE, color = "white", background = "#333333") %>% | |
| 430 row_spec(0, extra_css = "border-bottom: 2px solid #000000;") %>% | |
| 431 column_spec(9, extra_css = "border-right: 2px solid #000000;") %>% | |
| 432 scroll_box(width = "100%", height = "100%", | |
| 433 extra_css = "overflow-x: auto; border: 1px solid #ccc; border-radius: 4px;") | |
| 434 ``` | |
| 435 | |
| 436 ## Tools Documentation | |
| 437 | |
| 438 The following tools are utilized in this workflow. Each tool name below is a link to its respective documentation. | |
| 439 | |
| 440 **Host-analyses** | |
| 441 | |
| 442 - [**CheckM2 v1.1.0**](https://github.com/chklovski/CheckM2): Assesses the quality of the host. Most useful when working with assembled genomes. | |
| 443 | |
| 444 - [**GTDB-Tk v2.3.2**](https://ecogenomics.github.io/GTDBTk/index.html): Assigns a taxonomy to the host genome. | |
| 445 | |
| 446 - [**Defense-Finder v2.0.0, models 2.0.2**](https://ecogenomics.github.io/GTDBTk/index.html): Detects known anti-phage systems in the host. | |
| 447 | |
| 448 - [**geNomad v1.7.1**](https://portal.nersc.gov/genomad/): Predicts and annotates proviruses. | |
| 449 | |
| 450 **Virus-analyses** | |
| 451 | |
| 452 - [**CheckV v1.0.1**](https://pypi.org/project/checkv/): Evaluates the quality of viral genomes. | |
| 453 | |
| 454 - [**dRep v3.4.5**](https://drep.readthedocs.io/en/latest/): Compares viral genomes within the same host. | |
| 455 | |
| 456 - [**Abricate v1.0.1**](https://github.com/tseemann/abricate): Identifies virulence genes in the prophage genomes with the [VFDB database](https://www.mgc.ac.cn/VFs/). | |
| 457 | |
| 458 - [**iPHOP v1.3.3**](https://bitbucket.org/srouxjgi/iphop/src/main/): Predicts other potential hosts of viral genomes. | |
| 459 | |
| 460 - [**VIBRANT v1.2.1**](https://github.com/AnantharamanLab/VIBRANT): Used to identify Auxiliary Metabolic Genes in the prophages. | |
| 461 | |
| 462 ## Workflow | |
| 463 | |
| 464 The workflow begins with the input of bacterial genomes by the user. These are processed by the **host-analyses** tools. Prophage prediction is | |
| 465 performed by **geNomad** only. Afterward, prophages identified by **geNomad** are processed by the **virus-analyses** tools. | |
| 466 | |
| 467 If more than one prophage is recovered in the same sample, **dRep** is used to compare and determine if the viruses are identical or different within the same host. | |
| 468 | |
| 469 *PLACEHOLDER FOR PIPELINE* | |
| 470 | |
| 471 ## R Session Info | |
| 472 | |
| 473 Information about the R session used to render this markdown document. | |
| 474 | |
| 475 ```{r} | |
| 476 sessionInfo() | |
| 477 ``` | |
| 478 | |
| 479 | |
| 480 # Results {.tabset .tabset-fade} | |
| 481 | |
| 482 ```{r} | |
| 483 # Creating combined_unique object | |
| 484 | |
| 485 combined_unique <- bind_rows( | |
| 486 checkm_host_data %>% | |
| 487 # select(bin_id) %>% | |
| 488 # dplyr::rename(Sample = bin_id), | |
| 489 select(name) %>% | |
| 490 dplyr::rename(Sample = name), | |
| 491 | |
| 492 data %>% | |
| 493 #mutate(Sample = str_remove(Sample, "_virus_summary.tsv")) %>% | |
| 494 select(Sample) | |
| 495 ) %>% | |
| 496 distinct(Sample) %>% | |
| 497 arrange(Sample) | |
| 498 | |
| 499 log_debug(paste("combined_unique samples:", paste(combined_unique$Sample, collapse = ", "))) | |
| 500 ``` | |
| 501 | |
| 502 | |
| 503 | |
| 504 ```{r main_workflow, fig.width=6, fig.height=6, out.height="100%", out.width='100%', dpi=300, fig.align='center', warning=FALSE, message=FALSE, results='asis'} | |
| 505 # Process proviruses data | |
| 506 process_proviruses <- function(data_genomad) { | |
| 507 proviruses <- data_genomad %>% | |
| 508 dplyr::filter(topology == "Provirus") %>% | |
| 509 dplyr::mutate(contig = sub("\\|provirus_.*", "", seq_name)) %>% # take everything before "|provirus" | |
| 510 dplyr::mutate(contig = paste0("c", as.numeric(factor(contig)))) %>% # map them to c_1, c_2, ... | |
| 511 dplyr::select(seq_name, coordinates, length, contig, virus_score, n_hallmarks) | |
| 512 | |
| 513 proviruses <- proviruses %>% | |
| 514 tidyr::separate(coordinates, into = c("start", "end"), sep = "-") | |
| 515 | |
| 516 proviruses$start <- as.integer(proviruses$start) | |
| 517 proviruses$end <- as.integer(proviruses$end) | |
| 518 | |
| 519 proviruses_gr_features <- GRanges(seqnames = proviruses$contig, | |
| 520 ranges = IRanges(start = proviruses$start, | |
| 521 end = proviruses$end)) | |
| 522 proviruses_gr_features$length <- proviruses_gr_features %>% ranges %>% width | |
| 523 proviruses_gr_features$score <- as.numeric(proviruses$virus_score) | |
| 524 proviruses_gr_features$n_hallmarks <- as.numeric(proviruses$n_hallmarks) | |
| 525 | |
| 526 proviruses_gr_features$n_hallmarks_pos <- | |
| 527 abs(start(proviruses_gr_features) - end(proviruses_gr_features)) / 2 | |
| 528 | |
| 529 return(proviruses_gr_features) | |
| 530 } | |
| 531 | |
| 532 plot_genome_ideogram <- function(genome_current, proviruses_gr_features) { | |
| 533 fasta_file_path <- file.path(params$outdir, "genomes", paste0(genome_current, ".fna")) | |
| 534 #cat(fasta_file_path, "\n\n") | |
| 535 genome_ideogram <- getIdeogramData(fasta_file = fasta_file_path) | |
| 536 | |
| 537 # Replace any seqlevel to c_1, c_2, c_3, ... | |
| 538 new_seqlevels <- paste0("c", seq_along(seqlevels(genome_ideogram))) | |
| 539 names(new_seqlevels) <- seqlevels(genome_ideogram) | |
| 540 genome_ideogram <- GenomeInfoDb::renameSeqlevels(genome_ideogram, new_seqlevels) | |
| 541 colours <- rep("#a58bc5", length(seqlevels(genome_ideogram))) | |
| 542 | |
| 543 par(mar = c(2, 2, 2, 2)) # minimal margins around the plot | |
| 544 | |
| 545 gmovizInitialise(genome_ideogram, | |
| 546 sector_colours = colours, | |
| 547 sector_border_colours = colours, | |
| 548 sector_labels = FALSE | |
| 549 ) | |
| 550 | |
| 551 for (i in 1:length(proviruses_gr_features)) { | |
| 552 name <- as.character(seqnames(proviruses_gr_features[i])) | |
| 553 start <- as.numeric(start(proviruses_gr_features[i])) | |
| 554 end <- as.numeric(end(proviruses_gr_features[i])) | |
| 555 region <- data.frame(start = start, end = end) | |
| 556 circos.genomicRect(seqnames = name, | |
| 557 region, | |
| 558 ytop = .5, | |
| 559 ybottom = 0, | |
| 560 track.index = 1, | |
| 561 sector.index = name, | |
| 562 border = "#e9d27d", | |
| 563 col = "#e9d27d") | |
| 564 } | |
| 565 | |
| 566 length <- as.numeric(proviruses_gr_features$length) | |
| 567 length <- ifelse(length > 1000000, | |
| 568 paste0(round(length/1000000, 2), "mb"), | |
| 569 paste0(round(length/1000, 2), "kb")) | |
| 570 labels <- paste0(as.character(seqnames(proviruses_gr_features)), " (", length, ")") | |
| 571 circos.labels(sectors = as.character(seqnames(proviruses_gr_features)), | |
| 572 x = as.numeric(start(proviruses_gr_features)), | |
| 573 labels, | |
| 574 facing = "clockwise") | |
| 575 } | |
| 576 | |
| 577 process_sample <- function(sample, combined_unique, host_genomes_paths, genome_data) { | |
| 578 genome_current <- sample # Add this line | |
| 579 tryCatch({ | |
| 580 log_debug(paste("Starting to process sample:", sample)) | |
| 581 | |
| 582 # Check if sample exists in genome_data | |
| 583 if (!(sample %in% names(genome_data))) { | |
| 584 log_debug(paste("Sample", sample, "not found in genome_data")) | |
| 585 cat(paste("Error: Sample", sample, "not found in genome_data\n\n")) | |
| 586 return() | |
| 587 } | |
| 588 | |
| 589 cat(paste("## ", sample, "{.tabset .tabset-fade} \n\n")) | |
| 590 | |
| 591 host_genome_path <- host_genomes_paths$path[host_genomes_paths$name == sample] | |
| 592 if (length(host_genome_path) == 0) { | |
| 593 log_debug(paste("Host genome path not found for sample:", sample)) | |
| 594 cat(paste("Error: Host genome path not found for sample", sample, "\n\n")) | |
| 595 return() | |
| 596 } | |
| 597 | |
| 598 host_genome_ideogram <- tryCatch({ | |
| 599 getIdeogramData(fasta_file = host_genome_path) | |
| 600 }, error = function(e) { | |
| 601 log_debug(paste("Error loading host genome ideogram for sample", sample, ":", conditionMessage(e))) | |
| 602 NULL | |
| 603 }) | |
| 604 | |
| 605 if (is.null(host_genome_ideogram)) { | |
| 606 cat(paste("Error: Unable to load host genome ideogram for sample", sample, "\n\n")) | |
| 607 return() | |
| 608 } | |
| 609 | |
| 610 sample_data <- genome_data[[sample]]$loaded_data | |
| 611 genomad_summary <- sample_data$genomad | |
| 612 genomad_annotation <- sample_data$genomad_annotations | |
| 613 checkv_data <- sample_data$checkv | |
| 614 defense_finder_data <- sample_data$defense_finder | |
| 615 abricate_data <- sample_data$abricate | |
| 616 iphop_data <- sample_data$iphop | |
| 617 vibrant_data <- sample_data$vibrant | |
| 618 | |
| 619 cat("### Host Genome\n\n") | |
| 620 | |
| 621 cat("**GTDB-Tk taxonomy**: \n\n") | |
| 622 data_gtdbtk_host %>% filter(user_genome == sample) %>% | |
| 623 select(classification) %>% | |
| 624 kbl() %>% | |
| 625 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
| 626 kable_paper("striped", full_width = TRUE) %>% | |
| 627 scroll_box(width = "100%", height = "100%") %>% | |
| 628 cat() | |
| 629 | |
| 630 # Cat checkm summary for this genome | |
| 631 cat("**CheckM2 Summary**:\n\n") | |
| 632 #checkm_summary <- data_checkm_host %>% filter(`bin_id` == sample) | |
| 633 checkm_summary <- data_checkm_host %>% filter(`name` == sample) | |
| 634 checkm_summary %>% clean_names %>% | |
| 635 #select(number_contigs, n50_contigs, completeness, contamination, strain_heterogeneity) %>% | |
| 636 select(total_contigs, contig_n50, completeness, contamination) %>% | |
| 637 kbl() %>% | |
| 638 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
| 639 kable_paper("striped", full_width = TRUE) %>% | |
| 640 scroll_box(width = "100%", height = "100%") %>% | |
| 641 cat() | |
| 642 | |
| 643 # Display defense-finder as a table | |
| 644 if (!is.null(defense_finder_data) && nrow(defense_finder_data) > 0) { | |
| 645 cat("**Defense-Finder Systems**:\n\n") | |
| 646 | |
| 647 defense_finder_data %>% | |
| 648 select(sys_id, type, subtype, sys_beg, sys_end, protein_in_syst, genes_count, name_of_profiles_in_sys) %>% | |
| 649 kbl() %>% | |
| 650 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
| 651 kable_paper("striped", full_width = TRUE) %>% | |
| 652 scroll_box(width = "100%", height = "100%") %>% | |
| 653 cat() | |
| 654 } else { | |
| 655 cat("No Defense-Finder systems detected.\n\n") | |
| 656 } | |
| 657 | |
| 658 if (is.null(genomad_summary) || nrow(genomad_summary) == 0) { | |
| 659 log_debug(paste("No geNomad summary data found for sample:", sample)) | |
| 660 return() | |
| 661 } | |
| 662 | |
| 663 if (length(seqlevels(host_genome_ideogram)) == 1) { | |
| 664 host_genome_size <- sum(width(host_genome_ideogram)) | |
| 665 } else { | |
| 666 virus_containing_contigs <- unique(sub("\\|.*", "", genomad_summary$seq_name)) | |
| 667 virus_containing_contigs <- paste0("c_", as.numeric(factor(virus_containing_contigs))) | |
| 668 filtered_host_genome <- subset_and_update_ideogram(host_genome_ideogram, virus_containing_contigs) | |
| 669 host_genome_size <- sum(width(filtered_host_genome)) | |
| 670 } | |
| 671 | |
| 672 # Process proviruses | |
| 673 proviruses_gr_features <- process_proviruses(genomad_summary) | |
| 674 | |
| 675 cat("**Genomad and CheckV Summary**:\n\n") | |
| 676 genomad_summary %>% | |
| 677 select(seq_name, taxonomy, topology, coordinates, length) %>% | |
| 678 left_join( | |
| 679 checkv_data %>% select(contig_id, gene_count, viral_genes, checkv_quality, miuvig_quality), | |
| 680 by = c("seq_name" = "contig_id")) %>% | |
| 681 select(seq_name, length, gene_count, viral_genes, checkv_quality, miuvig_quality, taxonomy, topology, coordinates) %>% | |
| 682 kbl() %>% | |
| 683 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
| 684 kable_paper("striped", full_width = TRUE) %>% | |
| 685 scroll_box(width = "100%", height = "100%") %>% | |
| 686 cat() | |
| 687 | |
| 688 cat("**Host Genome Ideogram with Phages**:\n\n") | |
| 689 plot_genome_ideogram(sample, proviruses_gr_features) | |
| 690 cat('In this circular plot, **"c"** indicates the contig, and the number that follows (e.g., **c1**) represents the contig number. | |
| 691 If multiple contigs are present in the genome, each will be shown with a distinct label (e.g., **c1**, **c2**, etc.).\n\n') | |
| 692 cat("\n\n") | |
| 693 | |
| 694 # Process phage genomes | |
| 695 cat("### Prophages {.tabset .tabset-fade} \n\n") | |
| 696 cat("**Select prophage to show: ** \n\n") | |
| 697 for (i in seq_len(nrow(genomad_summary))) { | |
| 698 log_debug(paste("Processing phage", i, "of", nrow(genomad_summary), "for sample", sample)) | |
| 699 process_phage(genomad_summary[i, ], genomad_summary, genomad_annotation, checkv_data, host_genome_size) | |
| 700 } | |
| 701 | |
| 702 # Plot dREP if applicable | |
| 703 if (nrow(genomad_summary) > 1) { | |
| 704 cat("### vOTUs\n\n") | |
| 705 plot_drep(sample, genomad_summary) | |
| 706 } | |
| 707 | |
| 708 # Creating table with Abricate data | |
| 709 if (nrow(abricate_data) > 0) { | |
| 710 cat("### Virulence Genes {.tabset .tabset-fade} \n\n") | |
| 711 cat("Screening of virulence genes present in the prophage contigs. \n\n") | |
| 712 abricate_data %>% select(-number_file) %>% | |
| 713 kbl() %>% | |
| 714 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
| 715 kable_paper("striped", full_width = TRUE) %>% | |
| 716 scroll_box(width = "100%", height = "100%") %>% cat() | |
| 717 cat("\n\n") | |
| 718 } | |
| 719 | |
| 720 # Creating table with iPHOP | |
| 721 if (nrow(iphop_data) > 0) { | |
| 722 cat("### Prophage-Host Prediction {.tabset .tabset-fade} \n\n") | |
| 723 cat("Prediction of potential hosts for the prophage contigs. \n\n") | |
| 724 iphop_data %>% | |
| 725 kbl() %>% | |
| 726 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
| 727 kable_paper("striped", full_width = TRUE) %>% | |
| 728 scroll_box(width = "100%", height = "100%") %>% cat() | |
| 729 cat("\n\n") | |
| 730 } | |
| 731 | |
| 732 # Creating table with VIBRANT AMGs | |
| 733 if (nrow(vibrant_data) > 0) { | |
| 734 cat("### AMG Predictions {.tabset .tabset-fade} \n\n") | |
| 735 cat("Prediction of auxiliary metabolic genes in the prophage contigs. \n\n") | |
| 736 vibrant_data %>% | |
| 737 kbl() %>% | |
| 738 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
| 739 kable_paper("striped", full_width = TRUE) %>% | |
| 740 scroll_box(width = "100%", height = "100%") %>% cat() | |
| 741 cat("\n\n") | |
| 742 } | |
| 743 | |
| 744 log_debug(paste("Finished processing sample:", sample)) | |
| 745 }, error = function(e) { | |
| 746 log_debug(paste("Error in process_sample for", sample, ":", conditionMessage(e))) | |
| 747 cat(paste("Error processing sample", sample, ":", conditionMessage(e), "\n\n")) | |
| 748 }) | |
| 749 } | |
| 750 | |
| 751 process_phage <- function(virus, genomad_summary, genomad_annotation, checkv_data, host_genome_size) { | |
| 752 cat(paste("#### Phage ID:", virus$seq_name, " {.tabset .tabset-fade} \n\n")) | |
| 753 | |
| 754 current_contig <- sub("\\|.*", "", virus$seq_name) | |
| 755 | |
| 756 provirus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", virus$seq_name)) | |
| 757 provirus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", virus$seq_name)) | |
| 758 virus_length <- provirus_end - provirus_start + 1 | |
| 759 | |
| 760 current_contig_base <- sub("\\|provirus_.*", "", virus$seq_name) | |
| 761 current_provirus_range <- sub(".*\\|provirus_", "", virus$seq_name) | |
| 762 current_annotations <- genomad_annotation[grepl(paste0(current_contig_base, "\\|provirus_", current_provirus_range, "_"), | |
| 763 genomad_annotation$gene, fixed = FALSE), ] %>% | |
| 764 mutate(arrow_pos = ifelse(strand == -1, "start", "end")) | |
| 765 | |
| 766 | |
| 767 cat("\n\n**Phage–Host Genome Ideogram:**\n\n") | |
| 768 | |
| 769 plot_phage_circos(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data) | |
| 770 | |
| 771 cat("\n\n") | |
| 772 cat("\n\n**Genes Annotation (geNomad):**\n\n") | |
| 773 | |
| 774 current_annotations %>% | |
| 775 select(gene, length, marker, annotation_accessions, annotation_description) %>% | |
| 776 kbl() %>% | |
| 777 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
| 778 kable_paper() %>% | |
| 779 cat() | |
| 780 cat("\n\n") | |
| 781 } | |
| 782 | |
| 783 plot_phage_circos <- function(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data) { | |
| 784 tryCatch({ | |
| 785 log_debug("Starting plot_phage_circos function") | |
| 786 log_debug(paste("Current virus:", virus$seq_name)) | |
| 787 log_debug(paste("Virus length:", virus_length)) | |
| 788 log_debug(paste("Host genome size:", host_genome_size)) | |
| 789 log_debug(paste("Provirus start:", provirus_start)) | |
| 790 log_debug(paste("Provirus end:", provirus_end)) | |
| 791 | |
| 792 # Check for NA or invalid values in input parameters | |
| 793 if (is.na(virus_length) || virus_length <= 0) { | |
| 794 log_debug("Error: Invalid virus length") | |
| 795 return(NULL) | |
| 796 } | |
| 797 if (is.na(host_genome_size) || host_genome_size <= 0) { | |
| 798 log_debug("Error: Invalid host genome size") | |
| 799 return(NULL) | |
| 800 } | |
| 801 if (is.na(provirus_start) || provirus_start < 0) { | |
| 802 log_debug("Error: Invalid provirus start position") | |
| 803 return(NULL) | |
| 804 } | |
| 805 if (is.na(provirus_end) || provirus_end <= provirus_start) { | |
| 806 log_debug("Error: Invalid provirus end position") | |
| 807 return(NULL) | |
| 808 } | |
| 809 | |
| 810 # Extract contig information | |
| 811 current_contig <- sub("\\|.*", "", virus$seq_name) | |
| 812 log_debug(paste("Current contig:", current_contig)) | |
| 813 | |
| 814 contig_viruses <- genomad_summary[grepl(paste0("^", current_contig), genomad_summary$seq_name), ] | |
| 815 if (nrow(contig_viruses) == 0) { | |
| 816 log_debug("Error: No viruses found for the current contig") | |
| 817 return(NULL) | |
| 818 } | |
| 819 | |
| 820 contig_length <- max(as.numeric(sub(".*_(\\d+)$", "\\1", contig_viruses$seq_name))) | |
| 821 if (is.na(contig_length) || contig_length <= 0) { | |
| 822 contig_length <- virus_length # Use virus length as fallback if contig length is invalid | |
| 823 log_debug(paste("Using virus length as contig length:", contig_length)) | |
| 824 } else { | |
| 825 log_debug(paste("Contig length:", contig_length)) | |
| 826 } | |
| 827 | |
| 828 if (provirus_end > contig_length) { | |
| 829 log_debug("Error: Provirus end position exceeds contig length") | |
| 830 return(NULL) | |
| 831 } | |
| 832 | |
| 833 log_debug("Clearing circos") | |
| 834 circos.clear() | |
| 835 | |
| 836 log_debug("Setting circos parameters") | |
| 837 circos.par(start.degree = 180, gap.degree = 10, track.margin = c(0.01, 0.01)) | |
| 838 | |
| 839 main_color <- "#a58bc5" | |
| 840 zoom_color <- "#e9d27d" | |
| 841 | |
| 842 zoom_start <- (provirus_start / contig_length) * 100 | |
| 843 zoom_end <- (provirus_end / contig_length) * 100 | |
| 844 | |
| 845 log_debug(paste("Zoom start:", zoom_start)) | |
| 846 log_debug(paste("Zoom end:", zoom_end)) | |
| 847 | |
| 848 log_debug("Initializing circos") | |
| 849 circos.initialize(factors = c("Zoom", "Main"), xlim = c(0, 100)) | |
| 850 | |
| 851 format_genome_labels <- function(x) { | |
| 852 ifelse(x >= 1e6, paste0(round(x / 1e6, 2), " Mb"), | |
| 853 ifelse(x >= 1e3, paste0(round(x / 1e3, 2), " Kb"), | |
| 854 paste0(x, " bp"))) | |
| 855 } | |
| 856 | |
| 857 log_debug("Adding link") | |
| 858 tryCatch({ | |
| 859 circos.link("Main", c(zoom_start, zoom_end), "Zoom", c(0, 100), | |
| 860 rou1 = 0.8, | |
| 861 rou2 = 0.97, | |
| 862 h.ratio = 0.55, # width? | |
| 863 lty = 2, | |
| 864 lwd = 0.5, | |
| 865 h2 = 1, | |
| 866 col = "grey99", border = "grey80") | |
| 867 }, error = function(e) { | |
| 868 log_debug(paste("Error in circos.link:", e$message)) | |
| 869 }) | |
| 870 | |
| 871 log_debug("Adding zoom track") | |
| 872 circos.track(factors = "Zoom", ylim = c(0, 1), track.height = 0.15, | |
| 873 panel.fun = function(x, y) { | |
| 874 circos.rect(0, 0, 100, 1, col = zoom_color, border = NA) | |
| 875 axis_labels <- seq(0, virus_length, length.out = 6) | |
| 876 axis_positions <- seq(0, 100, length.out = 6) | |
| 877 circos.axis(h = "top", major.at = axis_positions, | |
| 878 labels = format_genome_labels(axis_labels), | |
| 879 labels.cex = 0.7, direction = "outside") | |
| 880 | |
| 881 for (i in 1:nrow(current_annotations)) { | |
| 882 gene_start <- current_annotations$start[i] | |
| 883 gene_end <- current_annotations$end[i] | |
| 884 arrow_start <- (gene_start - provirus_start) / virus_length * 100 | |
| 885 arrow_end <- (gene_end - provirus_start) / virus_length * 100 | |
| 886 | |
| 887 circos.arrow(arrow_start, arrow_end, y1 = 0, y2 = 1, | |
| 888 arrow.head.width = 0.75, arrow.head.length = cm_x(0.1), | |
| 889 arrow.position = current_annotations$arrow_pos[i], | |
| 890 col = ifelse(is.na(current_annotations$annotation_description[i]), "grey", "#7fbfff"), | |
| 891 border = ifelse(is.na(current_annotations$annotation_description[i]), "grey20", "darkblue")) | |
| 892 } | |
| 893 }, bg.border = NA) | |
| 894 | |
| 895 log_debug("Adding main track") | |
| 896 circos.track(factors = "Main", ylim = c(0, 1), track.height = 0.1, | |
| 897 panel.fun = function(x, y) { | |
| 898 circos.rect(xleft = 0, ybottom = 0, xright = 100, ytop = 1, col = main_color, border = NA) | |
| 899 | |
| 900 for (i in 1:nrow(contig_viruses)) { | |
| 901 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i])) | |
| 902 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i])) | |
| 903 | |
| 904 virus_start_percent <- (virus_start / contig_length) * 100 | |
| 905 virus_end_percent <- (virus_end / contig_length) * 100 | |
| 906 | |
| 907 rect_color <- if (contig_viruses$seq_name[i] == virus$seq_name) zoom_color else adjustcolor(zoom_color, alpha.f = 0.7) | |
| 908 | |
| 909 circos.rect(xleft = virus_start_percent, ybottom = 0, | |
| 910 xright = virus_end_percent, ytop = 1, | |
| 911 col = rect_color, border = NA) | |
| 912 } | |
| 913 | |
| 914 axis_labels <- seq(0, contig_length, length.out = 6) | |
| 915 axis_positions <- seq(0, 100, length.out = 6) | |
| 916 circos.axis(h = "top", major.at = axis_positions, | |
| 917 labels = format_genome_labels(axis_labels), | |
| 918 labels.cex = 0.7, direction = "outside") | |
| 919 }, bg.border = NA) | |
| 920 | |
| 921 log_debug("Locating phage positions") | |
| 922 phage_positions <- sapply(1:nrow(contig_viruses), function(i) { | |
| 923 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i])) | |
| 924 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i])) | |
| 925 ((virus_start + virus_end) / 2 / contig_length) * 100 | |
| 926 }) | |
| 927 | |
| 928 log_debug("Annotating names on phage positions") | |
| 929 | |
| 930 # Extract start and end positions from sequence names | |
| 931 start_positions <- as.numeric(sub(".*provirus_([0-9]+)_.*", "\\1", contig_viruses$seq_name)) | |
| 932 end_positions <- as.numeric(sub(".*provirus_[0-9]+_([0-9]+)", "\\1", contig_viruses$seq_name)) | |
| 933 | |
| 934 # Create phage labels with the desired format | |
| 935 phage_labels <- paste0(round(contig_viruses$length / 1e3, 2), " Kb") | |
| 936 | |
| 937 # Apply labels to circos plot | |
| 938 circos.labels( | |
| 939 sectors = "Main", | |
| 940 x = phage_positions, | |
| 941 labels = phage_labels, | |
| 942 facing = "reverse.clockwise", | |
| 943 niceFacing = TRUE, | |
| 944 col = "black", | |
| 945 cex = 0.6, | |
| 946 side = "inside", | |
| 947 connection_height = 0.02, | |
| 948 line_col = "gray" | |
| 949 ) | |
| 950 | |
| 951 center_x <- 50 | |
| 952 virus_name <- virus$seq_name | |
| 953 taxonomy <- virus$taxonomy | |
| 954 | |
| 955 log_debug("Adding taxonomy and virus name to the plot") | |
| 956 circos.text(x = center_x, y = -0.2, labels = taxonomy, | |
| 957 sector.index = "Zoom", track.index = 1, | |
| 958 facing = "bending.inside", niceFacing = TRUE, | |
| 959 adj = c(0.5, 0.7), cex = 0.8) | |
| 960 | |
| 961 checkv_info <- checkv_data[checkv_data$contig_id == virus$seq_name, ] | |
| 962 if (nrow(checkv_info) > 0) { | |
| 963 checkv_quality <- checkv_info$checkv_quality | |
| 964 gene_count <- checkv_info$gene_count | |
| 965 viral_genes <- checkv_info$viral_genes | |
| 966 host_genes <- checkv_info$host_genes | |
| 967 miuvig_quality <- checkv_info$miuvig_quality | |
| 968 completeness <- checkv_info$completeness | |
| 969 completeness_method <- checkv_info$completeness_method | |
| 970 contamination <- checkv_info$contamination | |
| 971 | |
| 972 circos.text( | |
| 973 x = center_x, y = -0.5, | |
| 974 labels = paste("CheckV Quality:", checkv_quality, " - miuvig Quality:", miuvig_quality), | |
| 975 sector.index = "Zoom", track.index = 2, | |
| 976 facing = "bending.inside", niceFacing = TRUE, | |
| 977 adj = c(0.5, 0), cex = 0.7 | |
| 978 ) | |
| 979 | |
| 980 circos.text( | |
| 981 x = center_x, y = -1.5, | |
| 982 labels = paste("Gene Count:", gene_count, " - Viral Genes:", viral_genes, " - Host Genes:", host_genes), | |
| 983 sector.index = "Zoom", track.index = 2, | |
| 984 facing = "bending.inside", niceFacing = TRUE, | |
| 985 adj = c(0.5, 0), cex = 0.7 | |
| 986 ) | |
| 987 | |
| 988 circos.text( | |
| 989 x = center_x, y = -2.5, | |
| 990 labels = paste("Completeness:", completeness, " - Contamination:", contamination), | |
| 991 sector.index = "Zoom", track.index = 2, | |
| 992 facing = "bending.inside", niceFacing = TRUE, | |
| 993 adj = c(0.5, 0), cex = 0.7 | |
| 994 ) | |
| 995 } | |
| 996 | |
| 997 log_debug("Adding legend") | |
| 998 # Add legend | |
| 999 legend("topright", | |
| 1000 legend = c("Annotated gene", "Unknown gene"), | |
| 1001 fill = c("#7fbfff", "grey"), | |
| 1002 border = c("darkblue", "grey20"), | |
| 1003 cex = 0.8, | |
| 1004 bty = "n") | |
| 1005 | |
| 1006 log_debug("Clearing circos") | |
| 1007 circos.clear() | |
| 1008 log_debug("Finished plot_phage_circos function successfully") | |
| 1009 }, error = function(e) { | |
| 1010 log_debug(paste("Error in plot_phage_circos:", e$message)) | |
| 1011 circos.clear() | |
| 1012 }) | |
| 1013 } | |
| 1014 | |
| 1015 plot_drep <- function(sample, genomad_summary) { | |
| 1016 drep_file_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "data_tables", "Cdb.csv") | |
| 1017 drep_data <- read_csv(drep_file_path) %>% clean_names() | |
| 1018 drep_data <- cbind(genomad_summary$seq_name, drep_data) | |
| 1019 | |
| 1020 cat("When more than 1 phage is detected in the host genome, we perform a clustering step using the tool dRep.\n\n") | |
| 1021 cat("A threshold of 0.95 was applied to the ANI similarity index to define clusters of virus operational taxonomic units (vOTUs).") | |
| 1022 | |
| 1023 cat("\n\n**Final cluster designations**\n\n") | |
| 1024 drep_data %>% | |
| 1025 kbl() %>% | |
| 1026 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
| 1027 kable_paper("striped", full_width = TRUE) %>% | |
| 1028 cat() | |
| 1029 | |
| 1030 # Insert the PDF plot | |
| 1031 plot_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.pdf") | |
| 1032 png_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.png") | |
| 1033 | |
| 1034 if (file.exists(plot_path)) { | |
| 1035 pdftools::pdf_convert(plot_path, format = "png", filenames = png_path, verbose = FALSE, dpi=150) | |
| 1036 base64_str <- base64enc::dataURI(file = png_path, mime = "image/png") | |
| 1037 cat("**Primary clustering plot**\n\n") | |
| 1038 cat(sprintf( | |
| 1039 '<div style="text-align: center;"><img src="%s" style="max-width: 100%%; height: auto; border:1px solid #ddd; border-radius:4px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); margin: 1em 0;"></div>',base64_str | |
| 1040 )) | |
| 1041 } else { | |
| 1042 cat("**No dRep clustering plot found.**\n\n") | |
| 1043 } | |
| 1044 | |
| 1045 } | |
| 1046 | |
| 1047 subset_and_update_ideogram <- function(ideogram, contigs) { | |
| 1048 filtered <- ideogram[seqnames(ideogram) %in% contigs] | |
| 1049 seqlevels(filtered) <- contigs | |
| 1050 seqinfo(filtered) <- seqinfo(filtered)[contigs] | |
| 1051 filtered | |
| 1052 } | |
| 1053 | |
| 1054 render_all_samples <- function(test_mode = FALSE) { | |
| 1055 if (test_mode) { | |
| 1056 if (nrow(combined_unique) > 0) { | |
| 1057 cat("**Select sample to show:** \n\n\n") | |
| 1058 current_sample <- combined_unique$Sample[6] | |
| 1059 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data) | |
| 1060 } else { | |
| 1061 print("No samples can be further analysed.") | |
| 1062 } | |
| 1063 } else { | |
| 1064 cat("**Select sample to show:** \n\n\n") | |
| 1065 for (i in seq_len(nrow(combined_unique))) { | |
| 1066 current_sample <- combined_unique$Sample[i] | |
| 1067 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data) | |
| 1068 } | |
| 1069 } | |
| 1070 } | |
| 1071 | |
| 1072 # Execute the main function | |
| 1073 # Test mode processes one sample only | |
| 1074 render_all_samples(test_mode = F) | |
| 1075 ``` | |
| 1076 | |
| 1077 # Citation | |
| 1078 | |
| 1079 Cite this work: XXXXX | |
| 1080 | |
| 1081 |
