Mercurial > repos > iuc > snpfreqplot
comparison heatmap_for_variants.R @ 0:1062d6ad6503 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"
| author | iuc |
|---|---|
| date | Wed, 02 Dec 2020 21:23:06 +0000 |
| parents | |
| children | e362b3143cde |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1062d6ad6503 |
|---|---|
| 1 #!/usr/bin/env R | |
| 2 | |
| 3 suppressPackageStartupMessages(library(pheatmap)) | |
| 4 suppressPackageStartupMessages(library(RColorBrewer)) | |
| 5 suppressPackageStartupMessages(library(tidyverse)) | |
| 6 | |
| 7 fapply <- function(vect_ids, func) { | |
| 8 #' List apply but preserve the names | |
| 9 res <- lapply(vect_ids, func) | |
| 10 names(res) <- vect_ids | |
| 11 return(res) | |
| 12 } | |
| 13 | |
| 14 # M A I N | |
| 15 stopifnot(exists("samples")) | |
| 16 variant_files <- fapply(samples$ids, read_and_process) # nolint | |
| 17 | |
| 18 extractall_data <- function(id) { | |
| 19 variants <- variant_files[[id]] | |
| 20 tmp <- variants %>% | |
| 21 mutate(posalt = uni_select) %>% | |
| 22 select(posalt, AF) | |
| 23 colnames(tmp) <- c("Mutation", id) | |
| 24 return(tmp) | |
| 25 } | |
| 26 | |
| 27 extractall_annots <- function(id) { | |
| 28 variants <- variant_files[[id]] | |
| 29 tmp <- variants %>% | |
| 30 mutate(posalt = uni_select, | |
| 31 effect = EFF....EFFECT, gene = EFF....GENE) %>% | |
| 32 select(posalt, effect, gene) | |
| 33 return(tmp) | |
| 34 } | |
| 35 | |
| 36 # process allele frequencies | |
| 37 processed_files <- fapply(samples$ids, extractall_data) | |
| 38 final <- as_tibble( | |
| 39 processed_files %>% | |
| 40 reduce(full_join, by = "Mutation", copy = T)) | |
| 41 | |
| 42 final <- final[str_order(final$Mutation, numeric = T), ] %>% | |
| 43 column_to_rownames("Mutation") ## sort and set rownames | |
| 44 final[final < variant_frequency] <- NA ## adjust the variant frequency: | |
| 45 final <- final[rowSums(is.na(final)) != ncol(final), ] | |
| 46 final <- t(final) | |
| 47 final[is.na(final)] <- 0 | |
| 48 class(final) <- "numeric" | |
| 49 | |
| 50 # add annotations | |
| 51 ## readout annotations | |
| 52 processed_annots <- fapply(samples$ids, extractall_annots) | |
| 53 ann_final <- processed_annots %>% | |
| 54 reduce(function(x, y) { | |
| 55 unique(rbind(x, y))}) %>% | |
| 56 filter(posalt %in% colnames(final)) ## apply frequency filter | |
| 57 ann_final <- as_tibble(ann_final[str_order( | |
| 58 ann_final$posalt, numeric = T), ]) %>% | |
| 59 column_to_rownames("posalt") ## sort | |
| 60 | |
| 61 # rename annotations | |
| 62 trans <- function(x, mapping, replace_missing=NULL) { | |
| 63 # helper function for translating effects | |
| 64 mapped <- mapping[[x]] | |
| 65 if (is.null(mapped)) { | |
| 66 if (is.null(replace_missing)) x else replace_missing | |
| 67 } else { | |
| 68 mapped | |
| 69 } | |
| 70 } | |
| 71 | |
| 72 # handle translation of classic SnpEff effects to sequence ontology terms | |
| 73 # The following list defines the complete mapping between classic and So effect | |
| 74 # terms even if not all of these are likely to appear in viral variant data. | |
| 75 classic_snpeff_effects_to_so <- list( | |
| 76 "coding_sequence_variant", "coding_sequence_variant", "disruptive_inframe_deletion", "disruptive_inframe_insertion", "inframe_deletion", "inframe_insertion", "downstream_gene_variant", "exon_variant", "exon_loss_variant", "frameshift_variant", "gene_variant", "intergenic_variant", "intergenic_region", "conserved_intergenic_variant", "intragenic_variant", "intron_variant", "conserved_intron_variant", "missense_variant", "rare_amino_acid_variant", "splice_acceptor_variant", "splice_donor_variant", "splice_region_variant", "5_prime_UTR_premature_start_codon_variant", "start_lost", "stop_gained", "stop_lost", "synonymous_variant", "start_retained_variant", "stop_retained_variant", "transcript_variant", "upstream_gene_variant", "3_prime_UTR_truncation_+_exon_loss_variant", "3_prime_UTR_variant", "5_prime_UTR_truncation_+_exon_loss_variant", "5_prime_UTR_variant", "initiator_codon_variant", "None", "chromosomal_deletion" | |
| 77 ) | |
| 78 names(classic_snpeff_effects_to_so) <- c( | |
| 79 "CDS", "CODON_CHANGE", "CODON_CHANGE_PLUS_CODON_DELETION", "CODON_CHANGE_PLUS_CODON_INSERTION", "CODON_DELETION", "CODON_INSERTION", "DOWNSTREAM", "EXON", "EXON_DELETED", "FRAME_SHIFT", "GENE", "INTERGENIC", "INTERGENIC_REGION", "INTERGENIC_CONSERVED", "INTRAGENIC", "INTRON", "INTRON_CONSERVED", "NON_SYNONYMOUS_CODING", "RARE_AMINO_ACID", "SPLICE_SITE_ACCEPTOR", "SPLICE_SITE_DONOR", "SPLICE_SITE_REGION", "START_GAINED", "START_LOST", "STOP_GAINED", "STOP_LOST", "SYNONYMOUS_CODING", "SYNONYMOUS_START", "SYNONYMOUS_STOP", "TRANSCRIPT", "UPSTREAM", "UTR_3_DELETED", "UTR_3_PRIME", "UTR_5_DELETED", "UTR_5_PRIME", "NON_SYNONYMOUS_START", "NONE", "CHROMOSOME_LARGE_DELETION" | |
| 80 ) | |
| 81 # translate classic effects into SO terms leaving unknown terms (possibly SO already) as is | |
| 82 so_effects <- sapply(ann_final$effect, function(x) trans(x, classic_snpeff_effects_to_so)) | |
| 83 | |
| 84 # handle further translation of effects we care about | |
| 85 so_effects_translation <- list( | |
| 86 "non-syn", "syn", | |
| 87 "deletion", "deletion", "deletion", | |
| 88 "insertion", "insertion", "frame shift", | |
| 89 "stop gained", "stop lost" | |
| 90 ) | |
| 91 names(so_effects_translation) <- c( | |
| 92 "missense_variant", "synonymous_variant", | |
| 93 "disruptive_inframe_deletion", "inframe_deletion", "chromosomal_deletion", | |
| 94 "disruptive_inframe_insertion", "inframe_insertion", "frameshift_variant", | |
| 95 "stop_gained", "stop_lost" | |
| 96 ) | |
| 97 # translate to our simple terms turning undefined terms into '?' | |
| 98 simple_effects <- sapply(so_effects, function(x) trans(x, so_effects_translation, replace_missing = "?")) | |
| 99 # complex variant effects (those that do more than one thing) are concatenated | |
| 100 # with either '+' (for classic terms) or '&' (for SO terms) | |
| 101 simple_effects[grepl("+", so_effects, fixed = TRUE)] <- "complex" | |
| 102 simple_effects[grepl("&", so_effects, fixed = TRUE)] <- "complex" | |
| 103 simple_effects[so_effects == ""] <- "non-coding" | |
| 104 | |
| 105 ann_final$effect <- simple_effects | |
| 106 ann_final$gene <- sub("^$", "NCR", ann_final$gene) | |
| 107 | |
| 108 ## automatically determine gaps for the heatmap | |
| 109 gap_vector <- which(!(ann_final$gene[1:length(ann_final$gene) - 1] == # nolint | |
| 110 ann_final$gene[2:length(ann_final$gene)])) | |
| 111 | |
| 112 # colormanagement | |
| 113 my_colors <- colorRampPalette(c("grey93", "brown", "black")) #heatmap | |
| 114 count <- length(unique(ann_final$gene)) #annotations (genes) | |
| 115 gene_color <- c(brewer.pal(brewer_color_gene_annotation, n = count)) | |
| 116 names(gene_color) <- unique(ann_final$gene) | |
| 117 | |
| 118 # colormanagement annotations (effect) | |
| 119 ## Define the full set of colors for each effect that we can encounter | |
| 120 ## This is not bulletproof. The effect names given here were swapped into the | |
| 121 ## data (see above substitutions in ann_final$effect) and so are hard-coded, | |
| 122 ## as well as their preferred colors. | |
| 123 | |
| 124 all_colors <- data.frame( | |
| 125 color = c("white", "green", "orange", "red", | |
| 126 "black", "grey", "yellow", "blue", "purple", "brown"), | |
| 127 name = c("non-coding", "syn", "non-syn", "deletion", | |
| 128 "frame shift", "stop gained", "stop lost", "insertion", | |
| 129 "complex", "?")) | |
| 130 ## Reduce the full set to just those that we want | |
| 131 detected_effects <- unique(ann_final$effect) | |
| 132 subset_colors <- subset(all_colors, name %in% detected_effects) | |
| 133 effect_color <- subset_colors$color | |
| 134 names(effect_color) <- subset_colors$name | |
| 135 color_list <- list(gene_color = gene_color, effect_color = effect_color) | |
| 136 names(color_list) <- c("gene", "effect") | |
| 137 | |
| 138 # visualize heatmap | |
| 139 if (pheat_number_of_clusters > length(samples$ids)) { | |
| 140 print(paste0("[INFO] Number of clusters: User-specified clusters (", | |
| 141 pheat_number_of_clusters, | |
| 142 ") is greater than the number of samples (", | |
| 143 length(samples$ids), ")")) | |
| 144 pheat_number_of_clusters <- length(samples$ids) | |
| 145 print(paste0("[INFO] Number of clusters: now set to ", | |
| 146 pheat_number_of_clusters)) | |
| 147 } | |
| 148 | |
| 149 get_plot_dims <- function(heat_map) { | |
| 150 ## get the dimensions of a pheatmap object | |
| 151 ## useful for plot formats that can't be written to a file directly, but | |
| 152 ## for which we need to set up a plotting device | |
| 153 ## source: https://stackoverflow.com/a/61876386 | |
| 154 plot_height <- sum(sapply(heat_map$gtable$heights, | |
| 155 grid::convertHeight, "in")) | |
| 156 plot_width <- sum(sapply(heat_map$gtable$widths, | |
| 157 grid::convertWidth, "in")) | |
| 158 return(list(height = plot_height, width = plot_width)) | |
| 159 } | |
| 160 | |
| 161 height <- round(max(c(max(c( | |
| 162 16 * (length(unique(ann_final$effect)) + | |
| 163 length(unique(ann_final$gene))), 160)) / | |
| 164 nrow(final), 15))) | |
| 165 width <- round(ratio * height) | |
| 166 | |
| 167 | |
| 168 if (!(out_ext %in% c("svg", "jpeg", "png", "pdf"))) { | |
| 169 stop("Unknown extension: ", ext, ", aborting.") | |
| 170 } | |
| 171 plot_device <- get(out_ext) | |
| 172 | |
| 173 | |
| 174 ## A constant scaling factor based on the calculated dimensions | |
| 175 ## above does not work for PNG, so we resort to feeding pheatmap | |
| 176 ## with a direct filename | |
| 177 plot_filename <- NA | |
| 178 if (out_ext %in% c("jpeg", "png")) { | |
| 179 plot_filename <- out_file | |
| 180 } | |
| 181 | |
| 182 ## SVG is not a format pheatmap knows how to write to a file directly. | |
| 183 ## As a workaround we | |
| 184 ## 1. create the plot object | |
| 185 ## 2. get its dimensions | |
| 186 ## 3. set up a svg plotting device with these dimensions | |
| 187 ## 4. print the heatmap object to the device | |
| 188 hm <- pheatmap( | |
| 189 final, | |
| 190 color = my_colors(100), | |
| 191 cellwidth = width, | |
| 192 cellheight = height, | |
| 193 fontsize_col = round(1 / 3 * width), | |
| 194 fontsize_row = round(1 / 3 * min(c(height, width))), | |
| 195 clustering_method = pheat_clustering_method, | |
| 196 cluster_rows = pheat_clustering, | |
| 197 cluster_cols = F, | |
| 198 cutree_rows = pheat_number_of_clusters, | |
| 199 annotation_col = ann_final, | |
| 200 annotation_colors = color_list, | |
| 201 filename = plot_filename, | |
| 202 gaps_col = gap_vector | |
| 203 ) | |
| 204 | |
| 205 if (out_ext %in% c("pdf", "svg")) { | |
| 206 plot_dims <- get_plot_dims(hm) | |
| 207 plot_device(out_file, | |
| 208 width = plot_dims$width, | |
| 209 height = plot_dims$height) | |
| 210 print(hm) | |
| 211 dev.off() | |
| 212 } |
