Mercurial > repos > petr-novak > repeatrxplorer
comparison lib/create_annotation.R @ 0:1d1b9e1b2e2f draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 19 Dec 2019 10:24:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1d1b9e1b2e2f |
---|---|
1 #!/usr/bin/env Rscript | |
2 Sys.setlocale("LC_CTYPE", "en_US.UTF-8") # this is necessary for handling unicode characters (data.tree package) | |
3 suppressPackageStartupMessages(library(data.tree)) | |
4 suppressPackageStartupMessages(library(stringr)) | |
5 suppressPackageStartupMessages(library(R2HTML)) | |
6 suppressPackageStartupMessages(library(hwriter)) | |
7 suppressPackageStartupMessages(library(DT)) | |
8 suppressPackageStartupMessages(library(tools)) | |
9 suppressPackageStartupMessages(library(scales)) | |
10 suppressPackageStartupMessages(library(igraph)) | |
11 suppressPackageStartupMessages(library(plotrix)) | |
12 suppressPackageStartupMessages(library(png)) | |
13 | |
14 source("htmlheader.R") | |
15 source("config.R") # load tandem ranks info | |
16 source("utils.R") | |
17 DT_OPTIONS = options = list(pageLength = 1000, lengthMenu = c(10,50,100,1000,5000,10000)) | |
18 WD = getwd() # to get script directory when run from Rserve | |
19 HTMLHEADER = htmlheader ## header (character) loaded from htmlheader.R | |
20 htmlheader = gsub("Superclusters summary","TAREAN summary", htmlheader) | |
21 | |
22 evaluate_LTR_detection = function(f){ | |
23 NO_LTR=NULL | |
24 if (length(readLines(f)) == 11 ){ | |
25 return(NO_LTR) | |
26 } | |
27 df=read.table(f,as.is=TRUE,sep="\t", skip=11,fill=TRUE) | |
28 if (ncol(df) != 23){ | |
29 #df is smaller if no pbs is detected! | |
30 return(NO_LTR) | |
31 } | |
32 df=df[!df$V13 == "",,drop=FALSE] | |
33 if (nrow(df)==0){ | |
34 return(NO_LTR) | |
35 } | |
36 # criteria: | |
37 df_part=df[df$V15 >=12 & df$V20 == 23 & df$V21<df$V20,,drop=FALSE] | |
38 if (nrow(df_part) == 0){ | |
39 return(NO_LTR) | |
40 } | |
41 PBS_type = gsub("_","",(str_extract_all(df_part$V13, pattern="_([A-Z][a-z]{2})", simplify=TRUE))) %>% | |
42 paste(collapse=" ") | |
43 return(PBS_type) | |
44 } | |
45 | |
46 | |
47 | |
48 ## annotate superclusters | |
49 select_reads_id = function(index, search_type = c("cluster","supercluster")) { | |
50 ## select read if base on the supecluster index need database connection | |
51 ## HITSORTDB! | |
52 search_type = match.arg(search_type) | |
53 x = dbGetQuery(HITSORTDB, | |
54 paste0("SELECT vertexname FROM vertices WHERE vertexindex IN ", | |
55 "(SELECT vertexindex FROM communities ", | |
56 "WHERE ", search_type,"=\"", index, | |
57 "\")")) | |
58 return(x$vertexname) | |
59 } | |
60 | |
61 | |
62 get_reads_annotation = function(reads_id) { | |
63 ## select annotation from tables in SEQDB which has name in format *_database | |
64 annot_types = grep("_database", dbListTables(SEQDB), value = TRUE) | |
65 annot = list() | |
66 for (i in annot_types) { | |
67 query = paste0("SELECT * FROM ", i, " WHERE name IN (", paste0("\"", reads_id, | |
68 "\"", collapse = ", "), ")") | |
69 annot[[i]] = dbGetQuery(SEQDB, query) | |
70 } | |
71 return(annot) | |
72 } | |
73 | |
74 supercluster_size = function(supercluster) { | |
75 x = dbGetQuery(HITSORTDB, paste0("SELECT count(*) FROM vertices WHERE vertexindex IN ", | |
76 "(SELECT vertexindex FROM communities ", "WHERE supercluster=\"", supercluster, | |
77 "\")")) | |
78 return(x$"count(*)") | |
79 } | |
80 | |
81 | |
82 cluster_annotation = function(cluster, search_type = c("cluster", "supercluster")){ | |
83 ## searcheither for cluster or supercluster annotation | |
84 ## read annotation from sqlite databases database is access though SEQDB (sequence | |
85 ## annotation) and HITSORTDB - clustering information | |
86 search_type = match.arg(search_type) | |
87 reads_id = select_reads_id(cluster, search_type) | |
88 annot = get_reads_annotation(reads_id) | |
89 return(annot) | |
90 } | |
91 | |
92 get_tarean_info = function(cluster, search_type = c("cluster", "supercluster")){ | |
93 search_type = match.arg(search_type) | |
94 if (search_type == "cluster") { | |
95 search_type = "[index]" | |
96 } | |
97 tarean_info = dbGetQuery(HITSORTDB, | |
98 paste0( | |
99 "SELECT [index], supercluster, satellite_probability, tandem_rank, size_real, satellite FROM cluster_info WHERE ", | |
100 search_type, " = ", cluster)) | |
101 nhits = sum(tarean_info$size_real[tarean_info$tandem_rank %in% 1:2]) | |
102 proportion = nhits/sum(tarean_info$size_real) | |
103 return( list( | |
104 nhits = nhits, | |
105 proportion = proportion, | |
106 tarean_info = tarean_info) | |
107 ) | |
108 | |
109 } | |
110 | |
111 get_ltr_info = function(supercluster){ | |
112 ltr_info = dbGetQuery(HITSORTDB, | |
113 paste0( | |
114 "SELECT [index], supercluster, ltr_detection, size_real FROM cluster_info WHERE ", | |
115 "supercluster", " = ", supercluster)) | |
116 return(ltr_info) | |
117 } | |
118 | |
119 summarize_annotation = function(annot, size = NULL){ | |
120 db_id = do.call(rbind, annot)$db_id | |
121 cl_string = gsub("^.+/","",gsub(":.+$", "", gsub("^.+#", "", db_id))) | |
122 weight = do.call(rbind, annot)$bitscore | |
123 domain = str_replace(str_extract(str_replace(db_id, "^.+#", ""), ":.+"), ":", | |
124 "") | |
125 domain[is.na(domain)] = "" | |
126 domain_table = table(cl_string, domain) | |
127 dt = as.data.frame(domain_table) | |
128 ## remove no count | |
129 dt = dt[dt$Freq != 0, ,drop = FALSE] | |
130 rownames(dt) = paste(dt$cl_string,dt$domain) | |
131 if (!is.null(size)){ | |
132 dt$proportion = dt$Freq / size | |
133 | |
134 } | |
135 return(dt) | |
136 } | |
137 | |
138 annot2colors = function(annot, ids, base_color = "#00000050"){ | |
139 annot_table = do.call(rbind, annot) | |
140 domains = str_replace(str_extract(str_replace(annot_table$db_id, "^.+#", ""), ":.+"), ":","") | |
141 other_annot = str_replace(annot_table$db_id, "^.+#", "") | |
142 complete_annot = ifelse(is.na(domains), other_annot, domains) | |
143 names(complete_annot) = annot_table$name | |
144 unique_complete_annot = names (table(complete_annot) %>% sort(decreasing = TRUE)) | |
145 color_key = unique_colors[seq_along(unique_complete_annot)] | |
146 names(color_key) = unique_complete_annot | |
147 color_table = rep(base_color, length(ids)) | |
148 names(color_table) = ids | |
149 color_table[names(complete_annot)] = color_key[complete_annot] | |
150 return(list( color_table = color_table, legend = color_key)) | |
151 } | |
152 | |
153 | |
154 read_annotation_to_tree = function(supercluster, TREE_TEMPLATE = CLS_TREE) { | |
155 ## CLS_TREE is datatree containing 'taxonomy' information of repeats, attribute of | |
156 ## each node/ leave if filled up from annotation annotation is added to leaves | |
157 ## only!! Inner node are NA or NULL | |
158 annot0 = cluster_annotation(supercluster, search_type = "supercluster") | |
159 ## keep only built in annotation | |
160 annot = list(protein_databse = annot0$protein_database, dna_database = annot0$dna_database) | |
161 annot$dna_database$bitscore = annot$dna_database$bitscore / 2 #DNA bitscore corection, blastx - more weight | |
162 annot = do.call(rbind, annot) | |
163 ## remove duplicated hits, keep best | |
164 annot_clean = annot[order(annot$bitscore, decreasing = TRUE), ] | |
165 annot_clean = annot_clean[!duplicated(annot_clean$name), ] | |
166 tarean_info = get_tarean_info(supercluster, search_type = "supercluster") | |
167 ltr_info = get_ltr_info(supercluster) | |
168 db_id = annot_clean$db_id | |
169 cl_string = gsub(":.+$", "", gsub("^.+#", "", db_id)) | |
170 weight = annot_clean$bitscore | |
171 domain = str_replace(str_extract(str_replace(db_id, "^.+#", ""), ":.+"), ":", | |
172 "") | |
173 domain[is.na(domain)] = "NA" | |
174 mean_weight_table = by(weight, INDICES = list(cl_string, domain), FUN = function(x) signif(mean(x))) | |
175 mean_weight_table[is.na(mean_weight_table)] = 0 | |
176 total_weight_table = by(weight, INDICES = list(cl_string, domain), FUN = sum) | |
177 total_weight_table[is.na(total_weight_table)] = 0 | |
178 cl_table = table(cl_string) | |
179 domain_table = table(cl_string, domain) | |
180 cls_tree = Clone(TREE_TEMPLATE) | |
181 cls_tree$size = supercluster_size(supercluster) | |
182 cls_tree$ltr_info = ltr_info | |
183 for (i in cls_tree$leaves) { | |
184 if (i$full_name %in% names(cl_table)) { | |
185 i$nhits = cl_table[[i$full_name]] | |
186 i$domains = domain_table[i$full_name, ] | |
187 names(i$domains) = colnames(domain_table) | |
188 i$mean_weight = mean_weight_table[i$full_name, ] | |
189 i$total_weight = total_weight_table[i$full_name, ] | |
190 i$proportion = signif(i$nhits/cls_tree$size, 3) | |
191 } else { | |
192 if (i$name == "satellite"){ | |
193 i$nhits = tarean_info$nhits | |
194 i$tandem_rank = tarean_info$tarean_info$tandem_rank | |
195 i$proportion = tarean_info$proportion | |
196 i$tarean_info = tarean_info$tarean_info | |
197 }else{ | |
198 i$proportion = 0 | |
199 } | |
200 } | |
201 } | |
202 ## create domain string for printing: | |
203 for (i in Traverse(cls_tree)) { | |
204 if (is.null(i$domains)) { | |
205 i$features = "" | |
206 } else if (sum(i$domains) == 0) { | |
207 i$features = "" | |
208 } else { | |
209 dom = i$domains[!names(i$domains) == "NA"] | |
210 i$features = gsub(" *\\(\\)", "", pasteDomains(dom)) | |
211 } | |
212 } | |
213 ## add LTR info: | |
214 if (any(!cls_tree$ltr_info$ltr_detection %in% "None")){ | |
215 tr = FindNode(cls_tree, "LTR") | |
216 tr$features = "LTR/PBS" | |
217 } | |
218 return(cls_tree) | |
219 } | |
220 | |
221 pasteDomains = function(dom){ | |
222 d = dom[dom != 0] | |
223 if (length(d) == 0){ | |
224 return("") | |
225 }else{ | |
226 paste0(d, " (", names(d), ")", sep = ", ", collapse="") | |
227 } | |
228 | |
229 } | |
230 | |
231 rescale = function(x, newrange) { | |
232 # taken from plotrix package | |
233 if (missing(x) | missing(newrange)) { | |
234 usage.string <- paste("Usage: rescale(x,newrange)\n", "\twhere x is a numeric object and newrange is the new min and max\n", | |
235 sep = "", collapse = "") | |
236 stop(usage.string) | |
237 } | |
238 if (is.numeric(x) && is.numeric(newrange)) { | |
239 xna <- is.na(x) | |
240 if (all(xna)) | |
241 return(x) | |
242 if (any(xna)) | |
243 xrange <- range(x[!xna]) else xrange <- range(x) | |
244 if (xrange[1] == xrange[2]) | |
245 return(x) | |
246 mfac <- (newrange[2] - newrange[1])/(xrange[2] - xrange[1]) | |
247 return(newrange[1] + (x - xrange[1]) * mfac) | |
248 } else { | |
249 warning("Only numeric objects can be rescaled") | |
250 return(x) | |
251 } | |
252 } | |
253 | |
254 add_value_to_nodes = function(tr, value_names = c("nhits", "domains", "total_weight", | |
255 "proportion")) { | |
256 ## propagate values from leaves to nodes, assume that nodes are not defined | |
257 ## (either NA or NULL) leaves coud be be either numeric of list, if list values | |
258 ## are summed up based on the names | |
259 for (vn in value_names) { | |
260 for (i in Traverse(tr)) { | |
261 w = add_leaves_value(i$leaves, vn) | |
262 i[[vn]] = w | |
263 } | |
264 } | |
265 } | |
266 | |
267 add_leaves_value = function(x, name) { | |
268 ## check if annotation is multivalue or single value, propagates values from | |
269 ## leaves to root | |
270 n = unique(unlist(lapply(lapply(x, "[[", name), names))) | |
271 if (is.null(n)) { | |
272 ## no names given, we can sum everything to one value | |
273 total = sum(unlist(sapply(x, "[[", name)), na.rm = TRUE) | |
274 return(total) | |
275 } else { | |
276 total = numeric(length(n)) | |
277 names(total) = n | |
278 xvals = lapply(x, "[[", name) | |
279 N = 0 | |
280 for (i in xvals) { | |
281 for (j in names(i)) { | |
282 total[j] = total[j] + i[j] | |
283 N = N + 1 | |
284 } | |
285 } | |
286 return(total) | |
287 } | |
288 } | |
289 | |
290 | |
291 trmap = function(tr, xl = 0, yb = 0, xr = 1, yt = 1, first = TRUE, vertical = TRUE) { | |
292 # treemap for data.tree plotting - experimental | |
293 if (first) { | |
294 plot(xl:xr, yb:yt, type = "n", axes = FALSE, xlab = "", ylab = "") | |
295 } | |
296 if (tr$nhits > 0) { | |
297 width = 2 * (6 - tr$level) | |
298 rect(xl, yb, xr, yt, col = "#00008805", lwd = width, border = "#00000080") | |
299 if (tr$level != 1) { | |
300 text(x = (xl + xr)/2, y = (yb + yt)/2, labels = tr$name, cex = width/4) | |
301 } | |
302 } else { | |
303 return(NULL) | |
304 } | |
305 Nchildren = length(tr$children) | |
306 cat("children:", tr$name, tr$level, "\n") | |
307 if (Nchildren == 0) { | |
308 return(NULL) | |
309 } | |
310 sizes = sapply(tr$children, "[[", "nhits") | |
311 rng = c(sizes, 0) / sum(sizes) | |
312 if (vertical) { | |
313 ## x does not change | |
314 xl2 = rep(xl, Nchildren) | |
315 xr2 = rep(xr, Nchildren) | |
316 yb2 = rescale(rng, c(yb, yt))[1:(Nchildren + 1)] | |
317 yt2 = rescale(rng, c(yb, yt))[-1] | |
318 } else { | |
319 yb2 = rep(yb, Nchildren) | |
320 yt2 = rep(yt, Nchildren) | |
321 xr2 = rescale(rng, c(xl, xr))[1:(Nchildren + 1)] | |
322 xl2 = rescale(rng, c(xl, xr))[-1] | |
323 } | |
324 for (i in 1:Nchildren) { | |
325 trmap(tr$children[[i]], xl2[i], yb2[i], xr2[i], yt2[i], first = FALSE, vertical = !vertical) | |
326 } | |
327 | |
328 } | |
329 | |
330 filter_tree = function(tr) { | |
331 ## keep only nodes with positive nhits values | |
332 ## must be used on trees where leaves were already propagated | |
333 ## using add_values_to_nodes | |
334 tr_filtered = Clone(tr) | |
335 Prune(tr_filtered, function(x) x$nhits > 0 | containLTR(x)) | |
336 return(tr_filtered) | |
337 } | |
338 | |
339 containLTR = function(tr){ | |
340 for (i in Traverse(tr)){ | |
341 if (i$features == "LTR/PBS"){ | |
342 return(TRUE) | |
343 } | |
344 } | |
345 return(FALSE) | |
346 } | |
347 | |
348 filter_tree2 = function(tr) { | |
349 tr_filtered = Clone(tr) | |
350 Prune(tr_filtered, function(x) x$parent$nhits > 0) | |
351 return(tr_filtered) | |
352 } | |
353 | |
354 ## this is for testing purposesm in futurem each node will have its function | |
355 ## named find_best_hit, it will work like decisin tree. | |
356 ## functions will be set manually based on training data | |
357 formatWidth = function(x, w){ | |
358 if (length (dim(x)) > 1){ | |
359 for (i in seq_along(x)){ | |
360 delta = nchar(x[,i], type="bytes") - nchar(x[,i], type="char") | |
361 ## delta correction is neccessary for utf-8 correct formating | |
362 x[,i] = sprintf(paste0("%-",w[i] + delta,"s"), x[,i]) | |
363 } | |
364 return(x) | |
365 }else{ | |
366 delta = nchar(x, type="bytes") - nchar(x, type="char") | |
367 sprintf(paste0("%",w + delta,"s"), | |
368 x) %>% return | |
369 } | |
370 } | |
371 | |
372 | |
373 find_best_hit_repeat = function(cls_tree){ | |
374 ## three children: | |
375 ## repeat | |
376 ## |--rDNA | |
377 ## |--satellite this need special handling, rank 1 or 2 | |
378 ## |--mobile_element | |
379 ## | |
380 ## in this case we require that satellite has proportion 0.95 | |
381 ## params of good hit | |
382 min_prop_of_positive = 0.7 | |
383 min_prop_second_best = 0.9 | |
384 min_proportion_x_nhits = 2.5 # based on 0.05 x 50 | |
385 min_satellite_proportion = 0.95 | |
386 if (isLeaf(cls_tree)) { | |
387 return(cls_tree) | |
388 } | |
389 cond1 = cls_tree$nhits * cls_tree$proportion < min_proportion_x_nhits | |
390 if (cond1){ | |
391 return(cls_tree) | |
392 } | |
393 nhits = sapply(cls_tree$children, "[[", "nhits") | |
394 all_hits = cls_tree$root$nhits | |
395 name_of_best_hit = cls_tree$children[[which.max(nhits)]]$name | |
396 best_hit_child = cls_tree$children[[which.max(nhits)]] | |
397 second_max_hits = ifelse(length(nhits) == 1, 0, max(nhits[-which.max(nhits)])) | |
398 cond2 = max(nhits) / sum(nhits) > min_prop_of_positive | |
399 cond3 = max(nhits) / (second_max_hits + max(nhits)) > min_prop_second_best | |
400 cond_satellite = best_hit_child$proportion > min_satellite_proportion & name_of_best_hit == "satellite" | |
401 cond_other = ! name_of_best_hit == "satellite" | |
402 cat(cond2, cond3, cond_satellite, cond_other, "\n") | |
403 if (cond2 & cond3 & (cond_satellite | cond_other)) { | |
404 ## clear case satellite or other | |
405 if ("find_best_hit" %in% names(best_hit_child)){ | |
406 ## custom rules for node is defined | |
407 best_hit = best_hit_child$find_best_hit(cls_tree$children[[which.max(nhits)]]) | |
408 }else{ | |
409 # use generic rules | |
410 best_hit = find_best_hit(cls_tree$children[[which.max(nhits)]]) | |
411 } | |
412 }else{ | |
413 ## do more specific tests for rDNA, or mobile element | |
414 ## rDNA o mobile_elements must be above threshold! | |
415 cond_sat_rank = any(cls_tree$satellite$tandem_rank == 2) & cls_tree$satellite$nhits != 0 | |
416 cond_rdna = cls_tree$rDNA$nhits * cls_tree$rDNA$proportion > min_proportion_x_nhits | |
417 cond_mobile = cls_tree$mobile_element$nhits * cls_tree$mobile_element$proportion > min_proportion_x_nhits | |
418 cat(cond_sat_rank, "\n") | |
419 cat(cond_rdna,"\n") | |
420 cat(cond_mobile, "\n") | |
421 if (cond_sat_rank & (cond_rdna | cond_mobile) ){ | |
422 cls_tree$satellite$nhits = 0 | |
423 best_hit = find_best_hit(cls_tree) | |
424 }else{ | |
425 return(cls_tree) | |
426 } | |
427 } | |
428 return(best_hit) | |
429 } | |
430 | |
431 find_best_hit = function(cls_tree){ | |
432 ## general params of good hit | |
433 min_prop_of_positive = 0.7 | |
434 min_prop_second_best = 0.8 | |
435 min_proportion_x_nhits = 2.5 # based on 0.05 x 50 | |
436 if (isLeaf(cls_tree)) { | |
437 return(cls_tree) | |
438 } | |
439 | |
440 | |
441 cond1 = cls_tree$nhits * cls_tree$proportion < min_proportion_x_nhits | |
442 | |
443 if (cond1){ | |
444 ## return if proportions is lower than threshold | |
445 return(cls_tree) | |
446 } | |
447 | |
448 nhits = sapply(cls_tree$children, "[[", "nhits") | |
449 best_hit_child = cls_tree$children[[which.max(nhits)]] | |
450 ## more special cases: | |
451 second_max_hits = ifelse(length(nhits) == 1, 0, max(nhits[-which.max(nhits)])) | |
452 cond2 = max(nhits) / sum(nhits) > min_prop_of_positive | |
453 cond3 = max(nhits) / (second_max_hits + max(nhits)) > min_prop_second_best | |
454 if (cond2 & cond3) { | |
455 if ("find_best_hit" %in% names(best_hit_child)){ | |
456 ## custom rules for node is defined | |
457 best_hit = best_hit_child$find_best_hit(cls_tree$children[[which.max(nhits)]]) | |
458 }else{ | |
459 # use generic rules | |
460 best_hit = find_best_hit(cls_tree$children[[which.max(nhits)]]) | |
461 } | |
462 } else { | |
463 return(cls_tree) | |
464 } | |
465 return(best_hit) | |
466 } | |
467 | |
468 | |
469 get_annotation_groups = function(tr){ | |
470 tr_all = Clone(tr) | |
471 Prune(tr_all, pruneFun=function(...)FALSE) ## prune everithing -> keep root | |
472 tr_all$name = "Unclassified repeat (No evidence)" | |
473 tr_contamination = Clone(tr)$contamination | |
474 tr_organelle = Clone(tr)$organelle | |
475 tr_repeat = Clone(tr)$"repeat" | |
476 tr_repeat$name = "Unclassified_repeat (conflicting evidences)" | |
477 return( | |
478 list( | |
479 tr_repeat = tr_repeat, | |
480 tr_organelle = tr_organelle, | |
481 tr_all = tr_all, | |
482 tr_contamination = tr_contamination | |
483 ) | |
484 ) | |
485 } | |
486 | |
487 | |
488 | |
489 format_tree = function(cls_tree, ...) { | |
490 df = ToDataFrameTree(cls_tree, ...) | |
491 ## try to get rid off unicode characters | |
492 df[,1] = gsub("\U00B0", "'", df[,1], fixed = TRUE, useBytes = TRUE) | |
493 df[,1] = gsub("\U00A6", "|", df[,1], fixed = TRUE, useBytes = TRUE) | |
494 colnames(df)[1] = " " ## originally levelName | |
495 if ("proportion" %in% colnames(df)){ | |
496 df$proportion = signif(df$proportion,2) | |
497 } | |
498 if ("Proportion[%]" %in% colnames(df)){ | |
499 df$"Proportion[%]" = round(df$"Proportion[%]", 2) | |
500 } | |
501 ## format header | |
502 w1 = apply(df, 2, function(x) max(nchar(x))) | |
503 w2 = nchar(colnames(df)) | |
504 # use whatever with is bigger for formating | |
505 w = ifelse(w1 > w2, w1, w2) | |
506 | |
507 out = character() | |
508 header = mapply(FUN = function(x, y) formatWidth(x, w = y), colnames(df), w) %>% | |
509 paste0(collapse=" | ") | |
510 hr_line = gsub(".","-",header) | |
511 # create output | |
512 # classification lines | |
513 class_lines = formatWidth(df, w) %>% | |
514 apply(1, FUN = function(x) paste0(x,collapse = " | ")) %>% | |
515 paste(collapse = "\n") | |
516 paste( | |
517 header, | |
518 hr_line, | |
519 class_lines, | |
520 sep="\n") %>% return | |
521 } | |
522 | |
523 | |
524 make_final_annotation_template = function( | |
525 annot_attributes = list( | |
526 Nreads = 0, | |
527 Nclusters = 0, | |
528 Nsuperclusters = 0, | |
529 "Proportion[%]" = 0, | |
530 cluster_list = c()) | |
531 ) | |
532 { | |
533 ct = Clone(CLS_TREE) | |
534 for (i in Traverse(ct)){ | |
535 for (j in names(annot_attributes)){ | |
536 i[[j]] = annot_attributes[[j]] | |
537 } | |
538 } | |
539 return(ct) | |
540 } | |
541 | |
542 | |
543 common_ancestor = function(tr1, tr2){ | |
544 a1 = tr1$Get('name', traversal = "ancestor") | |
545 a2 = tr2$Get('name', traversal = "ancestor") | |
546 ancestor = intersect(a1,a2)[1] | |
547 return(ancestor) | |
548 } | |
549 | |
550 create_all_superclusters_report = function(max_supercluster = 100, | |
551 paths, | |
552 libdir, | |
553 superclusters_dir, | |
554 seqdb, hitsortdb, | |
555 classification_hierarchy_file, | |
556 HTML_LINKS) | |
557 { | |
558 ## connect to sqlite databases | |
559 ## seqdb and hitsortdb are path to sqlite files | |
560 HTML_LINKS = nested2named_list(HTML_LINKS) | |
561 paths = nested2named_list(paths) | |
562 report_file = paths[['supercluster_report_html']] | |
563 ## create SEQDB, HTISORTDB and CLS_TREE | |
564 connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file) | |
565 | |
566 ## append specific classication rules | |
567 CLS_TREE$"repeat"$find_best_hit = find_best_hit_repeat | |
568 | |
569 ### | |
570 cat("Superclusters summary\n", file = report_file) | |
571 cat("---------------------\n\n", file = report_file, append = TRUE) | |
572 empty = rep(NA,max_supercluster) | |
573 SC_table = data.frame( | |
574 Supercluster = 1 : max_supercluster, "Number of reads" = empty, Automatic_annotation = empty, | |
575 Similarity_hits = empty, TAREAN_annotation = empty, | |
576 Clusters = empty, stringsAsFactors = FALSE, check.names = FALSE | |
577 ) | |
578 SC_csv = SC_table # table as spreadsheet - no html formatings | |
579 final_cluster_annotation = make_final_annotation_template() | |
580 | |
581 for (sc in 1 : max_supercluster) { | |
582 cat("supercluster: ",sc,"\n") | |
583 cls_tree = read_annotation_to_tree(sc) | |
584 sc_summary = get_supercluster_summary(sc) | |
585 add_value_to_nodes(cls_tree) | |
586 cls_tree_filtered = filter_tree(cls_tree) | |
587 best_hit = find_best_hit(cls_tree) | |
588 ## exception to decision tree put here. | |
589 ## All -> class I ? | |
590 if (best_hit$name == "All"){ | |
591 ## check LTR | |
592 if (any(!(unique(cls_tree$ltr_info$ltr_detection) %in% "None"))){ | |
593 ## if LTR found - move classification to Class I | |
594 | |
595 LTR = FindNode(best_hit, "LTR") | |
596 nhits_without_class_I = best_hit$nhits - LTR$nhits | |
597 prop_without_class_I = best_hit$proportion - LTR$proportion | |
598 cond3 = nhits_without_class_I * prop_without_class_I < 1 | |
599 cond2 = best_hit$nhits * best_hit$proportion < 0.5 # other hits weak | |
600 cond1 = LTR$nhits >= (0.95 * best_hit$nhits) # hits are pre in sub class_I | |
601 | |
602 if (cond1 | cond2 | cond3){ | |
603 best_hit = LTR | |
604 } | |
605 } | |
606 }else{ | |
607 ## check is conflict os best hit with LTR/PBS exists | |
608 if (any(!(unique(cls_tree$ltr_info$ltr_detection) %in% "None"))){ | |
609 LTR = FindNode(cls_tree, "LTR") | |
610 ca_name = common_ancestor(LTR, best_hit) | |
611 if (ca_name != "LTR"){ | |
612 ## reclassify | |
613 best_hit = FindNode(cls_tree, ca_name) | |
614 } | |
615 } | |
616 } | |
617 | |
618 best_hit_name = best_hit$name | |
619 best_hit_path = paste(best_hit$path, collapse = "/") | |
620 ## add best hit do database: | |
621 sqlcmd = paste0( | |
622 "UPDATE cluster_info SET supercluster_best_hit = \"", best_hit_path, "\" ", | |
623 " WHERE supercluster = ", sc | |
624 ) | |
625 dbExecute(HITSORTDB, sqlcmd) | |
626 # add annotation annotation summary stored in final_cluster_annotation - summing up | |
627 best_hit_node = FindNode(final_cluster_annotation, best_hit_name) | |
628 best_hit_node$Nsuperclusters = best_hit_node$Nsuperclusters + 1 | |
629 best_hit_node$Nclusters = best_hit_node$Nclusters + sc_summary$Nclusters | |
630 best_hit_node$Nreads = best_hit_node$Nreads + sc_summary$Nreads | |
631 best_hit_node$"Proportion[%]" = best_hit_node$"Proportion[%]" + sc_summary$proportion*100 | |
632 best_hit_node$cluster_list = append(best_hit_node$cluster_list, sc_summary$cluster_list) | |
633 best_hit_label = ifelse(best_hit_name == "All", "NA", best_hit_name) | |
634 SC_csv$Automatic_annotation[sc] = best_hit_label | |
635 SC_csv$"Number of reads"[sc] = SC_table$"Number of reads"[sc] = cls_tree$size | |
636 SC_csv$Similarity_hits[sc] = format_tree(cls_tree_filtered, | |
637 "nhits", "proportion", "features") | |
638 SC_table$Similarity_hits[sc] = format_tree(cls_tree_filtered, | |
639 "nhits", "proportion", "features") %>% | |
640 preformatted | |
641 | |
642 SC_table$Automatic_annotation[sc] = best_hit_label | |
643 | |
644 G = get_supercluster_graph( | |
645 sc=sc, seqdb = seqdb,hitsortdb = hitsortdb, | |
646 classification_hierarchy_file = classification_hierarchy_file | |
647 ) | |
648 ## append tarean annotation | |
649 clist = paste(V(G)$name, collapse =",") | |
650 cl_rank = dbGetQuery(HITSORTDB, | |
651 paste("SELECT [index], tandem_rank FROM cluster_info WHERE [index] IN (",clist, | |
652 ") AND tandem_rank IN (1,2)")) | |
653 ## add information about TR clusters is any | |
654 if (nrow(cl_rank) > 0){ | |
655 tarean_annot = paste( | |
656 cl_rank$index, "-", | |
657 RANKS_TANDEM[cl_rank$tandem_rank] | |
658 ) | |
659 SC_csv$TAREAN_annotation[sc] = paste(tarean_annot,collapse="\n") | |
660 SC_table$TAREAN_annotation[sc] = mapply( | |
661 hwrite, tarean_annot, | |
662 link = sprintf(HTML_LINKS$ROOT_TO_TAREAN,cl_rank$index)) %>% paste(collapse="<br>") | |
663 } | |
664 ## add links to clusters | |
665 SC_csv$Clusters[sc] = paste((V(G)$name), collapse = ",") | |
666 links = mapply(hwrite, V(G)$name, link = sprintf(HTML_LINKS$ROOT_TO_CLUSTER, as.integer(V(G)$name))) | |
667 SC_table$Clusters[sc] = paste(links, collapse =", ") | |
668 create_single_supercluster_report(G, superclusters_dir) | |
669 } | |
670 | |
671 | |
672 | |
673 ## add html links | |
674 SC_table$Supercluster = mapply(hwrite, SC_table$Supercluster, | |
675 link = sprintf(HTML_LINKS$ROOT_TO_SUPERCLUSTER, | |
676 SC_table$Supercluster)) | |
677 | |
678 | |
679 | |
680 write.table(SC_csv, file = paths[["superclusters_csv_summary"]], | |
681 sep = "\t", row.names = FALSE) | |
682 | |
683 | |
684 DT_instance = datatable(SC_table, escape = FALSE, options = DT_OPTIONS) | |
685 saveWidget(DT_instance, file = normalizePath(report_file), | |
686 normalizePath(libdir), selfcontained = FALSE) | |
687 add_preamble(normalizePath(report_file), | |
688 preamble='<h2>Supercluster annotation</h2> <p><a href="documentation.html#superclust"> For table legend see documentation. <a> </p>') | |
689 | |
690 annot_groups = get_annotation_groups(final_cluster_annotation) | |
691 saveRDS(object=annot_groups, file = paths[['repeat_annotation_summary_rds']]) | |
692 final_cluster_annotation_formated = paste( | |
693 sapply(annot_groups, format_tree, 'Proportion[%]', 'Nsuperclusters', 'Nclusters', "Nreads"), | |
694 collapse = "\n\n\n" | |
695 ) | |
696 ## TODO add singleton counts, total counts and extra text - csv output | |
697 ## export annotation summary | |
698 html_summary = start_html(paths[['summarized_annotation_html']], gsub("PAGE_TITLE", "Repeat Annotation Summary", HTMLHEADER)) | |
699 html_summary("Repeat annotation summary", HTML.title, HR=2) | |
700 html_summary("This table summarizes the automatic annotations of superclusters that should be verified and manually corrected if necessary. Thus, the table should not be used as the final output of the analysis without critical evaluation.", HTML.title, HR=3) | |
701 html_summary(preformatted(final_cluster_annotation_formated), cat) | |
702 | |
703 | |
704 | |
705 return() | |
706 } | |
707 ## for testing | |
708 if (FALSE){ | |
709 create_supercluster_report(1:100, report_file, seqdb, hitsortdb, class_file) | |
710 } | |
711 | |
712 create_single_supercluster_report = function(G, superclusters_dir){ | |
713 sc_dir = paste0(superclusters_dir, "/dir_SC",sprintf("%04d", G$name)) | |
714 htmlfile = paste0(sc_dir, "/index.html") | |
715 dir.create(sc_dir) | |
716 title = paste("Supercluster no. ",G$name) | |
717 htmlheader = gsub("PAGE_TITLE", title, HTMLHEADER) | |
718 cat(htmlheader, file = htmlfile) | |
719 file.copy(paste0(WD,"/style1.css"), sc_dir) | |
720 HTML.title(title, file = htmlfile) | |
721 HTML("Simmilarity hits (only hits with proportion above 0.1% in at least one cluster are shown) ", file = htmlfile) | |
722 img_file = paste0(sc_dir,"/SC",sprintf("%0d",G$name), ".png") | |
723 png(filename = img_file, width = 1400, height=1200) | |
724 coords = plot_supercluster(G = G) | |
725 dev.off() | |
726 ## basename - make link relative | |
727 href = paste0("../../clusters/dir_CL",sprintf("%04d", as.numeric(V(G)$name)),"/index.html") | |
728 html_insert_image(basename(img_file), htmlfile,coords=coords, href = href) | |
729 | |
730 if (is_comparative()){ | |
731 HTML("Comparative analysis", file = htmlfile) | |
732 img_file = paste0(sc_dir,"/SC",sprintf("%0d",G$name), "comparative.png") | |
733 png(filename = img_file, width = 1400, height=1200) | |
734 coords = plot_supercluster(G = G, "comparative") | |
735 mtext("comparative analysis") | |
736 dev.off() | |
737 ## basename - make link relative | |
738 href = paste0("../../clusters/dir_CL",sprintf("%04d", as.numeric(V(G)$name)),"/index.html") | |
739 html_insert_image(basename(img_file), htmlfile,coords=coords, href = href) | |
740 | |
741 } | |
742 } | |
743 | |
744 html_insert_image = function(img_file, htmlfile, coords=NULL, href=NULL) { | |
745 img_tag = sprintf('<p> <img src="%s" usemap="#clustermap" > \n</p>\n<br>\n', img_file) | |
746 cat(img_tag, file = htmlfile, append = TRUE) | |
747 if (!is.null(coords)){ | |
748 formated_links = character() | |
749 for (i in seq_along(href)){ | |
750 formated_links[i] = sprintf('<area shape="circle" coords="%f,%f,%f" href="%s" >\n', | |
751 coords[i,1],coords[i,2],coords[,3],href[i]) | |
752 } | |
753 cat('<map name="clustermap" >\n', | |
754 formated_links,"</map>\n", file=htmlfile, append = TRUE) | |
755 | |
756 } | |
757 } | |
758 | |
759 html_insert_floating_image = function(img_file, | |
760 htmlfile, | |
761 width=NULL, | |
762 title= "", | |
763 footer = "" | |
764 ){ | |
765 if (is.null(width)){ | |
766 width="" | |
767 } | |
768 tag = paste( | |
769 '<div class="floating_img">\n', | |
770 ' <p>', title,'</p>\n', | |
771 ' <a href=',img_file, ">", | |
772 ' <img src="',img_file, '" alt="image" width="',width,'">\n', | |
773 ' </a>', | |
774 ' <p> ',footer,'</p>\n', | |
775 '</div>\n' | |
776 ,sep = "") | |
777 cat(tag, file = htmlfile, append = TRUE) | |
778 } | |
779 | |
780 get_supercluster_graph = function(sc, seqdb, hitsortdb, classification_hierarchy_file){ | |
781 ## sc - superclusted index | |
782 ## seqdb, hitsortdb - path to sqlite databases | |
783 ## classificationn_hierarchy_file - path data.tree rds file | |
784 ## SIZE of image | |
785 SIZE = 1000 | |
786 connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file) | |
787 clusters = dbGetQuery(HITSORTDB, | |
788 paste0("SELECT cluster, size FROM superclusters WHERE supercluster=",sc) | |
789 ) %>% as.data.frame | |
790 ## string for sql query: | |
791 cluster_list = paste0( "(", paste0(clusters$cluster, collapse = ","),")") | |
792 supercluster_ncol = dbGetQuery(HITSORTDB, | |
793 paste("SELECT c1,c2,w, k FROM cluster_mate_bond where c1 IN ", | |
794 cluster_list, " AND c2 IN ", cluster_list, "AND k > 0.1" | |
795 ) | |
796 ) | |
797 if (nrow(supercluster_ncol)>0){ | |
798 ## at least two clusters | |
799 G = graph.data.frame(supercluster_ncol, directed=FALSE) | |
800 L = layout_with_kk(G) | |
801 ## layout is rescaled for easier linking html | |
802 L[,1] = scales::rescale(L[,1], to = c(1,SIZE) ) | |
803 L[,2] = scales::rescale(L[,2], to = c(1,SIZE) ) | |
804 G$L = L | |
805 }else{ | |
806 ## only one cluster in supercluster | |
807 G = graph.full(n = 1) | |
808 V(G)$name = as.character(clusters$cluster) | |
809 G$L = matrix(c(SIZE/2, SIZE/2), ncol=2) | |
810 } | |
811 G = set_vertex_attr(G, "label", value = paste0("CL",names(V(G)))) | |
812 G$name=sc ## names is supercluster | |
813 # create annotation of individual clusters, will be attached as node attribudes | |
814 annot = get_cluster_annotation_summary(clusters) | |
815 ## annotation of nodes(clusters) | |
816 G = set_vertex_attr(G, "annotation", value = annot$clusters[names(V(G))]) | |
817 G = set_vertex_attr(G, "size", | |
818 value = clusters$size[match(names(V(G)), as.character(clusters$cluster))]) | |
819 G$annotation = annot$supercluster | |
820 G$clusters = clusters | |
821 # TODO if comparative analysis - add proportion of species | |
822 if (is_comparative()){ | |
823 comparative_counts = get_cluster_comparative_counts(cluster_list) | |
824 NACluster = comparative_counts$supercluster | |
825 # some clusters do not have comparative data - they are bellow threshold! | |
826 NACluster[,] = NA | |
827 counts_adjusted = comparative_counts$clusters[names(V(G))] | |
828 for (i in names(V(G))){ | |
829 if(is.null(counts_adjusted[[i]])){ | |
830 ## get count from database | |
831 seqid = dbGetQuery(HITSORTDB, | |
832 paste( | |
833 "SELECT vertexname FROM vertex_cluster where cluster = ", | |
834 i) | |
835 )$vertexname | |
836 codes = get_comparative_codes()$prefix | |
837 x = table(factor(substring(seqid, 1,nchar(codes[1])), levels = codes)) | |
838 counts_adjusted[[i]] = data.frame( | |
839 Freq = as.numeric(x), | |
840 proportion = as.numeric(x/sum(x)), | |
841 row.names = names(x)) | |
842 | |
843 } | |
844 } | |
845 | |
846 G = set_vertex_attr(G, "comparative_counts", | |
847 value = counts_adjusted[V(G)$name] | |
848 ) | |
849 # for whole supercluster | |
850 G$comparative = comparative_counts$supercluster | |
851 } | |
852 | |
853 return(G) | |
854 } | |
855 | |
856 get_cluster_comparative_counts = function(cluster_list){ | |
857 counts = dbGetQuery( | |
858 HITSORTDB, | |
859 paste0("SELECT * FROM comparative_counts WHERE clusterindex IN", | |
860 cluster_list | |
861 ) | |
862 ) | |
863 comparative_counts = apply(counts, 1, function(x) | |
864 y = data.frame(Freq = x[-1], proportion = x[-1]/sum(x[-1])) | |
865 ) | |
866 names(comparative_counts) = counts$clusterindex | |
867 total_counts = data.frame( | |
868 Freq = colSums(counts[,-1]), | |
869 proportion = colSums(counts[,-1])/sum(counts[,-1])) | |
870 return( | |
871 list( | |
872 clusters = comparative_counts, | |
873 supercluster = total_counts | |
874 ) | |
875 ) | |
876 | |
877 } | |
878 | |
879 get_cluster_annotation_summary = function(clusters){ | |
880 ## clusters - table of clusters, col: cluster, size | |
881 annot_list = apply(clusters,1,FUN = function(x)summarize_annotation(cluster_annotation(x[1]),x[2])) | |
882 ## if empty - not annotation at all | |
883 if (sum(sapply(annot_list, nrow)) == 0){ | |
884 return(list (clusters = NULL, | |
885 superclusters = NULL | |
886 )) | |
887 } | |
888 names(annot_list) = as.character(clusters$cluster) | |
889 annot_all = do.call(rbind,annot_list) | |
890 total = by(annot_all$Freq,INDICES=with(annot_all, paste(cl_string,domain)), FUN=sum, simplify=TRUE) | |
891 total_df = data.frame(Freq = c(total), proportion = c(total) /sum(clusters$size)) | |
892 ## for ploting it need to contain all annot categories as in s upercluster | |
893 annot_list_full = list() | |
894 for (i in seq_along(annot_list)){ | |
895 annot_list_full[[i]] = total_df | |
896 for (j in rownames(total_df)){ | |
897 if (!j %in% rownames(annot_list[[i]])){ | |
898 annot_list_full[[i]][j,c('Freq','proportion')] = c(0,0) | |
899 }else{ | |
900 annot_list_full[[i]][j,c('Freq','proportion')] = annot_list[[i]][j,c('Freq','proportion')] | |
901 } | |
902 } | |
903 } | |
904 names(annot_list_full) = names(annot_list) | |
905 return(list (clusters = annot_list_full, | |
906 supercluster = total_df | |
907 ) | |
908 ) | |
909 } | |
910 | |
911 plot_supercluster = function(G,color_scheme=c("annotation","comparative")){ | |
912 ## create plot in coords y: 1-1200, x: 1 - 1400 | |
913 ## this is fixed for href to work in exact image areas! | |
914 color_scheme = match.arg(color_scheme) | |
915 LIMS0 = matrix(c (1,1000,1,1000), ncol = 2) | |
916 OFFSET_H = 100; OFFSET_W = 200 | |
917 lims = LIMS0 + c(0,OFFSET_W * 2, 0, OFFSET_H * 2) | |
918 ## use full range | |
919 par(mar=c(0,0,0,0),xaxs="i", yaxs="i") | |
920 ## proportion of repeats | |
921 if (color_scheme == "annotation"){ | |
922 prop = sapply (V(G)$annotation, FUN = function(x)x$proportion) | |
923 brv = c("#BBBBBB",unique_colors) | |
924 }else{ | |
925 prop = sapply (V(G)$comparative_counts, FUN = function(x)x$proportion) | |
926 brv = unique_colors | |
927 } | |
928 ## handle special cases - single cluster with annot or single annotation group | |
929 if (is.null(dim(prop))){ | |
930 prop=matrix(prop, nrow = 1) | |
931 } | |
932 if (length(prop)==0){ | |
933 ## special case - not annotation at all | |
934 prop = matrix(rep(1,vcount(G)),ncol = vcount(G), dimnames=list("NAN", NULL)) | |
935 }else{ | |
936 if (color_scheme == "annotation"){ | |
937 rownames(prop) = rownames(G$annotation) | |
938 }else{ | |
939 rownames(prop) = rownames(G$comparative) | |
940 } | |
941 ## reorder by prop | |
942 if (color_scheme =="annotation"){ | |
943 prop = prop[order(prop[,1], decreasing = TRUE),,drop=FALSE] | |
944 NAN = 1 - colSums(prop) | |
945 prop = rbind(NAN, prop) | |
946 brv = c("#BBBBBB",unique_colors) | |
947 }else{ | |
948 } | |
949 } | |
950 L = G$L + c(OFFSET_H) | |
951 ## for href - necessary convert coordinater - image is flipped vertically | |
952 include = rowSums(prop>0.001, na.rm = TRUE)>=1 | |
953 include[1] = TRUE | |
954 prop = prop[include, , drop=FALSE] | |
955 plot(0,0,type='n', , xlim = lims[,1], ylim = lims[,2]) | |
956 plot_edges(G, L, lwd = 8) | |
957 RADIUS = radius_size(G) | |
958 coords = cbind(G$L[,1,drop=FALSE] + 100, 1100 - G$L[,2,drop=FALSE], RADIUS) | |
959 pieScatter(L[,1], L[,2], t(prop), col = brv, | |
960 radius = RADIUS, new.plot = FALSE, border=NA) | |
961 ## add ledend | |
962 legend("topright", legend = rownames(prop), | |
963 col = brv, pch = 15, cex= 1.5, pt.cex= 3) | |
964 ## add cluster labels | |
965 text(x = L[,1], y = L[,2], labels = paste(V(G)$label,"\n",V(G)$size)) | |
966 return(coords) | |
967 } | |
968 | |
969 | |
970 radius_size = function(G){ | |
971 if (vcount(G) == 1) RADIUS=120 | |
972 if (vcount(G) == 2) RADIUS=50 | |
973 if (vcount(G) %in% 3:8) RADIUS=40 | |
974 if (vcount(G) > 8) RADIUS=30 | |
975 return(RADIUS) | |
976 } | |
977 | |
978 | |
979 plot_edges = function(G,L,col="#33000040", lwd = 1){ | |
980 e = get.edgelist(G, names = F) | |
981 X0 = L[e[, 1], 1] | |
982 Y0 = L[e[, 1], 2] | |
983 X1 = L[e[, 2], 1] | |
984 Y1 = L[e[, 2], 2] | |
985 segments(X0, Y0, X1, Y1, lwd = lwd,col = col) | |
986 } | |
987 | |
988 | |
989 pieScatter=function(x, y, prop,radius=1, col=NULL, edges=100, new.plot = TRUE, ...){ | |
990 if (new.plot){ | |
991 plot(x,y,type='n') | |
992 } | |
993 for (i in seq_along(x)){ | |
994 p=prop[i,] | |
995 if (length(radius)==1){ | |
996 r=radius | |
997 }else{ | |
998 r=radius[i] | |
999 } | |
1000 include = p != 0 | |
1001 floating.pie(x[i], y[i],p[include], | |
1002 col=col[include], radius=r,edges=50,...) | |
1003 } | |
1004 } | |
1005 | |
1006 unique_colors = c("#00FF00", "#0000FF", "#FF0000", | |
1007 "#FFA6FE", "#FFDB66", "#006401", "#010067", "#95003A", | |
1008 "#007DB5", "#FF00F6", "#FFEEE8", "#774D00", "#90FB92", "#01FFFE", | |
1009 "#0076FF", "#D5FF00", "#FF937E", "#6A826C", "#FF029D", | |
1010 "#FE8900", "#7A4782", "#7E2DD2", "#85A900", "#FF0056", | |
1011 "#A42400", "#00AE7E", "#683D3B", "#BDC6FF", "#263400", | |
1012 "#BDD393", "#00B917", "#9E008E", "#001544", "#C28C9F", | |
1013 "#FF74A3", "#01D0FF", "#004754", "#E56FFE", "#788231", | |
1014 "#0E4CA1", "#91D0CB", "#BE9970", "#968AE8", "#BB8800", | |
1015 "#43002C", "#DEFF74", "#00FFC6", "#FFE502", "#620E00", | |
1016 "#008F9C", "#98FF52", "#7544B1", "#B500FF", "#00FF78", | |
1017 "#FF6E41", "#005F39", "#6B6882", "#5FAD4E", "#A75740", | |
1018 "#A5FFD2", "#FFB167", "#009BFF", "#E85EBE") | |
1019 | |
1020 if (FALSE){ | |
1021 ## For testing | |
1022 for ( i in 1:100){ | |
1023 G = get_supercluster_graph( | |
1024 sc=i, seqdb = seqdb, | |
1025 hitsortdb = hitsortdb, classification_hierarchy_file = class_file | |
1026 ) | |
1027 png(paste0("/mnt/raid/users/petr/tmp/test.figure_",i,".png"), width = 1400, height=1000) | |
1028 plot_supercluster(G) | |
1029 dev.off() | |
1030 print(i) | |
1031 } | |
1032 | |
1033 } | |
1034 | |
1035 plotg = function(GG,LL,wlim=NULL,...){ | |
1036 ## quick ploting function | |
1037 e = get.edgelist(GG, names=F) | |
1038 w = E(GG)$weight | |
1039 if (!is.null(wlim)) {e = e[w > wlim,]; w = w[w > wlim]} | |
1040 X0 = LL[e[,1],1] | |
1041 Y0 = LL[e[,1],2] | |
1042 X1 = LL[e[,2],1] | |
1043 Y1 = LL[e[,2],2] | |
1044 plot(range(LL[,1]),range(LL[,2]),xlab="",ylab="",axes=FALSE,type="n",...) | |
1045 brv = 'grey' | |
1046 segments(X0,Y0,X1,Y1,lwd=.5,col=brv) | |
1047 points(LL,pch=18,cex=.8,...) | |
1048 } | |
1049 | |
1050 | |
1051 | |
1052 | |
1053 get_cluster_info = function(index){ | |
1054 ## must be connected to hitsort database ( HITSORT) | |
1055 x = dbGetQuery(HITSORTDB, | |
1056 paste0("SELECT * FROM cluster_info WHERE [index]=",index)) | |
1057 return(as.list(x)) | |
1058 } | |
1059 | |
1060 get_supercluster_info = function(sc){ | |
1061 ## must be connected to hitsort database ( HITSORT) | |
1062 x = dbGetQuery(HITSORTDB, | |
1063 paste0("SELECT * FROM cluster_info WHERE supercluster=",sc)) | |
1064 return(x) | |
1065 } | |
1066 | |
1067 get_supercluster_summary = function(sc){ | |
1068 info = get_supercluster_info(sc) | |
1069 Nclusters = nrow(info) | |
1070 Nreads = supercluster_size(sc) | |
1071 N = dbGetQuery(SEQDB, "SELECT count(*) from sequences") | |
1072 proportion = Nreads/N | |
1073 cluster_list = info$index | |
1074 return(list( | |
1075 Nreads = Nreads, | |
1076 Nclusters = Nclusters, | |
1077 proportion = proportion, | |
1078 cluster_list = cluster_list | |
1079 )) | |
1080 } | |
1081 | |
1082 | |
1083 get_cluster_connection_info = function(index, search_type=c("pair", "similarity")){ | |
1084 search_type = match.arg(search_type) | |
1085 if (search_type == "pair"){ | |
1086 x = dbGetQuery(HITSORTDB, | |
1087 sprintf( | |
1088 "SELECT * FROM cluster_mate_connections WHERE c1==%d OR c2=%d", | |
1089 index, index) | |
1090 ) | |
1091 if (nrow(x) == 0 ){ | |
1092 ## no connections | |
1093 return(NULL) | |
1094 } | |
1095 other_cl = ifelse(x$c1 == index, x$c2, x$c1) | |
1096 out = data.frame(cl = other_cl, N = x$N) | |
1097 out$ids = unlist(strsplit(x$ids, split = ", ")) | |
1098 k = dbGetQuery(HITSORTDB, | |
1099 sprintf( | |
1100 "SELECT * FROM cluster_mate_bond WHERE c1==%d OR c2=%d", | |
1101 index, index) | |
1102 ) | |
1103 if (k$c1 != x$c1 || k$c2 !=x$c2){ | |
1104 ## tables not in same order | |
1105 stop("tables cluster_mate_bond and cluster_mate_connection are not in the same order") | |
1106 } | |
1107 out$k = k$k | |
1108 return(out) | |
1109 }else{ | |
1110 x = dbGetQuery(HITSORTDB, | |
1111 sprintf( | |
1112 "SELECT * FROM cluster_connections WHERE c1==%d OR c2=%d", | |
1113 index, index) | |
1114 ) | |
1115 if (nrow(x) == 0 ){ | |
1116 ## no connections | |
1117 return(NULL) | |
1118 } | |
1119 other_cl = ifelse(x$c1 == index, x$c2, x$c1) | |
1120 out = data.frame(cl = other_cl, N = x$"count(*)") | |
1121 return(out) | |
1122 } | |
1123 } | |
1124 | |
1125 | |
1126 format_clinfo=function(x){ | |
1127 include = c( | |
1128 "size", | |
1129 "size_real", | |
1130 "ecount", | |
1131 "supercluster", | |
1132 "annotations_summary", | |
1133 "ltr_detection", | |
1134 "pair_completeness", | |
1135 "TR_score", | |
1136 "TR_monomer_length", | |
1137 "loop_index", | |
1138 "satellite_probability", | |
1139 "TR_consensus", | |
1140 "tandem_rank", | |
1141 "orientation_score" | |
1142 ) | |
1143 new_names = c( | |
1144 "TR_consensus" = "TAREAN consensus", | |
1145 "tandem_rank" = "TAREAN_annotation", | |
1146 "ltr_detection" = "PBS/LTR", | |
1147 'size' = 'number of reads in the graph', | |
1148 'size_real' = 'the total number of reads in the cluster', | |
1149 'ecount' = "number of edges of the graph:", | |
1150 "annotations_summary" = "similarity based annotation" | |
1151 ) | |
1152 values = unlist(x[include]) %>% gsub("\n","<br>", .) | |
1153 include = replace(include, match(names(new_names),include), new_names) | |
1154 names(values) = include | |
1155 ## replace tandem rank with annotation description | |
1156 values["TAREAN_annotation"] = RANKS_TANDEM[values["TAREAN_annotation"]] | |
1157 ## create HTML table without header | |
1158 desc = df2html(data.frame(names(values), values)) | |
1159 return(desc) | |
1160 } | |
1161 | |
1162 create_cluster_report = function(index, | |
1163 seqdb, hitsortdb, | |
1164 classification_hierarchy_file, | |
1165 HTML_LINKS){ | |
1166 ## index - index of cluster, integer | |
1167 ## seqdb, hitsortdb - sqlite databases | |
1168 ## classification_hierarchy_file - rds file with classification | |
1169 ## other information about cluster is collected from HITSORTDB | |
1170 connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file) | |
1171 ## basic info about cluster | |
1172 clinfo = get_cluster_info(index) | |
1173 HTML_LINKS = nested2named_list(HTML_LINKS) | |
1174 PNGWIDTH = 1000 # this must be kept so than link in png work correctly | |
1175 PNGHEIGHT = 1000 | |
1176 LEGENDWIDTH = 300 | |
1177 PS = 30 ## pointsize for plotting | |
1178 MAX_N = 10 ## to show mates or similarities connections | |
1179 | |
1180 ################################################################################# | |
1181 ## create HTML title: | |
1182 title = paste("Cluster no. ",index) | |
1183 htmlheader = gsub("PAGE_TITLE", title, HTMLHEADER) | |
1184 cat(htmlheader, file = clinfo$html_report_main) | |
1185 file.copy(paste0(WD,"/style1.css"), clinfo$dir) | |
1186 HTML.title(title, file = clinfo$html_report_main) | |
1187 ## print basic info to HTML header: | |
1188 ## make link back to cluster table | |
1189 hwrite("Go back to cluster table <br><br>\n", link=HTML_LINKS$CLUSTER_TO_CLUSTER_TABLE) %>% | |
1190 cat(file = clinfo$html_report_main, append =TRUE) | |
1191 ## create ling to corresponding supercluster | |
1192 cat("Cluster is part of ", | |
1193 hwrite (paste("supercluster: ", clinfo$supercluster), | |
1194 link=sprintf(HTML_LINKS$CLUSTER_TO_SUPERCLUSTER, clinfo$supercluster)), | |
1195 "\n<br><br>\n", | |
1196 file = clinfo$html_report_main, append =TRUE) | |
1197 HTML.title("Cluster characteristics:", HR = 3, file = clinfo$html_report_main) | |
1198 cat(format_clinfo(clinfo), file = clinfo$html_report_main, append =TRUE) | |
1199 ################################################################################# | |
1200 ## add link to image with graph layout with domains | |
1201 fs = list( | |
1202 graph_domains = paste0(clinfo$html_report_files,"/graph_domains.png"), | |
1203 graph_mates = paste0(clinfo$html_report_files,"/graph_mates%04d_%04d.png"), | |
1204 graph_base = paste0(clinfo$html_report_files,"/graph_base.png"), | |
1205 graph_comparative = paste0(clinfo$html_report_files,"/graph_comparative.png") | |
1206 ) | |
1207 fs_relative = list( | |
1208 graph_domains = paste0(basename(clinfo$html_report_files),"/graph_domains.png"), | |
1209 graph_comparative = paste0(basename(clinfo$html_report_files),"/graph_comparative.png"), | |
1210 graph_mates = paste0(basename(clinfo$html_report_files),"/graph_mates%04d_%04d.png") | |
1211 ) | |
1212 | |
1213 dir.create(clinfo$html_report_files) | |
1214 load(clinfo$graph_file) | |
1215 annot = cluster_annotation(index) | |
1216 annot_summary = summarize_annotation(annot, clinfo$size_real) | |
1217 vertex_colors = annot2colors(annot,ids = V(GL$G)$name) | |
1218 | |
1219 color_proportions = sort(table(vertex_colors$color_table)) / clinfo$size_real | |
1220 ## hits with smaller proportion are not shown! | |
1221 exclude_colors = names(color_proportions)[color_proportions < 0.001] | |
1222 vertex_colors$color_table[vertex_colors$color_table %in% exclude_colors] = "#00000050" | |
1223 vertex_colors$legend = vertex_colors$legend[!vertex_colors$legend %in% exclude_colors] | |
1224 | |
1225 if (is_comparative()){ | |
1226 comp_codes = get_comparative_codes()$prefix | |
1227 colors = unique_colors[seq_along(comp_codes)] | |
1228 names(colors) = comp_codes | |
1229 species = substring(V(GL$G)$name,1, nchar(comp_codes[1])) | |
1230 ## create image with highlighted domains | |
1231 png(fs$graph_comparative, width = PNGWIDTH + LEGENDWIDTH, height = PNGHEIGHT, pointsize = PS) | |
1232 layout(matrix(1:2, ncol = 2), width = c(10, 3)) | |
1233 plotg(GL$G,GL$L, col = colors[species]) | |
1234 par(mar = c(0, 0, 0, 0)) | |
1235 plot.new() | |
1236 legend("topleft", col=colors, | |
1237 legend = names(colors), | |
1238 pch = 15, cex = 0.7) | |
1239 dev.off() | |
1240 HTML.title("comparative analysis:", HR=4, file = clinfo$html_report_main) | |
1241 html_insert_image( | |
1242 img_file = fs_relative$graph_comparative, | |
1243 htmlfile = clinfo$html_report_main) | |
1244 ## comparative analysis - calculate statistics: | |
1245 V1V2 = substring(get.edgelist(GL$G),1, nchar(comp_codes[1])) | |
1246 C_observed = table(data.frame( | |
1247 V1=factor(V1V2[,1],levels = comp_codes), | |
1248 V2=factor(V1V2[,2],levels = comp_codes) | |
1249 )) | |
1250 C_observed = as.data.frame.matrix(C_observed + t(C_observed)) | |
1251 P_observed = as.data.frame.matrix(C_observed/sum(C_observed)) | |
1252 Edge_propotions = table(factor(V1V2, levels = comp_codes))/(length(V1V2)) | |
1253 P_expected = matrix(Edge_propotions,ncol=1) %*% matrix(Edge_propotions, nrow=1) | |
1254 | |
1255 spec_counts = df2html( | |
1256 data.frame(table(factor(species, levels=comp_codes))), | |
1257 header = c("Species", "Read count") | |
1258 ) | |
1259 | |
1260 edge_count = df2html( | |
1261 data.frame(species = comp_codes, (C_observed/2)[,]), | |
1262 header = c("", comp_codes) | |
1263 ) | |
1264 | |
1265 P_OE = df2html( | |
1266 data.frame(species=comp_codes,(P_observed/P_expected[,])), | |
1267 header = c("", comp_codes) | |
1268 ) | |
1269 | |
1270 HTML.title("Comparative analysis - species read counts:", HR =5, file = clinfo$html_report_main) | |
1271 cat(spec_counts, | |
1272 file = clinfo$html_report_main, append =TRUE) | |
1273 | |
1274 # show connection only if more than one species in cluster | |
1275 if (length(unique(species))>1){ | |
1276 HTML.title("comparative analysis - number of edges between species:", | |
1277 HR =5, file = clinfo$html_report_main) | |
1278 cat(edge_count, | |
1279 file = clinfo$html_report_main, append = TRUE) | |
1280 | |
1281 HTML.title("comparative analysis - observed/expected number of edges between species", | |
1282 HR =5, file = clinfo$html_report_main) | |
1283 cat(P_OE, | |
1284 file = clinfo$html_report_main, append = TRUE) | |
1285 } | |
1286 | |
1287 } | |
1288 | |
1289 | |
1290 ## create image with highlighted domains | |
1291 png(fs$graph_domains, width = PNGWIDTH + LEGENDWIDTH, height = PNGHEIGHT, pointsize = PS) | |
1292 layout(matrix(1:2, ncol = 2), width = c(10, 3)) | |
1293 plotg(GL$G,GL$L, col = vertex_colors$color_table) | |
1294 domains_detected = length(vertex_colors$legend) > 0 | |
1295 par(mar = c(0, 0, 0, 0)) | |
1296 plot.new() | |
1297 if (domains_detected){ | |
1298 # domains found | |
1299 legend("topleft", col=vertex_colors$legend, | |
1300 legend = names(vertex_colors$legend), | |
1301 pch = 15, cex = 0.7) | |
1302 } | |
1303 dev.off() | |
1304 | |
1305 HTML.title("protein domains:", HR=4, file = clinfo$html_report_main) | |
1306 if (!domains_detected){ | |
1307 HTML("No protein domains detected", file = clinfo$html_report_main) | |
1308 } | |
1309 HTML("protein domains:", HR=4, file = clinfo$html_report_main) | |
1310 html_insert_image( | |
1311 img_file = fs_relative$graph_domains, | |
1312 htmlfile = clinfo$html_report_main) | |
1313 | |
1314 ############################################################################# | |
1315 if (nrow(annot_summary) == 0){ | |
1316 HTML.title("Reads annotation summary", HR = 3, file = clinfo$html_report_main) | |
1317 HTML("No similarity hits to repeat databases found", file = clinfo$html_report_main) | |
1318 }else{ | |
1319 HTML(annot_summary, file = clinfo$html_report_main, align = "left") | |
1320 } | |
1321 | |
1322 | |
1323 ## similarity and mate cluster | |
1324 mate_clusters = get_cluster_connection_info(index, search_type="pair") | |
1325 similar_clusters = get_cluster_connection_info(index, search_type="similarity") | |
1326 ## report mate and similarity clusters | |
1327 if (!is.null(similar_clusters)){ | |
1328 HTML.title("clusters with similarity:", file =clinfo$html_report_main, HR = 3) | |
1329 cat(df2html( | |
1330 similar_clusters, | |
1331 header=c("Cluster","Number of similarity hits"), | |
1332 sort_col = "N", scroling = TRUE | |
1333 ), | |
1334 file =clinfo$html_report_main, append=TRUE) | |
1335 } | |
1336 if (!is.null(mate_clusters)){ | |
1337 HTML.title("clusters connected through mates:", file =clinfo$html_report_main, HR = 3) | |
1338 cat(df2html( | |
1339 mate_clusters[,c('cl','N','k')], | |
1340 header = c('Cluster','Number of shared<br> read pairs','k'), | |
1341 sort_col = "N", scroling = TRUE | |
1342 ), | |
1343 file = clinfo$html_report_main,append = TRUE | |
1344 ) | |
1345 | |
1346 ## create base graph images - it will serve as background for | |
1347 ## mate clusters plots | |
1348 png(fs$graph_base, | |
1349 width = PNGWIDTH, height = PNGHEIGHT, pointsize = PS) | |
1350 par(mar=c(0,0,0,0),xaxs="i",yaxs="i") | |
1351 plotg(GL$G,GL$L, col = "#00000050") | |
1352 dev.off() | |
1353 ## load base as raster image | |
1354 base_image = readPNG(fs$graph_base) | |
1355 | |
1356 for (i in order(mate_clusters$N, decreasing = TRUE)){ | |
1357 mate_ids = unlist(strsplit(mate_clusters$ids[[i]],split=",")) | |
1358 ## print only graph above MAX_N mates | |
1359 if (length(mate_ids) < MAX_N){ # TODO - use constant | |
1360 next | |
1361 } | |
1362 png(sprintf(fs$graph_mates,index, mate_clusters$cl[i]), | |
1363 width = PNGWIDTH, height = PNGHEIGHT, pointsize = PS) | |
1364 color_mate = gsub(".$","",V(GL$G)$name) %in% mate_ids %>% | |
1365 ifelse("#FF0000FF", "#000000AA") | |
1366 par(mar=c(0,0,0,0),xaxs="i",yaxs="i") | |
1367 plot(range(GL$L[,1]), range(GL$L[,2]), type = "n", xlab = "", ylab = "", axes = FALSE, | |
1368 main = paste0("CL",index," ----> CL",mate_clusters$cl[i])) | |
1369 rasterImage(base_image, | |
1370 range(GL$L[,1])[1], range(GL$L[,2])[1], | |
1371 range(GL$L[,1])[2], range(GL$L[,2])[2] | |
1372 ) | |
1373 points(GL$L[,1:2], col = color_mate,pch=18,cex=.8) | |
1374 dev.off() | |
1375 title = paste0("CL",index," ----> CL",mate_clusters$cl[i]) | |
1376 footer = paste0("No. of shared pairs: :", mate_clusters$N[i]) | |
1377 html_insert_floating_image( | |
1378 img_file = sprintf(fs_relative$graph_mates, index, mate_clusters$cl[i]), | |
1379 htmlfile = clinfo$html_report_main, width = 200, | |
1380 title = title, footer = footer | |
1381 ) | |
1382 } | |
1383 } | |
1384 } |