comparison lib/create_annotation.R @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1d1b9e1b2e2f
1 #!/usr/bin/env Rscript
2 Sys.setlocale("LC_CTYPE", "en_US.UTF-8") # this is necessary for handling unicode characters (data.tree package)
3 suppressPackageStartupMessages(library(data.tree))
4 suppressPackageStartupMessages(library(stringr))
5 suppressPackageStartupMessages(library(R2HTML))
6 suppressPackageStartupMessages(library(hwriter))
7 suppressPackageStartupMessages(library(DT))
8 suppressPackageStartupMessages(library(tools))
9 suppressPackageStartupMessages(library(scales))
10 suppressPackageStartupMessages(library(igraph))
11 suppressPackageStartupMessages(library(plotrix))
12 suppressPackageStartupMessages(library(png))
13
14 source("htmlheader.R")
15 source("config.R") # load tandem ranks info
16 source("utils.R")
17 DT_OPTIONS = options = list(pageLength = 1000, lengthMenu = c(10,50,100,1000,5000,10000))
18 WD = getwd() # to get script directory when run from Rserve
19 HTMLHEADER = htmlheader ## header (character) loaded from htmlheader.R
20 htmlheader = gsub("Superclusters summary","TAREAN summary", htmlheader)
21
22 evaluate_LTR_detection = function(f){
23 NO_LTR=NULL
24 if (length(readLines(f)) == 11 ){
25 return(NO_LTR)
26 }
27 df=read.table(f,as.is=TRUE,sep="\t", skip=11,fill=TRUE)
28 if (ncol(df) != 23){
29 #df is smaller if no pbs is detected!
30 return(NO_LTR)
31 }
32 df=df[!df$V13 == "",,drop=FALSE]
33 if (nrow(df)==0){
34 return(NO_LTR)
35 }
36 # criteria:
37 df_part=df[df$V15 >=12 & df$V20 == 23 & df$V21<df$V20,,drop=FALSE]
38 if (nrow(df_part) == 0){
39 return(NO_LTR)
40 }
41 PBS_type = gsub("_","",(str_extract_all(df_part$V13, pattern="_([A-Z][a-z]{2})", simplify=TRUE))) %>%
42 paste(collapse=" ")
43 return(PBS_type)
44 }
45
46
47
48 ## annotate superclusters
49 select_reads_id = function(index, search_type = c("cluster","supercluster")) {
50 ## select read if base on the supecluster index need database connection
51 ## HITSORTDB!
52 search_type = match.arg(search_type)
53 x = dbGetQuery(HITSORTDB,
54 paste0("SELECT vertexname FROM vertices WHERE vertexindex IN ",
55 "(SELECT vertexindex FROM communities ",
56 "WHERE ", search_type,"=\"", index,
57 "\")"))
58 return(x$vertexname)
59 }
60
61
62 get_reads_annotation = function(reads_id) {
63 ## select annotation from tables in SEQDB which has name in format *_database
64 annot_types = grep("_database", dbListTables(SEQDB), value = TRUE)
65 annot = list()
66 for (i in annot_types) {
67 query = paste0("SELECT * FROM ", i, " WHERE name IN (", paste0("\"", reads_id,
68 "\"", collapse = ", "), ")")
69 annot[[i]] = dbGetQuery(SEQDB, query)
70 }
71 return(annot)
72 }
73
74 supercluster_size = function(supercluster) {
75 x = dbGetQuery(HITSORTDB, paste0("SELECT count(*) FROM vertices WHERE vertexindex IN ",
76 "(SELECT vertexindex FROM communities ", "WHERE supercluster=\"", supercluster,
77 "\")"))
78 return(x$"count(*)")
79 }
80
81
82 cluster_annotation = function(cluster, search_type = c("cluster", "supercluster")){
83 ## searcheither for cluster or supercluster annotation
84 ## read annotation from sqlite databases database is access though SEQDB (sequence
85 ## annotation) and HITSORTDB - clustering information
86 search_type = match.arg(search_type)
87 reads_id = select_reads_id(cluster, search_type)
88 annot = get_reads_annotation(reads_id)
89 return(annot)
90 }
91
92 get_tarean_info = function(cluster, search_type = c("cluster", "supercluster")){
93 search_type = match.arg(search_type)
94 if (search_type == "cluster") {
95 search_type = "[index]"
96 }
97 tarean_info = dbGetQuery(HITSORTDB,
98 paste0(
99 "SELECT [index], supercluster, satellite_probability, tandem_rank, size_real, satellite FROM cluster_info WHERE ",
100 search_type, " = ", cluster))
101 nhits = sum(tarean_info$size_real[tarean_info$tandem_rank %in% 1:2])
102 proportion = nhits/sum(tarean_info$size_real)
103 return( list(
104 nhits = nhits,
105 proportion = proportion,
106 tarean_info = tarean_info)
107 )
108
109 }
110
111 get_ltr_info = function(supercluster){
112 ltr_info = dbGetQuery(HITSORTDB,
113 paste0(
114 "SELECT [index], supercluster, ltr_detection, size_real FROM cluster_info WHERE ",
115 "supercluster", " = ", supercluster))
116 return(ltr_info)
117 }
118
119 summarize_annotation = function(annot, size = NULL){
120 db_id = do.call(rbind, annot)$db_id
121 cl_string = gsub("^.+/","",gsub(":.+$", "", gsub("^.+#", "", db_id)))
122 weight = do.call(rbind, annot)$bitscore
123 domain = str_replace(str_extract(str_replace(db_id, "^.+#", ""), ":.+"), ":",
124 "")
125 domain[is.na(domain)] = ""
126 domain_table = table(cl_string, domain)
127 dt = as.data.frame(domain_table)
128 ## remove no count
129 dt = dt[dt$Freq != 0, ,drop = FALSE]
130 rownames(dt) = paste(dt$cl_string,dt$domain)
131 if (!is.null(size)){
132 dt$proportion = dt$Freq / size
133
134 }
135 return(dt)
136 }
137
138 annot2colors = function(annot, ids, base_color = "#00000050"){
139 annot_table = do.call(rbind, annot)
140 domains = str_replace(str_extract(str_replace(annot_table$db_id, "^.+#", ""), ":.+"), ":","")
141 other_annot = str_replace(annot_table$db_id, "^.+#", "")
142 complete_annot = ifelse(is.na(domains), other_annot, domains)
143 names(complete_annot) = annot_table$name
144 unique_complete_annot = names (table(complete_annot) %>% sort(decreasing = TRUE))
145 color_key = unique_colors[seq_along(unique_complete_annot)]
146 names(color_key) = unique_complete_annot
147 color_table = rep(base_color, length(ids))
148 names(color_table) = ids
149 color_table[names(complete_annot)] = color_key[complete_annot]
150 return(list( color_table = color_table, legend = color_key))
151 }
152
153
154 read_annotation_to_tree = function(supercluster, TREE_TEMPLATE = CLS_TREE) {
155 ## CLS_TREE is datatree containing 'taxonomy' information of repeats, attribute of
156 ## each node/ leave if filled up from annotation annotation is added to leaves
157 ## only!! Inner node are NA or NULL
158 annot0 = cluster_annotation(supercluster, search_type = "supercluster")
159 ## keep only built in annotation
160 annot = list(protein_databse = annot0$protein_database, dna_database = annot0$dna_database)
161 annot$dna_database$bitscore = annot$dna_database$bitscore / 2 #DNA bitscore corection, blastx - more weight
162 annot = do.call(rbind, annot)
163 ## remove duplicated hits, keep best
164 annot_clean = annot[order(annot$bitscore, decreasing = TRUE), ]
165 annot_clean = annot_clean[!duplicated(annot_clean$name), ]
166 tarean_info = get_tarean_info(supercluster, search_type = "supercluster")
167 ltr_info = get_ltr_info(supercluster)
168 db_id = annot_clean$db_id
169 cl_string = gsub(":.+$", "", gsub("^.+#", "", db_id))
170 weight = annot_clean$bitscore
171 domain = str_replace(str_extract(str_replace(db_id, "^.+#", ""), ":.+"), ":",
172 "")
173 domain[is.na(domain)] = "NA"
174 mean_weight_table = by(weight, INDICES = list(cl_string, domain), FUN = function(x) signif(mean(x)))
175 mean_weight_table[is.na(mean_weight_table)] = 0
176 total_weight_table = by(weight, INDICES = list(cl_string, domain), FUN = sum)
177 total_weight_table[is.na(total_weight_table)] = 0
178 cl_table = table(cl_string)
179 domain_table = table(cl_string, domain)
180 cls_tree = Clone(TREE_TEMPLATE)
181 cls_tree$size = supercluster_size(supercluster)
182 cls_tree$ltr_info = ltr_info
183 for (i in cls_tree$leaves) {
184 if (i$full_name %in% names(cl_table)) {
185 i$nhits = cl_table[[i$full_name]]
186 i$domains = domain_table[i$full_name, ]
187 names(i$domains) = colnames(domain_table)
188 i$mean_weight = mean_weight_table[i$full_name, ]
189 i$total_weight = total_weight_table[i$full_name, ]
190 i$proportion = signif(i$nhits/cls_tree$size, 3)
191 } else {
192 if (i$name == "satellite"){
193 i$nhits = tarean_info$nhits
194 i$tandem_rank = tarean_info$tarean_info$tandem_rank
195 i$proportion = tarean_info$proportion
196 i$tarean_info = tarean_info$tarean_info
197 }else{
198 i$proportion = 0
199 }
200 }
201 }
202 ## create domain string for printing:
203 for (i in Traverse(cls_tree)) {
204 if (is.null(i$domains)) {
205 i$features = ""
206 } else if (sum(i$domains) == 0) {
207 i$features = ""
208 } else {
209 dom = i$domains[!names(i$domains) == "NA"]
210 i$features = gsub(" *\\(\\)", "", pasteDomains(dom))
211 }
212 }
213 ## add LTR info:
214 if (any(!cls_tree$ltr_info$ltr_detection %in% "None")){
215 tr = FindNode(cls_tree, "LTR")
216 tr$features = "LTR/PBS"
217 }
218 return(cls_tree)
219 }
220
221 pasteDomains = function(dom){
222 d = dom[dom != 0]
223 if (length(d) == 0){
224 return("")
225 }else{
226 paste0(d, " (", names(d), ")", sep = ", ", collapse="")
227 }
228
229 }
230
231 rescale = function(x, newrange) {
232 # taken from plotrix package
233 if (missing(x) | missing(newrange)) {
234 usage.string <- paste("Usage: rescale(x,newrange)\n", "\twhere x is a numeric object and newrange is the new min and max\n",
235 sep = "", collapse = "")
236 stop(usage.string)
237 }
238 if (is.numeric(x) && is.numeric(newrange)) {
239 xna <- is.na(x)
240 if (all(xna))
241 return(x)
242 if (any(xna))
243 xrange <- range(x[!xna]) else xrange <- range(x)
244 if (xrange[1] == xrange[2])
245 return(x)
246 mfac <- (newrange[2] - newrange[1])/(xrange[2] - xrange[1])
247 return(newrange[1] + (x - xrange[1]) * mfac)
248 } else {
249 warning("Only numeric objects can be rescaled")
250 return(x)
251 }
252 }
253
254 add_value_to_nodes = function(tr, value_names = c("nhits", "domains", "total_weight",
255 "proportion")) {
256 ## propagate values from leaves to nodes, assume that nodes are not defined
257 ## (either NA or NULL) leaves coud be be either numeric of list, if list values
258 ## are summed up based on the names
259 for (vn in value_names) {
260 for (i in Traverse(tr)) {
261 w = add_leaves_value(i$leaves, vn)
262 i[[vn]] = w
263 }
264 }
265 }
266
267 add_leaves_value = function(x, name) {
268 ## check if annotation is multivalue or single value, propagates values from
269 ## leaves to root
270 n = unique(unlist(lapply(lapply(x, "[[", name), names)))
271 if (is.null(n)) {
272 ## no names given, we can sum everything to one value
273 total = sum(unlist(sapply(x, "[[", name)), na.rm = TRUE)
274 return(total)
275 } else {
276 total = numeric(length(n))
277 names(total) = n
278 xvals = lapply(x, "[[", name)
279 N = 0
280 for (i in xvals) {
281 for (j in names(i)) {
282 total[j] = total[j] + i[j]
283 N = N + 1
284 }
285 }
286 return(total)
287 }
288 }
289
290
291 trmap = function(tr, xl = 0, yb = 0, xr = 1, yt = 1, first = TRUE, vertical = TRUE) {
292 # treemap for data.tree plotting - experimental
293 if (first) {
294 plot(xl:xr, yb:yt, type = "n", axes = FALSE, xlab = "", ylab = "")
295 }
296 if (tr$nhits > 0) {
297 width = 2 * (6 - tr$level)
298 rect(xl, yb, xr, yt, col = "#00008805", lwd = width, border = "#00000080")
299 if (tr$level != 1) {
300 text(x = (xl + xr)/2, y = (yb + yt)/2, labels = tr$name, cex = width/4)
301 }
302 } else {
303 return(NULL)
304 }
305 Nchildren = length(tr$children)
306 cat("children:", tr$name, tr$level, "\n")
307 if (Nchildren == 0) {
308 return(NULL)
309 }
310 sizes = sapply(tr$children, "[[", "nhits")
311 rng = c(sizes, 0) / sum(sizes)
312 if (vertical) {
313 ## x does not change
314 xl2 = rep(xl, Nchildren)
315 xr2 = rep(xr, Nchildren)
316 yb2 = rescale(rng, c(yb, yt))[1:(Nchildren + 1)]
317 yt2 = rescale(rng, c(yb, yt))[-1]
318 } else {
319 yb2 = rep(yb, Nchildren)
320 yt2 = rep(yt, Nchildren)
321 xr2 = rescale(rng, c(xl, xr))[1:(Nchildren + 1)]
322 xl2 = rescale(rng, c(xl, xr))[-1]
323 }
324 for (i in 1:Nchildren) {
325 trmap(tr$children[[i]], xl2[i], yb2[i], xr2[i], yt2[i], first = FALSE, vertical = !vertical)
326 }
327
328 }
329
330 filter_tree = function(tr) {
331 ## keep only nodes with positive nhits values
332 ## must be used on trees where leaves were already propagated
333 ## using add_values_to_nodes
334 tr_filtered = Clone(tr)
335 Prune(tr_filtered, function(x) x$nhits > 0 | containLTR(x))
336 return(tr_filtered)
337 }
338
339 containLTR = function(tr){
340 for (i in Traverse(tr)){
341 if (i$features == "LTR/PBS"){
342 return(TRUE)
343 }
344 }
345 return(FALSE)
346 }
347
348 filter_tree2 = function(tr) {
349 tr_filtered = Clone(tr)
350 Prune(tr_filtered, function(x) x$parent$nhits > 0)
351 return(tr_filtered)
352 }
353
354 ## this is for testing purposesm in futurem each node will have its function
355 ## named find_best_hit, it will work like decisin tree.
356 ## functions will be set manually based on training data
357 formatWidth = function(x, w){
358 if (length (dim(x)) > 1){
359 for (i in seq_along(x)){
360 delta = nchar(x[,i], type="bytes") - nchar(x[,i], type="char")
361 ## delta correction is neccessary for utf-8 correct formating
362 x[,i] = sprintf(paste0("%-",w[i] + delta,"s"), x[,i])
363 }
364 return(x)
365 }else{
366 delta = nchar(x, type="bytes") - nchar(x, type="char")
367 sprintf(paste0("%",w + delta,"s"),
368 x) %>% return
369 }
370 }
371
372
373 find_best_hit_repeat = function(cls_tree){
374 ## three children:
375 ## repeat
376 ## |--rDNA
377 ## |--satellite this need special handling, rank 1 or 2
378 ## |--mobile_element
379 ##
380 ## in this case we require that satellite has proportion 0.95
381 ## params of good hit
382 min_prop_of_positive = 0.7
383 min_prop_second_best = 0.9
384 min_proportion_x_nhits = 2.5 # based on 0.05 x 50
385 min_satellite_proportion = 0.95
386 if (isLeaf(cls_tree)) {
387 return(cls_tree)
388 }
389 cond1 = cls_tree$nhits * cls_tree$proportion < min_proportion_x_nhits
390 if (cond1){
391 return(cls_tree)
392 }
393 nhits = sapply(cls_tree$children, "[[", "nhits")
394 all_hits = cls_tree$root$nhits
395 name_of_best_hit = cls_tree$children[[which.max(nhits)]]$name
396 best_hit_child = cls_tree$children[[which.max(nhits)]]
397 second_max_hits = ifelse(length(nhits) == 1, 0, max(nhits[-which.max(nhits)]))
398 cond2 = max(nhits) / sum(nhits) > min_prop_of_positive
399 cond3 = max(nhits) / (second_max_hits + max(nhits)) > min_prop_second_best
400 cond_satellite = best_hit_child$proportion > min_satellite_proportion & name_of_best_hit == "satellite"
401 cond_other = ! name_of_best_hit == "satellite"
402 cat(cond2, cond3, cond_satellite, cond_other, "\n")
403 if (cond2 & cond3 & (cond_satellite | cond_other)) {
404 ## clear case satellite or other
405 if ("find_best_hit" %in% names(best_hit_child)){
406 ## custom rules for node is defined
407 best_hit = best_hit_child$find_best_hit(cls_tree$children[[which.max(nhits)]])
408 }else{
409 # use generic rules
410 best_hit = find_best_hit(cls_tree$children[[which.max(nhits)]])
411 }
412 }else{
413 ## do more specific tests for rDNA, or mobile element
414 ## rDNA o mobile_elements must be above threshold!
415 cond_sat_rank = any(cls_tree$satellite$tandem_rank == 2) & cls_tree$satellite$nhits != 0
416 cond_rdna = cls_tree$rDNA$nhits * cls_tree$rDNA$proportion > min_proportion_x_nhits
417 cond_mobile = cls_tree$mobile_element$nhits * cls_tree$mobile_element$proportion > min_proportion_x_nhits
418 cat(cond_sat_rank, "\n")
419 cat(cond_rdna,"\n")
420 cat(cond_mobile, "\n")
421 if (cond_sat_rank & (cond_rdna | cond_mobile) ){
422 cls_tree$satellite$nhits = 0
423 best_hit = find_best_hit(cls_tree)
424 }else{
425 return(cls_tree)
426 }
427 }
428 return(best_hit)
429 }
430
431 find_best_hit = function(cls_tree){
432 ## general params of good hit
433 min_prop_of_positive = 0.7
434 min_prop_second_best = 0.8
435 min_proportion_x_nhits = 2.5 # based on 0.05 x 50
436 if (isLeaf(cls_tree)) {
437 return(cls_tree)
438 }
439
440
441 cond1 = cls_tree$nhits * cls_tree$proportion < min_proportion_x_nhits
442
443 if (cond1){
444 ## return if proportions is lower than threshold
445 return(cls_tree)
446 }
447
448 nhits = sapply(cls_tree$children, "[[", "nhits")
449 best_hit_child = cls_tree$children[[which.max(nhits)]]
450 ## more special cases:
451 second_max_hits = ifelse(length(nhits) == 1, 0, max(nhits[-which.max(nhits)]))
452 cond2 = max(nhits) / sum(nhits) > min_prop_of_positive
453 cond3 = max(nhits) / (second_max_hits + max(nhits)) > min_prop_second_best
454 if (cond2 & cond3) {
455 if ("find_best_hit" %in% names(best_hit_child)){
456 ## custom rules for node is defined
457 best_hit = best_hit_child$find_best_hit(cls_tree$children[[which.max(nhits)]])
458 }else{
459 # use generic rules
460 best_hit = find_best_hit(cls_tree$children[[which.max(nhits)]])
461 }
462 } else {
463 return(cls_tree)
464 }
465 return(best_hit)
466 }
467
468
469 get_annotation_groups = function(tr){
470 tr_all = Clone(tr)
471 Prune(tr_all, pruneFun=function(...)FALSE) ## prune everithing -> keep root
472 tr_all$name = "Unclassified repeat (No evidence)"
473 tr_contamination = Clone(tr)$contamination
474 tr_organelle = Clone(tr)$organelle
475 tr_repeat = Clone(tr)$"repeat"
476 tr_repeat$name = "Unclassified_repeat (conflicting evidences)"
477 return(
478 list(
479 tr_repeat = tr_repeat,
480 tr_organelle = tr_organelle,
481 tr_all = tr_all,
482 tr_contamination = tr_contamination
483 )
484 )
485 }
486
487
488
489 format_tree = function(cls_tree, ...) {
490 df = ToDataFrameTree(cls_tree, ...)
491 ## try to get rid off unicode characters
492 df[,1] = gsub("\U00B0", "'", df[,1], fixed = TRUE, useBytes = TRUE)
493 df[,1] = gsub("\U00A6", "|", df[,1], fixed = TRUE, useBytes = TRUE)
494 colnames(df)[1] = " " ## originally levelName
495 if ("proportion" %in% colnames(df)){
496 df$proportion = signif(df$proportion,2)
497 }
498 if ("Proportion[%]" %in% colnames(df)){
499 df$"Proportion[%]" = round(df$"Proportion[%]", 2)
500 }
501 ## format header
502 w1 = apply(df, 2, function(x) max(nchar(x)))
503 w2 = nchar(colnames(df))
504 # use whatever with is bigger for formating
505 w = ifelse(w1 > w2, w1, w2)
506
507 out = character()
508 header = mapply(FUN = function(x, y) formatWidth(x, w = y), colnames(df), w) %>%
509 paste0(collapse=" | ")
510 hr_line = gsub(".","-",header)
511 # create output
512 # classification lines
513 class_lines = formatWidth(df, w) %>%
514 apply(1, FUN = function(x) paste0(x,collapse = " | ")) %>%
515 paste(collapse = "\n")
516 paste(
517 header,
518 hr_line,
519 class_lines,
520 sep="\n") %>% return
521 }
522
523
524 make_final_annotation_template = function(
525 annot_attributes = list(
526 Nreads = 0,
527 Nclusters = 0,
528 Nsuperclusters = 0,
529 "Proportion[%]" = 0,
530 cluster_list = c())
531 )
532 {
533 ct = Clone(CLS_TREE)
534 for (i in Traverse(ct)){
535 for (j in names(annot_attributes)){
536 i[[j]] = annot_attributes[[j]]
537 }
538 }
539 return(ct)
540 }
541
542
543 common_ancestor = function(tr1, tr2){
544 a1 = tr1$Get('name', traversal = "ancestor")
545 a2 = tr2$Get('name', traversal = "ancestor")
546 ancestor = intersect(a1,a2)[1]
547 return(ancestor)
548 }
549
550 create_all_superclusters_report = function(max_supercluster = 100,
551 paths,
552 libdir,
553 superclusters_dir,
554 seqdb, hitsortdb,
555 classification_hierarchy_file,
556 HTML_LINKS)
557 {
558 ## connect to sqlite databases
559 ## seqdb and hitsortdb are path to sqlite files
560 HTML_LINKS = nested2named_list(HTML_LINKS)
561 paths = nested2named_list(paths)
562 report_file = paths[['supercluster_report_html']]
563 ## create SEQDB, HTISORTDB and CLS_TREE
564 connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file)
565
566 ## append specific classication rules
567 CLS_TREE$"repeat"$find_best_hit = find_best_hit_repeat
568
569 ###
570 cat("Superclusters summary\n", file = report_file)
571 cat("---------------------\n\n", file = report_file, append = TRUE)
572 empty = rep(NA,max_supercluster)
573 SC_table = data.frame(
574 Supercluster = 1 : max_supercluster, "Number of reads" = empty, Automatic_annotation = empty,
575 Similarity_hits = empty, TAREAN_annotation = empty,
576 Clusters = empty, stringsAsFactors = FALSE, check.names = FALSE
577 )
578 SC_csv = SC_table # table as spreadsheet - no html formatings
579 final_cluster_annotation = make_final_annotation_template()
580
581 for (sc in 1 : max_supercluster) {
582 cat("supercluster: ",sc,"\n")
583 cls_tree = read_annotation_to_tree(sc)
584 sc_summary = get_supercluster_summary(sc)
585 add_value_to_nodes(cls_tree)
586 cls_tree_filtered = filter_tree(cls_tree)
587 best_hit = find_best_hit(cls_tree)
588 ## exception to decision tree put here.
589 ## All -> class I ?
590 if (best_hit$name == "All"){
591 ## check LTR
592 if (any(!(unique(cls_tree$ltr_info$ltr_detection) %in% "None"))){
593 ## if LTR found - move classification to Class I
594
595 LTR = FindNode(best_hit, "LTR")
596 nhits_without_class_I = best_hit$nhits - LTR$nhits
597 prop_without_class_I = best_hit$proportion - LTR$proportion
598 cond3 = nhits_without_class_I * prop_without_class_I < 1
599 cond2 = best_hit$nhits * best_hit$proportion < 0.5 # other hits weak
600 cond1 = LTR$nhits >= (0.95 * best_hit$nhits) # hits are pre in sub class_I
601
602 if (cond1 | cond2 | cond3){
603 best_hit = LTR
604 }
605 }
606 }else{
607 ## check is conflict os best hit with LTR/PBS exists
608 if (any(!(unique(cls_tree$ltr_info$ltr_detection) %in% "None"))){
609 LTR = FindNode(cls_tree, "LTR")
610 ca_name = common_ancestor(LTR, best_hit)
611 if (ca_name != "LTR"){
612 ## reclassify
613 best_hit = FindNode(cls_tree, ca_name)
614 }
615 }
616 }
617
618 best_hit_name = best_hit$name
619 best_hit_path = paste(best_hit$path, collapse = "/")
620 ## add best hit do database:
621 sqlcmd = paste0(
622 "UPDATE cluster_info SET supercluster_best_hit = \"", best_hit_path, "\" ",
623 " WHERE supercluster = ", sc
624 )
625 dbExecute(HITSORTDB, sqlcmd)
626 # add annotation annotation summary stored in final_cluster_annotation - summing up
627 best_hit_node = FindNode(final_cluster_annotation, best_hit_name)
628 best_hit_node$Nsuperclusters = best_hit_node$Nsuperclusters + 1
629 best_hit_node$Nclusters = best_hit_node$Nclusters + sc_summary$Nclusters
630 best_hit_node$Nreads = best_hit_node$Nreads + sc_summary$Nreads
631 best_hit_node$"Proportion[%]" = best_hit_node$"Proportion[%]" + sc_summary$proportion*100
632 best_hit_node$cluster_list = append(best_hit_node$cluster_list, sc_summary$cluster_list)
633 best_hit_label = ifelse(best_hit_name == "All", "NA", best_hit_name)
634 SC_csv$Automatic_annotation[sc] = best_hit_label
635 SC_csv$"Number of reads"[sc] = SC_table$"Number of reads"[sc] = cls_tree$size
636 SC_csv$Similarity_hits[sc] = format_tree(cls_tree_filtered,
637 "nhits", "proportion", "features")
638 SC_table$Similarity_hits[sc] = format_tree(cls_tree_filtered,
639 "nhits", "proportion", "features") %>%
640 preformatted
641
642 SC_table$Automatic_annotation[sc] = best_hit_label
643
644 G = get_supercluster_graph(
645 sc=sc, seqdb = seqdb,hitsortdb = hitsortdb,
646 classification_hierarchy_file = classification_hierarchy_file
647 )
648 ## append tarean annotation
649 clist = paste(V(G)$name, collapse =",")
650 cl_rank = dbGetQuery(HITSORTDB,
651 paste("SELECT [index], tandem_rank FROM cluster_info WHERE [index] IN (",clist,
652 ") AND tandem_rank IN (1,2)"))
653 ## add information about TR clusters is any
654 if (nrow(cl_rank) > 0){
655 tarean_annot = paste(
656 cl_rank$index, "-",
657 RANKS_TANDEM[cl_rank$tandem_rank]
658 )
659 SC_csv$TAREAN_annotation[sc] = paste(tarean_annot,collapse="\n")
660 SC_table$TAREAN_annotation[sc] = mapply(
661 hwrite, tarean_annot,
662 link = sprintf(HTML_LINKS$ROOT_TO_TAREAN,cl_rank$index)) %>% paste(collapse="<br>")
663 }
664 ## add links to clusters
665 SC_csv$Clusters[sc] = paste((V(G)$name), collapse = ",")
666 links = mapply(hwrite, V(G)$name, link = sprintf(HTML_LINKS$ROOT_TO_CLUSTER, as.integer(V(G)$name)))
667 SC_table$Clusters[sc] = paste(links, collapse =", ")
668 create_single_supercluster_report(G, superclusters_dir)
669 }
670
671
672
673 ## add html links
674 SC_table$Supercluster = mapply(hwrite, SC_table$Supercluster,
675 link = sprintf(HTML_LINKS$ROOT_TO_SUPERCLUSTER,
676 SC_table$Supercluster))
677
678
679
680 write.table(SC_csv, file = paths[["superclusters_csv_summary"]],
681 sep = "\t", row.names = FALSE)
682
683
684 DT_instance = datatable(SC_table, escape = FALSE, options = DT_OPTIONS)
685 saveWidget(DT_instance, file = normalizePath(report_file),
686 normalizePath(libdir), selfcontained = FALSE)
687 add_preamble(normalizePath(report_file),
688 preamble='<h2>Supercluster annotation</h2> <p><a href="documentation.html#superclust"> For table legend see documentation. <a> </p>')
689
690 annot_groups = get_annotation_groups(final_cluster_annotation)
691 saveRDS(object=annot_groups, file = paths[['repeat_annotation_summary_rds']])
692 final_cluster_annotation_formated = paste(
693 sapply(annot_groups, format_tree, 'Proportion[%]', 'Nsuperclusters', 'Nclusters', "Nreads"),
694 collapse = "\n\n\n"
695 )
696 ## TODO add singleton counts, total counts and extra text - csv output
697 ## export annotation summary
698 html_summary = start_html(paths[['summarized_annotation_html']], gsub("PAGE_TITLE", "Repeat Annotation Summary", HTMLHEADER))
699 html_summary("Repeat annotation summary", HTML.title, HR=2)
700 html_summary("This table summarizes the automatic annotations of superclusters that should be verified and manually corrected if necessary. Thus, the table should not be used as the final output of the analysis without critical evaluation.", HTML.title, HR=3)
701 html_summary(preformatted(final_cluster_annotation_formated), cat)
702
703
704
705 return()
706 }
707 ## for testing
708 if (FALSE){
709 create_supercluster_report(1:100, report_file, seqdb, hitsortdb, class_file)
710 }
711
712 create_single_supercluster_report = function(G, superclusters_dir){
713 sc_dir = paste0(superclusters_dir, "/dir_SC",sprintf("%04d", G$name))
714 htmlfile = paste0(sc_dir, "/index.html")
715 dir.create(sc_dir)
716 title = paste("Supercluster no. ",G$name)
717 htmlheader = gsub("PAGE_TITLE", title, HTMLHEADER)
718 cat(htmlheader, file = htmlfile)
719 file.copy(paste0(WD,"/style1.css"), sc_dir)
720 HTML.title(title, file = htmlfile)
721 HTML("Simmilarity hits (only hits with proportion above 0.1% in at least one cluster are shown) ", file = htmlfile)
722 img_file = paste0(sc_dir,"/SC",sprintf("%0d",G$name), ".png")
723 png(filename = img_file, width = 1400, height=1200)
724 coords = plot_supercluster(G = G)
725 dev.off()
726 ## basename - make link relative
727 href = paste0("../../clusters/dir_CL",sprintf("%04d", as.numeric(V(G)$name)),"/index.html")
728 html_insert_image(basename(img_file), htmlfile,coords=coords, href = href)
729
730 if (is_comparative()){
731 HTML("Comparative analysis", file = htmlfile)
732 img_file = paste0(sc_dir,"/SC",sprintf("%0d",G$name), "comparative.png")
733 png(filename = img_file, width = 1400, height=1200)
734 coords = plot_supercluster(G = G, "comparative")
735 mtext("comparative analysis")
736 dev.off()
737 ## basename - make link relative
738 href = paste0("../../clusters/dir_CL",sprintf("%04d", as.numeric(V(G)$name)),"/index.html")
739 html_insert_image(basename(img_file), htmlfile,coords=coords, href = href)
740
741 }
742 }
743
744 html_insert_image = function(img_file, htmlfile, coords=NULL, href=NULL) {
745 img_tag = sprintf('<p> <img src="%s" usemap="#clustermap" > \n</p>\n<br>\n', img_file)
746 cat(img_tag, file = htmlfile, append = TRUE)
747 if (!is.null(coords)){
748 formated_links = character()
749 for (i in seq_along(href)){
750 formated_links[i] = sprintf('<area shape="circle" coords="%f,%f,%f" href="%s" >\n',
751 coords[i,1],coords[i,2],coords[,3],href[i])
752 }
753 cat('<map name="clustermap" >\n',
754 formated_links,"</map>\n", file=htmlfile, append = TRUE)
755
756 }
757 }
758
759 html_insert_floating_image = function(img_file,
760 htmlfile,
761 width=NULL,
762 title= "",
763 footer = ""
764 ){
765 if (is.null(width)){
766 width=""
767 }
768 tag = paste(
769 '<div class="floating_img">\n',
770 ' <p>', title,'</p>\n',
771 ' <a href=',img_file, ">",
772 ' <img src="',img_file, '" alt="image" width="',width,'">\n',
773 ' </a>',
774 ' <p> ',footer,'</p>\n',
775 '</div>\n'
776 ,sep = "")
777 cat(tag, file = htmlfile, append = TRUE)
778 }
779
780 get_supercluster_graph = function(sc, seqdb, hitsortdb, classification_hierarchy_file){
781 ## sc - superclusted index
782 ## seqdb, hitsortdb - path to sqlite databases
783 ## classificationn_hierarchy_file - path data.tree rds file
784 ## SIZE of image
785 SIZE = 1000
786 connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file)
787 clusters = dbGetQuery(HITSORTDB,
788 paste0("SELECT cluster, size FROM superclusters WHERE supercluster=",sc)
789 ) %>% as.data.frame
790 ## string for sql query:
791 cluster_list = paste0( "(", paste0(clusters$cluster, collapse = ","),")")
792 supercluster_ncol = dbGetQuery(HITSORTDB,
793 paste("SELECT c1,c2,w, k FROM cluster_mate_bond where c1 IN ",
794 cluster_list, " AND c2 IN ", cluster_list, "AND k > 0.1"
795 )
796 )
797 if (nrow(supercluster_ncol)>0){
798 ## at least two clusters
799 G = graph.data.frame(supercluster_ncol, directed=FALSE)
800 L = layout_with_kk(G)
801 ## layout is rescaled for easier linking html
802 L[,1] = scales::rescale(L[,1], to = c(1,SIZE) )
803 L[,2] = scales::rescale(L[,2], to = c(1,SIZE) )
804 G$L = L
805 }else{
806 ## only one cluster in supercluster
807 G = graph.full(n = 1)
808 V(G)$name = as.character(clusters$cluster)
809 G$L = matrix(c(SIZE/2, SIZE/2), ncol=2)
810 }
811 G = set_vertex_attr(G, "label", value = paste0("CL",names(V(G))))
812 G$name=sc ## names is supercluster
813 # create annotation of individual clusters, will be attached as node attribudes
814 annot = get_cluster_annotation_summary(clusters)
815 ## annotation of nodes(clusters)
816 G = set_vertex_attr(G, "annotation", value = annot$clusters[names(V(G))])
817 G = set_vertex_attr(G, "size",
818 value = clusters$size[match(names(V(G)), as.character(clusters$cluster))])
819 G$annotation = annot$supercluster
820 G$clusters = clusters
821 # TODO if comparative analysis - add proportion of species
822 if (is_comparative()){
823 comparative_counts = get_cluster_comparative_counts(cluster_list)
824 NACluster = comparative_counts$supercluster
825 # some clusters do not have comparative data - they are bellow threshold!
826 NACluster[,] = NA
827 counts_adjusted = comparative_counts$clusters[names(V(G))]
828 for (i in names(V(G))){
829 if(is.null(counts_adjusted[[i]])){
830 ## get count from database
831 seqid = dbGetQuery(HITSORTDB,
832 paste(
833 "SELECT vertexname FROM vertex_cluster where cluster = ",
834 i)
835 )$vertexname
836 codes = get_comparative_codes()$prefix
837 x = table(factor(substring(seqid, 1,nchar(codes[1])), levels = codes))
838 counts_adjusted[[i]] = data.frame(
839 Freq = as.numeric(x),
840 proportion = as.numeric(x/sum(x)),
841 row.names = names(x))
842
843 }
844 }
845
846 G = set_vertex_attr(G, "comparative_counts",
847 value = counts_adjusted[V(G)$name]
848 )
849 # for whole supercluster
850 G$comparative = comparative_counts$supercluster
851 }
852
853 return(G)
854 }
855
856 get_cluster_comparative_counts = function(cluster_list){
857 counts = dbGetQuery(
858 HITSORTDB,
859 paste0("SELECT * FROM comparative_counts WHERE clusterindex IN",
860 cluster_list
861 )
862 )
863 comparative_counts = apply(counts, 1, function(x)
864 y = data.frame(Freq = x[-1], proportion = x[-1]/sum(x[-1]))
865 )
866 names(comparative_counts) = counts$clusterindex
867 total_counts = data.frame(
868 Freq = colSums(counts[,-1]),
869 proportion = colSums(counts[,-1])/sum(counts[,-1]))
870 return(
871 list(
872 clusters = comparative_counts,
873 supercluster = total_counts
874 )
875 )
876
877 }
878
879 get_cluster_annotation_summary = function(clusters){
880 ## clusters - table of clusters, col: cluster, size
881 annot_list = apply(clusters,1,FUN = function(x)summarize_annotation(cluster_annotation(x[1]),x[2]))
882 ## if empty - not annotation at all
883 if (sum(sapply(annot_list, nrow)) == 0){
884 return(list (clusters = NULL,
885 superclusters = NULL
886 ))
887 }
888 names(annot_list) = as.character(clusters$cluster)
889 annot_all = do.call(rbind,annot_list)
890 total = by(annot_all$Freq,INDICES=with(annot_all, paste(cl_string,domain)), FUN=sum, simplify=TRUE)
891 total_df = data.frame(Freq = c(total), proportion = c(total) /sum(clusters$size))
892 ## for ploting it need to contain all annot categories as in s upercluster
893 annot_list_full = list()
894 for (i in seq_along(annot_list)){
895 annot_list_full[[i]] = total_df
896 for (j in rownames(total_df)){
897 if (!j %in% rownames(annot_list[[i]])){
898 annot_list_full[[i]][j,c('Freq','proportion')] = c(0,0)
899 }else{
900 annot_list_full[[i]][j,c('Freq','proportion')] = annot_list[[i]][j,c('Freq','proportion')]
901 }
902 }
903 }
904 names(annot_list_full) = names(annot_list)
905 return(list (clusters = annot_list_full,
906 supercluster = total_df
907 )
908 )
909 }
910
911 plot_supercluster = function(G,color_scheme=c("annotation","comparative")){
912 ## create plot in coords y: 1-1200, x: 1 - 1400
913 ## this is fixed for href to work in exact image areas!
914 color_scheme = match.arg(color_scheme)
915 LIMS0 = matrix(c (1,1000,1,1000), ncol = 2)
916 OFFSET_H = 100; OFFSET_W = 200
917 lims = LIMS0 + c(0,OFFSET_W * 2, 0, OFFSET_H * 2)
918 ## use full range
919 par(mar=c(0,0,0,0),xaxs="i", yaxs="i")
920 ## proportion of repeats
921 if (color_scheme == "annotation"){
922 prop = sapply (V(G)$annotation, FUN = function(x)x$proportion)
923 brv = c("#BBBBBB",unique_colors)
924 }else{
925 prop = sapply (V(G)$comparative_counts, FUN = function(x)x$proportion)
926 brv = unique_colors
927 }
928 ## handle special cases - single cluster with annot or single annotation group
929 if (is.null(dim(prop))){
930 prop=matrix(prop, nrow = 1)
931 }
932 if (length(prop)==0){
933 ## special case - not annotation at all
934 prop = matrix(rep(1,vcount(G)),ncol = vcount(G), dimnames=list("NAN", NULL))
935 }else{
936 if (color_scheme == "annotation"){
937 rownames(prop) = rownames(G$annotation)
938 }else{
939 rownames(prop) = rownames(G$comparative)
940 }
941 ## reorder by prop
942 if (color_scheme =="annotation"){
943 prop = prop[order(prop[,1], decreasing = TRUE),,drop=FALSE]
944 NAN = 1 - colSums(prop)
945 prop = rbind(NAN, prop)
946 brv = c("#BBBBBB",unique_colors)
947 }else{
948 }
949 }
950 L = G$L + c(OFFSET_H)
951 ## for href - necessary convert coordinater - image is flipped vertically
952 include = rowSums(prop>0.001, na.rm = TRUE)>=1
953 include[1] = TRUE
954 prop = prop[include, , drop=FALSE]
955 plot(0,0,type='n', , xlim = lims[,1], ylim = lims[,2])
956 plot_edges(G, L, lwd = 8)
957 RADIUS = radius_size(G)
958 coords = cbind(G$L[,1,drop=FALSE] + 100, 1100 - G$L[,2,drop=FALSE], RADIUS)
959 pieScatter(L[,1], L[,2], t(prop), col = brv,
960 radius = RADIUS, new.plot = FALSE, border=NA)
961 ## add ledend
962 legend("topright", legend = rownames(prop),
963 col = brv, pch = 15, cex= 1.5, pt.cex= 3)
964 ## add cluster labels
965 text(x = L[,1], y = L[,2], labels = paste(V(G)$label,"\n",V(G)$size))
966 return(coords)
967 }
968
969
970 radius_size = function(G){
971 if (vcount(G) == 1) RADIUS=120
972 if (vcount(G) == 2) RADIUS=50
973 if (vcount(G) %in% 3:8) RADIUS=40
974 if (vcount(G) > 8) RADIUS=30
975 return(RADIUS)
976 }
977
978
979 plot_edges = function(G,L,col="#33000040", lwd = 1){
980 e = get.edgelist(G, names = F)
981 X0 = L[e[, 1], 1]
982 Y0 = L[e[, 1], 2]
983 X1 = L[e[, 2], 1]
984 Y1 = L[e[, 2], 2]
985 segments(X0, Y0, X1, Y1, lwd = lwd,col = col)
986 }
987
988
989 pieScatter=function(x, y, prop,radius=1, col=NULL, edges=100, new.plot = TRUE, ...){
990 if (new.plot){
991 plot(x,y,type='n')
992 }
993 for (i in seq_along(x)){
994 p=prop[i,]
995 if (length(radius)==1){
996 r=radius
997 }else{
998 r=radius[i]
999 }
1000 include = p != 0
1001 floating.pie(x[i], y[i],p[include],
1002 col=col[include], radius=r,edges=50,...)
1003 }
1004 }
1005
1006 unique_colors = c("#00FF00", "#0000FF", "#FF0000",
1007 "#FFA6FE", "#FFDB66", "#006401", "#010067", "#95003A",
1008 "#007DB5", "#FF00F6", "#FFEEE8", "#774D00", "#90FB92", "#01FFFE",
1009 "#0076FF", "#D5FF00", "#FF937E", "#6A826C", "#FF029D",
1010 "#FE8900", "#7A4782", "#7E2DD2", "#85A900", "#FF0056",
1011 "#A42400", "#00AE7E", "#683D3B", "#BDC6FF", "#263400",
1012 "#BDD393", "#00B917", "#9E008E", "#001544", "#C28C9F",
1013 "#FF74A3", "#01D0FF", "#004754", "#E56FFE", "#788231",
1014 "#0E4CA1", "#91D0CB", "#BE9970", "#968AE8", "#BB8800",
1015 "#43002C", "#DEFF74", "#00FFC6", "#FFE502", "#620E00",
1016 "#008F9C", "#98FF52", "#7544B1", "#B500FF", "#00FF78",
1017 "#FF6E41", "#005F39", "#6B6882", "#5FAD4E", "#A75740",
1018 "#A5FFD2", "#FFB167", "#009BFF", "#E85EBE")
1019
1020 if (FALSE){
1021 ## For testing
1022 for ( i in 1:100){
1023 G = get_supercluster_graph(
1024 sc=i, seqdb = seqdb,
1025 hitsortdb = hitsortdb, classification_hierarchy_file = class_file
1026 )
1027 png(paste0("/mnt/raid/users/petr/tmp/test.figure_",i,".png"), width = 1400, height=1000)
1028 plot_supercluster(G)
1029 dev.off()
1030 print(i)
1031 }
1032
1033 }
1034
1035 plotg = function(GG,LL,wlim=NULL,...){
1036 ## quick ploting function
1037 e = get.edgelist(GG, names=F)
1038 w = E(GG)$weight
1039 if (!is.null(wlim)) {e = e[w > wlim,]; w = w[w > wlim]}
1040 X0 = LL[e[,1],1]
1041 Y0 = LL[e[,1],2]
1042 X1 = LL[e[,2],1]
1043 Y1 = LL[e[,2],2]
1044 plot(range(LL[,1]),range(LL[,2]),xlab="",ylab="",axes=FALSE,type="n",...)
1045 brv = 'grey'
1046 segments(X0,Y0,X1,Y1,lwd=.5,col=brv)
1047 points(LL,pch=18,cex=.8,...)
1048 }
1049
1050
1051
1052
1053 get_cluster_info = function(index){
1054 ## must be connected to hitsort database ( HITSORT)
1055 x = dbGetQuery(HITSORTDB,
1056 paste0("SELECT * FROM cluster_info WHERE [index]=",index))
1057 return(as.list(x))
1058 }
1059
1060 get_supercluster_info = function(sc){
1061 ## must be connected to hitsort database ( HITSORT)
1062 x = dbGetQuery(HITSORTDB,
1063 paste0("SELECT * FROM cluster_info WHERE supercluster=",sc))
1064 return(x)
1065 }
1066
1067 get_supercluster_summary = function(sc){
1068 info = get_supercluster_info(sc)
1069 Nclusters = nrow(info)
1070 Nreads = supercluster_size(sc)
1071 N = dbGetQuery(SEQDB, "SELECT count(*) from sequences")
1072 proportion = Nreads/N
1073 cluster_list = info$index
1074 return(list(
1075 Nreads = Nreads,
1076 Nclusters = Nclusters,
1077 proportion = proportion,
1078 cluster_list = cluster_list
1079 ))
1080 }
1081
1082
1083 get_cluster_connection_info = function(index, search_type=c("pair", "similarity")){
1084 search_type = match.arg(search_type)
1085 if (search_type == "pair"){
1086 x = dbGetQuery(HITSORTDB,
1087 sprintf(
1088 "SELECT * FROM cluster_mate_connections WHERE c1==%d OR c2=%d",
1089 index, index)
1090 )
1091 if (nrow(x) == 0 ){
1092 ## no connections
1093 return(NULL)
1094 }
1095 other_cl = ifelse(x$c1 == index, x$c2, x$c1)
1096 out = data.frame(cl = other_cl, N = x$N)
1097 out$ids = unlist(strsplit(x$ids, split = ", "))
1098 k = dbGetQuery(HITSORTDB,
1099 sprintf(
1100 "SELECT * FROM cluster_mate_bond WHERE c1==%d OR c2=%d",
1101 index, index)
1102 )
1103 if (k$c1 != x$c1 || k$c2 !=x$c2){
1104 ## tables not in same order
1105 stop("tables cluster_mate_bond and cluster_mate_connection are not in the same order")
1106 }
1107 out$k = k$k
1108 return(out)
1109 }else{
1110 x = dbGetQuery(HITSORTDB,
1111 sprintf(
1112 "SELECT * FROM cluster_connections WHERE c1==%d OR c2=%d",
1113 index, index)
1114 )
1115 if (nrow(x) == 0 ){
1116 ## no connections
1117 return(NULL)
1118 }
1119 other_cl = ifelse(x$c1 == index, x$c2, x$c1)
1120 out = data.frame(cl = other_cl, N = x$"count(*)")
1121 return(out)
1122 }
1123 }
1124
1125
1126 format_clinfo=function(x){
1127 include = c(
1128 "size",
1129 "size_real",
1130 "ecount",
1131 "supercluster",
1132 "annotations_summary",
1133 "ltr_detection",
1134 "pair_completeness",
1135 "TR_score",
1136 "TR_monomer_length",
1137 "loop_index",
1138 "satellite_probability",
1139 "TR_consensus",
1140 "tandem_rank",
1141 "orientation_score"
1142 )
1143 new_names = c(
1144 "TR_consensus" = "TAREAN consensus",
1145 "tandem_rank" = "TAREAN_annotation",
1146 "ltr_detection" = "PBS/LTR",
1147 'size' = 'number of reads in the graph',
1148 'size_real' = 'the total number of reads in the cluster',
1149 'ecount' = "number of edges of the graph:",
1150 "annotations_summary" = "similarity based annotation"
1151 )
1152 values = unlist(x[include]) %>% gsub("\n","<br>", .)
1153 include = replace(include, match(names(new_names),include), new_names)
1154 names(values) = include
1155 ## replace tandem rank with annotation description
1156 values["TAREAN_annotation"] = RANKS_TANDEM[values["TAREAN_annotation"]]
1157 ## create HTML table without header
1158 desc = df2html(data.frame(names(values), values))
1159 return(desc)
1160 }
1161
1162 create_cluster_report = function(index,
1163 seqdb, hitsortdb,
1164 classification_hierarchy_file,
1165 HTML_LINKS){
1166 ## index - index of cluster, integer
1167 ## seqdb, hitsortdb - sqlite databases
1168 ## classification_hierarchy_file - rds file with classification
1169 ## other information about cluster is collected from HITSORTDB
1170 connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file)
1171 ## basic info about cluster
1172 clinfo = get_cluster_info(index)
1173 HTML_LINKS = nested2named_list(HTML_LINKS)
1174 PNGWIDTH = 1000 # this must be kept so than link in png work correctly
1175 PNGHEIGHT = 1000
1176 LEGENDWIDTH = 300
1177 PS = 30 ## pointsize for plotting
1178 MAX_N = 10 ## to show mates or similarities connections
1179
1180 #################################################################################
1181 ## create HTML title:
1182 title = paste("Cluster no. ",index)
1183 htmlheader = gsub("PAGE_TITLE", title, HTMLHEADER)
1184 cat(htmlheader, file = clinfo$html_report_main)
1185 file.copy(paste0(WD,"/style1.css"), clinfo$dir)
1186 HTML.title(title, file = clinfo$html_report_main)
1187 ## print basic info to HTML header:
1188 ## make link back to cluster table
1189 hwrite("Go back to cluster table <br><br>\n", link=HTML_LINKS$CLUSTER_TO_CLUSTER_TABLE) %>%
1190 cat(file = clinfo$html_report_main, append =TRUE)
1191 ## create ling to corresponding supercluster
1192 cat("Cluster is part of ",
1193 hwrite (paste("supercluster: ", clinfo$supercluster),
1194 link=sprintf(HTML_LINKS$CLUSTER_TO_SUPERCLUSTER, clinfo$supercluster)),
1195 "\n<br><br>\n",
1196 file = clinfo$html_report_main, append =TRUE)
1197 HTML.title("Cluster characteristics:", HR = 3, file = clinfo$html_report_main)
1198 cat(format_clinfo(clinfo), file = clinfo$html_report_main, append =TRUE)
1199 #################################################################################
1200 ## add link to image with graph layout with domains
1201 fs = list(
1202 graph_domains = paste0(clinfo$html_report_files,"/graph_domains.png"),
1203 graph_mates = paste0(clinfo$html_report_files,"/graph_mates%04d_%04d.png"),
1204 graph_base = paste0(clinfo$html_report_files,"/graph_base.png"),
1205 graph_comparative = paste0(clinfo$html_report_files,"/graph_comparative.png")
1206 )
1207 fs_relative = list(
1208 graph_domains = paste0(basename(clinfo$html_report_files),"/graph_domains.png"),
1209 graph_comparative = paste0(basename(clinfo$html_report_files),"/graph_comparative.png"),
1210 graph_mates = paste0(basename(clinfo$html_report_files),"/graph_mates%04d_%04d.png")
1211 )
1212
1213 dir.create(clinfo$html_report_files)
1214 load(clinfo$graph_file)
1215 annot = cluster_annotation(index)
1216 annot_summary = summarize_annotation(annot, clinfo$size_real)
1217 vertex_colors = annot2colors(annot,ids = V(GL$G)$name)
1218
1219 color_proportions = sort(table(vertex_colors$color_table)) / clinfo$size_real
1220 ## hits with smaller proportion are not shown!
1221 exclude_colors = names(color_proportions)[color_proportions < 0.001]
1222 vertex_colors$color_table[vertex_colors$color_table %in% exclude_colors] = "#00000050"
1223 vertex_colors$legend = vertex_colors$legend[!vertex_colors$legend %in% exclude_colors]
1224
1225 if (is_comparative()){
1226 comp_codes = get_comparative_codes()$prefix
1227 colors = unique_colors[seq_along(comp_codes)]
1228 names(colors) = comp_codes
1229 species = substring(V(GL$G)$name,1, nchar(comp_codes[1]))
1230 ## create image with highlighted domains
1231 png(fs$graph_comparative, width = PNGWIDTH + LEGENDWIDTH, height = PNGHEIGHT, pointsize = PS)
1232 layout(matrix(1:2, ncol = 2), width = c(10, 3))
1233 plotg(GL$G,GL$L, col = colors[species])
1234 par(mar = c(0, 0, 0, 0))
1235 plot.new()
1236 legend("topleft", col=colors,
1237 legend = names(colors),
1238 pch = 15, cex = 0.7)
1239 dev.off()
1240 HTML.title("comparative analysis:", HR=4, file = clinfo$html_report_main)
1241 html_insert_image(
1242 img_file = fs_relative$graph_comparative,
1243 htmlfile = clinfo$html_report_main)
1244 ## comparative analysis - calculate statistics:
1245 V1V2 = substring(get.edgelist(GL$G),1, nchar(comp_codes[1]))
1246 C_observed = table(data.frame(
1247 V1=factor(V1V2[,1],levels = comp_codes),
1248 V2=factor(V1V2[,2],levels = comp_codes)
1249 ))
1250 C_observed = as.data.frame.matrix(C_observed + t(C_observed))
1251 P_observed = as.data.frame.matrix(C_observed/sum(C_observed))
1252 Edge_propotions = table(factor(V1V2, levels = comp_codes))/(length(V1V2))
1253 P_expected = matrix(Edge_propotions,ncol=1) %*% matrix(Edge_propotions, nrow=1)
1254
1255 spec_counts = df2html(
1256 data.frame(table(factor(species, levels=comp_codes))),
1257 header = c("Species", "Read count")
1258 )
1259
1260 edge_count = df2html(
1261 data.frame(species = comp_codes, (C_observed/2)[,]),
1262 header = c("", comp_codes)
1263 )
1264
1265 P_OE = df2html(
1266 data.frame(species=comp_codes,(P_observed/P_expected[,])),
1267 header = c("", comp_codes)
1268 )
1269
1270 HTML.title("Comparative analysis - species read counts:", HR =5, file = clinfo$html_report_main)
1271 cat(spec_counts,
1272 file = clinfo$html_report_main, append =TRUE)
1273
1274 # show connection only if more than one species in cluster
1275 if (length(unique(species))>1){
1276 HTML.title("comparative analysis - number of edges between species:",
1277 HR =5, file = clinfo$html_report_main)
1278 cat(edge_count,
1279 file = clinfo$html_report_main, append = TRUE)
1280
1281 HTML.title("comparative analysis - observed/expected number of edges between species",
1282 HR =5, file = clinfo$html_report_main)
1283 cat(P_OE,
1284 file = clinfo$html_report_main, append = TRUE)
1285 }
1286
1287 }
1288
1289
1290 ## create image with highlighted domains
1291 png(fs$graph_domains, width = PNGWIDTH + LEGENDWIDTH, height = PNGHEIGHT, pointsize = PS)
1292 layout(matrix(1:2, ncol = 2), width = c(10, 3))
1293 plotg(GL$G,GL$L, col = vertex_colors$color_table)
1294 domains_detected = length(vertex_colors$legend) > 0
1295 par(mar = c(0, 0, 0, 0))
1296 plot.new()
1297 if (domains_detected){
1298 # domains found
1299 legend("topleft", col=vertex_colors$legend,
1300 legend = names(vertex_colors$legend),
1301 pch = 15, cex = 0.7)
1302 }
1303 dev.off()
1304
1305 HTML.title("protein domains:", HR=4, file = clinfo$html_report_main)
1306 if (!domains_detected){
1307 HTML("No protein domains detected", file = clinfo$html_report_main)
1308 }
1309 HTML("protein domains:", HR=4, file = clinfo$html_report_main)
1310 html_insert_image(
1311 img_file = fs_relative$graph_domains,
1312 htmlfile = clinfo$html_report_main)
1313
1314 #############################################################################
1315 if (nrow(annot_summary) == 0){
1316 HTML.title("Reads annotation summary", HR = 3, file = clinfo$html_report_main)
1317 HTML("No similarity hits to repeat databases found", file = clinfo$html_report_main)
1318 }else{
1319 HTML(annot_summary, file = clinfo$html_report_main, align = "left")
1320 }
1321
1322
1323 ## similarity and mate cluster
1324 mate_clusters = get_cluster_connection_info(index, search_type="pair")
1325 similar_clusters = get_cluster_connection_info(index, search_type="similarity")
1326 ## report mate and similarity clusters
1327 if (!is.null(similar_clusters)){
1328 HTML.title("clusters with similarity:", file =clinfo$html_report_main, HR = 3)
1329 cat(df2html(
1330 similar_clusters,
1331 header=c("Cluster","Number of similarity hits"),
1332 sort_col = "N", scroling = TRUE
1333 ),
1334 file =clinfo$html_report_main, append=TRUE)
1335 }
1336 if (!is.null(mate_clusters)){
1337 HTML.title("clusters connected through mates:", file =clinfo$html_report_main, HR = 3)
1338 cat(df2html(
1339 mate_clusters[,c('cl','N','k')],
1340 header = c('Cluster','Number of shared<br> read pairs','k'),
1341 sort_col = "N", scroling = TRUE
1342 ),
1343 file = clinfo$html_report_main,append = TRUE
1344 )
1345
1346 ## create base graph images - it will serve as background for
1347 ## mate clusters plots
1348 png(fs$graph_base,
1349 width = PNGWIDTH, height = PNGHEIGHT, pointsize = PS)
1350 par(mar=c(0,0,0,0),xaxs="i",yaxs="i")
1351 plotg(GL$G,GL$L, col = "#00000050")
1352 dev.off()
1353 ## load base as raster image
1354 base_image = readPNG(fs$graph_base)
1355
1356 for (i in order(mate_clusters$N, decreasing = TRUE)){
1357 mate_ids = unlist(strsplit(mate_clusters$ids[[i]],split=","))
1358 ## print only graph above MAX_N mates
1359 if (length(mate_ids) < MAX_N){ # TODO - use constant
1360 next
1361 }
1362 png(sprintf(fs$graph_mates,index, mate_clusters$cl[i]),
1363 width = PNGWIDTH, height = PNGHEIGHT, pointsize = PS)
1364 color_mate = gsub(".$","",V(GL$G)$name) %in% mate_ids %>%
1365 ifelse("#FF0000FF", "#000000AA")
1366 par(mar=c(0,0,0,0),xaxs="i",yaxs="i")
1367 plot(range(GL$L[,1]), range(GL$L[,2]), type = "n", xlab = "", ylab = "", axes = FALSE,
1368 main = paste0("CL",index," ----> CL",mate_clusters$cl[i]))
1369 rasterImage(base_image,
1370 range(GL$L[,1])[1], range(GL$L[,2])[1],
1371 range(GL$L[,1])[2], range(GL$L[,2])[2]
1372 )
1373 points(GL$L[,1:2], col = color_mate,pch=18,cex=.8)
1374 dev.off()
1375 title = paste0("CL",index," ----> CL",mate_clusters$cl[i])
1376 footer = paste0("No. of shared pairs: :", mate_clusters$N[i])
1377 html_insert_floating_image(
1378 img_file = sprintf(fs_relative$graph_mates, index, mate_clusters$cl[i]),
1379 htmlfile = clinfo$html_report_main, width = 200,
1380 title = title, footer = footer
1381 )
1382 }
1383 }
1384 }