view lib/create_annotation.R @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env Rscript
Sys.setlocale("LC_CTYPE", "en_US.UTF-8")  # this is necessary for handling unicode characters (data.tree package)
suppressPackageStartupMessages(library(data.tree))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(R2HTML))
suppressPackageStartupMessages(library(hwriter))
suppressPackageStartupMessages(library(DT))
suppressPackageStartupMessages(library(tools))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(igraph))
suppressPackageStartupMessages(library(plotrix))
suppressPackageStartupMessages(library(png))

source("htmlheader.R")
source("config.R")  # load tandem ranks info
source("utils.R")  
DT_OPTIONS = options = list(pageLength = 1000, lengthMenu = c(10,50,100,1000,5000,10000))
WD = getwd()   # to get script directory when run from Rserve
HTMLHEADER = htmlheader  ## header (character) loaded from htmlheader.R
htmlheader = gsub("Superclusters summary","TAREAN summary", htmlheader)

evaluate_LTR_detection = function(f){
	NO_LTR=NULL
	if (length(readLines(f)) == 11 ){
		return(NO_LTR)
	}
	df=read.table(f,as.is=TRUE,sep="\t", skip=11,fill=TRUE)
	if (ncol(df) != 23){
		#df is smaller if no pbs is detected!
		return(NO_LTR)
	}
  df=df[!df$V13 == "",,drop=FALSE]
	if (nrow(df)==0){
		return(NO_LTR)
	}
	# criteria:
	df_part=df[df$V15 >=12 & df$V20 == 23 & df$V21<df$V20,,drop=FALSE]
	if (nrow(df_part) == 0){
		return(NO_LTR)
	}
  PBS_type = gsub("_","",(str_extract_all(df_part$V13, pattern="_([A-Z][a-z]{2})", simplify=TRUE))) %>%
      paste(collapse=" ")
	return(PBS_type)
}



## annotate superclusters
select_reads_id = function(index, search_type = c("cluster","supercluster")) {
    ## select read if base on the supecluster index need database connection
    ## HITSORTDB!
    search_type = match.arg(search_type)
    x = dbGetQuery(HITSORTDB,
                   paste0("SELECT vertexname FROM vertices WHERE vertexindex IN ",
                          "(SELECT vertexindex  FROM communities ",
                          "WHERE ", search_type,"=\"", index,
                          "\")"))
    return(x$vertexname)
}


get_reads_annotation = function(reads_id) {
    ## select annotation from tables in SEQDB which has name in format *_database
    annot_types = grep("_database", dbListTables(SEQDB), value = TRUE)
    annot = list()
    for (i in annot_types) {
        query = paste0("SELECT * FROM ", i, " WHERE name IN (", paste0("\"", reads_id, 
            "\"", collapse = ", "), ")")
        annot[[i]] = dbGetQuery(SEQDB, query)
    }
    return(annot)
}

supercluster_size = function(supercluster) {
    x = dbGetQuery(HITSORTDB, paste0("SELECT count(*) FROM vertices WHERE vertexindex IN ", 
        "(SELECT vertexindex  FROM communities ", "WHERE supercluster=\"", supercluster, 
        "\")"))
    return(x$"count(*)")
}


cluster_annotation = function(cluster, search_type = c("cluster", "supercluster")){
    ## searcheither for cluster or supercluster annotation
    ## read annotation from sqlite databases database is access though SEQDB (sequence
    ## annotation) and HITSORTDB - clustering information
    search_type = match.arg(search_type)
    reads_id = select_reads_id(cluster, search_type)
    annot = get_reads_annotation(reads_id)
    return(annot)
}

get_tarean_info = function(cluster, search_type = c("cluster", "supercluster")){
    search_type = match.arg(search_type)
    if (search_type == "cluster") {
        search_type = "[index]"
    }
    tarean_info = dbGetQuery(HITSORTDB,
                         paste0(
                             "SELECT [index], supercluster, satellite_probability, tandem_rank, size_real, satellite FROM cluster_info WHERE ",
                             search_type, " = ", cluster))
    nhits = sum(tarean_info$size_real[tarean_info$tandem_rank %in% 1:2])
    proportion = nhits/sum(tarean_info$size_real)
    return( list(
        nhits = nhits,
        proportion = proportion,
        tarean_info = tarean_info)
        )

}

get_ltr_info = function(supercluster){
  ltr_info = dbGetQuery(HITSORTDB,
                        paste0(
                          "SELECT [index], supercluster, ltr_detection, size_real FROM cluster_info WHERE ",
                          "supercluster", " = ", supercluster))
  return(ltr_info)
}

summarize_annotation = function(annot, size = NULL){
    db_id = do.call(rbind, annot)$db_id
    cl_string = gsub("^.+/","",gsub(":.+$", "", gsub("^.+#", "", db_id)))
    weight = do.call(rbind, annot)$bitscore
    domain = str_replace(str_extract(str_replace(db_id, "^.+#", ""), ":.+"), ":", 
                                                          "")
    domain[is.na(domain)] = ""
    domain_table = table(cl_string, domain)
    dt = as.data.frame(domain_table)
    ## remove no count
    dt = dt[dt$Freq != 0, ,drop = FALSE]
    rownames(dt) = paste(dt$cl_string,dt$domain)
    if (!is.null(size)){
        dt$proportion = dt$Freq / size

    }
    return(dt)
}

annot2colors = function(annot, ids, base_color = "#00000050"){
    annot_table = do.call(rbind, annot)
    domains = str_replace(str_extract(str_replace(annot_table$db_id, "^.+#", ""), ":.+"), ":","")
    other_annot = str_replace(annot_table$db_id, "^.+#", "")
    complete_annot = ifelse(is.na(domains), other_annot, domains)
    names(complete_annot) = annot_table$name
    unique_complete_annot = names (table(complete_annot) %>% sort(decreasing = TRUE))
    color_key = unique_colors[seq_along(unique_complete_annot)]
    names(color_key) = unique_complete_annot
    color_table = rep(base_color, length(ids))
    names(color_table) = ids
    color_table[names(complete_annot)] = color_key[complete_annot]
    return(list( color_table = color_table, legend = color_key))
}


 read_annotation_to_tree = function(supercluster, TREE_TEMPLATE = CLS_TREE) {
     ## CLS_TREE is datatree containing 'taxonomy' information of repeats, attribute of
     ## each node/ leave if filled up from annotation annotation is added to leaves
     ## only!! Inner node are NA or NULL
     annot0 = cluster_annotation(supercluster, search_type = "supercluster")
     ## keep only built in annotation
     annot = list(protein_databse = annot0$protein_database, dna_database = annot0$dna_database)
     annot$dna_database$bitscore = annot$dna_database$bitscore / 2 #DNA bitscore corection, blastx - more weight
     annot = do.call(rbind, annot)
     ## remove duplicated hits, keep best
     annot_clean = annot[order(annot$bitscore, decreasing = TRUE), ]
     annot_clean = annot_clean[!duplicated(annot_clean$name), ]
     tarean_info = get_tarean_info(supercluster, search_type = "supercluster")
     ltr_info = get_ltr_info(supercluster)
     db_id = annot_clean$db_id
     cl_string = gsub(":.+$", "", gsub("^.+#", "", db_id))
     weight = annot_clean$bitscore
     domain = str_replace(str_extract(str_replace(db_id, "^.+#", ""), ":.+"), ":", 
        "")
     domain[is.na(domain)] = "NA"
     mean_weight_table = by(weight, INDICES = list(cl_string, domain), FUN = function(x) signif(mean(x)))
     mean_weight_table[is.na(mean_weight_table)] = 0
     total_weight_table = by(weight, INDICES = list(cl_string, domain), FUN = sum)
     total_weight_table[is.na(total_weight_table)] = 0
     cl_table = table(cl_string)
     domain_table = table(cl_string, domain)
     cls_tree = Clone(TREE_TEMPLATE)
     cls_tree$size = supercluster_size(supercluster)
     cls_tree$ltr_info = ltr_info
     for (i in cls_tree$leaves) {
         if (i$full_name %in% names(cl_table)) {
             i$nhits = cl_table[[i$full_name]]
             i$domains = domain_table[i$full_name, ]
             names(i$domains) = colnames(domain_table)
             i$mean_weight = mean_weight_table[i$full_name, ]
             i$total_weight = total_weight_table[i$full_name, ]
             i$proportion = signif(i$nhits/cls_tree$size, 3)
         } else {
             if (i$name == "satellite"){
               i$nhits = tarean_info$nhits
                 i$tandem_rank = tarean_info$tarean_info$tandem_rank
                 i$proportion = tarean_info$proportion
                 i$tarean_info = tarean_info$tarean_info
             }else{
                 i$proportion = 0
             }
         }
     }
     ## create domain string for printing:
     for (i in Traverse(cls_tree)) {
         if (is.null(i$domains)) {
             i$features = ""
         } else if (sum(i$domains) == 0) {
             i$features = ""
         } else {
             dom = i$domains[!names(i$domains) == "NA"]
             i$features = gsub(" *\\(\\)", "", pasteDomains(dom))
         }
     }
     ## add LTR info:
     if  (any(!cls_tree$ltr_info$ltr_detection %in% "None")){
       tr = FindNode(cls_tree, "LTR")
       tr$features = "LTR/PBS"
      }
     return(cls_tree)
 }

pasteDomains = function(dom){
    d = dom[dom != 0]
    if (length(d) == 0){
        return("")
    }else{
        paste0(d, " (", names(d), ")", sep = ", ", collapse="")
    }
    
}

rescale = function(x, newrange) {
    # taken from plotrix package
    if (missing(x) | missing(newrange)) {
        usage.string <- paste("Usage: rescale(x,newrange)\n", "\twhere x is a numeric object and newrange is the new min and max\n", 
            sep = "", collapse = "")
        stop(usage.string)
    }
    if (is.numeric(x) && is.numeric(newrange)) {
        xna <- is.na(x)
        if (all(xna)) 
            return(x)
        if (any(xna)) 
            xrange <- range(x[!xna]) else xrange <- range(x)
        if (xrange[1] == xrange[2]) 
            return(x)
        mfac <- (newrange[2] - newrange[1])/(xrange[2] - xrange[1])
        return(newrange[1] + (x - xrange[1]) * mfac)
    } else {
        warning("Only numeric objects can be rescaled")
        return(x)
    }
}

add_value_to_nodes = function(tr, value_names = c("nhits", "domains", "total_weight", 
    "proportion")) {
    ## propagate values from leaves to nodes, assume that nodes are not defined
    ## (either NA or NULL) leaves coud be be either numeric of list, if list values
    ## are summed up based on the names
    for (vn in value_names) {
        for (i in Traverse(tr)) {
          w = add_leaves_value(i$leaves, vn)
          i[[vn]] = w
        }
    }
}

add_leaves_value = function(x, name) {
    ## check if annotation is multivalue or single value, propagates values from
    ## leaves to root
    n = unique(unlist(lapply(lapply(x, "[[", name), names)))
    if (is.null(n)) {
      ## no names given, we can sum everything to one value
        total = sum(unlist(sapply(x, "[[", name)), na.rm = TRUE)
        return(total)
    } else {
        total = numeric(length(n))
        names(total) = n
        xvals = lapply(x, "[[", name)
        N = 0
        for (i in xvals) {
            for (j in names(i)) {
                total[j] = total[j] + i[j]
                N = N + 1
            }
        }
        return(total)
    }
}


trmap = function(tr, xl = 0, yb = 0, xr = 1, yt = 1, first = TRUE, vertical = TRUE) {
    # treemap for data.tree plotting - experimental
    if (first) {
        plot(xl:xr, yb:yt, type = "n", axes = FALSE, xlab = "", ylab = "")
    }
    if (tr$nhits > 0) {
        width = 2 * (6 - tr$level)
        rect(xl, yb, xr, yt, col = "#00008805", lwd = width, border = "#00000080")
        if (tr$level != 1) {
            text(x = (xl + xr)/2, y = (yb + yt)/2, labels = tr$name, cex = width/4)
        }
    } else {
        return(NULL)
    }
    Nchildren = length(tr$children)
    cat("children:", tr$name, tr$level, "\n")
    if (Nchildren == 0) {
        return(NULL)
    }
    sizes = sapply(tr$children, "[[", "nhits")
    rng = c(sizes, 0) / sum(sizes)
    if (vertical) {
        ## x does not change
        xl2 = rep(xl, Nchildren)
        xr2 = rep(xr, Nchildren)
        yb2 = rescale(rng, c(yb, yt))[1:(Nchildren + 1)]
        yt2 = rescale(rng, c(yb, yt))[-1]
    } else {
        yb2 = rep(yb, Nchildren)
        yt2 = rep(yt, Nchildren)
        xr2 = rescale(rng, c(xl, xr))[1:(Nchildren + 1)]
        xl2 = rescale(rng, c(xl, xr))[-1]
    }
    for (i in 1:Nchildren) {
        trmap(tr$children[[i]], xl2[i], yb2[i], xr2[i], yt2[i], first = FALSE, vertical = !vertical)
    }
    
}

filter_tree = function(tr) {
    ## keep only nodes with positive nhits values
    ## must be used on trees where leaves were already propagated
    ## using add_values_to_nodes
    tr_filtered = Clone(tr)
    Prune(tr_filtered, function(x) x$nhits > 0 | containLTR(x))
    return(tr_filtered)
}

containLTR = function(tr){
  for (i in Traverse(tr)){
    if (i$features == "LTR/PBS"){
      return(TRUE)
    }
  }
  return(FALSE)
}

filter_tree2 = function(tr) {
    tr_filtered = Clone(tr)
    Prune(tr_filtered, function(x) x$parent$nhits > 0)
    return(tr_filtered)
}

## this is for testing purposesm in futurem each node will have its function
## named find_best_hit, it will work like decisin tree.
## functions will be set manually based on training data
formatWidth =  function(x, w){
    if (length (dim(x)) > 1){
        for (i in seq_along(x)){
            delta = nchar(x[,i], type="bytes") - nchar(x[,i], type="char")
            ## delta correction is neccessary for utf-8 correct formating
            x[,i] = sprintf(paste0("%-",w[i] + delta,"s"), x[,i])
        }
        return(x)
    }else{
        delta = nchar(x, type="bytes") - nchar(x, type="char")
        sprintf(paste0("%",w + delta,"s"),
            x) %>% return
    }
}


find_best_hit_repeat = function(cls_tree){
  ## three children:
  ## repeat
  ##  |--rDNA
  ##  |--satellite      this need special handling, rank 1 or 2
  ##  |--mobile_element
  ##
  ## in this case we require that satellite has proportion 0.95
  ## params of good hit
  min_prop_of_positive = 0.7
  min_prop_second_best = 0.9
  min_proportion_x_nhits = 2.5  # based on 0.05 x 50
  min_satellite_proportion = 0.95
  if (isLeaf(cls_tree)) {
    return(cls_tree)
  }
  cond1 = cls_tree$nhits * cls_tree$proportion < min_proportion_x_nhits
  if (cond1){
    return(cls_tree)
  }
  nhits = sapply(cls_tree$children, "[[", "nhits")
  all_hits = cls_tree$root$nhits
  name_of_best_hit = cls_tree$children[[which.max(nhits)]]$name
  best_hit_child = cls_tree$children[[which.max(nhits)]]
  second_max_hits = ifelse(length(nhits) == 1, 0, max(nhits[-which.max(nhits)]))
  cond2 = max(nhits) / sum(nhits) > min_prop_of_positive
  cond3 = max(nhits) / (second_max_hits + max(nhits)) > min_prop_second_best
  cond_satellite = best_hit_child$proportion > min_satellite_proportion & name_of_best_hit == "satellite"
  cond_other = ! name_of_best_hit == "satellite"
  cat(cond2, cond3, cond_satellite, cond_other, "\n")
  if (cond2 & cond3 & (cond_satellite | cond_other)) {
    ## clear case satellite or other
    if ("find_best_hit" %in% names(best_hit_child)){
      ## custom rules for node is defined
      best_hit = best_hit_child$find_best_hit(cls_tree$children[[which.max(nhits)]])
    }else{
                                        # use generic rules
      best_hit = find_best_hit(cls_tree$children[[which.max(nhits)]])
    }
  }else{
    ## do more specific tests for rDNA, or mobile element
    ## rDNA o mobile_elements must be above threshold!
    cond_sat_rank = any(cls_tree$satellite$tandem_rank == 2) & cls_tree$satellite$nhits != 0
    cond_rdna = cls_tree$rDNA$nhits *  cls_tree$rDNA$proportion > min_proportion_x_nhits
    cond_mobile = cls_tree$mobile_element$nhits * cls_tree$mobile_element$proportion  > min_proportion_x_nhits
    cat(cond_sat_rank, "\n")
    cat(cond_rdna,"\n")
    cat(cond_mobile, "\n")
    if (cond_sat_rank  & (cond_rdna | cond_mobile) ){
      cls_tree$satellite$nhits = 0
      best_hit = find_best_hit(cls_tree)
    }else{
      return(cls_tree)
    }
  }
  return(best_hit)
}

find_best_hit = function(cls_tree){
    ## general params of good hit
    min_prop_of_positive = 0.7
    min_prop_second_best = 0.8
    min_proportion_x_nhits = 2.5   # based on 0.05 x 50
    if (isLeaf(cls_tree)) {
        return(cls_tree)
    }


    cond1 = cls_tree$nhits * cls_tree$proportion < min_proportion_x_nhits

    if (cond1){
        ## return if proportions is lower than threshold 
        return(cls_tree)
    }

    nhits = sapply(cls_tree$children, "[[", "nhits")
    best_hit_child = cls_tree$children[[which.max(nhits)]]
  ## more special cases:
    second_max_hits = ifelse(length(nhits) == 1, 0, max(nhits[-which.max(nhits)]))
    cond2 = max(nhits) / sum(nhits) > min_prop_of_positive
    cond3 = max(nhits) / (second_max_hits + max(nhits)) > min_prop_second_best
    if (cond2 & cond3) {
        if ("find_best_hit" %in% names(best_hit_child)){
            ## custom rules for node is defined
            best_hit = best_hit_child$find_best_hit(cls_tree$children[[which.max(nhits)]])
        }else{
            # use generic rules
            best_hit = find_best_hit(cls_tree$children[[which.max(nhits)]])
        }
    } else {
        return(cls_tree)
    }
    return(best_hit)
}


get_annotation_groups = function(tr){
    tr_all = Clone(tr)
    Prune(tr_all, pruneFun=function(...)FALSE) ## prune everithing -> keep root
    tr_all$name = "Unclassified repeat (No evidence)"
    tr_contamination = Clone(tr)$contamination
    tr_organelle = Clone(tr)$organelle
    tr_repeat = Clone(tr)$"repeat"
    tr_repeat$name = "Unclassified_repeat (conflicting evidences)"
    return(
        list(
            tr_repeat = tr_repeat,
            tr_organelle = tr_organelle,
            tr_all = tr_all,
            tr_contamination = tr_contamination
        )
    )
}



format_tree = function(cls_tree, ...) {
    df = ToDataFrameTree(cls_tree, ...)
    ## try to get rid off unicode characters
    df[,1] = gsub("\U00B0", "'", df[,1], fixed = TRUE, useBytes = TRUE)
    df[,1] = gsub("\U00A6", "|", df[,1], fixed = TRUE, useBytes = TRUE)
    colnames(df)[1] = "                                               "  ## originally levelName
    if ("proportion" %in% colnames(df)){
        df$proportion = signif(df$proportion,2)
    }
    if ("Proportion[%]" %in% colnames(df)){
        df$"Proportion[%]" = round(df$"Proportion[%]", 2)
    }
    ## format header
    w1 = apply(df, 2, function(x) max(nchar(x)))
    w2 = nchar(colnames(df))
    # use whatever with is bigger for formating
    w = ifelse(w1 > w2, w1, w2)
    
    out = character()
    header = mapply(FUN = function(x, y) formatWidth(x, w = y), colnames(df), w) %>%
        paste0(collapse=" | ")
    hr_line = gsub(".","-",header)
    # create output
    # classification lines
    class_lines = formatWidth(df, w) %>%
        apply(1, FUN = function(x) paste0(x,collapse = " | ")) %>%
        paste(collapse = "\n")
    paste(
        header,
        hr_line,
        class_lines,
        sep="\n") %>% return
}


make_final_annotation_template = function(
                                          annot_attributes = list(
                                              Nreads = 0,
                                              Nclusters = 0,
                                              Nsuperclusters = 0,
                                              "Proportion[%]" = 0,
                                              cluster_list = c())
                                          )
{
    ct = Clone(CLS_TREE)
    for (i in Traverse(ct)){
        for (j in names(annot_attributes)){
            i[[j]] = annot_attributes[[j]]
        }
    }
    return(ct)
}


common_ancestor = function(tr1, tr2){
  a1 = tr1$Get('name', traversal = "ancestor")
  a2 = tr2$Get('name', traversal = "ancestor")
  ancestor = intersect(a1,a2)[1]
  return(ancestor)
}

create_all_superclusters_report = function(max_supercluster = 100,
                                           paths,
                                           libdir,
                                           superclusters_dir,
                                           seqdb, hitsortdb,
                                           classification_hierarchy_file,
                                           HTML_LINKS)
{
    ## connect to sqlite databases
    ## seqdb and hitsortdb are path to sqlite files
    HTML_LINKS = nested2named_list(HTML_LINKS)
    paths = nested2named_list(paths)
    report_file = paths[['supercluster_report_html']]
    ## create SEQDB, HTISORTDB and CLS_TREE
    connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file)

    ## append specific classication rules
    CLS_TREE$"repeat"$find_best_hit = find_best_hit_repeat

    ###
  cat("Superclusters summary\n", file = report_file)
  cat("---------------------\n\n", file = report_file, append = TRUE)
    empty = rep(NA,max_supercluster)
    SC_table = data.frame(
        Supercluster = 1 : max_supercluster, "Number of reads" = empty, Automatic_annotation = empty,
        Similarity_hits = empty, TAREAN_annotation = empty,
        Clusters = empty, stringsAsFactors = FALSE, check.names = FALSE
    )
    SC_csv = SC_table  # table as spreadsheet - no html formatings
    final_cluster_annotation = make_final_annotation_template()

  for (sc in 1 : max_supercluster) {
        cat("supercluster: ",sc,"\n")
        cls_tree = read_annotation_to_tree(sc)
        sc_summary = get_supercluster_summary(sc)
        add_value_to_nodes(cls_tree)
        cls_tree_filtered = filter_tree(cls_tree)
        best_hit = find_best_hit(cls_tree)
        ## exception to decision tree put here.
        ## All -> class I ?
        if (best_hit$name == "All"){
          ## check LTR
          if (any(!(unique(cls_tree$ltr_info$ltr_detection) %in% "None"))){
            ## if LTR found - move classification to Class I
            
            LTR = FindNode(best_hit, "LTR")
            nhits_without_class_I = best_hit$nhits - LTR$nhits
            prop_without_class_I = best_hit$proportion - LTR$proportion
            cond3 = nhits_without_class_I * prop_without_class_I < 1
            cond2 = best_hit$nhits * best_hit$proportion < 0.5  # other hits weak
            cond1 = LTR$nhits >= (0.95 * best_hit$nhits) # hits are pre in sub class_I

            if (cond1 | cond2 | cond3){
              best_hit = LTR
            }
          }
        }else{
          ## check is conflict os best hit  with LTR/PBS exists
          if (any(!(unique(cls_tree$ltr_info$ltr_detection) %in% "None"))){
            LTR = FindNode(cls_tree, "LTR")
            ca_name = common_ancestor(LTR, best_hit)
            if (ca_name != "LTR"){
              ## reclassify
              best_hit = FindNode(cls_tree, ca_name)
            }
          }
        }
  
        best_hit_name = best_hit$name
        best_hit_path = paste(best_hit$path, collapse = "/")
        ## add best hit do database:
        sqlcmd = paste0(
            "UPDATE cluster_info SET supercluster_best_hit = \"", best_hit_path, "\" ",
            " WHERE supercluster = ", sc
        )
        dbExecute(HITSORTDB, sqlcmd)
        # add annotation annotation summary stored in final_cluster_annotation - summing up
        best_hit_node = FindNode(final_cluster_annotation, best_hit_name)
        best_hit_node$Nsuperclusters = best_hit_node$Nsuperclusters + 1
        best_hit_node$Nclusters = best_hit_node$Nclusters + sc_summary$Nclusters
        best_hit_node$Nreads = best_hit_node$Nreads + sc_summary$Nreads
        best_hit_node$"Proportion[%]" = best_hit_node$"Proportion[%]" + sc_summary$proportion*100
        best_hit_node$cluster_list = append(best_hit_node$cluster_list, sc_summary$cluster_list)
        best_hit_label = ifelse(best_hit_name == "All", "NA", best_hit_name)
        SC_csv$Automatic_annotation[sc] = best_hit_label
        SC_csv$"Number of reads"[sc] = SC_table$"Number of reads"[sc] = cls_tree$size
        SC_csv$Similarity_hits[sc] = format_tree(cls_tree_filtered,
                                                             "nhits", "proportion", "features")
        SC_table$Similarity_hits[sc] = format_tree(cls_tree_filtered,
                                                               "nhits", "proportion", "features") %>%
            preformatted

        SC_table$Automatic_annotation[sc] = best_hit_label

        G = get_supercluster_graph(
            sc=sc, seqdb = seqdb,hitsortdb = hitsortdb,
            classification_hierarchy_file = classification_hierarchy_file
        )
        ## append tarean annotation
        clist = paste(V(G)$name, collapse =",")
        cl_rank = dbGetQuery(HITSORTDB,
                             paste("SELECT [index], tandem_rank FROM cluster_info WHERE [index] IN (",clist,
                                   ") AND tandem_rank IN (1,2)"))
        ## add information about TR clusters is any
        if (nrow(cl_rank) > 0){
            tarean_annot = paste(
                cl_rank$index, "-",
                RANKS_TANDEM[cl_rank$tandem_rank]
            )
            SC_csv$TAREAN_annotation[sc] = paste(tarean_annot,collapse="\n")
            SC_table$TAREAN_annotation[sc] = mapply(
                hwrite, tarean_annot,
                link = sprintf(HTML_LINKS$ROOT_TO_TAREAN,cl_rank$index)) %>% paste(collapse="<br>")
        }
        ## add links to clusters
        SC_csv$Clusters[sc] = paste((V(G)$name), collapse = ",")
        links = mapply(hwrite, V(G)$name, link = sprintf(HTML_LINKS$ROOT_TO_CLUSTER, as.integer(V(G)$name)))
        SC_table$Clusters[sc] = paste(links, collapse =", ")
        create_single_supercluster_report(G, superclusters_dir)
    }



    ## add html links
    SC_table$Supercluster = mapply(hwrite, SC_table$Supercluster,
                                   link = sprintf(HTML_LINKS$ROOT_TO_SUPERCLUSTER,
                                                  SC_table$Supercluster))



    write.table(SC_csv, file = paths[["superclusters_csv_summary"]],
                sep = "\t", row.names = FALSE)

    
    DT_instance = datatable(SC_table, escape = FALSE, options = DT_OPTIONS)
    saveWidget(DT_instance, file = normalizePath(report_file),
               normalizePath(libdir), selfcontained = FALSE)
    add_preamble(normalizePath(report_file),
                 preamble='<h2>Supercluster annotation</h2> <p><a href="documentation.html#superclust"> For table legend see documentation. <a> </p>')

    annot_groups = get_annotation_groups(final_cluster_annotation)
    saveRDS(object=annot_groups, file = paths[['repeat_annotation_summary_rds']])
    final_cluster_annotation_formated = paste(
        sapply(annot_groups, format_tree, 'Proportion[%]', 'Nsuperclusters', 'Nclusters', "Nreads"),
        collapse = "\n\n\n"
    )
    ## TODO add singleton counts, total counts and extra text - csv output
    ## export annotation summary
    html_summary = start_html(paths[['summarized_annotation_html']], gsub("PAGE_TITLE", "Repeat Annotation Summary", HTMLHEADER))
    html_summary("Repeat annotation summary", HTML.title, HR=2)
    html_summary("This table summarizes the automatic annotations of superclusters that should be verified and manually corrected if necessary. Thus, the table should not be used as the final output of the analysis without critical evaluation.", HTML.title, HR=3)
    html_summary(preformatted(final_cluster_annotation_formated), cat)



    return()
}
## for testing
if (FALSE){
    create_supercluster_report(1:100, report_file, seqdb, hitsortdb, class_file)
}

create_single_supercluster_report = function(G, superclusters_dir){
    sc_dir = paste0(superclusters_dir, "/dir_SC",sprintf("%04d", G$name))
    htmlfile = paste0(sc_dir, "/index.html")
    dir.create(sc_dir)
    title = paste("Supercluster no. ",G$name)
    htmlheader = gsub("PAGE_TITLE", title, HTMLHEADER)
    cat(htmlheader, file = htmlfile)
    file.copy(paste0(WD,"/style1.css"), sc_dir)
    HTML.title(title, file = htmlfile)
    HTML("Simmilarity hits (only hits with proportion above 0.1% in at least one cluster are shown) ", file = htmlfile)
    img_file = paste0(sc_dir,"/SC",sprintf("%0d",G$name), ".png")
    png(filename = img_file, width = 1400, height=1200)
    coords = plot_supercluster(G = G)
    dev.off()
    ## basename - make link relative
    href = paste0("../../clusters/dir_CL",sprintf("%04d", as.numeric(V(G)$name)),"/index.html")
    html_insert_image(basename(img_file), htmlfile,coords=coords, href = href)

    if (is_comparative()){
        HTML("Comparative analysis", file = htmlfile)
        img_file = paste0(sc_dir,"/SC",sprintf("%0d",G$name), "comparative.png")
        png(filename = img_file, width = 1400, height=1200)
        coords = plot_supercluster(G = G, "comparative")
        mtext("comparative analysis")
        dev.off()
        ## basename - make link relative
        href = paste0("../../clusters/dir_CL",sprintf("%04d", as.numeric(V(G)$name)),"/index.html")
        html_insert_image(basename(img_file), htmlfile,coords=coords, href = href)

    }
}

html_insert_image = function(img_file, htmlfile, coords=NULL, href=NULL) {
    img_tag = sprintf('<p> <img src="%s" usemap="#clustermap" > \n</p>\n<br>\n', img_file)
    cat(img_tag, file = htmlfile, append = TRUE)
    if (!is.null(coords)){
        formated_links = character()
        for (i in seq_along(href)){
            formated_links[i] = sprintf('<area shape="circle" coords="%f,%f,%f" href="%s" >\n',
                                        coords[i,1],coords[i,2],coords[,3],href[i])
        }
        cat('<map name="clustermap" >\n',
            formated_links,"</map>\n", file=htmlfile, append = TRUE)

    }
}

html_insert_floating_image = function(img_file,
                                      htmlfile,
                                      width=NULL,
                                      title= "",
                                      footer = ""
                                      ){
    if (is.null(width)){
        width=""
    }
    tag = paste(
        '<div class="floating_img">\n',
        '  <p>', title,'</p>\n',
        '   <a href=',img_file, ">",
        '    <img src="',img_file, '" alt="image" width="',width,'">\n',
        '   </a>',
        '  <p> ',footer,'</p>\n',
        '</div>\n'
       ,sep = "")
    cat(tag, file = htmlfile, append = TRUE)
}

get_supercluster_graph = function(sc, seqdb, hitsortdb, classification_hierarchy_file){
    ## sc - superclusted index
    ## seqdb, hitsortdb - path to sqlite databases
    ## classificationn_hierarchy_file  - path data.tree rds file
    ## SIZE of image
    SIZE = 1000
    connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file)
    clusters = dbGetQuery(HITSORTDB,
                          paste0("SELECT cluster, size FROM superclusters WHERE supercluster=",sc)
                          ) %>% as.data.frame
    ## string for sql query:
    cluster_list = paste0( "(", paste0(clusters$cluster, collapse = ","),")")
    supercluster_ncol = dbGetQuery(HITSORTDB,
                                   paste("SELECT c1,c2,w, k FROM cluster_mate_bond where c1 IN ",
                                         cluster_list, " AND c2 IN ", cluster_list, "AND k > 0.1" 
                                         )
                                   )
    if (nrow(supercluster_ncol)>0){
        ## at least two clusters
        G = graph.data.frame(supercluster_ncol, directed=FALSE)
        L = layout_with_kk(G)
        ## layout is rescaled for easier linking html 
        L[,1] = scales::rescale(L[,1], to = c(1,SIZE) )
        L[,2] = scales::rescale(L[,2], to = c(1,SIZE) )
        G$L = L
    }else{
        ## only one cluster in supercluster
        G = graph.full(n = 1)
        V(G)$name = as.character(clusters$cluster)
        G$L = matrix(c(SIZE/2, SIZE/2), ncol=2)
    }
    G = set_vertex_attr(G, "label", value = paste0("CL",names(V(G))))
    G$name=sc ## names is supercluster
    # create annotation of individual clusters, will be attached as node attribudes
    annot = get_cluster_annotation_summary(clusters)
    ## annotation of nodes(clusters)
    G = set_vertex_attr(G, "annotation", value = annot$clusters[names(V(G))])
    G = set_vertex_attr(G, "size",
                        value = clusters$size[match(names(V(G)), as.character(clusters$cluster))])
    G$annotation = annot$supercluster
    G$clusters = clusters
    # TODO if comparative analysis - add proportion of species
    if (is_comparative()){
        comparative_counts = get_cluster_comparative_counts(cluster_list)
        NACluster = comparative_counts$supercluster
        # some clusters do not have comparative data - they are bellow threshold!
        NACluster[,] = NA
        counts_adjusted = comparative_counts$clusters[names(V(G))]
        for (i in names(V(G))){
            if(is.null(counts_adjusted[[i]])){
                ## get count from database
                seqid = dbGetQuery(HITSORTDB,
                                   paste(
                                       "SELECT vertexname FROM vertex_cluster where cluster = ",
                                       i)
                                   )$vertexname
                codes = get_comparative_codes()$prefix
                x = table(factor(substring(seqid, 1,nchar(codes[1])), levels = codes))
                counts_adjusted[[i]] = data.frame(
                    Freq = as.numeric(x),
                    proportion = as.numeric(x/sum(x)),
                    row.names = names(x))
                
            }
        }

        G = set_vertex_attr(G, "comparative_counts",
                            value = counts_adjusted[V(G)$name]
                            )
        # for whole supercluster
        G$comparative = comparative_counts$supercluster
    }

    return(G)
}

get_cluster_comparative_counts = function(cluster_list){
    counts = dbGetQuery(
        HITSORTDB,
        paste0("SELECT * FROM comparative_counts WHERE clusterindex IN",
               cluster_list
               )
    )
    comparative_counts = apply(counts, 1, function(x)
        y = data.frame(Freq = x[-1], proportion = x[-1]/sum(x[-1]))
    )
    names(comparative_counts) = counts$clusterindex
    total_counts = data.frame(
        Freq = colSums(counts[,-1]),
        proportion = colSums(counts[,-1])/sum(counts[,-1]))
    return(
        list(
            clusters = comparative_counts,
            supercluster = total_counts
            )
    )

}

get_cluster_annotation_summary = function(clusters){
    ## clusters - table of clusters, col: cluster, size
    annot_list = apply(clusters,1,FUN = function(x)summarize_annotation(cluster_annotation(x[1]),x[2]))
    ## if empty - not annotation at all
    if (sum(sapply(annot_list, nrow)) == 0){
        return(list (clusters = NULL,
                 superclusters = NULL
                 ))
    }
    names(annot_list) = as.character(clusters$cluster)
    annot_all = do.call(rbind,annot_list)
    total = by(annot_all$Freq,INDICES=with(annot_all, paste(cl_string,domain)), FUN=sum, simplify=TRUE)
    total_df = data.frame(Freq = c(total), proportion = c(total) /sum(clusters$size))
    ## for ploting it need to contain all annot categories as in s upercluster
    annot_list_full = list()
    for (i in seq_along(annot_list)){
        annot_list_full[[i]] = total_df
        for (j in rownames(total_df)){
            if (!j %in% rownames(annot_list[[i]])){
                annot_list_full[[i]][j,c('Freq','proportion')] = c(0,0)
            }else{
                annot_list_full[[i]][j,c('Freq','proportion')] = annot_list[[i]][j,c('Freq','proportion')]
            }
        }
    }
    names(annot_list_full) = names(annot_list)
    return(list (clusters = annot_list_full,
                 supercluster = total_df
                 )
           )
}

plot_supercluster = function(G,color_scheme=c("annotation","comparative")){
    ## create plot in coords y: 1-1200, x: 1 - 1400
    ## this is fixed for href to work in exact image areas!
    color_scheme = match.arg(color_scheme)
    LIMS0 = matrix(c (1,1000,1,1000), ncol = 2)
    OFFSET_H = 100; OFFSET_W = 200
    lims = LIMS0 + c(0,OFFSET_W * 2, 0, OFFSET_H * 2)
    ## use full range
    par(mar=c(0,0,0,0),xaxs="i", yaxs="i")
    ## proportion of repeats
    if (color_scheme == "annotation"){
        prop = sapply (V(G)$annotation, FUN = function(x)x$proportion)
        brv = c("#BBBBBB",unique_colors)
    }else{
        prop = sapply (V(G)$comparative_counts, FUN = function(x)x$proportion)
        brv = unique_colors
    }
    ## handle special cases - single cluster with annot or single annotation group
    if (is.null(dim(prop))){
        prop=matrix(prop, nrow = 1)
    }
    if (length(prop)==0){
        ## special case - not annotation at all
        prop = matrix(rep(1,vcount(G)),ncol = vcount(G), dimnames=list("NAN", NULL))
    }else{
        if (color_scheme == "annotation"){
            rownames(prop) = rownames(G$annotation)
        }else{
            rownames(prop) = rownames(G$comparative)
        }
        ## reorder by prop
        if (color_scheme =="annotation"){
            prop = prop[order(prop[,1], decreasing = TRUE),,drop=FALSE]
            NAN = 1 - colSums(prop)
            prop = rbind(NAN, prop)
            brv = c("#BBBBBB",unique_colors)
        }else{
        }
    }
    L = G$L  + c(OFFSET_H)
  ## for href - necessary convert coordinater - image is flipped vertically
    include = rowSums(prop>0.001, na.rm = TRUE)>=1
    include[1] = TRUE
    prop = prop[include, , drop=FALSE]
    plot(0,0,type='n', , xlim = lims[,1], ylim = lims[,2])
    plot_edges(G, L, lwd = 8)
    RADIUS = radius_size(G)
    coords = cbind(G$L[,1,drop=FALSE] + 100, 1100 - G$L[,2,drop=FALSE], RADIUS)
    pieScatter(L[,1], L[,2], t(prop), col = brv,
               radius = RADIUS, new.plot = FALSE, border=NA)
    ## add ledend
    legend("topright", legend = rownames(prop),
           col = brv, pch = 15, cex= 1.5, pt.cex= 3)
    ## add cluster labels
    text(x = L[,1], y = L[,2], labels = paste(V(G)$label,"\n",V(G)$size))
    return(coords)
}


radius_size = function(G){
    if (vcount(G) == 1) RADIUS=120
    if (vcount(G) == 2) RADIUS=50
    if (vcount(G) %in% 3:8) RADIUS=40
    if (vcount(G) > 8) RADIUS=30
    return(RADIUS)
}


plot_edges = function(G,L,col="#33000040", lwd = 1){
	e = get.edgelist(G, names = F)
	X0 = L[e[, 1], 1]
	Y0 = L[e[, 1], 2]
	X1 = L[e[, 2], 1]
	Y1 = L[e[, 2], 2]
	segments(X0, Y0, X1, Y1, lwd = lwd,col = col)
}


pieScatter=function(x, y, prop,radius=1, col=NULL, edges=100, new.plot = TRUE, ...){
    if (new.plot){
        plot(x,y,type='n')
    }
    for (i in seq_along(x)){
        p=prop[i,]
        if (length(radius)==1){
            r=radius
        }else{
            r=radius[i]
        }
        include = p != 0
        floating.pie(x[i], y[i],p[include],
                     col=col[include], radius=r,edges=50,...)
    }
}

unique_colors = c("#00FF00", "#0000FF", "#FF0000", 
                  "#FFA6FE", "#FFDB66", "#006401", "#010067", "#95003A",
                  "#007DB5", "#FF00F6", "#FFEEE8", "#774D00", "#90FB92", "#01FFFE",
                  "#0076FF", "#D5FF00", "#FF937E", "#6A826C", "#FF029D",
                  "#FE8900", "#7A4782", "#7E2DD2", "#85A900", "#FF0056",
                  "#A42400", "#00AE7E", "#683D3B", "#BDC6FF", "#263400",
                  "#BDD393", "#00B917", "#9E008E", "#001544", "#C28C9F",
                  "#FF74A3", "#01D0FF", "#004754", "#E56FFE", "#788231",
                  "#0E4CA1", "#91D0CB", "#BE9970", "#968AE8", "#BB8800",
                  "#43002C", "#DEFF74", "#00FFC6", "#FFE502", "#620E00",
                  "#008F9C", "#98FF52", "#7544B1", "#B500FF", "#00FF78",
                  "#FF6E41", "#005F39", "#6B6882", "#5FAD4E", "#A75740",
                  "#A5FFD2", "#FFB167", "#009BFF", "#E85EBE")

if (FALSE){
## For testing
    for ( i in 1:100){
        G = get_supercluster_graph(
            sc=i, seqdb = seqdb,
            hitsortdb = hitsortdb, classification_hierarchy_file = class_file
        )
        png(paste0("/mnt/raid/users/petr/tmp/test.figure_",i,".png"), width = 1400, height=1000)
        plot_supercluster(G)
        dev.off()
        print(i)
    }

}

plotg = function(GG,LL,wlim=NULL,...){
    ## quick ploting function
    e = get.edgelist(GG, names=F)
    w = E(GG)$weight
    if (!is.null(wlim)) {e = e[w > wlim,]; w = w[w > wlim]}
    X0 = LL[e[,1],1]
    Y0 = LL[e[,1],2]
    X1 = LL[e[,2],1]
    Y1 = LL[e[,2],2]
    plot(range(LL[,1]),range(LL[,2]),xlab="",ylab="",axes=FALSE,type="n",...)
    brv = 'grey'
    segments(X0,Y0,X1,Y1,lwd=.5,col=brv)
    points(LL,pch=18,cex=.8,...)
}




get_cluster_info = function(index){
    ## must be connected to hitsort database ( HITSORT)
    x = dbGetQuery(HITSORTDB,
                   paste0("SELECT * FROM cluster_info WHERE [index]=",index))
    return(as.list(x))
}

get_supercluster_info = function(sc){
    ## must be connected to hitsort database ( HITSORT)
    x = dbGetQuery(HITSORTDB,
                   paste0("SELECT * FROM cluster_info WHERE supercluster=",sc))
    return(x)
}

get_supercluster_summary = function(sc){
    info = get_supercluster_info(sc)
    Nclusters = nrow(info)
    Nreads = supercluster_size(sc)
    N = dbGetQuery(SEQDB, "SELECT count(*) from sequences")
    proportion = Nreads/N
    cluster_list = info$index
    return(list(
        Nreads = Nreads,
        Nclusters = Nclusters,
        proportion = proportion,
        cluster_list = cluster_list
    ))
}


get_cluster_connection_info = function(index, search_type=c("pair", "similarity")){
    search_type = match.arg(search_type)
    if (search_type == "pair"){
        x = dbGetQuery(HITSORTDB,
                       sprintf(
                           "SELECT * FROM cluster_mate_connections WHERE c1==%d OR c2=%d",
                           index, index)
                       )
        if (nrow(x) == 0 ){
            ## no connections
            return(NULL)
        }
        other_cl = ifelse(x$c1 == index, x$c2, x$c1)
        out = data.frame(cl = other_cl, N = x$N)
        out$ids = unlist(strsplit(x$ids, split = ", "))
        k = dbGetQuery(HITSORTDB,
                       sprintf(
                           "SELECT * FROM cluster_mate_bond WHERE c1==%d OR c2=%d",
                           index, index)
                       )
        if (k$c1 !=  x$c1 || k$c2 !=x$c2){
            ## tables not in same order
            stop("tables cluster_mate_bond and cluster_mate_connection are not in the same order")
        }
        out$k = k$k
        return(out)
    }else{
        x = dbGetQuery(HITSORTDB,
                       sprintf(
                           "SELECT * FROM cluster_connections WHERE c1==%d OR c2=%d",
                           index, index)
                       )
        if (nrow(x) == 0 ){
            ## no connections
            return(NULL)
        }
        other_cl = ifelse(x$c1 == index, x$c2, x$c1)
        out = data.frame(cl = other_cl, N = x$"count(*)")
        return(out)
    }
}


format_clinfo=function(x){
    include = c(
        "size",
        "size_real",
        "ecount",
        "supercluster",
        "annotations_summary",
        "ltr_detection",
        "pair_completeness",
        "TR_score",
        "TR_monomer_length",
        "loop_index",
        "satellite_probability",
        "TR_consensus",
        "tandem_rank",
        "orientation_score"
    )
    new_names = c(
      "TR_consensus" = "TAREAN consensus",
      "tandem_rank" = "TAREAN_annotation",
      "ltr_detection" = "PBS/LTR",
      'size' = 'number of reads in the graph',
      'size_real' = 'the total number of reads in the cluster',
      'ecount' = "number of edges of the graph:",
      "annotations_summary" = "similarity based annotation"
    )
    values = unlist(x[include]) %>% gsub("\n","<br>", .)
    include = replace(include, match(names(new_names),include), new_names)
    names(values) = include
    ## replace tandem rank with annotation description
    values["TAREAN_annotation"] = RANKS_TANDEM[values["TAREAN_annotation"]]
    ## create HTML table without header
    desc = df2html(data.frame(names(values), values))
    return(desc)
}

create_cluster_report = function(index,
                                 seqdb, hitsortdb,
                                 classification_hierarchy_file,
                                 HTML_LINKS){
    ## index - index of cluster, integer
    ## seqdb, hitsortdb - sqlite databases
    ## classification_hierarchy_file - rds file with classification
    ## other information about cluster is collected from HITSORTDB
    connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file)
    ## basic info about cluster
    clinfo = get_cluster_info(index)
    HTML_LINKS = nested2named_list(HTML_LINKS)
    PNGWIDTH = 1000  # this must be kept so than link in png work correctly
    PNGHEIGHT = 1000
    LEGENDWIDTH = 300
    PS = 30  ## pointsize for plotting
    MAX_N = 10 ## to show mates or similarities connections

    #################################################################################
    ## create HTML title:
    title = paste("Cluster no. ",index)
    htmlheader = gsub("PAGE_TITLE", title, HTMLHEADER)
    cat(htmlheader, file = clinfo$html_report_main)
    file.copy(paste0(WD,"/style1.css"), clinfo$dir)
    HTML.title(title, file = clinfo$html_report_main)
    ## print basic info to HTML header:
    ## make link back to cluster table
    hwrite("Go back to cluster table <br><br>\n", link=HTML_LINKS$CLUSTER_TO_CLUSTER_TABLE) %>%
        cat(file = clinfo$html_report_main, append =TRUE)
    ## create ling to corresponding supercluster
    cat("Cluster is part of ",
        hwrite (paste("supercluster: ", clinfo$supercluster),
                link=sprintf(HTML_LINKS$CLUSTER_TO_SUPERCLUSTER, clinfo$supercluster)),
        "\n<br><br>\n",
        file = clinfo$html_report_main, append =TRUE)
    HTML.title("Cluster characteristics:", HR = 3, file = clinfo$html_report_main)
    cat(format_clinfo(clinfo), file = clinfo$html_report_main, append =TRUE)
    #################################################################################
    ## add link to image with graph layout with domains
    fs = list(
        graph_domains = paste0(clinfo$html_report_files,"/graph_domains.png"),
        graph_mates = paste0(clinfo$html_report_files,"/graph_mates%04d_%04d.png"),
        graph_base = paste0(clinfo$html_report_files,"/graph_base.png"),
        graph_comparative = paste0(clinfo$html_report_files,"/graph_comparative.png")
    )
    fs_relative = list(
        graph_domains = paste0(basename(clinfo$html_report_files),"/graph_domains.png"),
        graph_comparative = paste0(basename(clinfo$html_report_files),"/graph_comparative.png"),
        graph_mates = paste0(basename(clinfo$html_report_files),"/graph_mates%04d_%04d.png")
    )

    dir.create(clinfo$html_report_files)
    load(clinfo$graph_file)
    annot = cluster_annotation(index)
    annot_summary = summarize_annotation(annot, clinfo$size_real)
    vertex_colors = annot2colors(annot,ids = V(GL$G)$name)

    color_proportions = sort(table(vertex_colors$color_table)) / clinfo$size_real
    ## hits with smaller proportion are not shown!
    exclude_colors = names(color_proportions)[color_proportions < 0.001]
    vertex_colors$color_table[vertex_colors$color_table %in% exclude_colors] = "#00000050"
    vertex_colors$legend = vertex_colors$legend[!vertex_colors$legend %in% exclude_colors]

    if (is_comparative()){
        comp_codes = get_comparative_codes()$prefix
        colors = unique_colors[seq_along(comp_codes)]
        names(colors) = comp_codes
        species = substring(V(GL$G)$name,1, nchar(comp_codes[1]))
        ## create image with highlighted domains
        png(fs$graph_comparative, width = PNGWIDTH + LEGENDWIDTH, height = PNGHEIGHT, pointsize = PS)
        layout(matrix(1:2, ncol = 2), width = c(10, 3))
        plotg(GL$G,GL$L, col = colors[species])
        par(mar = c(0, 0, 0, 0))
        plot.new()
        legend("topleft", col=colors,
                   legend = names(colors),
                   pch = 15, cex = 0.7)
        dev.off()
        HTML.title("comparative analysis:", HR=4, file = clinfo$html_report_main)
        html_insert_image(
            img_file = fs_relative$graph_comparative,
            htmlfile = clinfo$html_report_main)
        ## comparative analysis - calculate statistics:
        V1V2 = substring(get.edgelist(GL$G),1, nchar(comp_codes[1]))
        C_observed = table(data.frame(
            V1=factor(V1V2[,1],levels = comp_codes),
            V2=factor(V1V2[,2],levels = comp_codes)
        ))
        C_observed = as.data.frame.matrix(C_observed + t(C_observed))
        P_observed = as.data.frame.matrix(C_observed/sum(C_observed))
        Edge_propotions = table(factor(V1V2, levels = comp_codes))/(length(V1V2))
        P_expected = matrix(Edge_propotions,ncol=1) %*% matrix(Edge_propotions, nrow=1)

        spec_counts = df2html(
            data.frame(table(factor(species, levels=comp_codes))),
            header =  c("Species", "Read count")
        )
        
        edge_count = df2html(
            data.frame(species = comp_codes, (C_observed/2)[,]),
            header = c("", comp_codes)
        )

        P_OE = df2html(
            data.frame(species=comp_codes,(P_observed/P_expected[,])),
            header = c("", comp_codes)
        )

        HTML.title("Comparative analysis - species read counts:", HR =5, file = clinfo$html_report_main)
        cat(spec_counts,
            file = clinfo$html_report_main, append =TRUE)

        # show connection only if more than one species in cluster
        if (length(unique(species))>1){
            HTML.title("comparative analysis - number of edges between species:",
                       HR =5, file = clinfo$html_report_main)
            cat(edge_count,
                file = clinfo$html_report_main, append = TRUE)

            HTML.title("comparative analysis - observed/expected number of edges between species",
                       HR =5, file = clinfo$html_report_main)
            cat(P_OE,
                file = clinfo$html_report_main, append = TRUE)
        }
        
    }


    ## create image with highlighted domains
    png(fs$graph_domains, width = PNGWIDTH + LEGENDWIDTH, height = PNGHEIGHT, pointsize = PS)
    layout(matrix(1:2, ncol = 2), width = c(10, 3))
    plotg(GL$G,GL$L, col = vertex_colors$color_table)
    domains_detected = length(vertex_colors$legend) > 0
    par(mar = c(0, 0, 0, 0))
    plot.new()
    if (domains_detected){
        # domains found
        legend("topleft", col=vertex_colors$legend,
               legend = names(vertex_colors$legend),
               pch = 15, cex = 0.7)
    }
    dev.off()

    HTML.title("protein domains:", HR=4, file = clinfo$html_report_main)
    if (!domains_detected){
        HTML("No protein domains detected", file = clinfo$html_report_main)
    }
    HTML("protein domains:", HR=4, file = clinfo$html_report_main)
    html_insert_image(
        img_file = fs_relative$graph_domains,
        htmlfile = clinfo$html_report_main)

    #############################################################################
    if (nrow(annot_summary) == 0){
    HTML.title("Reads annotation summary", HR = 3, file = clinfo$html_report_main)
        HTML("No similarity hits to repeat databases found", file = clinfo$html_report_main)
    }else{
        HTML(annot_summary, file = clinfo$html_report_main, align = "left")
    }


    ## similarity and mate cluster
    mate_clusters = get_cluster_connection_info(index, search_type="pair")
    similar_clusters = get_cluster_connection_info(index, search_type="similarity")
    ## report mate and similarity clusters
    if (!is.null(similar_clusters)){
        HTML.title("clusters with similarity:", file =clinfo$html_report_main, HR = 3)
        cat(df2html(
            similar_clusters,
            header=c("Cluster","Number of similarity hits"),
            sort_col = "N", scroling = TRUE
            ),
        file =clinfo$html_report_main, append=TRUE)
    }
    if (!is.null(mate_clusters)){
        HTML.title("clusters connected through mates:", file =clinfo$html_report_main, HR = 3)
        cat(df2html(
            mate_clusters[,c('cl','N','k')],
            header = c('Cluster','Number of shared<br> read pairs','k'),
            sort_col = "N", scroling = TRUE
            ),
        file = clinfo$html_report_main,append = TRUE
        )

        ## create base graph images - it will serve as background for
        ## mate clusters plots
        png(fs$graph_base,
            width = PNGWIDTH, height = PNGHEIGHT, pointsize = PS)
        par(mar=c(0,0,0,0),xaxs="i",yaxs="i")
        plotg(GL$G,GL$L, col = "#00000050")
        dev.off()
        ## load base as raster image
        base_image = readPNG(fs$graph_base)

        for (i in order(mate_clusters$N, decreasing = TRUE)){
            mate_ids = unlist(strsplit(mate_clusters$ids[[i]],split=","))
            ## print only graph above MAX_N mates
            if (length(mate_ids) < MAX_N){  # TODO  - use constant
                next
            }
            png(sprintf(fs$graph_mates,index, mate_clusters$cl[i]),
                width = PNGWIDTH, height = PNGHEIGHT, pointsize = PS)
            color_mate = gsub(".$","",V(GL$G)$name) %in% mate_ids %>%
                ifelse("#FF0000FF", "#000000AA")
            par(mar=c(0,0,0,0),xaxs="i",yaxs="i")
            plot(range(GL$L[,1]), range(GL$L[,2]), type = "n", xlab = "", ylab = "", axes = FALSE,
                 main = paste0("CL",index," ----> CL",mate_clusters$cl[i]))
            rasterImage(base_image,
                        range(GL$L[,1])[1], range(GL$L[,2])[1],
                        range(GL$L[,1])[2], range(GL$L[,2])[2]
                        )
            points(GL$L[,1:2], col = color_mate,pch=18,cex=.8)
            dev.off()
            title = paste0("CL",index," ----> CL",mate_clusters$cl[i])
            footer = paste0("No. of shared pairs: :", mate_clusters$N[i])
            html_insert_floating_image(
                img_file = sprintf(fs_relative$graph_mates, index, mate_clusters$cl[i]),
                htmlfile = clinfo$html_report_main, width = 200,
                title = title, footer = footer
            )
        }
    }
}