diff lib/utils.R @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/utils.R	Thu Dec 19 10:24:45 2019 -0500
@@ -0,0 +1,341 @@
+#!/usr/bin/env Rscript
+suppressPackageStartupMessages(library(DBI))
+suppressPackageStartupMessages(library(RSQLite))
+ 
+CONNECTED = FALSE
+if (FALSE) {
+    ## for testing
+    seqdb = "/mnt/raid/spolecny/petr/RE2/comparative_test/sequences.db"
+    hitsortdb = "/mnt/raid/spolecny/petr/RE2/comparative_test/hitsort.db"
+    class_file = "/mnt/raid/users/petr/workspace/repex_tarean/databases/classification_tree.rds"
+    ## connect to sqlite databases
+    SEQDB = dbConnect(RSQLite::SQLite(), seqdb)
+    HITSORTDB = dbConnect(RSQLite::SQLite(), hitsortdb)
+    CLS_TREE = readRDS(class_file)
+}
+
+connect_to_databases = function(seqdb, hitsortdb,classification_hierarchy_file = NULL){
+    if (!CONNECTED){
+        SEQDB <<- dbConnect(RSQLite::SQLite(), seqdb)
+        HITSORTDB <<- dbConnect(RSQLite::SQLite(), hitsortdb)
+        if (!is.null(classification_hierarchy_file)){
+            CLS_TREE <<- readRDS(classification_hierarchy_file)
+        }
+        CONNECTED <<- TRUE
+    }
+}
+
+disconnect_database = function(){
+    if (CONNECTED){
+        dbDisconnect(SEQDB)
+        dbDisconnect(HITSORTDB)
+        CONNECTED <<- FALSE
+    }
+}
+
+nested2named_list = function(x){
+    y = as.list(unlist(x[[1]]))
+    names(y) = unlist(x[[2]])
+    return(y)
+}
+
+is_comparative = function(){
+    prefix_codes = dbGetQuery(SEQDB,"SELECT * FROM prefix_codes")
+    if (nrow(prefix_codes) == 0){
+        return(FALSE)
+    }else{
+        return(TRUE)
+    }
+}
+
+get_comparative_codes = function(){
+    prefix_codes = dbGetQuery(SEQDB,"SELECT * FROM prefix_codes")
+    return(prefix_codes)
+}
+
+add_preamble = function(html_file, preamble){
+  html_content=readLines(html_file)
+  modified_html_content = gsub("<body>",
+       paste("<body>\n", preamble,"\n"),
+       html_content)
+  cat(modified_html_content, file = html_file, sep="\n")
+}
+
+
+df2html = function(df, header = NULL, sort_col = NULL, digits = 3, rounding_function=signif, decreasing = TRUE, scroling = FALSE, width = 300){
+    if (!is.null(sort_col)){
+        df = df[order(df[,sort_col], decreasing = decreasing),]
+    }
+    if (!is.null(digits)){
+        for (i in seq_along(df)){
+            if(is.numeric(df[,i])){
+                df[,i] = rounding_function(df[,i], digits)
+            }
+        }
+    }
+    if (is.null(header)){
+        h = ""
+    }else{
+        h = paste0("    <th>",header,"</th>\n", collapse="") %>%
+            paste0(" <tr>\n", .,"  </tr>\n")
+    }
+    x = apply(df,1,function(x)paste0("    <td>",x,"</td>\n", collapse="")) %>%
+        paste0("  <tr>\n", .,"  </tr>\n", collapse = "")
+    if (scroling){
+        cols = paste0('<col width="',rep(round(100/ncol(df)),ncol(df)),'%">\n',collapse ="")
+        height = min(200, 22 * nrow(df))
+        out = paste0(
+            '<table cellspacing="0" cellpadding="0" border="0" width="',width,'">\n',
+            '  <tr>\n',
+            '    <td>\n',
+            '      <table cellspacing="0" cellpadding="1" border="1" width="', width,'" >\n',
+            cols,
+            h,
+            '      </table>\n',
+            '   </td>\n',
+            ' </tr>\n',
+            ' <tr>\n',
+            '   <td>\n',
+            '     <div style="width:',width,'px; height:',height,'px; overflow:auto;">\n',
+            '       <table cellspacing="0" cellpadding="1" border="1" width="',width,'" >\n',
+            cols,
+            x,
+            '       </table>\n',
+            '     </div>\n',
+            '  </td>\n',
+            ' </tr>\n',
+            '</table>\n'
+        )
+
+    }else{
+        out = paste ("<table>\n", h,x, "</table>\n")
+    }
+    return(out)
+}
+
+start_html = function(filename, header){
+    cat(header, file = filename)
+    html_writer = function(content, fn=HTML, ...){
+        fn(content, append = TRUE, file = filename, ...)
+    }
+}
+
+preformatted = function(x){
+    ## make preformatted html text
+    return(
+        paste(
+        "<pre>\n",
+        x,
+        "</pre>"
+        ,sep="")
+    )
+}
+
+
+summary_histogram = function(fn, N_clustering, N_omit=0, size_adjusted=NULL, top_clusters_prop){
+    ## assume connection do databases
+    communities = dbGetQuery(
+        HITSORTDB,
+        "SELECT DISTINCT cluster, size, supercluster, supercluster_size FROM communities ORDER BY supercluster" 
+    )
+    if (N_omit != 0){
+      ## adjust communities and cluster sizes:
+      cluster_to_adjust = which(
+        communities$size[order(communities$cluster)][1:length(size_adjusted)] != size_adjusted
+      )
+      ## keep original value:
+      communities$size_original = communities$size
+      superclusters_to_adjust = unique(communities$supercluster[communities$cluster %in% cluster_to_adjust])
+      for (cl in cluster_to_adjust){
+        communities[communities$cluster == cl,'size'] = size_adjusted[cl]
+      }
+      for (cl in superclusters_to_adjust){
+        communities[communities$supercluster == cl,'supercluster_size'] =
+          sum(communities[communities$supercluster == cl,'size'])
+      }
+    }else{
+      cluster_to_adjust=NULL
+    }
+     singlets = N_clustering - sum(communities$size)
+
+     supercluster_size = sort(unique(communities[, c('supercluster', 'supercluster_size')])$supercluster_size, decreasing =  TRUE) 
+
+
+   clid2size = sort(communities$size, decreasing = TRUE)
+
+    cluster_id = split(communities$cluster, communities$supercluster)
+    cluster_id_sort = lapply(cluster_id, function(x)x[order(clid2size[x], decreasing = FALSE)])
+
+  cluster_size_unsorted = split(communities$size, communities$supercluster)
+    cluster_size_sort = lapply(cluster_size_unsorted, function(x) (sort(x)))
+    ## reorder by size of superclusters
+    cluster_size_sort_sort = cluster_size_sort[order(sapply(cluster_size_sort, sum), decreasing = TRUE)]
+    cluster_id_sort_sort = cluster_id_sort[order(sapply(cluster_size_sort, sum), decreasing = TRUE)]
+
+  
+    Nmax = max(sapply(cluster_size_sort_sort, length))
+    M = cbind(
+        sapply(cluster_size_sort_sort, function(x)y = c(x, rep(0, 1 + Nmax - length(x)))),
+        c(1, rep(0, Nmax)) 
+    )
+
+    Mid =  cbind(
+        sapply(cluster_id_sort_sort, function(x)y = c(x, rep(0, 1 + Nmax - length(x)))),
+        c(1, rep(0, Nmax)) 
+    )
+
+    recolor = matrix(ifelse(Mid %in% cluster_to_adjust,TRUE,FALSE), ncol=ncol(Mid))
+    indices = which(recolor, arr.ind = TRUE)
+
+  
+    png(fn, width = 1200, height = 700, pointsize = 20)
+    barplot(M,
+            width = c(supercluster_size, singlets),
+            space = 0, ylim = c(0, max(supercluster_size) * 1.2),
+            col = c("#CCCCCC"), names.arg = rep("", ncol(M)))
+    plot(0,
+         xlim = c(0, sum(c(supercluster_size, singlets))),
+         ylim = c(0, max(supercluster_size) * 1.2),
+         type = "n", yaxs = 'i', axes = FALSE, xlab = "", ylab = "")
+
+    rect(0, 0,
+         sum(supercluster_size),
+         max(supercluster_size) * 1.2,
+         col = "#0000FF10")
+
+    rect(sum(supercluster_size), 0,
+         sum(supercluster_size) + singlets,
+         max(supercluster_size) * 1.2,
+         col = "#FFAAFF10")
+
+    barplot(M,
+            width = c(supercluster_size, singlets),
+            space = 0, ylim = c(0, max(supercluster_size) * 1.2),
+            col = "#AAAAAA", names.arg = rep("", ncol(M)), add = TRUE,
+            xlab = "Proportion of reads [%]", ylab = "Number of reads",
+            main = paste(N_clustering, "reads total"))
+
+
+    for (i in seq_along(indices[,1])){
+      y1 = sum(M[1:indices[i,'row'],indices[i,'col']])
+      x1 = sum(M[,1:indices[i,'col']])
+      if(indices[i,'row'] == 1){
+        y0=0
+      }else{
+        y0 = sum(M[1:(indices[i,'row']-1),indices[i,'col']])
+      }
+      if (indices[i,'col']==1){
+        x0=0
+      }else{
+        x0 = sum(M[,1:(indices[i,'col']-1)])
+      }
+      rect(x0,y0,x1,y1, col="#88FF88")
+    }
+    abline(v=top_clusters_prop * sum(c(supercluster_size, singlets)), col="#00000088", lwd=3, lty=3)
+
+    text(sum(supercluster_size) / 2,
+         max(supercluster_size) * 1.05,
+         labels = paste0(sum(supercluster_size), " reads in\n",
+                      length(supercluster_size), " supeclusters (", nrow(communities), " clusters)")
+         )
+
+    
+    text(sum(supercluster_size) + singlets / 2,
+         max(supercluster_size) * 1.05,
+         labels = paste(singlets, "singlets"))
+    
+    axis(1,at=seq(0,N_clustering,length.out=11),label=seq(0,100,by=10))
+  dev.off()
+  clustering_info = list(
+    Number_of_reads_in_clusters = sum(supercluster_size),
+    Number_of_clusters =  nrow(communities),
+    Number_of_superclusters = length(supercluster_size),
+    Number_of_singlets = singlets
+  )
+  return(clustering_info)
+}
+
+
+rectMap=function(x,scale.by='row',col=1,xlab="",ylab="",grid=TRUE,axis_pos=c(1,4),cexx=NULL,cexy=NULL){
+  if (scale.by=='row'){
+                                        #x=(x)/rowSums(x)
+    x=(x)/apply(x,1,max)
+  }
+  if (scale.by=='column'){
+    x=t(t(x)/apply(x,2,max))
+  }
+  nc=ncol(x)
+  nr=nrow(x)
+  coords=expand.grid(1:nr,1:nc)
+  plot(coords[,1],coords[,2],type='n',axes=F,xlim=range(coords[,1])+c(-.5,.5),ylim=range(coords[,2])+c(-.5,.5),xlab=xlab,ylab=ylab)
+  axis(axis_pos[1],at=1:nr,labels=rownames(x),lty=0,tick=FALSE,line=0,cex.axis=0.5/log10(nr))
+  axis(axis_pos[2],at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=0, cex.axis=0.7)
+  axis(2,at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=1, cex.axis=0.7)
+
+  mtext(side = 1, "Cluster id", las=1, line = 3, cex = 0.5)
+  line = 1.5 + log10(nr)
+  mtext(side = 2, "Proportions of individual samples", las =0, line = line, cex = 0.5)
+  s=c(x)/2  # to get it proportional
+  w = c(x)/2
+  rect(coords[,1]-0.5,coords[,2]-s,coords[,1]+0.5,coords[,2]+s,col=col,border=NA)
+  if (grid){
+    abline(v=0:(nr)+.5,h=0:(nc)+.5,lty=2,col="#60606030")
+  }
+  box(col="#60606030",lty=2)
+}
+
+plot_rect_map = function(read_counts,cluster_annotation, output_file,Xcoef=1,Ycoef=1){
+  counts = read.table(read_counts,header=TRUE,as.is=TRUE)
+  annot = read.table(cluster_annotation, sep="\t",header=FALSE,as.is=TRUE)
+  N = nrow(annot)
+  colnames(annot) = c("cluster", "Automatic.classification")
+  annot$number.of.reads = rowSums(counts[1 : nrow(annot) ,-1])
+  unique_repeats = names(sort(table(c(annot$Automatic.classification,rep('nd',N))),decreasing = TRUE))
+
+  M = as.matrix(counts[1:N,-(1:2)])
+  rownames(M) = paste0("CL",rownames(M))
+  Mn1=(M)/apply(M,1,max)
+  Mn2=M/max(M)
+  Mn2=M/apply(M,1,sum)
+
+  ord1 = hclust(dist(Mn1),method = "ward.D")$order
+  ord2 = hclust(dist(t(Mn2)))$order
+  wdth = (400 + N*10 ) * Xcoef
+  hgt = (600 + ncol(M)*50) * Ycoef
+  ptsize = round((wdth*hgt)^(1/4))
+  png(output_file, width=wdth,height=hgt, pointsize = ptsize)  # was 50
+  ploting_area_width = 3 + log10(N)*3
+  ploting_area_sides = 1
+  layout(matrix(c(4,2,3,4,1,3),ncol=3,byrow = TRUE),
+         width=c(ploting_area_sides,ploting_area_width,ploting_area_sides),
+         height=c(3,ncol(M)*0.5))
+  par(xaxs='i', yaxs = 'i')
+  par(las=2,mar=c(4,0,0,0),cex.axis=0.5)
+  rectMap(Mn2[ord1,ord2],scale.by='none',col=1, grid=TRUE)
+  par(las=2,mar=c(1,0,1,0), mgp = c(2,0.5,0))
+  barplot(annot$number.of.reads[ord1], col = 1)
+  mtext(side = 2, "Cluster size", las = 3, line = 2, cex = 0.5)
+  par(mar=c(0,0,10,0))
+  plot.new()
+  st = dev.off()
+  ## calculate coordinated if boxes to create hyperlink
+  X0 = wdth/(ploting_area_sides * 2 + ploting_area_width)* ploting_area_sides
+  X1 = wdth/(ploting_area_sides * 2 + ploting_area_width)*(ploting_area_sides + ploting_area_width)
+  L = round(seq(X0,X1, length.out = N + 1)[1:N])
+  R = round(seq(X0,X1, length.out = N + 1)[2:(N + 1)])
+  cn = rownames(Mn2[ord1,ord2])
+  cluster_links = paste0(
+    "seqclust/clustering/clusters/dir_CL",
+    sprintf("%04d", as.integer(substring(cn,3 ))),
+    "/index.html")
+  coords = paste0(L, ",", 1, ",", R, ",", hgt)
+  clustermap = paste0(
+    '\n<map name="clustermap"> \n',
+    paste0(
+      '<area shape="rect"\n      coords="',coords, '"\n',
+      '      href="', cluster_links, '"\n',
+      '      title="', cn, '"/>\n',
+      collapse = ""),
+    "</map>\n")
+  return(clustermap)
+}