Mercurial > repos > petr-novak > repeatrxplorer

diff lib/tarean/methods.R @ 0:1d1b9e1b2e2f draft
Uploaded
author: petr-novak
date: Thu, 19 Dec 2019 10:24:45 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean/methods.R	Thu Dec 19 10:24:45 2019 -0500
@@ -0,0 +1,1536 @@
+#!/user/bin/env Rscript
+
+suppressPackageStartupMessages(library(igraph))
+suppressPackageStartupMessages(library(parallel))
+suppressPackageStartupMessages(library(Biostrings))
+suppressPackageStartupMessages(library(scales))
+suppressPackageStartupMessages(library(stringr))
+suppressPackageStartupMessages(library(hwriter))
+suppressPackageStartupMessages(library(R2HTML))
+suppressPackageStartupMessages(library(plyr))
+suppressPackageStartupMessages(library(dplyr))
+
+max_ORF_length = function(s) {
+  ## check all frames
+  L = 0
+  for (i in 1:3) {
+    L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(s, i))), "*", 
+                                     fixed = TRUE))))
+    L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(reverseComplement(s), 
+                                                                   i))), "*", fixed = TRUE))))
+  }
+  return(L)
+}
+
+kmers2graph = function(kmers, mode = "strong", prop = NULL) {
+  kmerLength = nchar(kmers[1, 1])
+  if (ncol(kmers) == 2) {
+    kmers$size = kmers[, 2]/sum(kmers[, 2])
+  }
+  colnames(kmers) = c("name", "count", "size")
+  if (!is.null(prop)) {  # tohle se nepouziva(prop je null), a je to asi spatne - filtuje se to pred tridenim!!
+    p = cumsum(kmers$size)
+    kmers = kmers[p < prop, ]
+  }
+  N = dim(kmers)[1]
+  kmers = kmers[order(kmers$size), ]
+  ## convert kmers to fasta file
+  kms = data.frame(kmer = substring(kmers$name, 1, kmerLength - 1), ids = 1:nrow(kmers),stringsAsFactors = FALSE)
+  kme = data.frame(kmer = substring(kmers$name, 2), ide = 1:nrow(kmers), stringsAsFactors = FALSE)
+
+  ## df = merge(kms,kme, by = 'kmer',all=FALSE)[,c(2,3,1)]
+  df = inner_join(kme,kms, by = 'kmer')[,c(2,3)]
+
+  ## names(kms) = seq_along(kms)
+  ## kme = substring(kmers$name, 2)
+  ## names(kme) = seq_along(kme)
+  ## ## use new blast!
+  ## database = tempfile()
+  ## query = tempfile()
+  ## output = tempfile()
+  ## writeXStringSet(DNAStringSet(kms), filepath = database, format = "fasta")
+  ## writeXStringSet(DNAStringSet(kme), filepath = query, format = "fasta")
+  ## cmd = paste("makeblastdb -in", database, "-dbtype nucl")
+  ## system(cmd, ignore.stdout = TRUE)
+  ## cmd = paste("blastn -outfmt '6 qseqid sseqid pident'  -strand plus -dust no -perc_identity 100 -query ", 
+  ##     query, "-db", database, "-word_size", kmerLength - 1, "-out", output)
+  ## system(cmd)
+  ## df = try({
+  ##   read.table(output, as.is = TRUE)
+  ## })
+  ## if (class(df) == "try-error"){
+  ##   print("creation of kmer graph failed")
+  ##   print(query)
+  ##   print(output)
+  ##   print(database)
+  ##   return(NULL)
+  ## }
+  ## unlink(query)
+  ## unlink(paste(database, "*", sep = ""))
+  ## unlist(output)
+  gm_mean = function(x, na.rm = TRUE) {
+    exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x))
+  }
+  
+  whg = apply(cbind(kmers[df[, 1], 2], V2 = kmers[df[, 2], 2]), 1, gm_mean)
+  G = graph.data.frame(data.frame(V1 = kmers$name[df[, 1]], V2 = kmers$name[df[, 
+                                                                               2]], weight = whg), vertices = kmers[, 1:3])
+                                        # separate to connected components:
+  ccs = clusters(G, mode = mode)$membership
+  sel_cls = which(tabulate(ccs) > 1)
+  Gs = list()
+  for (i in seq_along(sel_cls)) {
+    Gs[[i]] = induced.subgraph(G, vids = which(ccs %in% sel_cls[i]))
+  }
+  ## reorder!!!
+  Gs = Gs[order(sapply(Gs, vcount), decreasing = TRUE)]
+  return(Gs)
+}
+
+
+OGDFlayout = function(G, ncol = NULL, alg = "fmmm", OGDF = getOption("OGDF")) {
+  ## is ogdf binary available?
+  if (is.null(OGDF)) {
+    OGDF = Sys.getenv("OGDF")
+    if ("" == OGDF) {
+      options(warn = -1)
+      OGDF = system("which runOGDFlayout", intern = TRUE)
+      options(warn = 0)
+      if (length(OGDF) == 0) {
+        cat("path to runOGDFlayout not found\n")
+        return(NULL)
+      }
+      
+    }
+  }
+  if (is.null(ncol)) {
+    if (is.null(E(G)$weight)) {
+      el = data.frame(get.edgelist(G, names = TRUE), rep(1, ecount(G)))
+    } else {
+      el = data.frame(get.edgelist(G, names = TRUE), E(G)$weight)
+    }
+    ncol = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", sep = "")
+    write.table(el, file = ncol, row.names = FALSE, col.names = FALSE, sep = "\t", 
+                quote = FALSE)
+  } else {
+                                        # copy ncol:
+    ncol_tmp = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", 
+                     sep = "")
+    file.copy(ncol, ncol_tmp)
+    ncol = ncol_tmp
+  }
+  algopt = c("fmmm", "sm", "fme")
+  if (!(alg %in% c("fmmm", "sm", "fme") && TRUE)) {
+    stop("alg must by :", algopt, "\n")
+    
+  }
+  
+                                        # output file:
+  Louts = list()
+  layout_file = tempfile(pattern = as.character(Sys.getpid()))
+  for (i in alg) {
+    cmd = paste(OGDF, "-alg", i, "-iff layout -off layout -out", layout_file, 
+                ncol)
+    system(cmd, intern = TRUE)
+    L = read.table(layout_file, skip = ecount(G))
+    L = L[match(V(G)$name, L$V2), ]
+    Lout = as.matrix(L[, -(1:2)])
+    unlink(layout_file)
+    Louts[[i]] = Lout
+    
+  }
+                                        # clean up
+  unlink(ncol)
+  return(Louts)
+}
+
+xcolor_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")
+
+kmers2color = function(s, position = NULL) {
+  if (is.null(position)) {
+    position = round(nchar(s[1])/2, 0)
+    ## position = 1
+    position = nchar(s[1])
+  }
+  color_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")
+  color_base = substring(s, position, position)
+  colors = color_code[color_base]
+  names(colors) = color_base
+  return(colors)
+}
+get_sequence = function(g, v, position = NULL) {
+  s = V(g)$name
+  if (is.null(position)) {
+    position = round(nchar(s[1])/2, 0)
+    ## position = 1
+    position = nchar(s[1])
+  }
+  nt = paste(substring(s[v], position, position), collapse = "")
+  return(nt)
+}
+
+
+get_mimimal_cc = function(km, thr = 20, min_coverage = 0.45, step = 2, start = NULL) {
+  if (is.null(start)) {
+    i = sum(cumsum(km$freq) < 0.5)
+  } else {
+    i = sum(cumsum(km$freq) < start)
+  }
+  continue = TRUE
+  while (continue) {
+    if (i > nrow(km)) {
+      i = nrow(km)
+      continue = FALSE
+      step = 1
+    }
+    GG = kmers2graph(km[1:i, ])
+    if (length(GG) > 0) {
+      if (vcount(GG[[1]]) > thr) {
+        if (sum(V(GG[[1]])$size) >= min_coverage) {
+          GG[[1]]$input_coverage = sum(km$freq[1:i])
+          GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
+          return(GG[[1]])
+        }
+      }
+    }
+    i = round(i * step)
+    
+  }
+  if (length(GG) == 0 | is.null(GG)) {
+    return(NULL)
+  }
+  
+  GG[[1]]$input_coverage = sum(km$freq[1:i])
+  GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
+  return(GG[[1]])
+}
+
+
+
+paths2string = function(paths) {
+  pathstring = sapply(lapply(lapply(paths, as_ids), substring, 1, 1), paste, collapse = "")
+  return(pathstring)
+}
+
+
+align_paths = function(paths, G) {
+  shift = rep(NA, length(paths))
+  thr = 0  # minimal length
+  tr_paths = list()
+  Seqs = list()
+  centre_node = as.numeric(names(sort(table(unlist(paths)), decreasing = TRUE)))[[1]]
+  
+  for (i in seq_along(paths)) {
+    if (centre_node %in% paths[[i]]) {
+      S = which(paths[[i]] %in% centre_node)
+      shift[i] = S
+      if (S == 1) {
+        tr_paths[[i]] = paths[[i]]
+      } else {
+        tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - 
+                                                                          1)])
+      }
+      Seqs[[i]] = get_sequence(G, tr_paths[[i]])
+    } else {
+      shift[i] = NA
+    }
+  }
+  paths_n = lapply(paths, as.numeric)
+  tr_paths_n = do.call(cbind, lapply(tr_paths, as.numeric))
+  new_shift = shift
+  for (i in which(is.na(shift))) {
+    score = numeric(length(paths_n))
+    for (S in seq_along(paths_n[[i]])) {
+      if (S == 1) {
+        path_tmp_n = paths_n[[i]]
+      } else {
+        path_tmp_n = c(paths_n[[i]][S:length(paths_n[[i]])], paths_n[[i]][1:(S - 
+                                                                             1)])
+      }
+      score[S] = sum(tr_paths_n == path_tmp_n)
+    }
+    if (sum(score) != 0) {
+      S = which.max(score)
+      new_shift[i] = S
+      if (S == 1) {
+        tr_paths[[i]] = paths[[i]]
+      } else {
+        tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - 
+                                                                          1)])
+      }
+      Seqs[[i]] = get_sequence(G, tr_paths[[i]])
+    }
+  }
+  shift = new_shift
+                                        # try to shift based on the sequences itself
+  return(list(Seqs = Seqs[!is.na(shift)], tr_paths = tr_paths[!is.na(shift)], shift = shift[!is.na(shift)]))
+}
+
+make_consensus = function(paths_info, G) {
+  include = !is.na(paths_info$shift)
+  ## make alignments using mafft
+  aln = mafft(unlist(paths_info$Seqs[include]))
+  CM = calculate_consensus_matrix(aln = aln, tr_paths = paths_info$tr_paths[include], 
+                                  G = G)
+  gaps = get_gaps_from_alignment(aln)
+  CMnorm = CM/rowSums(CM)
+  bases = colnames(CM)
+  consensus = sapply(apply(CMnorm, 1, function(x) bases[which(x > 0.2)][order(x[x > 
+                                                                                0.2], decreasing = TRUE)]), paste, collapse = "")
+  consensus2 = gsub("-", "", paste0(substring(consensus, 1, 1), collapse = ""), 
+                    fixed = TRUE)
+  number_of_SNP = sum(rowSums(CM > 0) > 1)
+  SNP_positions = which(rowSums(CM > 0) > 1)
+  number_of_position_with_indels = sum(colSums(do.call(rbind, strsplit(as.character(aln), 
+                                                                       "")) == "-") > 0)
+  indel_positions = which(colSums(do.call(rbind, strsplit(as.character(aln), "")) == 
+                                  "-") > 0)
+  if (length(SNP_positions) > 0) {
+    variable_sites = unlist(c(c(mapply(paste, strsplit((sapply(apply(CMnorm, 
+                                                                     1, function(x) bases[which(x > 0.2)]), paste, collapse = ""))[SNP_positions], 
+                                                       ""), SNP_positions, sep = "_")), paste("-", indel_positions, sep = "_")))
+  } else {
+    variable_sites = NULL
+  }
+  variable_positions = unique(SNP_positions, indel_positions)
+  return(list(aln = aln, CM = CM, CMnorm = CMnorm, consensus = consensus, consensus2 = consensus2, 
+              number_of_SNP = number_of_SNP, SNP_positions = SNP_positions, number_of_position_with_indels = number_of_position_with_indels, 
+              indel_positions = indel_positions, variable_positions = variable_positions, 
+              variable_sites = variable_sites, gaps = gaps))
+}
+
+
+estimate_monomer = function(G, weights = NULL, limit = NULL) {
+  if (is.null(G)) {
+    return(NULL)
+  }
+  ## estimate monomer from kmer based graph
+  V(G)$id = 1:vcount(G)
+  GS = induced_subgraph(G, vids = which(degree(G) == 2))  ## keep only vertices without branching
+  cls = clusters(GS)$membership
+  
+  
+  ids = mapply(FUN = function(x, y) x[which.max(y)], split(V(GS)$id, cls), split(V(GS)$size, 
+                                                                                 cls))  ## from each branch use only one vertex with larges size!
+  
+  
+  ids = ids[order(V(G)$size[ids], decreasing = TRUE)]
+  ids_size = V(G)$size[ids]
+  N50 = sum(cumsum(ids_size)/sum(ids_size) < 0.5)
+  if (length(ids) > 10000) {
+    ids = ids[1:N50]
+  }
+  ## use only large vertices in search!  how many?
+  el = get.edgelist(G, names = FALSE)
+  node_use = numeric(vcount(G))
+  LL = numeric()
+  ## W=numeric()
+  i = 0
+  paths = list()
+  if (is.null(weights)) {
+    weights = (max(E(G)$weight) - (E(G)$weight) + 1)
+    weights = E(G)$weight^(-3)
+  }
+  included = rep(FALSE, vcount(G))
+  W_total = sum(V(G)$size)
+  
+  coverage = numeric()
+  t0 = c()
+  i = 0
+  j = 0
+  for (n in ids) {
+    j = j + 1
+    t0[j] = Sys.time()
+    if (included[n]) {
+      next
+    }
+    m = which(el[, 1] %in% n)
+    i = i + 1
+    s = get.shortest.paths(G, el[m, 2], el[m, 1], weights = weights, output = "vpath")
+    included[as.numeric(s$vpath[[1]])] = TRUE
+    paths[[i]] = s$vpath[[1]]
+    LL[i] = (length(s$vpath[[1]]))
+  }
+  
+  ## evaluate if paths should be divided to variants - by length and path weight
+  paths_clusters = split(paths, LL)
+  paths_clusters_tr = mclapply(paths_clusters, FUN = align_paths, G = G, mc.cores = getOption("CPU"))
+  
+  ## paths_clusters_tr = lapply(paths_clusters, FUN = align_paths, G = G) consensus
+  paths_consensus = mclapply(paths_clusters_tr, make_consensus, G = G, mc.cores = getOption("CPU"))
+  
+  ## evaluate weight for individual paths:
+  for (v in seq_along(paths_consensus)) {
+    p = paths_clusters_tr[[v]]$tr_paths
+    ## clean
+    p = p[!sapply(p, function(x) anyNA(x) | is.null(x))]
+    L = sapply(p, length)
+    p_groups = split(p, L)
+    w_groups = sapply(p_groups, function(x) sum(V(G)$size[unique(c(sapply(x, 
+                                                                          as.numeric)))]))
+    total_score = sum(V(G)$size[unique(c(unlist(sapply(p, as.numeric))))])
+    LW = data.frame(`Length estimate` = unique(L), weight = w_groups, stringsAsFactors = FALSE)
+    LW = LW[order(LW$weight, decreasing = TRUE), ]
+    rownames(LW) = NULL
+    paths_consensus[[v]]$total_score = total_score
+    paths_consensus[[v]]$length_variant_score = LW
+    
+  }
+  return(list(estimates = paths_consensus, paths = paths_clusters_tr))
+}
+
+
+
+detect_pbs = function(dimers_file, tRNA_database_path, reads_file, output) {
+  ## read_file contain oriented reads!
+  min_offset = 10
+  max_end_dist = 2
+  min_aln_length = 30
+  max_pbs_search = 30
+  thr_length = 12
+  end_position = 23
+  insertion_proportion_threshold <- 0.15
+  ## read_file contain oriented reads!  for testing
+  if (FALSE) {
+    library(Biostrings)
+    thr_length = 12
+    end_position = 23
+    dimers_file = "consensus_dimer.fasta"
+    reads_file = "reads_oriented.fas"
+    tRNA_database_path = "/mnt/raid/users/petr/workspace/repex_tarean/databases/tRNA_database2.fasta"
+    
+  }
+  ## find read which are half aligned do reference dimer
+  dimers_seq = readDNAStringSet(dimers_file)
+  cmd = paste("makeblastdb -in", dimers_file, "-dbtype nucl")
+  system(cmd, ignore.stdout = TRUE)
+  output = paste0(reads_file, "blast_out.cvs")
+  columns = c("qseqid", "qstart", "qend", "qlen", "sseqid", "sstart", "send", "sstrand", 
+              "slen", "pident", "length")
+  cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), "' -perc_identity 90 -query ", 
+              reads_file, "-db", dimers_file, "-word_size", 7, "-dust no -num_alignments 99999 -strand plus ", 
+              "-out", output)
+  system(cmd)
+  blastdf = read.table(output, as.is = TRUE, col.names = columns, comment.char = "")
+  unlink(output)
+  blastdf = blastdf[blastdf$length >= min_aln_length, ]
+  
+  ## expand two whole read to see unligned reads parts
+  blastdf_expand = blastdf
+  blastdf_expand$qstart = blastdf$qstart - (blastdf$qstart - 1)
+  blastdf_expand$sstart = blastdf$sstart - (blastdf$qstart - 1)
+  blastdf_expand$qend = blastdf$qend + (blastdf$qlen - blastdf$qend)
+  blastdf_expand$send = blastdf$send + (blastdf$qlen - blastdf$qend)
+  pS = blastdf$sstart
+  pE = blastdf$send
+  pSF = blastdf_expand$sstart
+  pEF = blastdf_expand$send
+  cond1 = pS - pSF >= min_offset & blastdf$qend >= (blastdf$qlen - max_end_dist) & 
+    blastdf$sstart >= max_pbs_search
+  cond2 = pEF - pE >= min_offset & blastdf$qstart <= max_end_dist & blastdf$send <= 
+    blastdf$slen - max_pbs_search
+  
+  ## coverage of alignments: evaluate coverage at site with breaks, it is neccessary
+  ## that there must be at least 20% of interupted alignments at given position to
+  ## be considered for additional search
+  coverage_profiles = subject_coverage(blastdf)
+  
+  
+  ## extract flanking sequences - cca 50nt, it should contain tRNA sequence search
+  ## Left
+  fin = tempfile()
+  fout = tempfile()
+  scoreL = scoreR = 0
+  
+  
+  ## left side unique insertion sites
+  if (any(cond1)) {
+    insertion_sites = ddply(blastdf[cond1, ], .(sseqid, sstart), nrow)
+    ## check coverage
+    insertion_sites$coverage = 0
+    for (i in 1:nrow(insertion_sites)) {
+      insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$sstart[[i]]]
+    }
+    insertion_OK_left <- insertion_sites[with(insertion_sites, V1/coverage > 
+                                                               insertion_proportion_threshold), ]
+    
+    if (nrow(insertion_OK_left) > 0) {
+      s = ifelse(insertion_OK_left[, "sstart"] - max_pbs_search < 1, 1, insertion_OK_left[, 
+                                                                                          "sstart"] - max_pbs_search)
+      Lpart = subseq(dimers_seq[match(insertion_OK_left$sseqid, names(dimers_seq))], 
+                     s, insertion_OK_left$sstart)
+      
+      ## names are CONSENSUSID__READID_position
+      names(Lpart) = paste0(names(Lpart), "__", insertion_OK_left$V1, "__", 
+                            insertion_OK_left$sstart)
+      ## check presence TG dinucleotide
+      TG = vcountPattern("TG", subseq(dimers_seq[match(insertion_OK_left$sseqid, 
+                                                       names(dimers_seq))], insertion_OK_left$sstart - 2, insertion_OK_left$sstart + 
+                                                                                                          2))
+      if (any(TG > 0)) {
+        writeXStringSet(Lpart[TG > 0], filepath = fin)
+        cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), 
+                    "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, 
+                    "-word_size", 7, "-strand plus -dust no -max_target_seqs 10000 -evalue 100", 
+                    "-out", fout)
+        
+        system(cmd, ignore.stdout = TRUE)
+        df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
+        filter1 = df$length >= thr_length
+        filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
+        df_pass = df[filter1 & filter2, , drop = FALSE]
+        df_pass_L = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
+        scoreL = get_score(df_pass_L)
+        write.table(df_pass_L, file = paste0(output, "_L.csv"), row.names = FALSE, 
+                    sep = "\t")
+      }
+    }
+  }
+  if (any(cond2)) {
+    ## search Right
+    insertion_sites = ddply(blastdf[cond2, ], .(sseqid, send, slen), nrow)
+    ## check coverage
+    insertion_sites$coverage = 0
+    for (i in 1:nrow(insertion_sites)) {
+      insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$send[[i]]]
+    }
+    insertion_OK_right <- insertion_sites[with(insertion_sites, V1/coverage > 
+                                                                insertion_proportion_threshold), ]
+    
+    if (nrow(insertion_OK_right) > 0) {
+      s = ifelse(insertion_OK_right$send + max_pbs_search > insertion_OK_right$slen, 
+                 insertion_OK_right$slen, insertion_OK_right$send + max_pbs_search)
+      Rpart = subseq(dimers_seq[match(insertion_OK_right$sseqid, names(dimers_seq))], 
+                     insertion_OK_right$send, s)
+      names(Rpart) = paste0(names(Rpart), "__", insertion_OK_right$V1, "__", 
+                            insertion_OK_right$send)
+      
+      ## check presence CA dinucleotide
+      CA = vcountPattern("CA", subseq(dimers_seq[match(insertion_OK_right$sseqid, 
+                                                       names(dimers_seq))], insertion_OK_right$send - 2, insertion_OK_right$send + 
+                                                                                                         2, ))
+      if (any(CA > 0)) {
+        writeXStringSet(Rpart[CA > 0], filepath = fin)
+        cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), 
+                    "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, 
+                    "-word_size", 7, "-strand minus -dust no -max_target_seqs 10000 -evalue 100", 
+                    "-out", fout)
+        
+        system(cmd, ignore.stdout = TRUE)
+        df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
+        filter1 = df$length >= thr_length
+        filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
+        df_pass = df[filter1 & filter2, , drop = FALSE]
+        df_pass_R = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
+        write.table(df_pass_R, file = paste0(output, "_R.csv"), row.names = FALSE, 
+                    sep = "\t")
+        scoreR = get_score(df_pass_R)
+      }
+    }
+  }
+  unlink(fin)
+  unlink(fout)
+  return(max(scoreL, scoreR))
+}
+
+
+subject_coverage = function(blastdf) {
+  ## calculate coverage for all blast subjects
+  coverage_profiles <- by(blastdf, INDICES = blastdf$sseqid, FUN = function(x) {
+    as.numeric(coverage(IRanges(start = x$sstart, end = x$send)))
+  })
+  return(coverage_profiles)
+  
+}
+
+get_score = function(x) {
+  ## keep best tRNA
+  if (nrow(x) == 0) {
+    return(0)
+  }
+  xm = data.frame(do.call(rbind, strsplit(x$qseqid, "__")), stringsAsFactors = FALSE)
+  xm$score = as.numeric(sapply(strsplit(xm[, 1], "_"), "[[", 4))
+  xm$AA = gsub("-.+$", "", gsub("^.+__", "", x$sseqid))
+  best_score = max(by(xm$score, INDICES = xm$AA, FUN = sum))
+  return(best_score)
+}
+
+
+
+dotter = function(seq1, seq2 = NULL, params = "") {
+  if (is.null(seq2)) {
+    seq2 = seq1
+  }
+  library(Biostrings)
+  if (class(seq1) != "DNAStringSet") {
+    seq1 = BStringSet(seq1)
+  }
+  if (class(seq2) != "DNAStringSet") {
+    seq2 = BStringSet(seq2)
+  }
+  sf1 = tempfile("seq1")
+  writeXStringSet(seq1, file = sf1)
+  sf2 = tempfile("seq2")
+  writeXStringSet(seq2, file = sf2)
+  system(paste("dotter", params, sf1, sf2), wait = FALSE)
+  Sys.sleep(2)
+  unlink(c(sf1, sf2))
+  return(NULL)
+}
+
+
+
+dotter2 = function(seq1, seq2 = NULL, params = NULL) {
+  if (is.null(seq2)) {
+    seq2 = seq1
+  }
+  if (is.null(params)) {
+    params = " -windowsize 30 -threshold 45 "
+  }
+  library(Biostrings)
+  if (class(seq1) != "DNAStringSet") {
+    seq1 = DNAStringSet(seq1)
+  }
+  if (class(seq2) != "DNAStringSet") {
+    seq2 = DNAStringSet(seq2)
+  }
+  L1 = nchar(seq1)
+  L2 = nchar(seq2)
+  
+  tmpdat1 = tempfile()
+  
+  dir.create(tmpdat1)
+  tmpdat2 = tempfile()
+  dir.create(tmpdat2)
+  oridir = getwd()
+  
+  
+  seq1_merged = DNAStringSet(paste(seq1, collapse = ""))
+  seq2_merged = DNAStringSet(paste(seq2, collapse = ""))
+  seq2rc_merged = reverseComplement(DNAStringSet(paste(seq2, collapse = "")))
+  
+  
+  sf1 = tempfile("seq1")
+  writeXStringSet(seq1_merged, filepath = sf1)
+  sf2 = tempfile("seq2")
+  sf2rc = tempfile("seq2rc")
+  writeXStringSet(seq2_merged, filepath = sf2)
+  writeXStringSet(seq2rc_merged, filepath = sf2rc)
+  
+  cmd1 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2, params)
+  cmd2 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2rc, 
+               params)
+  setwd(tmpdat1)
+  output1 = system(cmd1, intern = TRUE)
+  setwd(tmpdat2)
+  output2 = system(cmd2, intern = TRUE)
+  setwd(oridir)
+  
+  
+  
+  fout1 = strsplit(tail(output1, n = 1), split = " ")[[1]][2]
+  rawdat1 = readLines(paste(tmpdat1, "/", fout1, sep = ""))
+  rawdat1 = rawdat1[1:min(grep("^Rectangle", rawdat1))]
+  
+  if (length(rawdat1[grep("^Line", rawdat1)]) == 0) {
+    coord1 = NULL
+  } else {
+    coord1 = apply(sapply(strsplit(rawdat1[grep("^Line", rawdat1)], " "), "[", 
+                          c(3, 5, 7, 9, 11)), 1, as.numeric)
+    coord1 = matrix(coord1, ncol = 5)
+  }
+  
+  fout2 = strsplit(tail(output2, n = 1), split = " ")[[1]][2]
+  rawdat2 = readLines(paste(tmpdat2, "/", fout2, sep = ""))
+  rawdat2 = rawdat2[1:min(grep("^Rectangle", rawdat2))]
+  
+  if (length(rawdat2[grep("^Line", rawdat2)]) == 0) {
+    coord2 = NULL
+  } else {
+    coord2 = apply(sapply(strsplit(rawdat2[grep("^Line", rawdat2)], " "), "[", 
+                          c(3, 5, 7, 9, 11)), 1, as.numeric)
+    coord2 = matrix(coord2, ncol = 5)
+  }
+  unlink(sf1)
+  unlink(sf2)
+  unlink(sf2rc)
+  unlink(tmpdat1, recursive = TRUE)
+  unlink(tmpdat2, recursive = TRUE)
+  
+  N1 = sum(nchar(seq1))
+  N2 = sum(nchar(seq2))
+  op = par(xaxs = "i", yaxs = "i", mar = c(5, 2, 6, 10), las = 1)
+  on.exit(par(op))
+  plot(c(1, N1), c(1, N2), type = "n", xlab = "", ylab = "", axes = FALSE)
+  if (!is.null(coord1)) {
+    segments(x0 = coord1[, 1], y0 = coord1[, 2], x1 = coord1[, 3], y1 = coord1[, 
+                                                                               4])
+  }
+  if (!is.null(coord2)) {
+    segments(x0 = coord2[, 1], y0 = N2 - coord2[, 2], x1 = coord2[, 3], y1 = N2 - 
+                                                                          coord2[, 4])
+  }
+  abline(v = c(0, cumsum(L1)), col = "green")
+  abline(h = c(0, cumsum(L2)), col = "green")
+  box()
+  axis(1, at = c(1, cumsum(L1))[-(length(L1) + 1)], labels = names(seq1), hadj = 0, 
+       cex.axis = 0.7)
+  axis(4, at = c(1, cumsum(L2))[-(length(L2) + 1)], labels = names(seq2), hadj = 0, 
+       cex.axis = 0.7)
+  invisible(list(coord1, coord2))
+}
+
+
+CM2sequence = function(x) {
+  bases = c(colnames(x), "N")
+  x = cbind(x, 0)
+  colnames(x) = bases
+  seqs = paste(bases[apply(x, 1, which.max)], collapse = "")
+  
+  return(seqs)
+}
+
+## in alingment calculate significance from kmers
+calculate_consensus_matrix = function(aln, tr_paths, G) {
+  bases = c("A", "C", "G", "T", "-")
+  positions = lapply(strsplit(as.character(aln), split = ""), function(x) which(!x %in% 
+                                                                                "-"))
+  base_matrix = do.call(rbind, strsplit(as.character(aln), split = ""))
+  weights = matrix(0, nrow = length(aln), ncol = nchar(aln))
+  kmer_rel_proportions = (V(G)$size/table(factor(unlist(tr_paths), levels = 1:vcount(G))))
+  for (i in seq_along(positions)) {
+    weights[i, positions[[i]]] = kmer_rel_proportions[tr_paths[[i]]]
+  }
+  names(kmer_rel_proportions) = V(G)$name
+  ## get weights for gaps by approximation
+  fitgaps = function(y) {
+    if (sum(y == 0) == 0) {
+      return(y)
+    } else {
+      y0 = rep(y[y != 0], 3)
+      x0 = which(rep(y, 3) != 0)
+      fitted = approx(x0, y0, xout = seq_along(rep(y, 3)), rule = 1)
+    }
+    return(fitted$y[-seq_along(y)][seq_along(y)])
+  }
+  weights_with_gaps = t(apply(weights, 1, fitgaps))
+  ## weights_with_gaps = get_gap_weights(weights, aln,G) ## this step take wey too
+  ## long time!!!-not so important TODO handle gaps differently - more effectively
+  
+  ## get consensus matrix
+  CM = sapply(1:nchar(aln[[1]]), function(i) {
+    sapply(split(weights_with_gaps[, i], factor(base_matrix[, i], levels = bases)), 
+           sum)
+  })
+  return(t(CM)[, 1:5])
+}
+
+
+get_gaps_from_alignment = function(aln) {
+  as.character(aln)
+  gaps_positions = unique(do.call(rbind, str_locate_all(as.character(aln), "-+")))
+  return(gaps_positions)
+}
+
+plot_kmer_graph = function(G, L = NULL, vertex.size = NULL, ord = NULL, upto = NULL, 
+                           highlight = NULL) {
+  if (!is.null(G$L) & is.null(L)) {
+    L = G$L
+  }
+  if (is.null(L)) {
+    ## L=layout.kamada.kawai(G)
+    L = OGDFlayout(G)[[1]]
+  }
+  clr = kmers2color(V(G)$name)
+  if (!is.null(highlight)) {
+    clr[highlight] = "#00000080"
+  }
+  if (!is.null(ord)) {
+    clr[ord[1:upto]] = paste(clr[ord[1:upto]], "30", sep = "")
+  }
+  if (is.null(vertex.size)) {
+    vertex.size = rescale(V(G)$size, to = c(0.5, 6))
+    
+  }
+  plot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, 
+       vertex.color = clr, vertex.frame.color = "#00000020", edge.width = 2, edge.arrow.mode = 1, 
+       edge.arrow.size = 0.2)
+  
+}
+
+rglplot_kmer_graph = function(G, L = NULL, vertex.size = 4) {
+  if (is.null(L)) {
+    L = layout.kamada.kawai(G)
+  }
+  
+  rglplot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, 
+          vertex.color = kmers2color(V(G)$name), edge.width = 2, edge.arrow.mode = 1, 
+          edge.arrow.size = 0.5)
+}
+
+
+
+
+mafft = function(seqs, params = "--auto --thread 1 ") {
+  if (length(seqs) < 2) {
+    return(seqs)
+  }
+  infile = tempfile()
+  if (class(seqs) == "character") {
+    seqs = DNAStringSet(seqs)
+  }
+  writeXStringSet(seqs, file = infile)
+  outfile = tempfile()
+  cmd = paste("mafft --quiet --nuc ", params, infile, "2> /dev/null > ", outfile)
+  system(cmd, intern = TRUE, ignore.stderr = FALSE)
+  aln = readDNAStringSet(outfile)
+  unlink(c(outfile, infile))
+  return(aln)
+}
+
+mgblast = function(databaseSeq, querySeq) {
+  params = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -C 30 -D 30 "
+                                        # no dust filtering:
+  paramsDF = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -F F"
+  
+  database = tempfile()
+  query = tempfile()
+  output = tempfile()
+  if (class(databaseSeq) == "character") {
+    database = databaseSeq
+    do_not_delete_database = TRUE
+  } else {
+    writeXStringSet(databaseSeq, filepath = database, format = "fasta")
+    do_not_delete_database = FALSE
+  }
+  if (class(querySeq) == "character") {
+    query = querySeq
+    do_not_delete_query = TRUE
+  } else {
+    writeXStringSet(querySeq, filepath = query, format = "fasta")
+    do_not_delete_query = FALSE
+  }
+  ## create database:
+  cmd = paste("formatdb -i", database, "-p F")
+  system(cmd)
+  cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, params)
+  system(cmd)
+  if (file.info(output)$size == 0) {
+                                        # no hist, try wthou dust masker
+    cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, paramsDF)
+    system(cmd)
+  }
+  blastOut = read.table(output, sep = "\t", header = FALSE, as.is = TRUE, comment.char = "")
+  unlink(output)
+  if (!do_not_delete_query) {
+    unlink(query)
+  }
+  if (!do_not_delete_database) {
+    unlink(paste(database, "*", sep = ""))
+  }
+  colnames(blastOut) = c("query", "q.length", "q.start", "q.end", "subject", "s.length", 
+                         "s.start", "s.end", "pid", "weight", "e.value", "strand")
+  blastOut
+}
+
+
+estimate_sample_size = function(NV, NE, maxv, maxe) {
+  ## density
+  d = (2 * NE)/(NV * (NV - 1))
+  eEst = (maxv * (maxv - 1) * d)/2
+  nEst = (d + sqrt(d^2 + 8 * d * maxe))/(2 * d)
+  if (eEst >= maxe) {
+    N = round(nEst)
+    E = round((N * (N - 1) * d)/2)
+    
+  }
+  if (nEst >= maxv) {
+    N = maxv
+    E = round((N * (N - 1) * d)/2)
+    
+  }
+  return(N)
+}
+
+
+
+
+mgblast2graph = function(blastfile, seqfile, graph_destination, directed_graph_destination, 
+                         oriented_sequences, paired = TRUE, repex = FALSE, image_file = NULL, image_file_tmb = NULL, 
+                         include_layout = TRUE, pair_completeness = NULL, satellite_model_path = NULL, 
+                         maxv = 40000, maxe = 5e+08, seqfile_full = seqfile) {
+  cat("loading blast results\n")
+  if (repex) {
+    cln = c("query", "subject", "weight", "q.length", "q.start", "q.end", "s.length", "s.start", 
+            "s.end", "sign")
+    colcls = c("character", "character", "numeric", "numeric", "numeric", "numeric", 
+               "numeric", "numeric", "numeric", "NULL", "NULL", "character")
+    
+  } else {
+    cln = c("query", "q.length", "q.start", "q.end", "subject", "s.length", "s.start", 
+            "s.end","weight", "sign")
+    colcls = c("character", "numeric", "numeric", "numeric", "character", "numeric", 
+               "numeric", "numeric", "NULL", "numeric", "NULL", "character")
+  }
+  if (class(blastfile) == "data.frame") {
+    blastTable = blastfile
+    colnames(blastTable)[12] = "sign"
+  } else {
+    blastTable = read.table(blastfile, sep = "\t", as.is = TRUE, header = FALSE, 
+                            colClasses = colcls, comment.char = "")
+    colnames(blastTable) = cln
+  }
+  ## check for duplicates!
+  key = with(blastTable, ifelse(query > subject, paste(query, subject), paste(subject, 
+                                                                              query)))
+  if (any(duplicated(key))) {
+    blastTable = blastTable[!duplicated(key), ]
+  }
+  seqs = readDNAStringSet(seqfile)
+  ## calculate pair completeness for reads before sampling
+  if (is.null(pair_completeness)) {
+    if (paired) {
+      if (seqfile_full != seqfile) {
+        seqs_full = readDNAStringSet(seqfile_full)
+        pair_counts = tabulate(table(gsub(".$", "", names(seqs_full))))
+        rm(seqs_full)
+      } else {
+        pair_counts = tabulate(table(gsub(".$", "", names(seqs))))
+        
+      }
+      pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
+    }else{
+      pair_completeness = 0
+    }
+  }
+  NV = length(seqs)  # vertices
+  NE = nrow(blastTable)  # nodes
+  if (maxv < NV | maxe < NE) {
+    ## Sample if graph is large
+    V_sample_size = estimate_sample_size(NV, NE, maxv, maxe)
+    seqs = sample(seqs, V_sample_size)
+    blastTable = blastTable[blastTable$query %in% names(seqs) & blastTable$subject %in% 
+                            names(seqs), ]
+  }
+  
+  blastTable$sign = ifelse(blastTable$sign == "+", 1, -1)
+  vnames = unique(c(blastTable$query, blastTable$subject))
+  vindex = seq_along(vnames)
+  names(vindex) = vnames
+  ## correct orientation
+  cat("creating graph\n")
+  G = graph.data.frame(blastTable[, c("query", "subject", "weight", "sign")], directed = FALSE, 
+                       vertices = vnames)
+  if (include_layout) {
+    ## save temporarily modified blastTable if large
+    tmpB = tempfile()
+    save(blastTable, file = tmpB)
+    rm(blastTable)
+    ## 
+    cat("graph layout calculation ")
+    if (ecount(G) > 2e+06) {
+      cat("using fruchterman reingold\n")
+      L = layout.fruchterman.reingold(G, dim = 3)
+    } else {
+      cat("using OGDF & frucherman reingold\n")
+      Ltmp = OGDFlayout(G, alg = c("fmmm"))
+      L = cbind(Ltmp[[1]][, 1:2], layout.fruchterman.reingold(G, dim = 2))
+      
+    }
+    
+    GL = list(G = G, L = L)
+    save(GL, file = graph_destination)
+    if (!is.null(image_file)) {
+      cat("exporting graph figure")
+      png(image_file, width = 900, height = 900, pointsize = 20)
+      plot(GL$G, layout = GL$L[, 1:2], vertex.size = 2, vertex.color = "#000000A0", 
+           edge.color = "#00000040", vertex.shape = "circle", edge.curved = FALSE, 
+           vertex.label = NA, vertex.frame.color = NA, edge.width = 1)
+      dev.off()
+      ## create thunmbs:
+      system(paste("convert ", image_file, " -resize 100x100 ", image_file_tmb))
+      
+    }
+    rm(GL)
+    gc()
+    load(tmpB)
+    unlink(tmpB)
+  }
+  
+  if (!is.connected(G)) {
+    cat("Warning - graph is not connected\n")
+    cc = clusters(G, "weak")
+    mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), 
+                                                                          decreasing = TRUE)))))
+    selvi = which(mb == 1)  # largest component
+    cat("using largest component: \nsize =", round(length(selvi)/vcount(G) * 
+                                                   100, 1), "% ;", length(selvi), "reads", "\n")
+    G = induced.subgraph(G, vids = selvi)
+    blastTable = blastTable[blastTable$query %in% V(G)$name & blastTable$subject %in% 
+                                                      V(G)$name, ]
+  }
+  
+  Gmst = minimum.spanning.tree(G)
+                                        # create alternative trees - for case that unly suboptima solution is found
+  set.seed(123)
+  Gmst_alt = list()
+  Gmst_alt[[1]] = Gmst
+  for (i in 2:6){
+    E(G)$weight = runif(ecount(G), 0.1,1)
+    Gmst_alt[[i]] = minimum.spanning.tree(G)
+  }
+
+  rm(G)
+  gc()
+  ## six attempts to reorient reads
+  flip_names_all = list()
+  prop_of_notfit=numeric()
+  thr_fit=0.001
+  for (ii in 1:6){
+    Gmst = Gmst_alt[[ii]]
+
+
+    blastTable_mst = as.data.frame(get.edgelist(Gmst, names = TRUE))
+    colnames(blastTable_mst) = c("query", "subject")
+    blastTable_mst$sign = E(Gmst)$sign
+
+    d = dfs(Gmst, root = 1, father = TRUE, order.out = TRUE)
+    esign = E(Gmst)$sign
+    rc = rep(FALSE, vcount(Gmst))
+    j = 0
+    p = 0
+    for (v in d$order[-1]) {
+      j = j + 1
+      f = as.numeric(d$father)[v]
+      if (is.na(f)) {
+        next
+      }
+      eid = get.edge.ids(Gmst, c(v, f))
+      if (esign[eid] == -1) {
+        ie = unique(c(which(blastTable_mst$query %in% V(Gmst)$name[v]), which(blastTable_mst$subject %in% 
+                                                                              V(Gmst)$name[v])))  # incident edges
+        esign[ie] = esign[ie] * -1
+        rc[v] = TRUE
+        p = p + 1
+        if (p > 50) {
+          p = 0
+          cat("\r")
+          cat(j, " : ", sum(esign)/length(esign))
+        }
+      }
+    }
+    cat("\t")
+    flip_names_all[[ii]] = flip_names = V(Gmst)$name[rc]
+
+    if (!exists("blastTable")) {
+      load(tmpB)
+      unlink(tmpB)
+    }
+    ## add nofit later!!
+    s.mean = with(blastTable, (s.start + s.end)/2)
+    s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - 
+                                                                  s.mean, s.mean)
+    q.mean = with(blastTable, (q.start + q.end)/2)
+    q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - 
+                                                                q.mean, q.mean)
+    V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
+    V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
+    sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, 
+                                                            1) * blastTable$sign
+    nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))
+    prop_of_notfit[ii] = length(nofit)/vcount(Gmst)
+    ## check if correctly oriented
+    cat("prop notfit", prop_of_notfit[[ii]],"\n")
+    if (prop_of_notfit[[ii]]<thr_fit){
+      ## OK
+      break
+    }
+  }
+  if (!prop_of_notfit[[ii]]<thr_fit){
+    ## get best solution
+    ii_best = which.min(prop_of_notfit)
+    if (ii != ii_best){   # if the last solution was not best, get the best
+      flip_names = flip_names_all[[ii_best]]
+      s.mean = with(blastTable, (s.start + s.end)/2)
+      s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - 
+                                                                    s.mean, s.mean)
+      q.mean = with(blastTable, (q.start + q.end)/2)
+      q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - 
+                                                                  q.mean, q.mean)
+      V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
+      V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
+      sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, 
+                                                              1) * blastTable$sign
+      nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))
+      
+    }
+  }
+
+  ## exclude all nofit
+  
+  df2 = data.frame(V1, V2, sign = blastTable$sign, sign_final = sign_final, stringsAsFactors = FALSE)
+  rm(blastTable)
+  gc()
+  vertices = data.frame(name = vnames, reverse_complement = vnames %in% flip_names)
+  G = graph.data.frame(df2, vertices = vertices)
+  vcount_ori = vcount(G)
+  G = induced.subgraph(G, vids = which(!V(G)$name %in% nofit))
+  
+  G$escore_mst = sum(esign)/length(esign)
+  G$escore = sum(sign_final == 1)/length(sign_final)
+  cc = clusters(G, "strong")
+  mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), 
+                                                                        decreasing = TRUE)))))
+  names(mb) = V(G)$name
+  G$loop_index = max(cc$csize)/vcount(G)
+  G$coverage = vcount(G)/vcount_ori
+  ## check sign in all edges
+  V(G)$membership = mb
+  save(G, file = directed_graph_destination)
+  ## remove nofit
+  seqs = seqs[!names(seqs) %in% nofit]
+  seqs[names(seqs) %in% flip_names] = reverseComplement(seqs[names(seqs) %in% flip_names])
+  names(seqs) = paste(names(seqs), mb[names(seqs)])
+  writeXStringSet(seqs, filepath = oriented_sequences)
+  ## calculate satellite probability
+  if (is.null(satellite_model_path)) {
+    pSAT = isSAT = NULL
+  } else {
+    satellite_model = readRDS(satellite_model_path)
+    pSAT = get_prob(G$loop_index, pair_completeness, model = satellite_model)
+    isSAT = isSatellite(G$loop_index, pair_completeness, model = satellite_model)
+  }
+  
+  ## get larges cc
+  output = list(escore = G$escore, coverage = G$coverage, escore_mts = G$escore_mst, 
+                loop_index = G$loop_index, pair_completeness = pair_completeness, graph_file = graph_destination, 
+                oriented_sequences = oriented_sequences, vcount = vcount(G), ecount = ecount(G), 
+                satellite_probability = pSAT, satellite = isSAT)
+  ## clean up
+  all_objects = ls()
+  do_not_remove = "output"
+  rm(list = all_objects[!(all_objects %in% do_not_remove)])
+  gc(verbose = FALSE, reset = TRUE)
+  return(list2dictionary(output))
+}
+
+list2dictionary = function(l){
+    dict = "{"
+    q='"'
+    for (i in 1:length(l)){
+        if (class(l[[i]])=="character" | is.null(l[[i]])){
+            q2 = "'''"
+        }else{
+            q2 = ''
+        }
+        dict = paste0(
+            dict,
+            q,names(l)[i],q,":",q2, l[[i]], q2,", "
+        )
+    }
+    dict = paste0(dict, "}")
+    return(dict)
+}
+
+wrap = Vectorize(function(s, width = 80) {
+  i1 = seq(1, nchar(s), width)
+  i2 = seq(width, by = width, length.out = length(i1))
+  return(paste(substring(s, i1, i2), collapse = "\n"))
+})
+
+
+tarean = function(input_sequences, output_dir, min_kmer_length = 11, max_kmer_length = 27, 
+                  CPU = 2, sample_size = 10000, reorient_reads = TRUE, tRNA_database_path = NULL, 
+                  include_layout = TRUE, paired = TRUE, lock_file=NULL) {
+  options(CPU = CPU)
+  time0 = Sys.time()
+  dir.create(output_dir)
+  input_sequences_copy = paste(output_dir, "/", basename(input_sequences), sep = "")
+  
+  if (!file.copy(input_sequences, input_sequences_copy, overwrite = TRUE)) {
+    cat(paste("cannot copy", input_sequences, " to", output_dir), "\n")
+    stop()
+  }
+  
+  lock_file = waitForRAM(lock_file = lock_file)
+  pair_completeness = NULL
+  ## sampling
+  if (sample_size != 0) {
+    s = readDNAStringSet(input_sequences_copy)
+    N = length(s)
+    ## pair completness must be calculated before sampling!
+    if (N > sample_size) {
+      writeXStringSet(sample(s, sample_size), filepath = input_sequences_copy)
+      if (paired) {
+        pair_counts = tabulate(table(gsub(".$", "", names(s))))
+        pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
+      }
+    }
+    rm(s)
+  }
+  
+  if (reorient_reads) {
+    input_sequences_oriented = paste0(input_sequences_copy, "oriented.fasta")
+    graph_file = paste0(input_sequences_copy, "_graph.RData")
+    GLfile = paste0(input_sequences_copy, "_graph.GL")
+    cat("reorientig sequences\n")
+    
+    blastTable = mgblast(input_sequences_copy, input_sequences_copy)
+    
+    graph_info = mgblast2graph(blastTable, input_sequences_copy, GLfile, graph_file, 
+                               input_sequences_oriented, include_layout = include_layout, paired = paired, 
+                               pair_completeness = pair_completeness)
+    
+    ## interupt if it does not look like tandem repeat at all # soft threshold!
+    if (is.null(graph_info$pair_completeness)) {
+      if (graph_info$loop_index <= 0.4) {
+        cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+        return(list2dictionary(list(graph_info = graph_info)))
+      }
+    } else {
+      ## for paired reads:
+      if (graph_info$loop_index < 0.7 | graph_info$pair_completeness < 0.4) {
+        cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+        return(list2dictionary(list(graph_info = graph_info)))
+      }
+    }
+    
+  } else {
+    input_sequences_oriented = input_sequences_copy
+    graph_info = NULL
+  }
+  
+  
+  
+  kmer_counts = list()
+  kmer_lengths = seq(min_kmer_length, max_kmer_length, by = 4)
+  for (i in kmer_lengths) {
+    ## use pythonis.null(l[[i]])) function - faster
+    cmd = paste(script.dir, "/kmer_counting.py ", input_sequences_oriented, " ", 
+                i, sep = "")
+    cat("calculation kmers of length ", i, "\n")
+    f = system(cmd, intern = TRUE)
+    x = read.table(f, as.is = TRUE, sep = "\t")
+    kmer_counts[[as.character(i)]] = data.frame(x, freq = x$V2/sum(x$V2))
+  }
+  
+  ## number of kmers:
+  N50 = sapply(kmer_counts, function(x) {
+    sum(cumsum(x$freq) < 0.5)
+  })
+  
+  N70 = sapply(kmer_counts, function(x) {
+    sum(cumsum(x$freq) < 0.7)
+  })
+  
+  
+  time1 = Sys.time()
+  ggmin = mclapply(kmer_counts, get_mimimal_cc, start = 0.5, mc.cores = CPU)
+  time2 = Sys.time()
+  cat("kmers graphs created ")
+  print(time2 - time1)
+  names(ggmin) = names(kmer_counts)
+  
+  
+  ## estimate monomer
+  monomers = mclapply(ggmin, estimate_monomer, mc.cores = CPU)
+  
+  names(monomers) = names(kmer_counts)
+  monomers = monomers[!sapply(monomers, is.null)]
+  ## error handling:
+  error = sapply(monomers, class) == "try-error"
+  if (any(error)) {
+    cat("\nError in monomers estimation: ")
+    cat("calculation failed for monomers length ", names(monomers)[error], "\n")
+    print(monomers[error])
+    if (any(!error)) {
+      monomers = monomers[!error]
+    } else {
+      stop("monomer estimation failed")
+    }
+  }
+  
+  ## summary - make function!!
+  total_score = list()
+  k = 0
+  length_best = numeric()
+  score_bn = numeric()
+  consensus = character()
+  
+  for (i in names(monomers)) {
+    for (v in seq_along(monomers[[i]]$estimates)) {
+      k = k + 1
+      total_score[[k]] = c(kmer = as.numeric(i), variant = v, total_score = monomers[[i]]$estimates[[v]]$total_score)
+      score_bn[k] = min(rowSums(monomers[[i]]$estimates[[v]]$CM))
+      length_best[k] = monomers[[i]]$estimates[[v]]$length_variant_score[1, 
+                                                                         1]
+      consensus[[k]] = monomers[[i]]$estimates[[v]]$consensus2
+    }
+  }
+  summary_table = as.data.frame(do.call(rbind, total_score))
+  summary_table$monomer_length = length_best
+  summary_table$consensus_length = nchar(consensus)
+  summary_table$score_bn = score_bn
+  summary_table$consensus = paste("<pre>", wrap(consensus, width = 80), "<pre>", 
+                                  sep = "")
+  consensus_DS = DNAStringSet(consensus)
+  names(consensus_DS) = with(summary_table, paste0(kmer, "_", variant, "_sc_", 
+                                                   signif(total_score), "_l_", monomer_length))
+  
+  ## filter by score - keep
+  
+  ## reorder by score
+  consensus_DS = consensus_DS[order(summary_table$total_score, decreasing = TRUE)]
+  summary_table = summary_table[order(summary_table$total_score, decreasing = TRUE), 
+                                ]
+  rownames(summary_table) = NULL
+  N = nrow(summary_table)
+  ## concatenate concensus(ie. make dimer head 2 tail) for pbs detection and other
+  ## make something like 'pseudo contig' multimer for mapping - min length 200 bp
+
+  ## searches
+  consensus_DS_dimer = DNAStringSet(paste0(consensus_DS, consensus_DS))
+  tarean_contigs = DNAStringSet(sapply(consensus_DS,function(x)
+    ifelse(nchar(x)<200,
+           paste(rep(as.character(x),round(300/nchar(as.character(x))+1)),collapse=''),
+           as.character(x)))
+    )
+
+  names(consensus_DS_dimer) = names(consensus_DS)
+                                        # save sequences:
+  consensus_DS_dimer_file = paste0(output_dir, "/consensus_dimer.fasta")
+  consensus_DS_file = paste0(output_dir, "/consensus.fasta")
+  tarean_contig_file = paste0(output_dir, "/tarean_contigs.fasta")
+  writeXStringSet(consensus_DS, consensus_DS_file)
+  writeXStringSet(tarean_contigs, tarean_contig_file)
+  writeXStringSet(consensus_DS_dimer, consensus_DS_dimer_file)
+  if (is.null(tRNA_database_path)) {
+    pbs_score = -1
+  } else {
+    pbs_blast_table = paste0(output_dir, "/pbs_detection")
+    pbs_score = detect_pbs(dimers_file = consensus_DS_dimer_file, tRNA_database_path = tRNA_database_path, 
+                           reads = input_sequences_oriented, output = pbs_blast_table)
+  }
+  ## search of open reading frames get the length of the longest
+  orf_l = max_ORF_length(consensus_DS_dimer)
+  
+  
+  dir.create(paste0(output_dir, "/img"), recursive = TRUE)
+  summary_table$monomer_length_graph = numeric(N)
+  summary_table$graph_image = character(N)
+  summary_table$monomer_length_logo = numeric(N)
+  summary_table$logo_image = character(N)
+  ## export graph nd consensus estimate to cluster directory type of output may
+  ## change in future
+  save(ggmin, file = paste0(output_dir, "/ggmin.RData"))
+  save(monomers, file = paste0(output_dir, "/monomers.RData"))
+  
+  cat("generating HTML output\n")
+  for (i in 1:N) {
+    kmer = as.character(summary_table$kmer[i])
+    variant = summary_table$variant[i]
+    ## export graph
+    fout_link = paste0("img/graph_", kmer, "mer_", variant, ".png")
+    fout = paste0(output_dir, "/", fout_link)
+    ## summary_table$monomer_length_graph[i] = summary_table$monomer_length[i]
+    ## summary_table$monomer_length_logo[[i]] = nrow(monomers[[kmer]]$estimates[[variant]]$CM)
+    summary_table$monomer_length[[i]] = length(monomers[[kmer]]$estimates[[variant]]$consensus)
+
+    if (i <= 10) {
+      png(fout, width = 800, height = 800)
+      plot_kmer_graph(ggmin[[kmer]], highlight = unlist(monomers[[kmer]]$paths[[variant]]$tr_paths))
+      dev.off()
+      summary_table$graph_image[i] = hwriteImage(fout_link, link = fout_link, 
+                                                 table = FALSE, width = 100, height = 100)
+      ## export logo
+      png_link = paste0("img/logo_", kmer, "mer_", variant, ".png")
+      fout = paste0(output_dir, "/", png_link)
+      png(fout, width = 1200, height = round(summary_table$monomer_length[i] * 
+                                             1) + 550)
+      try(plot_multiline_logo(monomers[[kmer]]$estimates[[variant]]$CM, W = 100))
+      dev.off()
+      ## export corresponding position probability matrices
+      ppm_file = paste0(output_dir, '/ppm_', kmer, "mer_", variant, ".csv")
+      ppm_link = paste0('ppm_', kmer, "mer_", variant, ".csv")
+      write.table(monomers[[kmer]]$estimates[[variant]]$CM,
+                  file = ppm_file,
+                  col.names = TRUE, quote = FALSE,
+                  row.names = FALSE, sep="\t")
+      summary_table$logo_image[i] = hwriteImage(png_link, link = ppm_link, 
+                                                table = FALSE, width = 200, height = 100)
+    }
+    
+  }
+  
+  ## html_report = HTMLReport()
+  
+  htmlfile = paste0(output_dir, "/report.html")
+  cat(htmlheader, file = htmlfile)
+  included_columns = c('kmer', 'variant', 'total_score', 'consensus_length','consensus', 'graph_image', 'logo_image')
+  summary_table_clean = summary_table[,included_columns]
+  colnames(summary_table_clean) = c('k-mer length',
+                                    'Variant index',
+                                    'k-mer coverage score',
+                                    'Consensus length',
+                                    'Consensus sequence',
+                                    'k-mer based graph',
+                                    'Sequence logo')
+  HTML(summary_table_clean, file = htmlfile, sortableDF = TRUE)
+  HTMLEndFile(file = htmlfile)
+  time4 = Sys.time()
+  print(time4 - time0)
+  if (!is.null(lock_file)){
+    print("------removing-lock--------")
+    removelock(lock_file)
+  }
+
+  print(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
+             TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
+             TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
+             orf_l = orf_l, tarean_contig_file = tarean_contig_file))
+  return(list2dictionary(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
+              TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
+              TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
+              orf_l = orf_l, tarean_contig_file = tarean_contig_file)))
+}
+
+
+## graph loop index stability
+loop_index_instability = function(G) {
+  N = 50
+  s = seq(vcount(G), vcount(G)/10, length.out = N)
+  p = seq(1, 0.1, length.out = N)
+  li = numeric()
+  for (i in seq_along(s)) {
+    print(i)
+    gs = induced_subgraph(G, sample(1:vcount(G), s[i]))
+    li[i] = max(clusters(gs, "strong")$csize)/vcount(gs)
+  }
+  instability = lm(li ~ p)$coefficient[2]
+  return(instability)
+}
+
+isSatellite = function(x, y, model) {
+  p = get_prob(x, y, model)
+  if (p > model$cutoff) {
+    return("Putative Satellite")
+  } else {
+    return("")
+  }
+}
+
+get_prob = function(x, y, model) {
+  pm = model$prob_matrix
+  N = ncol(pm)
+  i = round(x * (N - 1)) + 1
+  j = round(y * (N - 1)) + 1
+  p = pm[i, j]
+  return(p)
+}
+
+
+detectMemUsage = function() {
+  con = textConnection(gsub(" +", " ", readLines("/proc/meminfo")))
+  memInfo = read.table(con, fill = TRUE, row.names = 1)
+  close(con)
+  memUsage = 1 - (memInfo["MemFree", 1] + memInfo["Cached", 1])/memInfo["MemTotal", 
+                                                                        1]
+  return(memUsage)
+}
+
+
+makelock<-function(lockfile,lockmsg,CreateDirectories=TRUE){
+    lockdir=dirname(lockfile)
+    if(!file.exists(lockdir)){
+        if(CreateDirectories) dir.create(lockdir,recursive=TRUE)
+        else stop("Lock Directory for lockfile ",lockfile," does not exist")
+    } 
+    if(missing(lockmsg)) lockmsg=paste(system('hostname',intern=TRUE),Sys.getenv("R_SESSION_TMPDIR"))
+    if (file.exists(lockfile)) return (FALSE)
+                                        # note the use of paste makes the message writing atomic
+    cat(paste(lockmsg,"\n",sep=""),file=lockfile,append=TRUE,sep="")
+    firstline=readLines(lockfile,n=1)
+    if(firstline!=lockmsg){
+                                        # somebody else got there first
+        return(FALSE)
+    } else return(TRUE)
+}
+
+
+removelock<-function(lockfile){
+  if(unlink(lockfile)!=0) {
+    warning("Unable to remove ",lockfile)
+    return (FALSE)
+  }
+  return (TRUE)
+}
+
+
+waitForRAM = function(p = 0.5,lock_file=NULL) {
+  if (detectMemUsage() < p) {
+    return(NULL)
+    ## check lock file:
+  } else {
+    cat("waiting for RAM \n")
+    free_count = 0
+    while (TRUE) {
+        if (makelock(lock_file)){
+            print("---------locking--------")
+            return(lock_file)
+        }
+      if (detectMemUsage() < p) {
+        cat("RAM freed \n")
+        return(NULL)
+      }
+      Sys.sleep(5)
+      if (evaluate_user_cpu_usage() == 'free'){
+        free_count = free_count + 1
+      }else{
+        free_count = 0
+      }
+      if (detectMemUsage() < 0.8 & free_count > 100){
+        cat("RAM not free but nothing else is running \n")
+        return(NULL)
+      }
+    }
+  }
+}
+
+lsmem = function() {
+  g = globalenv()
+  out_all = envs = list()
+  envs = append(envs, g)
+  total_size = numeric()
+  while (environmentName(g) != "R_EmptyEnv") {
+    g <- parent.env(g)
+    envs = append(envs, g)
+  }
+  for (e in envs) {
+    
+    obj = ls(envir = e)
+    if (length(obj) == 0) {
+      break
+    }
+    obj.size = list()
+    for (i in obj) {
+      obj.size[[i]] = object.size(get(i, envir = e))
+    }
+    out = data.frame(object = obj, size = unlist(obj.size), stringsAsFactors = FALSE)
+    out = out[order(out$size, decreasing = TRUE), ]
+    out_all = append(out_all, out)
+    total_size = append(total_size, sum(out$size))
+  }
+  return(list(objects = out_all, total_size = total_size))
+} 
+
+evaluate_user_cpu_usage = function(){
+  user = Sys.info()["user"]
+  a = sum(as.numeric (system(paste ("ps -e -o %cpu -u", user), intern = TRUE)[-1]))
+  s = substring (system(paste ("ps -e -o stat -u", user), intern = TRUE)[-1],1,1)
+  if (a<5 & sum(s %in% 'D')==0 & sum(s%in% 'R')<2){
+    status = 'free'
+  }else{
+    status = 'full'
+  }
+  return(status)
+}
author	petr-novak
date	Thu, 19 Dec 2019 10:24:45 -0500
parents
children