Mercurial > repos > petr-novak > repeatrxplorer

#!/user/bin/env Rscript

suppressPackageStartupMessages(library(igraph))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(Biostrings))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(hwriter))
suppressPackageStartupMessages(library(R2HTML))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(dplyr))

max_ORF_length = function(s) {
  ## check all frames
  L = 0
  for (i in 1:3) {
    L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(s, i))), "*",
                                     fixed = TRUE))))
    L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(reverseComplement(s),
                                                                   i))), "*", fixed = TRUE))))
  }
  return(L)
}

kmers2graph = function(kmers, mode = "strong", prop = NULL) {
  kmerLength = nchar(kmers[1, 1])
  if (ncol(kmers) == 2) {
    kmers$size = kmers[, 2]/sum(kmers[, 2])
  }
  colnames(kmers) = c("name", "count", "size")
  if (!is.null(prop)) {  # tohle se nepouziva(prop je null), a je to asi spatne - filtuje se to pred tridenim!!
    p = cumsum(kmers$size)
    kmers = kmers[p < prop, ]
  }
  N = dim(kmers)[1]
  kmers = kmers[order(kmers$size), ]
  ## convert kmers to fasta file
  kms = data.frame(kmer = substring(kmers$name, 1, kmerLength - 1), ids = 1:nrow(kmers),stringsAsFactors = FALSE)
  kme = data.frame(kmer = substring(kmers$name, 2), ide = 1:nrow(kmers), stringsAsFactors = FALSE)

  ## df = merge(kms,kme, by = 'kmer',all=FALSE)[,c(2,3,1)]
  df = inner_join(kme,kms, by = 'kmer')[,c(2,3)]

  ## names(kms) = seq_along(kms)
  ## kme = substring(kmers$name, 2)
  ## names(kme) = seq_along(kme)
  ## ## use new blast!
  ## database = tempfile()
  ## query = tempfile()
  ## output = tempfile()
  ## writeXStringSet(DNAStringSet(kms), filepath = database, format = "fasta")
  ## writeXStringSet(DNAStringSet(kme), filepath = query, format = "fasta")
  ## cmd = paste("makeblastdb -in", database, "-dbtype nucl")
  ## system(cmd, ignore.stdout = TRUE)
  ## cmd = paste("blastn -outfmt '6 qseqid sseqid pident'  -strand plus -dust no -perc_identity 100 -query ",
  ##     query, "-db", database, "-word_size", kmerLength - 1, "-out", output)
  ## system(cmd)
  ## df = try({
  ##   read.table(output, as.is = TRUE)
  ## })
  ## if (class(df) == "try-error"){
  ##   print("creation of kmer graph failed")
  ##   print(query)
  ##   print(output)
  ##   print(database)
  ##   return(NULL)
  ## }
  ## unlink(query)
  ## unlink(paste(database, "*", sep = ""))
  ## unlist(output)
  gm_mean = function(x, na.rm = TRUE) {
    exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x))
  }

  whg = apply(cbind(kmers[df[, 1], 2], V2 = kmers[df[, 2], 2]), 1, gm_mean)
  G = graph.data.frame(data.frame(V1 = kmers$name[df[, 1]], V2 = kmers$name[df[,
                                                                               2]], weight = whg), vertices = kmers[, 1:3])
                                        # separate to connected components:
  ccs = clusters(G, mode = mode)$membership
  sel_cls = which(tabulate(ccs) > 1)
  Gs = list()
  for (i in seq_along(sel_cls)) {
    Gs[[i]] = induced.subgraph(G, vids = which(ccs %in% sel_cls[i]))
  }
  ## reorder!!!
  Gs = Gs[order(sapply(Gs, vcount), decreasing = TRUE)]
  return(Gs)
}


OGDFlayout = function(G, ncol = NULL, alg = "fmmm", OGDF = getOption("OGDF")) {
  ## is ogdf binary available?
  if (is.null(OGDF)) {
    OGDF = Sys.getenv("OGDF")
    if ("" == OGDF) {
      options(warn = -1)
      OGDF = system("which runOGDFlayout", intern = TRUE)
      options(warn = 0)
      if (length(OGDF) == 0) {
        cat("path to runOGDFlayout not found\n")
        return(NULL)
      }

    }
  }
  if (is.null(ncol)) {
    if (is.null(E(G)$weight)) {
      el = data.frame(get.edgelist(G, names = TRUE), rep(1, ecount(G)))
    } else {
      el = data.frame(get.edgelist(G, names = TRUE), E(G)$weight)
    }
    ncol = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", sep = "")
    write.table(el, file = ncol, row.names = FALSE, col.names = FALSE, sep = "\t",
                quote = FALSE)
  } else {
                                        # copy ncol:
    ncol_tmp = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout",
                     sep = "")
    file.copy(ncol, ncol_tmp)
    ncol = ncol_tmp
  }
  algopt = c("fmmm", "sm", "fme")
  if (!(alg %in% c("fmmm", "sm", "fme") && TRUE)) {
    stop("alg must by :", algopt, "\n")

  }

                                        # output file:
  Louts = list()
  layout_file = tempfile(pattern = as.character(Sys.getpid()))
  for (i in alg) {
    cmd = paste(OGDF, "-alg", i, "-iff layout -off layout -out", layout_file,
                ncol)
    system(cmd, intern = TRUE)
    L = read.table(layout_file, skip = ecount(G))
    L = L[match(V(G)$name, L$V2), ]
    Lout = as.matrix(L[, -(1:2)])
    unlink(layout_file)
    Louts[[i]] = Lout

  }
                                        # clean up
  unlink(ncol)
  return(Louts)
}

xcolor_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")

kmers2color = function(s, position = NULL) {
  if (is.null(position)) {
    position = round(nchar(s[1])/2, 0)
    ## position = 1
    position = nchar(s[1])
  }
  color_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")
  color_base = substring(s, position, position)
  colors = color_code[color_base]
  names(colors) = color_base
  return(colors)
}
get_sequence = function(g, v, position = NULL) {
  s = V(g)$name
  if (is.null(position)) {
    position = round(nchar(s[1])/2, 0)
    ## position = 1
    position = nchar(s[1])
  }
  nt = paste(substring(s[v], position, position), collapse = "")
  return(nt)
}


get_mimimal_cc = function(km, thr = 20, min_coverage = 0.45, step = 2, start = NULL) {
  if (is.null(start)) {
    i = sum(cumsum(km$freq) < 0.5)
  } else {
    i = sum(cumsum(km$freq) < start)
  }
  continue = TRUE
  while (continue) {
    if (i > nrow(km)) {
      i = nrow(km)
      continue = FALSE
      step = 1
    }
    GG = kmers2graph(km[1:i, ])
    if (length(GG) > 0) {
      if (vcount(GG[[1]]) > thr) {
        if (sum(V(GG[[1]])$size) >= min_coverage) {
          GG[[1]]$input_coverage = sum(km$freq[1:i])
          GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
          return(GG[[1]])
        }
      }
    }
    i = round(i * step)

  }
  if (length(GG) == 0 | is.null(GG)) {
    return(NULL)
  }

  GG[[1]]$input_coverage = sum(km$freq[1:i])
  GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
  return(GG[[1]])
}


paths2string = function(paths) {
  pathstring = sapply(lapply(lapply(paths, as_ids), substring, 1, 1), paste, collapse = "")
  return(pathstring)
}


align_paths = function(paths, G) {
  shift = rep(NA, length(paths))
  thr = 0  # minimal length
  tr_paths = list()
  Seqs = list()
  centre_node = as.numeric(names(sort(table(unlist(paths)), decreasing = TRUE)))[[1]]

  for (i in seq_along(paths)) {
    if (centre_node %in% paths[[i]]) {
      S = which(paths[[i]] %in% centre_node)
      shift[i] = S
      if (S == 1) {
        tr_paths[[i]] = paths[[i]]
      } else {
        tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S -
                                                                          1)])
      }
      Seqs[[i]] = get_sequence(G, tr_paths[[i]])
    } else {
      shift[i] = NA
    }
  }
  paths_n = lapply(paths, as.numeric)
  tr_paths_n = do.call(cbind, lapply(tr_paths, as.numeric))
  new_shift = shift
  for (i in which(is.na(shift))) {
    score = numeric(length(paths_n))
    for (S in seq_along(paths_n[[i]])) {
      if (S == 1) {
        path_tmp_n = paths_n[[i]]
      } else {
        path_tmp_n = c(paths_n[[i]][S:length(paths_n[[i]])], paths_n[[i]][1:(S -
                                                                             1)])
      }
      score[S] = sum(tr_paths_n == path_tmp_n)
    }
    if (sum(score) != 0) {
      S = which.max(score)
      new_shift[i] = S
      if (S == 1) {
        tr_paths[[i]] = paths[[i]]
      } else {
        tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S -
                                                                          1)])
      }
      Seqs[[i]] = get_sequence(G, tr_paths[[i]])
    }
  }
  shift = new_shift
                                        # try to shift based on the sequences itself
  return(list(Seqs = Seqs[!is.na(shift)], tr_paths = tr_paths[!is.na(shift)], shift = shift[!is.na(shift)]))
}

make_consensus = function(paths_info, G) {
  include = !is.na(paths_info$shift)
  ## make alignments using mafft
  aln = mafft(unlist(paths_info$Seqs[include]))
  CM = calculate_consensus_matrix(aln = aln, tr_paths = paths_info$tr_paths[include],
                                  G = G)
  gaps = get_gaps_from_alignment(aln)
  CMnorm = CM/rowSums(CM)
  bases = colnames(CM)
  consensus = sapply(apply(CMnorm, 1, function(x) bases[which(x > 0.2)][order(x[x >
                                                                                0.2], decreasing = TRUE)]), paste, collapse = "")
  consensus2 = gsub("-", "", paste0(substring(consensus, 1, 1), collapse = ""),
                    fixed = TRUE)
  number_of_SNP = sum(rowSums(CM > 0) > 1)
  SNP_positions = which(rowSums(CM > 0) > 1)
  number_of_position_with_indels = sum(colSums(do.call(rbind, strsplit(as.character(aln),
                                                                       "")) == "-") > 0)
  indel_positions = which(colSums(do.call(rbind, strsplit(as.character(aln), "")) ==
                                  "-") > 0)
  if (length(SNP_positions) > 0) {
    variable_sites = unlist(c(c(mapply(paste, strsplit((sapply(apply(CMnorm,
                                                                     1, function(x) bases[which(x > 0.2)]), paste, collapse = ""))[SNP_positions],
                                                       ""), SNP_positions, sep = "_")), paste("-", indel_positions, sep = "_")))
  } else {
    variable_sites = NULL
  }
  variable_positions = unique(SNP_positions, indel_positions)
  return(list(aln = aln, CM = CM, CMnorm = CMnorm, consensus = consensus, consensus2 = consensus2,
              number_of_SNP = number_of_SNP, SNP_positions = SNP_positions, number_of_position_with_indels = number_of_position_with_indels,
              indel_positions = indel_positions, variable_positions = variable_positions,
              variable_sites = variable_sites, gaps = gaps))
}


estimate_monomer = function(G, weights = NULL, limit = NULL) {
  if (is.null(G)) {
    return(NULL)
  }
  ## estimate monomer from kmer based graph
  V(G)$id = 1:vcount(G)
  GS = induced_subgraph(G, vids = which(degree(G) == 2))  ## keep only vertices without branching
  cls = clusters(GS)$membership


  ids = mapply(FUN = function(x, y) x[which.max(y)], split(V(GS)$id, cls), split(V(GS)$size,
                                                                                 cls))  ## from each branch use only one vertex with larges size!


  ids = ids[order(V(G)$size[ids], decreasing = TRUE)]
  ids_size = V(G)$size[ids]
  N50 = sum(cumsum(ids_size)/sum(ids_size) < 0.5)
  if (length(ids) > 10000) {
    ids = ids[1:N50]
  }
  ## use only large vertices in search!  how many?
  el = get.edgelist(G, names = FALSE)
  node_use = numeric(vcount(G))
  LL = numeric()
  ## W=numeric()
  i = 0
  paths = list()
  if (is.null(weights)) {
    weights = (max(E(G)$weight) - (E(G)$weight) + 1)
    weights = E(G)$weight^(-3)
  }
  included = rep(FALSE, vcount(G))
  W_total = sum(V(G)$size)

  coverage = numeric()
  t0 = c()
  i = 0
  j = 0
  for (n in ids) {
    j = j + 1
    t0[j] = Sys.time()
    if (included[n]) {
      next
    }
    m = which(el[, 1] %in% n)
    i = i + 1
    s = get.shortest.paths(G, el[m, 2], el[m, 1], weights = weights, output = "vpath")
    included[as.numeric(s$vpath[[1]])] = TRUE
    paths[[i]] = s$vpath[[1]]
    LL[i] = (length(s$vpath[[1]]))
  }

  ## evaluate if paths should be divided to variants - by length and path weight
  paths_clusters = split(paths, LL)
  paths_clusters_tr = mclapply(paths_clusters, FUN = align_paths, G = G, mc.cores = getOption("CPU"))

  ## paths_clusters_tr = lapply(paths_clusters, FUN = align_paths, G = G) consensus
  paths_consensus = mclapply(paths_clusters_tr, make_consensus, G = G, mc.cores = getOption("CPU"))

  ## evaluate weight for individual paths:
  for (v in seq_along(paths_consensus)) {
    p = paths_clusters_tr[[v]]$tr_paths
    ## clean
    p = p[!sapply(p, function(x) anyNA(x) | is.null(x))]
    L = sapply(p, length)
    p_groups = split(p, L)
    w_groups = sapply(p_groups, function(x) sum(V(G)$size[unique(c(sapply(x,
                                                                          as.numeric)))]))
    total_score = sum(V(G)$size[unique(c(unlist(sapply(p, as.numeric))))])
    LW = data.frame(`Length estimate` = unique(L), weight = w_groups, stringsAsFactors = FALSE)
    LW = LW[order(LW$weight, decreasing = TRUE), ]
    rownames(LW) = NULL
    paths_consensus[[v]]$total_score = total_score
    paths_consensus[[v]]$length_variant_score = LW

  }
  return(list(estimates = paths_consensus, paths = paths_clusters_tr))
}


detect_pbs = function(dimers_file, tRNA_database_path, reads_file, output) {
  ## read_file contain oriented reads!
  min_offset = 10
  max_end_dist = 2
  min_aln_length = 30
  max_pbs_search = 30
  thr_length = 12
  end_position = 23
  insertion_proportion_threshold <- 0.15
  ## read_file contain oriented reads!  for testing
  if (FALSE) {
    library(Biostrings)
    thr_length = 12
    end_position = 23
    dimers_file = "consensus_dimer.fasta"
    reads_file = "reads_oriented.fas"
    tRNA_database_path = "/mnt/raid/users/petr/workspace/repex_tarean/databases/tRNA_database2.fasta"

  }
  ## find read which are half aligned do reference dimer
  dimers_seq = readDNAStringSet(dimers_file)
  cmd = paste("makeblastdb -in", dimers_file, "-dbtype nucl")
  system(cmd, ignore.stdout = TRUE)
  output = paste0(reads_file, "blast_out.cvs")
  columns = c("qseqid", "qstart", "qend", "qlen", "sseqid", "sstart", "send", "sstrand",
              "slen", "pident", "length")
  cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), "' -perc_identity 90 -query ",
              reads_file, "-db", dimers_file, "-word_size", 7, "-dust no -num_alignments 99999 -strand plus ",
              "-out", output)
  system(cmd)
  blastdf = read.table(output, as.is = TRUE, col.names = columns, comment.char = "")
  unlink(output)
  blastdf = blastdf[blastdf$length >= min_aln_length, ]

  ## expand two whole read to see unligned reads parts
  blastdf_expand = blastdf
  blastdf_expand$qstart = blastdf$qstart - (blastdf$qstart - 1)
  blastdf_expand$sstart = blastdf$sstart - (blastdf$qstart - 1)
  blastdf_expand$qend = blastdf$qend + (blastdf$qlen - blastdf$qend)
  blastdf_expand$send = blastdf$send + (blastdf$qlen - blastdf$qend)
  pS = blastdf$sstart
  pE = blastdf$send
  pSF = blastdf_expand$sstart
  pEF = blastdf_expand$send
  cond1 = pS - pSF >= min_offset & blastdf$qend >= (blastdf$qlen - max_end_dist) &
    blastdf$sstart >= max_pbs_search
  cond2 = pEF - pE >= min_offset & blastdf$qstart <= max_end_dist & blastdf$send <=
    blastdf$slen - max_pbs_search

  ## coverage of alignments: evaluate coverage at site with breaks, it is neccessary
  ## that there must be at least 20% of interupted alignments at given position to
  ## be considered for additional search
  coverage_profiles = subject_coverage(blastdf)


  ## extract flanking sequences - cca 50nt, it should contain tRNA sequence search
  ## Left
  fin = tempfile()
  fout = tempfile()
  scoreL = scoreR = 0


  ## left side unique insertion sites
  if (any(cond1)) {
    insertion_sites = ddply(blastdf[cond1, ], .(sseqid, sstart), nrow)
    ## check coverage
    insertion_sites$coverage = 0
    for (i in 1:nrow(insertion_sites)) {
      insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$sstart[[i]]]
    }
    insertion_OK_left <- insertion_sites[with(insertion_sites, V1/coverage >
                                                               insertion_proportion_threshold), ]

    if (nrow(insertion_OK_left) > 0) {
      s = ifelse(insertion_OK_left[, "sstart"] - max_pbs_search < 1, 1, insertion_OK_left[,
                                                                                          "sstart"] - max_pbs_search)
      Lpart = subseq(dimers_seq[match(insertion_OK_left$sseqid, names(dimers_seq))],
                     s, insertion_OK_left$sstart)

      ## names are CONSENSUSID__READID_position
      names(Lpart) = paste0(names(Lpart), "__", insertion_OK_left$V1, "__",
                            insertion_OK_left$sstart)
      ## check presence TG dinucleotide
      TG = vcountPattern("TG", subseq(dimers_seq[match(insertion_OK_left$sseqid,
                                                       names(dimers_seq))], insertion_OK_left$sstart - 2, insertion_OK_left$sstart +
                                                                                                          2))
      if (any(TG > 0)) {
        writeXStringSet(Lpart[TG > 0], filepath = fin)
        cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "),
                    "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path,
                    "-word_size", 7, "-strand plus -dust no -max_target_seqs 10000 -evalue 100",
                    "-out", fout)

        system(cmd, ignore.stdout = TRUE)
        df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
        filter1 = df$length >= thr_length
        filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
        df_pass = df[filter1 & filter2, , drop = FALSE]
        df_pass_L = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
        scoreL = get_score(df_pass_L)
        write.table(df_pass_L, file = paste0(output, "_L.csv"), row.names = FALSE,
                    sep = "\t")
      }
    }
  }
  if (any(cond2)) {
    ## search Right
    insertion_sites = ddply(blastdf[cond2, ], .(sseqid, send, slen), nrow)
    ## check coverage
    insertion_sites$coverage = 0
    for (i in 1:nrow(insertion_sites)) {
      insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$send[[i]]]
    }
    insertion_OK_right <- insertion_sites[with(insertion_sites, V1/coverage >
                                                                insertion_proportion_threshold), ]

    if (nrow(insertion_OK_right) > 0) {
      s = ifelse(insertion_OK_right$send + max_pbs_search > insertion_OK_right$slen,
                 insertion_OK_right$slen, insertion_OK_right$send + max_pbs_search)
      Rpart = subseq(dimers_seq[match(insertion_OK_right$sseqid, names(dimers_seq))],
                     insertion_OK_right$send, s)
      names(Rpart) = paste0(names(Rpart), "__", insertion_OK_right$V1, "__",
                            insertion_OK_right$send)

      ## check presence CA dinucleotide
      CA = vcountPattern("CA", subseq(dimers_seq[match(insertion_OK_right$sseqid,
                                                       names(dimers_seq))], insertion_OK_right$send - 2, insertion_OK_right$send +
                                                                                                         2, ))
      if (any(CA > 0)) {
        writeXStringSet(Rpart[CA > 0], filepath = fin)
        cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "),
                    "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path,
                    "-word_size", 7, "-strand minus -dust no -max_target_seqs 10000 -evalue 100",
                    "-out", fout)

        system(cmd, ignore.stdout = TRUE)
        df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
        filter1 = df$length >= thr_length
        filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
        df_pass = df[filter1 & filter2, , drop = FALSE]
        df_pass_R = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
        write.table(df_pass_R, file = paste0(output, "_R.csv"), row.names = FALSE,
                    sep = "\t")
        scoreR = get_score(df_pass_R)
      }
    }
  }
  unlink(fin)
  unlink(fout)
  return(max(scoreL, scoreR))
}


subject_coverage = function(blastdf) {
  ## calculate coverage for all blast subjects
  coverage_profiles <- by(blastdf, INDICES = blastdf$sseqid, FUN = function(x) {
    as.numeric(coverage(IRanges(start = x$sstart, end = x$send)))
  })
  return(coverage_profiles)

}

get_score = function(x) {
  ## keep best tRNA
  if (nrow(x) == 0) {
    return(0)
  }
  xm = data.frame(do.call(rbind, strsplit(x$qseqid, "__")), stringsAsFactors = FALSE)
  xm$score = as.numeric(sapply(strsplit(xm[, 1], "_"), "[[", 4))
  xm$AA = gsub("-.+$", "", gsub("^.+__", "", x$sseqid))
  best_score = max(by(xm$score, INDICES = xm$AA, FUN = sum))
  return(best_score)
}


dotter = function(seq1, seq2 = NULL, params = "") {
  if (is.null(seq2)) {
    seq2 = seq1
  }
  library(Biostrings)
  if (class(seq1) != "DNAStringSet") {
    seq1 = BStringSet(seq1)
  }
  if (class(seq2) != "DNAStringSet") {
    seq2 = BStringSet(seq2)
  }
  sf1 = tempfile("seq1")
  writeXStringSet(seq1, file = sf1)
  sf2 = tempfile("seq2")
  writeXStringSet(seq2, file = sf2)
  system(paste("dotter", params, sf1, sf2), wait = FALSE)
  Sys.sleep(2)
  unlink(c(sf1, sf2))
  return(NULL)
}


dotter2 = function(seq1, seq2 = NULL, params = NULL) {
  if (is.null(seq2)) {
    seq2 = seq1
  }
  if (is.null(params)) {
    params = " -windowsize 30 -threshold 45 "
  }
  library(Biostrings)
  if (class(seq1) != "DNAStringSet") {
    seq1 = DNAStringSet(seq1)
  }
  if (class(seq2) != "DNAStringSet") {
    seq2 = DNAStringSet(seq2)
  }
  L1 = nchar(seq1)
  L2 = nchar(seq2)

  tmpdat1 = tempfile()

  dir.create(tmpdat1)
  tmpdat2 = tempfile()
  dir.create(tmpdat2)
  oridir = getwd()


  seq1_merged = DNAStringSet(paste(seq1, collapse = ""))
  seq2_merged = DNAStringSet(paste(seq2, collapse = ""))
  seq2rc_merged = reverseComplement(DNAStringSet(paste(seq2, collapse = "")))


  sf1 = tempfile("seq1")
  writeXStringSet(seq1_merged, filepath = sf1)
  sf2 = tempfile("seq2")
  sf2rc = tempfile("seq2rc")
  writeXStringSet(seq2_merged, filepath = sf2)
  writeXStringSet(seq2rc_merged, filepath = sf2rc)

  cmd1 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2, params)
  cmd2 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2rc,
               params)
  setwd(tmpdat1)
  output1 = system(cmd1, intern = TRUE)
  setwd(tmpdat2)
  output2 = system(cmd2, intern = TRUE)
  setwd(oridir)


  fout1 = strsplit(tail(output1, n = 1), split = " ")[[1]][2]
  rawdat1 = readLines(paste(tmpdat1, "/", fout1, sep = ""))
  rawdat1 = rawdat1[1:min(grep("^Rectangle", rawdat1))]

  if (length(rawdat1[grep("^Line", rawdat1)]) == 0) {
    coord1 = NULL
  } else {
    coord1 = apply(sapply(strsplit(rawdat1[grep("^Line", rawdat1)], " "), "[",
                          c(3, 5, 7, 9, 11)), 1, as.numeric)
    coord1 = matrix(coord1, ncol = 5)
  }

  fout2 = strsplit(tail(output2, n = 1), split = " ")[[1]][2]
  rawdat2 = readLines(paste(tmpdat2, "/", fout2, sep = ""))
  rawdat2 = rawdat2[1:min(grep("^Rectangle", rawdat2))]

  if (length(rawdat2[grep("^Line", rawdat2)]) == 0) {
    coord2 = NULL
  } else {
    coord2 = apply(sapply(strsplit(rawdat2[grep("^Line", rawdat2)], " "), "[",
                          c(3, 5, 7, 9, 11)), 1, as.numeric)
    coord2 = matrix(coord2, ncol = 5)
  }
  unlink(sf1)
  unlink(sf2)
  unlink(sf2rc)
  unlink(tmpdat1, recursive = TRUE)
  unlink(tmpdat2, recursive = TRUE)

  N1 = sum(nchar(seq1))
  N2 = sum(nchar(seq2))
  op = par(xaxs = "i", yaxs = "i", mar = c(5, 2, 6, 10), las = 1)
  on.exit(par(op))
  plot(c(1, N1), c(1, N2), type = "n", xlab = "", ylab = "", axes = FALSE)
  if (!is.null(coord1)) {
    segments(x0 = coord1[, 1], y0 = coord1[, 2], x1 = coord1[, 3], y1 = coord1[,
                                                                               4])
  }
  if (!is.null(coord2)) {
    segments(x0 = coord2[, 1], y0 = N2 - coord2[, 2], x1 = coord2[, 3], y1 = N2 -
                                                                          coord2[, 4])
  }
  abline(v = c(0, cumsum(L1)), col = "green")
  abline(h = c(0, cumsum(L2)), col = "green")
  box()
  axis(1, at = c(1, cumsum(L1))[-(length(L1) + 1)], labels = names(seq1), hadj = 0,
       cex.axis = 0.7)
  axis(4, at = c(1, cumsum(L2))[-(length(L2) + 1)], labels = names(seq2), hadj = 0,
       cex.axis = 0.7)
  invisible(list(coord1, coord2))
}


CM2sequence = function(x) {
  bases = c(colnames(x), "N")
  x = cbind(x, 0)
  colnames(x) = bases
  seqs = paste(bases[apply(x, 1, which.max)], collapse = "")

  return(seqs)
}

## in alingment calculate significance from kmers
calculate_consensus_matrix = function(aln, tr_paths, G) {
  bases = c("A", "C", "G", "T", "-")
  positions = lapply(strsplit(as.character(aln), split = ""), function(x) which(!x %in%
                                                                                "-"))
  base_matrix = do.call(rbind, strsplit(as.character(aln), split = ""))
  weights = matrix(0, nrow = length(aln), ncol = nchar(aln))
  kmer_rel_proportions = (V(G)$size/table(factor(unlist(tr_paths), levels = 1:vcount(G))))
  for (i in seq_along(positions)) {
    weights[i, positions[[i]]] = kmer_rel_proportions[tr_paths[[i]]]
  }
  names(kmer_rel_proportions) = V(G)$name
  ## get weights for gaps by approximation
  fitgaps = function(y) {
    if (sum(y == 0) == 0) {
      return(y)
    } else {
      y0 = rep(y[y != 0], 3)
      x0 = which(rep(y, 3) != 0)
      fitted = approx(x0, y0, xout = seq_along(rep(y, 3)), rule = 1)
    }
    return(fitted$y[-seq_along(y)][seq_along(y)])
  }
  weights_with_gaps = t(apply(weights, 1, fitgaps))
  ## weights_with_gaps = get_gap_weights(weights, aln,G) ## this step take wey too
  ## long time!!!-not so important TODO handle gaps differently - more effectively

  ## get consensus matrix
  CM = sapply(1:nchar(aln[[1]]), function(i) {
    sapply(split(weights_with_gaps[, i], factor(base_matrix[, i], levels = bases)),
           sum)
  })
  return(t(CM)[, 1:5])
}


get_gaps_from_alignment = function(aln) {
  as.character(aln)
  gaps_positions = unique(do.call(rbind, str_locate_all(as.character(aln), "-+")))
  return(gaps_positions)
}

plot_kmer_graph = function(G, L = NULL, vertex.size = NULL, ord = NULL, upto = NULL,
                           highlight = NULL) {
  if (!is.null(G$L) & is.null(L)) {
    L = G$L
  }
  if (is.null(L)) {
    ## L=layout.kamada.kawai(G)
    L = OGDFlayout(G)[[1]]
  }
  clr = kmers2color(V(G)$name)
  if (!is.null(highlight)) {
    clr[highlight] = "#00000080"
  }
  if (!is.null(ord)) {
    clr[ord[1:upto]] = paste(clr[ord[1:upto]], "30", sep = "")
  }
  if (is.null(vertex.size)) {
    vertex.size = rescale(V(G)$size, to = c(0.5, 6))

  }
  plot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE,
       vertex.color = clr, vertex.frame.color = "#00000020", edge.width = 2, edge.arrow.mode = 1,
       edge.arrow.size = 0.2)

}

rglplot_kmer_graph = function(G, L = NULL, vertex.size = 4) {
  if (is.null(L)) {
    L = layout.kamada.kawai(G)
  }

  rglplot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE,
          vertex.color = kmers2color(V(G)$name), edge.width = 2, edge.arrow.mode = 1,
          edge.arrow.size = 0.5)
}


mafft = function(seqs, params = "--auto --thread 1 ") {
  if (length(seqs) < 2) {
    return(seqs)
  }
  infile = tempfile()
  if (class(seqs) == "character") {
    seqs = DNAStringSet(seqs)
  }
  writeXStringSet(seqs, file = infile)
  outfile = tempfile()
  cmd = paste("mafft --quiet --nuc ", params, infile, "2> /dev/null > ", outfile)
  system(cmd, intern = TRUE, ignore.stderr = FALSE)
  aln = readDNAStringSet(outfile)
  unlink(c(outfile, infile))
  return(aln)
}

mgblast = function(databaseSeq, querySeq) {
  params = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -C 30 -D 30 "
                                        # no dust filtering:
  paramsDF = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -F F"

  database = tempfile()
  query = tempfile()
  output = tempfile()
  if (class(databaseSeq) == "character") {
    database = databaseSeq
    do_not_delete_database = TRUE
  } else {
    writeXStringSet(databaseSeq, filepath = database, format = "fasta")
    do_not_delete_database = FALSE
  }
  if (class(querySeq) == "character") {
    query = querySeq
    do_not_delete_query = TRUE
  } else {
    writeXStringSet(querySeq, filepath = query, format = "fasta")
    do_not_delete_query = FALSE
  }
  ## create database:
  cmd = paste("formatdb -i", database, "-p F")
  system(cmd)
  cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, params)
  system(cmd)
  if (file.info(output)$size == 0) {
                                        # no hist, try wthou dust masker
    cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, paramsDF)
    system(cmd)
  }
  blastOut = read.table(output, sep = "\t", header = FALSE, as.is = TRUE, comment.char = "")
  unlink(output)
  if (!do_not_delete_query) {
    unlink(query)
  }
  if (!do_not_delete_database) {
    unlink(paste(database, "*", sep = ""))
  }
  colnames(blastOut) = c("query", "q.length", "q.start", "q.end", "subject", "s.length",
                         "s.start", "s.end", "pid", "weight", "e.value", "strand")
  blastOut
}


estimate_sample_size = function(NV, NE, maxv, maxe) {
  ## density
  d = (2 * NE)/(NV * (NV - 1))
  eEst = (maxv * (maxv - 1) * d)/2
  nEst = (d + sqrt(d^2 + 8 * d * maxe))/(2 * d)
  if (eEst >= maxe) {
    N = round(nEst)
    E = round((N * (N - 1) * d)/2)

  }
  if (nEst >= maxv) {
    N = maxv
    E = round((N * (N - 1) * d)/2)

  }
  return(N)
}


mgblast2graph = function(blastfile, seqfile, graph_destination, directed_graph_destination,
                         oriented_sequences, paired = TRUE, repex = FALSE, image_file = NULL, image_file_tmb = NULL,
                         include_layout = TRUE, pair_completeness = NULL, satellite_model_path = NULL,
                         maxv = 40000, maxe = 5e+08, seqfile_full = seqfile) {
  cat("loading blast results\n")
  if (repex) {
    cln = c("query", "subject", "weight", "q.length", "q.start", "q.end", "s.length", "s.start",
            "s.end", "sign")
    colcls = c("character", "character", "numeric", "numeric", "numeric", "numeric",
               "numeric", "numeric", "numeric", "NULL", "NULL", "character")

  } else {
    cln = c("query", "q.length", "q.start", "q.end", "subject", "s.length", "s.start",
            "s.end","weight", "sign")
    colcls = c("character", "numeric", "numeric", "numeric", "character", "numeric",
               "numeric", "numeric", "NULL", "numeric", "NULL", "character")
  }
  if (class(blastfile) == "data.frame") {
    blastTable = blastfile
    colnames(blastTable)[12] = "sign"
  } else {
    blastTable = read.table(blastfile, sep = "\t", as.is = TRUE, header = FALSE,
                            colClasses = colcls, comment.char = "")
    colnames(blastTable) = cln
  }
  ## check for duplicates!
  key = with(blastTable, ifelse(query > subject, paste(query, subject), paste(subject,
                                                                              query)))
  if (any(duplicated(key))) {
    blastTable = blastTable[!duplicated(key), ]
  }
  seqs = readDNAStringSet(seqfile)
  ## calculate pair completeness for reads before sampling
  if (is.null(pair_completeness)) {
    if (paired) {
      if (seqfile_full != seqfile) {
        seqs_full = readDNAStringSet(seqfile_full)
        pair_counts = tabulate(table(gsub(".$", "", names(seqs_full))))
        rm(seqs_full)
      } else {
        pair_counts = tabulate(table(gsub(".$", "", names(seqs))))

      }
      pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
    }else{
      pair_completeness = 0
    }
  }
  NV = length(seqs)  # vertices
  NE = nrow(blastTable)  # nodes
  if (maxv < NV | maxe < NE) {
    ## Sample if graph is large
    V_sample_size = estimate_sample_size(NV, NE, maxv, maxe)
    seqs = sample(seqs, V_sample_size)
    blastTable = blastTable[blastTable$query %in% names(seqs) & blastTable$subject %in%
                            names(seqs), ]
  }

  blastTable$sign = ifelse(blastTable$sign == "+", 1, -1)
  vnames = unique(c(blastTable$query, blastTable$subject))
  vindex = seq_along(vnames)
  names(vindex) = vnames
  ## correct orientation
  cat("creating graph\n")
  G = graph.data.frame(blastTable[, c("query", "subject", "weight", "sign")], directed = FALSE,
                       vertices = vnames)
  if (include_layout) {
    ## save temporarily modified blastTable if large
    tmpB = tempfile()
    save(blastTable, file = tmpB)
    rm(blastTable)
    ##
    cat("graph layout calculation ")
    if (ecount(G) > 2e+06) {
      cat("using fruchterman reingold\n")
      L = layout.fruchterman.reingold(G, dim = 3)
    } else {
      cat("using OGDF & frucherman reingold\n")
      Ltmp = OGDFlayout(G, alg = c("fmmm"))
      L = cbind(Ltmp[[1]][, 1:2], layout.fruchterman.reingold(G, dim = 2))

    }

    GL = list(G = G, L = L)
    save(GL, file = graph_destination)
    if (!is.null(image_file)) {
      cat("exporting graph figure")
      png(image_file, width = 900, height = 900, pointsize = 20)
      plot(GL$G, layout = GL$L[, 1:2], vertex.size = 2, vertex.color = "#000000A0",
           edge.color = "#00000040", vertex.shape = "circle", edge.curved = FALSE,
           vertex.label = NA, vertex.frame.color = NA, edge.width = 1)
      dev.off()
      ## create thunmbs:
      system(paste("convert ", image_file, " -resize 100x100 ", image_file_tmb))

    }
    rm(GL)
    gc()
    load(tmpB)
    unlink(tmpB)
  }

  if (!is.connected(G)) {
    cat("Warning - graph is not connected\n")
    cc = clusters(G, "weak")
    mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)),
                                                                          decreasing = TRUE)))))
    selvi = which(mb == 1)  # largest component
    cat("using largest component: \nsize =", round(length(selvi)/vcount(G) *
                                                   100, 1), "% ;", length(selvi), "reads", "\n")
    G = induced.subgraph(G, vids = selvi)
    blastTable = blastTable[blastTable$query %in% V(G)$name & blastTable$subject %in%
                                                      V(G)$name, ]
  }

  Gmst = minimum.spanning.tree(G)
                                        # create alternative trees - for case that unly suboptima solution is found
  set.seed(123)
  Gmst_alt = list()
  Gmst_alt[[1]] = Gmst
  for (i in 2:6){
    E(G)$weight = runif(ecount(G), 0.1,1)
    Gmst_alt[[i]] = minimum.spanning.tree(G)
  }

  rm(G)
  gc()
  ## six attempts to reorient reads
  flip_names_all = list()
  prop_of_notfit=numeric()
  thr_fit=0.001
  for (ii in 1:6){
    Gmst = Gmst_alt[[ii]]


    blastTable_mst = as.data.frame(get.edgelist(Gmst, names = TRUE))
    colnames(blastTable_mst) = c("query", "subject")
    blastTable_mst$sign = E(Gmst)$sign

    d = dfs(Gmst, root = 1, father = TRUE, order.out = TRUE)
    esign = E(Gmst)$sign
    rc = rep(FALSE, vcount(Gmst))
    j = 0
    p = 0
    for (v in d$order[-1]) {
      j = j + 1
      f = as.numeric(d$father)[v]
      if (is.na(f)) {
        next
      }
      eid = get.edge.ids(Gmst, c(v, f))
      if (esign[eid] == -1) {
        ie = unique(c(which(blastTable_mst$query %in% V(Gmst)$name[v]), which(blastTable_mst$subject %in%
                                                                              V(Gmst)$name[v])))  # incident edges
        esign[ie] = esign[ie] * -1
        rc[v] = TRUE
        p = p + 1
        if (p > 50) {
          p = 0
          cat("\r")
          cat(j, " : ", sum(esign)/length(esign))
        }
      }
    }
    cat("\t")
    flip_names_all[[ii]] = flip_names = V(Gmst)$name[rc]

    if (!exists("blastTable")) {
      load(tmpB)
      unlink(tmpB)
    }
    ## add nofit later!!
    s.mean = with(blastTable, (s.start + s.end)/2)
    s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length -
                                                                  s.mean, s.mean)
    q.mean = with(blastTable, (q.start + q.end)/2)
    q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length -
                                                                q.mean, q.mean)
    V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
    V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
    sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1,
                                                            1) * blastTable$sign
    nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))
    prop_of_notfit[ii] = length(nofit)/vcount(Gmst)
    ## check if correctly oriented
    cat("prop notfit", prop_of_notfit[[ii]],"\n")
    if (prop_of_notfit[[ii]]<thr_fit){
      ## OK
      break
    }
  }
  if (!prop_of_notfit[[ii]]<thr_fit){
    ## get best solution
    ii_best = which.min(prop_of_notfit)
    if (ii != ii_best){   # if the last solution was not best, get the best
      flip_names = flip_names_all[[ii_best]]
      s.mean = with(blastTable, (s.start + s.end)/2)
      s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length -
                                                                    s.mean, s.mean)
      q.mean = with(blastTable, (q.start + q.end)/2)
      q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length -
                                                                  q.mean, q.mean)
      V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
      V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
      sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1,
                                                              1) * blastTable$sign
      nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))

    }
  }

  ## exclude all nofit

  df2 = data.frame(V1, V2, sign = blastTable$sign, sign_final = sign_final, stringsAsFactors = FALSE)
  rm(blastTable)
  gc()
  vertices = data.frame(name = vnames, reverse_complement = vnames %in% flip_names)
  G = graph.data.frame(df2, vertices = vertices)
  vcount_ori = vcount(G)
  G = induced.subgraph(G, vids = which(!V(G)$name %in% nofit))

  G$escore_mst = sum(esign)/length(esign)
  G$escore = sum(sign_final == 1)/length(sign_final)
  cc = clusters(G, "strong")
  mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)),
                                                                        decreasing = TRUE)))))
  names(mb) = V(G)$name
  G$loop_index = max(cc$csize)/vcount(G)
  G$coverage = vcount(G)/vcount_ori
  ## check sign in all edges
  V(G)$membership = mb
  save(G, file = directed_graph_destination)
  ## remove nofit
  seqs = seqs[!names(seqs) %in% nofit]
  seqs[names(seqs) %in% flip_names] = reverseComplement(seqs[names(seqs) %in% flip_names])
  names(seqs) = paste(names(seqs), mb[names(seqs)])
  writeXStringSet(seqs, filepath = oriented_sequences)
  ## calculate satellite probability
  if (is.null(satellite_model_path)) {
    pSAT = isSAT = NULL
  } else {
    satellite_model = readRDS(satellite_model_path)
    pSAT = get_prob(G$loop_index, pair_completeness, model = satellite_model)
    isSAT = isSatellite(G$loop_index, pair_completeness, model = satellite_model)
  }

  ## get larges cc
  output = list(escore = G$escore, coverage = G$coverage, escore_mts = G$escore_mst,
                loop_index = G$loop_index, pair_completeness = pair_completeness, graph_file = graph_destination,
                oriented_sequences = oriented_sequences, vcount = vcount(G), ecount = ecount(G),
                satellite_probability = pSAT, satellite = isSAT)
  ## clean up
  all_objects = ls()
  do_not_remove = "output"
  rm(list = all_objects[!(all_objects %in% do_not_remove)])
  gc(verbose = FALSE, reset = TRUE)
  return(list2dictionary(output))
}

list2dictionary = function(l){
    dict = "{"
    q='"'
    for (i in 1:length(l)){
        if (class(l[[i]])=="character" | is.null(l[[i]])){
            q2 = "'''"
        }else{
            q2 = ''
        }
        dict = paste0(
            dict,
            q,names(l)[i],q,":",q2, l[[i]], q2,", "
        )
    }
    dict = paste0(dict, "}")
    return(dict)
}

wrap = Vectorize(function(s, width = 80) {
  i1 = seq(1, nchar(s), width)
  i2 = seq(width, by = width, length.out = length(i1))
  return(paste(substring(s, i1, i2), collapse = "\n"))
})


tarean = function(input_sequences, output_dir, min_kmer_length = 11, max_kmer_length = 27,
                  CPU = 2, sample_size = 10000, reorient_reads = TRUE, tRNA_database_path = NULL,
                  include_layout = TRUE, paired = TRUE, lock_file=NULL) {
  options(CPU = CPU)
  time0 = Sys.time()
  dir.create(output_dir)
  input_sequences_copy = paste(output_dir, "/", basename(input_sequences), sep = "")

  if (!file.copy(input_sequences, input_sequences_copy, overwrite = TRUE)) {
    cat(paste("cannot copy", input_sequences, " to", output_dir), "\n")
    stop()
  }

  lock_file = waitForRAM(lock_file = lock_file)
  pair_completeness = NULL
  ## sampling
  if (sample_size != 0) {
    s = readDNAStringSet(input_sequences_copy)
    N = length(s)
    ## pair completness must be calculated before sampling!
    if (N > sample_size) {
      writeXStringSet(sample(s, sample_size), filepath = input_sequences_copy)
      if (paired) {
        pair_counts = tabulate(table(gsub(".$", "", names(s))))
        pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
      }
    }
    rm(s)
  }

  if (reorient_reads) {
    input_sequences_oriented = paste0(input_sequences_copy, "oriented.fasta")
    graph_file = paste0(input_sequences_copy, "_graph.RData")
    GLfile = paste0(input_sequences_copy, "_graph.GL")
    cat("reorientig sequences\n")

    blastTable = mgblast(input_sequences_copy, input_sequences_copy)

    graph_info = mgblast2graph(blastTable, input_sequences_copy, GLfile, graph_file,
                               input_sequences_oriented, include_layout = include_layout, paired = paired,
                               pair_completeness = pair_completeness)

    ## interupt if it does not look like tandem repeat at all # soft threshold!
    if (is.null(graph_info$pair_completeness)) {
      if (graph_info$loop_index <= 0.4) {
        cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        return(list2dictionary(list(graph_info = graph_info)))
      }
    } else {
      ## for paired reads:
      if (graph_info$loop_index < 0.7 | graph_info$pair_completeness < 0.4) {
        cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        return(list2dictionary(list(graph_info = graph_info)))
      }
    }

  } else {
    input_sequences_oriented = input_sequences_copy
    graph_info = NULL
  }


  kmer_counts = list()
  kmer_lengths = seq(min_kmer_length, max_kmer_length, by = 4)
  for (i in kmer_lengths) {
    ## use pythonis.null(l[[i]])) function - faster
    cmd = paste(script.dir, "/kmer_counting.py ", input_sequences_oriented, " ",
                i, sep = "")
    cat("calculation kmers of length ", i, "\n")
    f = system(cmd, intern = TRUE)
    x = read.table(f, as.is = TRUE, sep = "\t")
    kmer_counts[[as.character(i)]] = data.frame(x, freq = x$V2/sum(x$V2))
  }

  ## number of kmers:
  N50 = sapply(kmer_counts, function(x) {
    sum(cumsum(x$freq) < 0.5)
  })

  N70 = sapply(kmer_counts, function(x) {
    sum(cumsum(x$freq) < 0.7)
  })


  time1 = Sys.time()
  ggmin = mclapply(kmer_counts, get_mimimal_cc, start = 0.5, mc.cores = CPU)
  time2 = Sys.time()
  cat("kmers graphs created ")
  print(time2 - time1)
  names(ggmin) = names(kmer_counts)


  ## estimate monomer
  monomers = mclapply(ggmin, estimate_monomer, mc.cores = CPU)

  names(monomers) = names(kmer_counts)
  monomers = monomers[!sapply(monomers, is.null)]
  ## error handling:
  error = sapply(monomers, class) == "try-error"
  if (any(error)) {
    cat("\nError in monomers estimation: ")
    cat("calculation failed for monomers length ", names(monomers)[error], "\n")
    print(monomers[error])
    if (any(!error)) {
      monomers = monomers[!error]
    } else {
      stop("monomer estimation failed")
    }
  }

  ## summary - make function!!
  total_score = list()
  k = 0
  length_best = numeric()
  score_bn = numeric()
  consensus = character()

  for (i in names(monomers)) {
    for (v in seq_along(monomers[[i]]$estimates)) {
      k = k + 1
      total_score[[k]] = c(kmer = as.numeric(i), variant = v, total_score = monomers[[i]]$estimates[[v]]$total_score)
      score_bn[k] = min(rowSums(monomers[[i]]$estimates[[v]]$CM))
      length_best[k] = monomers[[i]]$estimates[[v]]$length_variant_score[1,
                                                                         1]
      consensus[[k]] = monomers[[i]]$estimates[[v]]$consensus2
    }
  }
  summary_table = as.data.frame(do.call(rbind, total_score))
  summary_table$monomer_length = length_best
  summary_table$consensus_length = nchar(consensus)
  summary_table$score_bn = score_bn
  summary_table$consensus = paste("<pre>", wrap(consensus, width = 80), "<pre>",
                                  sep = "")
  consensus_DS = DNAStringSet(consensus)
  names(consensus_DS) = with(summary_table, paste0(kmer, "_", variant, "_sc_",
                                                   signif(total_score), "_l_", monomer_length))

  ## filter by score - keep

  ## reorder by score
  consensus_DS = consensus_DS[order(summary_table$total_score, decreasing = TRUE)]
  summary_table = summary_table[order(summary_table$total_score, decreasing = TRUE),
                                ]
  rownames(summary_table) = NULL
  N = nrow(summary_table)
  ## concatenate concensus(ie. make dimer head 2 tail) for pbs detection and other
  ## make something like 'pseudo contig' multimer for mapping - min length 200 bp

  ## searches
  consensus_DS_dimer = DNAStringSet(paste0(consensus_DS, consensus_DS))
  tarean_contigs = DNAStringSet(sapply(consensus_DS,function(x)
    ifelse(nchar(x)<200,
           paste(rep(as.character(x),round(300/nchar(as.character(x))+1)),collapse=''),
           as.character(x)))
    )

  names(consensus_DS_dimer) = names(consensus_DS)
                                        # save sequences:
  consensus_DS_dimer_file = paste0(output_dir, "/consensus_dimer.fasta")
  consensus_DS_file = paste0(output_dir, "/consensus.fasta")
  tarean_contig_file = paste0(output_dir, "/tarean_contigs.fasta")
  writeXStringSet(consensus_DS, consensus_DS_file)
  writeXStringSet(tarean_contigs, tarean_contig_file)
  writeXStringSet(consensus_DS_dimer, consensus_DS_dimer_file)
  if (is.null(tRNA_database_path)) {
    pbs_score = -1
  } else {
    pbs_blast_table = paste0(output_dir, "/pbs_detection")
    pbs_score = detect_pbs(dimers_file = consensus_DS_dimer_file, tRNA_database_path = tRNA_database_path,
                           reads = input_sequences_oriented, output = pbs_blast_table)
  }
  ## search of open reading frames get the length of the longest
  orf_l = max_ORF_length(consensus_DS_dimer)


  dir.create(paste0(output_dir, "/img"), recursive = TRUE)
  summary_table$monomer_length_graph = numeric(N)
  summary_table$graph_image = character(N)
  summary_table$monomer_length_logo = numeric(N)
  summary_table$logo_image = character(N)
  ## export graph nd consensus estimate to cluster directory type of output may
  ## change in future
  save(ggmin, file = paste0(output_dir, "/ggmin.RData"))
  save(monomers, file = paste0(output_dir, "/monomers.RData"))

  cat("generating HTML output\n")
  for (i in 1:N) {
    kmer = as.character(summary_table$kmer[i])
    variant = summary_table$variant[i]
    ## export graph
    fout_link = paste0("img/graph_", kmer, "mer_", variant, ".png")
    fout = paste0(output_dir, "/", fout_link)
    ## summary_table$monomer_length_graph[i] = summary_table$monomer_length[i]
    ## summary_table$monomer_length_logo[[i]] = nrow(monomers[[kmer]]$estimates[[variant]]$CM)
    summary_table$monomer_length[[i]] = length(monomers[[kmer]]$estimates[[variant]]$consensus)

    if (i <= 10) {
      png(fout, width = 800, height = 800)
      plot_kmer_graph(ggmin[[kmer]], highlight = unlist(monomers[[kmer]]$paths[[variant]]$tr_paths))
      dev.off()
      summary_table$graph_image[i] = hwriteImage(fout_link, link = fout_link,
                                                 table = FALSE, width = 100, height = 100)
      ## export logo
      png_link = paste0("img/logo_", kmer, "mer_", variant, ".png")
      fout = paste0(output_dir, "/", png_link)
      png(fout, width = 1200, height = round(summary_table$monomer_length[i] *
                                             1) + 550)
      try(plot_multiline_logo(monomers[[kmer]]$estimates[[variant]]$CM, W = 100))
      dev.off()
      ## export corresponding position probability matrices
      ppm_file = paste0(output_dir, '/ppm_', kmer, "mer_", variant, ".csv")
      ppm_link = paste0('ppm_', kmer, "mer_", variant, ".csv")
      write.table(monomers[[kmer]]$estimates[[variant]]$CM,
                  file = ppm_file,
                  col.names = TRUE, quote = FALSE,
                  row.names = FALSE, sep="\t")
      summary_table$logo_image[i] = hwriteImage(png_link, link = ppm_link,
                                                table = FALSE, width = 200, height = 100)
    }

  }

  ## html_report = HTMLReport()

  htmlfile = paste0(output_dir, "/report.html")
  cat(htmlheader, file = htmlfile)
  included_columns = c('kmer', 'variant', 'total_score', 'consensus_length','consensus', 'graph_image', 'logo_image')
  summary_table_clean = summary_table[,included_columns]
  colnames(summary_table_clean) = c('k-mer length',
                                    'Variant index',
                                    'k-mer coverage score',
                                    'Consensus length',
                                    'Consensus sequence',
                                    'k-mer based graph',
                                    'Sequence logo')
  HTML(summary_table_clean, file = htmlfile, sortableDF = TRUE)
  HTMLEndFile(file = htmlfile)
  time4 = Sys.time()
  print(time4 - time0)
  if (!is.null(lock_file)){
    print("------removing-lock--------")
    removelock(lock_file)
  }

  print(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
             TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
             TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
             orf_l = orf_l, tarean_contig_file = tarean_contig_file))
  return(list2dictionary(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
              TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
              TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
              orf_l = orf_l, tarean_contig_file = tarean_contig_file)))
}


## graph loop index stability
loop_index_instability = function(G) {
  N = 50
  s = seq(vcount(G), vcount(G)/10, length.out = N)
  p = seq(1, 0.1, length.out = N)
  li = numeric()
  for (i in seq_along(s)) {
    print(i)
    gs = induced_subgraph(G, sample(1:vcount(G), s[i]))
    li[i] = max(clusters(gs, "strong")$csize)/vcount(gs)
  }
  instability = lm(li ~ p)$coefficient[2]
  return(instability)
}

isSatellite = function(x, y, model) {
  p = get_prob(x, y, model)
  if (p > model$cutoff) {
    return("Putative Satellite")
  } else {
    return("")
  }
}

get_prob = function(x, y, model) {
  pm = model$prob_matrix
  N = ncol(pm)
  i = round(x * (N - 1)) + 1
  j = round(y * (N - 1)) + 1
  p = pm[i, j]
  return(p)
}


detectMemUsage = function() {
  con = textConnection(gsub(" +", " ", readLines("/proc/meminfo")))
  memInfo = read.table(con, fill = TRUE, row.names = 1)
  close(con)
  memUsage = 1 - (memInfo["MemFree", 1] + memInfo["Cached", 1])/memInfo["MemTotal",
                                                                        1]
  return(memUsage)
}


makelock<-function(lockfile,lockmsg,CreateDirectories=TRUE){
    lockdir=dirname(lockfile)
    if(!file.exists(lockdir)){
        if(CreateDirectories) dir.create(lockdir,recursive=TRUE)
        else stop("Lock Directory for lockfile ",lockfile," does not exist")
    }
    if(missing(lockmsg)) lockmsg=paste(system('hostname',intern=TRUE),Sys.getenv("R_SESSION_TMPDIR"))
    if (file.exists(lockfile)) return (FALSE)
                                        # note the use of paste makes the message writing atomic
    cat(paste(lockmsg,"\n",sep=""),file=lockfile,append=TRUE,sep="")
    firstline=readLines(lockfile,n=1)
    if(firstline!=lockmsg){
                                        # somebody else got there first
        return(FALSE)
    } else return(TRUE)
}


removelock<-function(lockfile){
  if(unlink(lockfile)!=0) {
    warning("Unable to remove ",lockfile)
    return (FALSE)
  }
  return (TRUE)
}


waitForRAM = function(p = 0.5,lock_file=NULL) {
  if (detectMemUsage() < p) {
    return(NULL)
    ## check lock file:
  } else {
    cat("waiting for RAM \n")
    free_count = 0
    while (TRUE) {
        if (makelock(lock_file)){
            print("---------locking--------")
            return(lock_file)
        }
      if (detectMemUsage() < p) {
        cat("RAM freed \n")
        return(NULL)
      }
      Sys.sleep(5)
      if (evaluate_user_cpu_usage() == 'free'){
        free_count = free_count + 1
      }else{
        free_count = 0
      }
      if (detectMemUsage() < 0.8 & free_count > 100){
        cat("RAM not free but nothing else is running \n")
        return(NULL)
      }
    }
  }
}

lsmem = function() {
  g = globalenv()
  out_all = envs = list()
  envs = append(envs, g)
  total_size = numeric()
  while (environmentName(g) != "R_EmptyEnv") {
    g <- parent.env(g)
    envs = append(envs, g)
  }
  for (e in envs) {

    obj = ls(envir = e)
    if (length(obj) == 0) {
      break
    }
    obj.size = list()
    for (i in obj) {
      obj.size[[i]] = object.size(get(i, envir = e))
    }
    out = data.frame(object = obj, size = unlist(obj.size), stringsAsFactors = FALSE)
    out = out[order(out$size, decreasing = TRUE), ]
    out_all = append(out_all, out)
    total_size = append(total_size, sum(out$size))
  }
  return(list(objects = out_all, total_size = total_size))
}

evaluate_user_cpu_usage = function(){
  user = Sys.info()["user"]
  a = sum(as.numeric (system(paste ("ps -e -o %cpu -u", user), intern = TRUE)[-1]))
  s = substring (system(paste ("ps -e -o stat -u", user), intern = TRUE)[-1],1,1)
  if (a<5 & sum(s %in% 'D')==0 & sum(s%in% 'R')<2){
    status = 'free'
  }else{
    status = 'full'
  }
  return(status)
}
author	petr-novak
date	Thu, 19 Dec 2019 10:24:45 -0500
parents
children