Mercurial > repos > petr-novak > repeatrxplorer
diff lib/tarean/methods.R @ 0:1d1b9e1b2e2f draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 19 Dec 2019 10:24:45 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/tarean/methods.R Thu Dec 19 10:24:45 2019 -0500 @@ -0,0 +1,1536 @@ +#!/user/bin/env Rscript + +suppressPackageStartupMessages(library(igraph)) +suppressPackageStartupMessages(library(parallel)) +suppressPackageStartupMessages(library(Biostrings)) +suppressPackageStartupMessages(library(scales)) +suppressPackageStartupMessages(library(stringr)) +suppressPackageStartupMessages(library(hwriter)) +suppressPackageStartupMessages(library(R2HTML)) +suppressPackageStartupMessages(library(plyr)) +suppressPackageStartupMessages(library(dplyr)) + +max_ORF_length = function(s) { + ## check all frames + L = 0 + for (i in 1:3) { + L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(s, i))), "*", + fixed = TRUE)))) + L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(reverseComplement(s), + i))), "*", fixed = TRUE)))) + } + return(L) +} + +kmers2graph = function(kmers, mode = "strong", prop = NULL) { + kmerLength = nchar(kmers[1, 1]) + if (ncol(kmers) == 2) { + kmers$size = kmers[, 2]/sum(kmers[, 2]) + } + colnames(kmers) = c("name", "count", "size") + if (!is.null(prop)) { # tohle se nepouziva(prop je null), a je to asi spatne - filtuje se to pred tridenim!! + p = cumsum(kmers$size) + kmers = kmers[p < prop, ] + } + N = dim(kmers)[1] + kmers = kmers[order(kmers$size), ] + ## convert kmers to fasta file + kms = data.frame(kmer = substring(kmers$name, 1, kmerLength - 1), ids = 1:nrow(kmers),stringsAsFactors = FALSE) + kme = data.frame(kmer = substring(kmers$name, 2), ide = 1:nrow(kmers), stringsAsFactors = FALSE) + + ## df = merge(kms,kme, by = 'kmer',all=FALSE)[,c(2,3,1)] + df = inner_join(kme,kms, by = 'kmer')[,c(2,3)] + + ## names(kms) = seq_along(kms) + ## kme = substring(kmers$name, 2) + ## names(kme) = seq_along(kme) + ## ## use new blast! + ## database = tempfile() + ## query = tempfile() + ## output = tempfile() + ## writeXStringSet(DNAStringSet(kms), filepath = database, format = "fasta") + ## writeXStringSet(DNAStringSet(kme), filepath = query, format = "fasta") + ## cmd = paste("makeblastdb -in", database, "-dbtype nucl") + ## system(cmd, ignore.stdout = TRUE) + ## cmd = paste("blastn -outfmt '6 qseqid sseqid pident' -strand plus -dust no -perc_identity 100 -query ", + ## query, "-db", database, "-word_size", kmerLength - 1, "-out", output) + ## system(cmd) + ## df = try({ + ## read.table(output, as.is = TRUE) + ## }) + ## if (class(df) == "try-error"){ + ## print("creation of kmer graph failed") + ## print(query) + ## print(output) + ## print(database) + ## return(NULL) + ## } + ## unlink(query) + ## unlink(paste(database, "*", sep = "")) + ## unlist(output) + gm_mean = function(x, na.rm = TRUE) { + exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x)) + } + + whg = apply(cbind(kmers[df[, 1], 2], V2 = kmers[df[, 2], 2]), 1, gm_mean) + G = graph.data.frame(data.frame(V1 = kmers$name[df[, 1]], V2 = kmers$name[df[, + 2]], weight = whg), vertices = kmers[, 1:3]) + # separate to connected components: + ccs = clusters(G, mode = mode)$membership + sel_cls = which(tabulate(ccs) > 1) + Gs = list() + for (i in seq_along(sel_cls)) { + Gs[[i]] = induced.subgraph(G, vids = which(ccs %in% sel_cls[i])) + } + ## reorder!!! + Gs = Gs[order(sapply(Gs, vcount), decreasing = TRUE)] + return(Gs) +} + + +OGDFlayout = function(G, ncol = NULL, alg = "fmmm", OGDF = getOption("OGDF")) { + ## is ogdf binary available? + if (is.null(OGDF)) { + OGDF = Sys.getenv("OGDF") + if ("" == OGDF) { + options(warn = -1) + OGDF = system("which runOGDFlayout", intern = TRUE) + options(warn = 0) + if (length(OGDF) == 0) { + cat("path to runOGDFlayout not found\n") + return(NULL) + } + + } + } + if (is.null(ncol)) { + if (is.null(E(G)$weight)) { + el = data.frame(get.edgelist(G, names = TRUE), rep(1, ecount(G))) + } else { + el = data.frame(get.edgelist(G, names = TRUE), E(G)$weight) + } + ncol = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", sep = "") + write.table(el, file = ncol, row.names = FALSE, col.names = FALSE, sep = "\t", + quote = FALSE) + } else { + # copy ncol: + ncol_tmp = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", + sep = "") + file.copy(ncol, ncol_tmp) + ncol = ncol_tmp + } + algopt = c("fmmm", "sm", "fme") + if (!(alg %in% c("fmmm", "sm", "fme") && TRUE)) { + stop("alg must by :", algopt, "\n") + + } + + # output file: + Louts = list() + layout_file = tempfile(pattern = as.character(Sys.getpid())) + for (i in alg) { + cmd = paste(OGDF, "-alg", i, "-iff layout -off layout -out", layout_file, + ncol) + system(cmd, intern = TRUE) + L = read.table(layout_file, skip = ecount(G)) + L = L[match(V(G)$name, L$V2), ] + Lout = as.matrix(L[, -(1:2)]) + unlink(layout_file) + Louts[[i]] = Lout + + } + # clean up + unlink(ncol) + return(Louts) +} + +xcolor_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00") + +kmers2color = function(s, position = NULL) { + if (is.null(position)) { + position = round(nchar(s[1])/2, 0) + ## position = 1 + position = nchar(s[1]) + } + color_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00") + color_base = substring(s, position, position) + colors = color_code[color_base] + names(colors) = color_base + return(colors) +} +get_sequence = function(g, v, position = NULL) { + s = V(g)$name + if (is.null(position)) { + position = round(nchar(s[1])/2, 0) + ## position = 1 + position = nchar(s[1]) + } + nt = paste(substring(s[v], position, position), collapse = "") + return(nt) +} + + +get_mimimal_cc = function(km, thr = 20, min_coverage = 0.45, step = 2, start = NULL) { + if (is.null(start)) { + i = sum(cumsum(km$freq) < 0.5) + } else { + i = sum(cumsum(km$freq) < start) + } + continue = TRUE + while (continue) { + if (i > nrow(km)) { + i = nrow(km) + continue = FALSE + step = 1 + } + GG = kmers2graph(km[1:i, ]) + if (length(GG) > 0) { + if (vcount(GG[[1]]) > thr) { + if (sum(V(GG[[1]])$size) >= min_coverage) { + GG[[1]]$input_coverage = sum(km$freq[1:i]) + GG[[1]]$L = OGDFlayout(GG[[1]])[[1]] + return(GG[[1]]) + } + } + } + i = round(i * step) + + } + if (length(GG) == 0 | is.null(GG)) { + return(NULL) + } + + GG[[1]]$input_coverage = sum(km$freq[1:i]) + GG[[1]]$L = OGDFlayout(GG[[1]])[[1]] + return(GG[[1]]) +} + + + +paths2string = function(paths) { + pathstring = sapply(lapply(lapply(paths, as_ids), substring, 1, 1), paste, collapse = "") + return(pathstring) +} + + +align_paths = function(paths, G) { + shift = rep(NA, length(paths)) + thr = 0 # minimal length + tr_paths = list() + Seqs = list() + centre_node = as.numeric(names(sort(table(unlist(paths)), decreasing = TRUE)))[[1]] + + for (i in seq_along(paths)) { + if (centre_node %in% paths[[i]]) { + S = which(paths[[i]] %in% centre_node) + shift[i] = S + if (S == 1) { + tr_paths[[i]] = paths[[i]] + } else { + tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - + 1)]) + } + Seqs[[i]] = get_sequence(G, tr_paths[[i]]) + } else { + shift[i] = NA + } + } + paths_n = lapply(paths, as.numeric) + tr_paths_n = do.call(cbind, lapply(tr_paths, as.numeric)) + new_shift = shift + for (i in which(is.na(shift))) { + score = numeric(length(paths_n)) + for (S in seq_along(paths_n[[i]])) { + if (S == 1) { + path_tmp_n = paths_n[[i]] + } else { + path_tmp_n = c(paths_n[[i]][S:length(paths_n[[i]])], paths_n[[i]][1:(S - + 1)]) + } + score[S] = sum(tr_paths_n == path_tmp_n) + } + if (sum(score) != 0) { + S = which.max(score) + new_shift[i] = S + if (S == 1) { + tr_paths[[i]] = paths[[i]] + } else { + tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - + 1)]) + } + Seqs[[i]] = get_sequence(G, tr_paths[[i]]) + } + } + shift = new_shift + # try to shift based on the sequences itself + return(list(Seqs = Seqs[!is.na(shift)], tr_paths = tr_paths[!is.na(shift)], shift = shift[!is.na(shift)])) +} + +make_consensus = function(paths_info, G) { + include = !is.na(paths_info$shift) + ## make alignments using mafft + aln = mafft(unlist(paths_info$Seqs[include])) + CM = calculate_consensus_matrix(aln = aln, tr_paths = paths_info$tr_paths[include], + G = G) + gaps = get_gaps_from_alignment(aln) + CMnorm = CM/rowSums(CM) + bases = colnames(CM) + consensus = sapply(apply(CMnorm, 1, function(x) bases[which(x > 0.2)][order(x[x > + 0.2], decreasing = TRUE)]), paste, collapse = "") + consensus2 = gsub("-", "", paste0(substring(consensus, 1, 1), collapse = ""), + fixed = TRUE) + number_of_SNP = sum(rowSums(CM > 0) > 1) + SNP_positions = which(rowSums(CM > 0) > 1) + number_of_position_with_indels = sum(colSums(do.call(rbind, strsplit(as.character(aln), + "")) == "-") > 0) + indel_positions = which(colSums(do.call(rbind, strsplit(as.character(aln), "")) == + "-") > 0) + if (length(SNP_positions) > 0) { + variable_sites = unlist(c(c(mapply(paste, strsplit((sapply(apply(CMnorm, + 1, function(x) bases[which(x > 0.2)]), paste, collapse = ""))[SNP_positions], + ""), SNP_positions, sep = "_")), paste("-", indel_positions, sep = "_"))) + } else { + variable_sites = NULL + } + variable_positions = unique(SNP_positions, indel_positions) + return(list(aln = aln, CM = CM, CMnorm = CMnorm, consensus = consensus, consensus2 = consensus2, + number_of_SNP = number_of_SNP, SNP_positions = SNP_positions, number_of_position_with_indels = number_of_position_with_indels, + indel_positions = indel_positions, variable_positions = variable_positions, + variable_sites = variable_sites, gaps = gaps)) +} + + +estimate_monomer = function(G, weights = NULL, limit = NULL) { + if (is.null(G)) { + return(NULL) + } + ## estimate monomer from kmer based graph + V(G)$id = 1:vcount(G) + GS = induced_subgraph(G, vids = which(degree(G) == 2)) ## keep only vertices without branching + cls = clusters(GS)$membership + + + ids = mapply(FUN = function(x, y) x[which.max(y)], split(V(GS)$id, cls), split(V(GS)$size, + cls)) ## from each branch use only one vertex with larges size! + + + ids = ids[order(V(G)$size[ids], decreasing = TRUE)] + ids_size = V(G)$size[ids] + N50 = sum(cumsum(ids_size)/sum(ids_size) < 0.5) + if (length(ids) > 10000) { + ids = ids[1:N50] + } + ## use only large vertices in search! how many? + el = get.edgelist(G, names = FALSE) + node_use = numeric(vcount(G)) + LL = numeric() + ## W=numeric() + i = 0 + paths = list() + if (is.null(weights)) { + weights = (max(E(G)$weight) - (E(G)$weight) + 1) + weights = E(G)$weight^(-3) + } + included = rep(FALSE, vcount(G)) + W_total = sum(V(G)$size) + + coverage = numeric() + t0 = c() + i = 0 + j = 0 + for (n in ids) { + j = j + 1 + t0[j] = Sys.time() + if (included[n]) { + next + } + m = which(el[, 1] %in% n) + i = i + 1 + s = get.shortest.paths(G, el[m, 2], el[m, 1], weights = weights, output = "vpath") + included[as.numeric(s$vpath[[1]])] = TRUE + paths[[i]] = s$vpath[[1]] + LL[i] = (length(s$vpath[[1]])) + } + + ## evaluate if paths should be divided to variants - by length and path weight + paths_clusters = split(paths, LL) + paths_clusters_tr = mclapply(paths_clusters, FUN = align_paths, G = G, mc.cores = getOption("CPU")) + + ## paths_clusters_tr = lapply(paths_clusters, FUN = align_paths, G = G) consensus + paths_consensus = mclapply(paths_clusters_tr, make_consensus, G = G, mc.cores = getOption("CPU")) + + ## evaluate weight for individual paths: + for (v in seq_along(paths_consensus)) { + p = paths_clusters_tr[[v]]$tr_paths + ## clean + p = p[!sapply(p, function(x) anyNA(x) | is.null(x))] + L = sapply(p, length) + p_groups = split(p, L) + w_groups = sapply(p_groups, function(x) sum(V(G)$size[unique(c(sapply(x, + as.numeric)))])) + total_score = sum(V(G)$size[unique(c(unlist(sapply(p, as.numeric))))]) + LW = data.frame(`Length estimate` = unique(L), weight = w_groups, stringsAsFactors = FALSE) + LW = LW[order(LW$weight, decreasing = TRUE), ] + rownames(LW) = NULL + paths_consensus[[v]]$total_score = total_score + paths_consensus[[v]]$length_variant_score = LW + + } + return(list(estimates = paths_consensus, paths = paths_clusters_tr)) +} + + + +detect_pbs = function(dimers_file, tRNA_database_path, reads_file, output) { + ## read_file contain oriented reads! + min_offset = 10 + max_end_dist = 2 + min_aln_length = 30 + max_pbs_search = 30 + thr_length = 12 + end_position = 23 + insertion_proportion_threshold <- 0.15 + ## read_file contain oriented reads! for testing + if (FALSE) { + library(Biostrings) + thr_length = 12 + end_position = 23 + dimers_file = "consensus_dimer.fasta" + reads_file = "reads_oriented.fas" + tRNA_database_path = "/mnt/raid/users/petr/workspace/repex_tarean/databases/tRNA_database2.fasta" + + } + ## find read which are half aligned do reference dimer + dimers_seq = readDNAStringSet(dimers_file) + cmd = paste("makeblastdb -in", dimers_file, "-dbtype nucl") + system(cmd, ignore.stdout = TRUE) + output = paste0(reads_file, "blast_out.cvs") + columns = c("qseqid", "qstart", "qend", "qlen", "sseqid", "sstart", "send", "sstrand", + "slen", "pident", "length") + cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), "' -perc_identity 90 -query ", + reads_file, "-db", dimers_file, "-word_size", 7, "-dust no -num_alignments 99999 -strand plus ", + "-out", output) + system(cmd) + blastdf = read.table(output, as.is = TRUE, col.names = columns, comment.char = "") + unlink(output) + blastdf = blastdf[blastdf$length >= min_aln_length, ] + + ## expand two whole read to see unligned reads parts + blastdf_expand = blastdf + blastdf_expand$qstart = blastdf$qstart - (blastdf$qstart - 1) + blastdf_expand$sstart = blastdf$sstart - (blastdf$qstart - 1) + blastdf_expand$qend = blastdf$qend + (blastdf$qlen - blastdf$qend) + blastdf_expand$send = blastdf$send + (blastdf$qlen - blastdf$qend) + pS = blastdf$sstart + pE = blastdf$send + pSF = blastdf_expand$sstart + pEF = blastdf_expand$send + cond1 = pS - pSF >= min_offset & blastdf$qend >= (blastdf$qlen - max_end_dist) & + blastdf$sstart >= max_pbs_search + cond2 = pEF - pE >= min_offset & blastdf$qstart <= max_end_dist & blastdf$send <= + blastdf$slen - max_pbs_search + + ## coverage of alignments: evaluate coverage at site with breaks, it is neccessary + ## that there must be at least 20% of interupted alignments at given position to + ## be considered for additional search + coverage_profiles = subject_coverage(blastdf) + + + ## extract flanking sequences - cca 50nt, it should contain tRNA sequence search + ## Left + fin = tempfile() + fout = tempfile() + scoreL = scoreR = 0 + + + ## left side unique insertion sites + if (any(cond1)) { + insertion_sites = ddply(blastdf[cond1, ], .(sseqid, sstart), nrow) + ## check coverage + insertion_sites$coverage = 0 + for (i in 1:nrow(insertion_sites)) { + insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$sstart[[i]]] + } + insertion_OK_left <- insertion_sites[with(insertion_sites, V1/coverage > + insertion_proportion_threshold), ] + + if (nrow(insertion_OK_left) > 0) { + s = ifelse(insertion_OK_left[, "sstart"] - max_pbs_search < 1, 1, insertion_OK_left[, + "sstart"] - max_pbs_search) + Lpart = subseq(dimers_seq[match(insertion_OK_left$sseqid, names(dimers_seq))], + s, insertion_OK_left$sstart) + + ## names are CONSENSUSID__READID_position + names(Lpart) = paste0(names(Lpart), "__", insertion_OK_left$V1, "__", + insertion_OK_left$sstart) + ## check presence TG dinucleotide + TG = vcountPattern("TG", subseq(dimers_seq[match(insertion_OK_left$sseqid, + names(dimers_seq))], insertion_OK_left$sstart - 2, insertion_OK_left$sstart + + 2)) + if (any(TG > 0)) { + writeXStringSet(Lpart[TG > 0], filepath = fin) + cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), + "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, + "-word_size", 7, "-strand plus -dust no -max_target_seqs 10000 -evalue 100", + "-out", fout) + + system(cmd, ignore.stdout = TRUE) + df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "") + filter1 = df$length >= thr_length + filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position + df_pass = df[filter1 & filter2, , drop = FALSE] + df_pass_L = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE] + scoreL = get_score(df_pass_L) + write.table(df_pass_L, file = paste0(output, "_L.csv"), row.names = FALSE, + sep = "\t") + } + } + } + if (any(cond2)) { + ## search Right + insertion_sites = ddply(blastdf[cond2, ], .(sseqid, send, slen), nrow) + ## check coverage + insertion_sites$coverage = 0 + for (i in 1:nrow(insertion_sites)) { + insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$send[[i]]] + } + insertion_OK_right <- insertion_sites[with(insertion_sites, V1/coverage > + insertion_proportion_threshold), ] + + if (nrow(insertion_OK_right) > 0) { + s = ifelse(insertion_OK_right$send + max_pbs_search > insertion_OK_right$slen, + insertion_OK_right$slen, insertion_OK_right$send + max_pbs_search) + Rpart = subseq(dimers_seq[match(insertion_OK_right$sseqid, names(dimers_seq))], + insertion_OK_right$send, s) + names(Rpart) = paste0(names(Rpart), "__", insertion_OK_right$V1, "__", + insertion_OK_right$send) + + ## check presence CA dinucleotide + CA = vcountPattern("CA", subseq(dimers_seq[match(insertion_OK_right$sseqid, + names(dimers_seq))], insertion_OK_right$send - 2, insertion_OK_right$send + + 2, )) + if (any(CA > 0)) { + writeXStringSet(Rpart[CA > 0], filepath = fin) + cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), + "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, + "-word_size", 7, "-strand minus -dust no -max_target_seqs 10000 -evalue 100", + "-out", fout) + + system(cmd, ignore.stdout = TRUE) + df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "") + filter1 = df$length >= thr_length + filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position + df_pass = df[filter1 & filter2, , drop = FALSE] + df_pass_R = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE] + write.table(df_pass_R, file = paste0(output, "_R.csv"), row.names = FALSE, + sep = "\t") + scoreR = get_score(df_pass_R) + } + } + } + unlink(fin) + unlink(fout) + return(max(scoreL, scoreR)) +} + + +subject_coverage = function(blastdf) { + ## calculate coverage for all blast subjects + coverage_profiles <- by(blastdf, INDICES = blastdf$sseqid, FUN = function(x) { + as.numeric(coverage(IRanges(start = x$sstart, end = x$send))) + }) + return(coverage_profiles) + +} + +get_score = function(x) { + ## keep best tRNA + if (nrow(x) == 0) { + return(0) + } + xm = data.frame(do.call(rbind, strsplit(x$qseqid, "__")), stringsAsFactors = FALSE) + xm$score = as.numeric(sapply(strsplit(xm[, 1], "_"), "[[", 4)) + xm$AA = gsub("-.+$", "", gsub("^.+__", "", x$sseqid)) + best_score = max(by(xm$score, INDICES = xm$AA, FUN = sum)) + return(best_score) +} + + + +dotter = function(seq1, seq2 = NULL, params = "") { + if (is.null(seq2)) { + seq2 = seq1 + } + library(Biostrings) + if (class(seq1) != "DNAStringSet") { + seq1 = BStringSet(seq1) + } + if (class(seq2) != "DNAStringSet") { + seq2 = BStringSet(seq2) + } + sf1 = tempfile("seq1") + writeXStringSet(seq1, file = sf1) + sf2 = tempfile("seq2") + writeXStringSet(seq2, file = sf2) + system(paste("dotter", params, sf1, sf2), wait = FALSE) + Sys.sleep(2) + unlink(c(sf1, sf2)) + return(NULL) +} + + + +dotter2 = function(seq1, seq2 = NULL, params = NULL) { + if (is.null(seq2)) { + seq2 = seq1 + } + if (is.null(params)) { + params = " -windowsize 30 -threshold 45 " + } + library(Biostrings) + if (class(seq1) != "DNAStringSet") { + seq1 = DNAStringSet(seq1) + } + if (class(seq2) != "DNAStringSet") { + seq2 = DNAStringSet(seq2) + } + L1 = nchar(seq1) + L2 = nchar(seq2) + + tmpdat1 = tempfile() + + dir.create(tmpdat1) + tmpdat2 = tempfile() + dir.create(tmpdat2) + oridir = getwd() + + + seq1_merged = DNAStringSet(paste(seq1, collapse = "")) + seq2_merged = DNAStringSet(paste(seq2, collapse = "")) + seq2rc_merged = reverseComplement(DNAStringSet(paste(seq2, collapse = ""))) + + + sf1 = tempfile("seq1") + writeXStringSet(seq1_merged, filepath = sf1) + sf2 = tempfile("seq2") + sf2rc = tempfile("seq2rc") + writeXStringSet(seq2_merged, filepath = sf2) + writeXStringSet(seq2rc_merged, filepath = sf2rc) + + cmd1 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2, params) + cmd2 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2rc, + params) + setwd(tmpdat1) + output1 = system(cmd1, intern = TRUE) + setwd(tmpdat2) + output2 = system(cmd2, intern = TRUE) + setwd(oridir) + + + + fout1 = strsplit(tail(output1, n = 1), split = " ")[[1]][2] + rawdat1 = readLines(paste(tmpdat1, "/", fout1, sep = "")) + rawdat1 = rawdat1[1:min(grep("^Rectangle", rawdat1))] + + if (length(rawdat1[grep("^Line", rawdat1)]) == 0) { + coord1 = NULL + } else { + coord1 = apply(sapply(strsplit(rawdat1[grep("^Line", rawdat1)], " "), "[", + c(3, 5, 7, 9, 11)), 1, as.numeric) + coord1 = matrix(coord1, ncol = 5) + } + + fout2 = strsplit(tail(output2, n = 1), split = " ")[[1]][2] + rawdat2 = readLines(paste(tmpdat2, "/", fout2, sep = "")) + rawdat2 = rawdat2[1:min(grep("^Rectangle", rawdat2))] + + if (length(rawdat2[grep("^Line", rawdat2)]) == 0) { + coord2 = NULL + } else { + coord2 = apply(sapply(strsplit(rawdat2[grep("^Line", rawdat2)], " "), "[", + c(3, 5, 7, 9, 11)), 1, as.numeric) + coord2 = matrix(coord2, ncol = 5) + } + unlink(sf1) + unlink(sf2) + unlink(sf2rc) + unlink(tmpdat1, recursive = TRUE) + unlink(tmpdat2, recursive = TRUE) + + N1 = sum(nchar(seq1)) + N2 = sum(nchar(seq2)) + op = par(xaxs = "i", yaxs = "i", mar = c(5, 2, 6, 10), las = 1) + on.exit(par(op)) + plot(c(1, N1), c(1, N2), type = "n", xlab = "", ylab = "", axes = FALSE) + if (!is.null(coord1)) { + segments(x0 = coord1[, 1], y0 = coord1[, 2], x1 = coord1[, 3], y1 = coord1[, + 4]) + } + if (!is.null(coord2)) { + segments(x0 = coord2[, 1], y0 = N2 - coord2[, 2], x1 = coord2[, 3], y1 = N2 - + coord2[, 4]) + } + abline(v = c(0, cumsum(L1)), col = "green") + abline(h = c(0, cumsum(L2)), col = "green") + box() + axis(1, at = c(1, cumsum(L1))[-(length(L1) + 1)], labels = names(seq1), hadj = 0, + cex.axis = 0.7) + axis(4, at = c(1, cumsum(L2))[-(length(L2) + 1)], labels = names(seq2), hadj = 0, + cex.axis = 0.7) + invisible(list(coord1, coord2)) +} + + +CM2sequence = function(x) { + bases = c(colnames(x), "N") + x = cbind(x, 0) + colnames(x) = bases + seqs = paste(bases[apply(x, 1, which.max)], collapse = "") + + return(seqs) +} + +## in alingment calculate significance from kmers +calculate_consensus_matrix = function(aln, tr_paths, G) { + bases = c("A", "C", "G", "T", "-") + positions = lapply(strsplit(as.character(aln), split = ""), function(x) which(!x %in% + "-")) + base_matrix = do.call(rbind, strsplit(as.character(aln), split = "")) + weights = matrix(0, nrow = length(aln), ncol = nchar(aln)) + kmer_rel_proportions = (V(G)$size/table(factor(unlist(tr_paths), levels = 1:vcount(G)))) + for (i in seq_along(positions)) { + weights[i, positions[[i]]] = kmer_rel_proportions[tr_paths[[i]]] + } + names(kmer_rel_proportions) = V(G)$name + ## get weights for gaps by approximation + fitgaps = function(y) { + if (sum(y == 0) == 0) { + return(y) + } else { + y0 = rep(y[y != 0], 3) + x0 = which(rep(y, 3) != 0) + fitted = approx(x0, y0, xout = seq_along(rep(y, 3)), rule = 1) + } + return(fitted$y[-seq_along(y)][seq_along(y)]) + } + weights_with_gaps = t(apply(weights, 1, fitgaps)) + ## weights_with_gaps = get_gap_weights(weights, aln,G) ## this step take wey too + ## long time!!!-not so important TODO handle gaps differently - more effectively + + ## get consensus matrix + CM = sapply(1:nchar(aln[[1]]), function(i) { + sapply(split(weights_with_gaps[, i], factor(base_matrix[, i], levels = bases)), + sum) + }) + return(t(CM)[, 1:5]) +} + + +get_gaps_from_alignment = function(aln) { + as.character(aln) + gaps_positions = unique(do.call(rbind, str_locate_all(as.character(aln), "-+"))) + return(gaps_positions) +} + +plot_kmer_graph = function(G, L = NULL, vertex.size = NULL, ord = NULL, upto = NULL, + highlight = NULL) { + if (!is.null(G$L) & is.null(L)) { + L = G$L + } + if (is.null(L)) { + ## L=layout.kamada.kawai(G) + L = OGDFlayout(G)[[1]] + } + clr = kmers2color(V(G)$name) + if (!is.null(highlight)) { + clr[highlight] = "#00000080" + } + if (!is.null(ord)) { + clr[ord[1:upto]] = paste(clr[ord[1:upto]], "30", sep = "") + } + if (is.null(vertex.size)) { + vertex.size = rescale(V(G)$size, to = c(0.5, 6)) + + } + plot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, + vertex.color = clr, vertex.frame.color = "#00000020", edge.width = 2, edge.arrow.mode = 1, + edge.arrow.size = 0.2) + +} + +rglplot_kmer_graph = function(G, L = NULL, vertex.size = 4) { + if (is.null(L)) { + L = layout.kamada.kawai(G) + } + + rglplot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, + vertex.color = kmers2color(V(G)$name), edge.width = 2, edge.arrow.mode = 1, + edge.arrow.size = 0.5) +} + + + + +mafft = function(seqs, params = "--auto --thread 1 ") { + if (length(seqs) < 2) { + return(seqs) + } + infile = tempfile() + if (class(seqs) == "character") { + seqs = DNAStringSet(seqs) + } + writeXStringSet(seqs, file = infile) + outfile = tempfile() + cmd = paste("mafft --quiet --nuc ", params, infile, "2> /dev/null > ", outfile) + system(cmd, intern = TRUE, ignore.stderr = FALSE) + aln = readDNAStringSet(outfile) + unlink(c(outfile, infile)) + return(aln) +} + +mgblast = function(databaseSeq, querySeq) { + params = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -C 30 -D 30 " + # no dust filtering: + paramsDF = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -F F" + + database = tempfile() + query = tempfile() + output = tempfile() + if (class(databaseSeq) == "character") { + database = databaseSeq + do_not_delete_database = TRUE + } else { + writeXStringSet(databaseSeq, filepath = database, format = "fasta") + do_not_delete_database = FALSE + } + if (class(querySeq) == "character") { + query = querySeq + do_not_delete_query = TRUE + } else { + writeXStringSet(querySeq, filepath = query, format = "fasta") + do_not_delete_query = FALSE + } + ## create database: + cmd = paste("formatdb -i", database, "-p F") + system(cmd) + cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, params) + system(cmd) + if (file.info(output)$size == 0) { + # no hist, try wthou dust masker + cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, paramsDF) + system(cmd) + } + blastOut = read.table(output, sep = "\t", header = FALSE, as.is = TRUE, comment.char = "") + unlink(output) + if (!do_not_delete_query) { + unlink(query) + } + if (!do_not_delete_database) { + unlink(paste(database, "*", sep = "")) + } + colnames(blastOut) = c("query", "q.length", "q.start", "q.end", "subject", "s.length", + "s.start", "s.end", "pid", "weight", "e.value", "strand") + blastOut +} + + +estimate_sample_size = function(NV, NE, maxv, maxe) { + ## density + d = (2 * NE)/(NV * (NV - 1)) + eEst = (maxv * (maxv - 1) * d)/2 + nEst = (d + sqrt(d^2 + 8 * d * maxe))/(2 * d) + if (eEst >= maxe) { + N = round(nEst) + E = round((N * (N - 1) * d)/2) + + } + if (nEst >= maxv) { + N = maxv + E = round((N * (N - 1) * d)/2) + + } + return(N) +} + + + + +mgblast2graph = function(blastfile, seqfile, graph_destination, directed_graph_destination, + oriented_sequences, paired = TRUE, repex = FALSE, image_file = NULL, image_file_tmb = NULL, + include_layout = TRUE, pair_completeness = NULL, satellite_model_path = NULL, + maxv = 40000, maxe = 5e+08, seqfile_full = seqfile) { + cat("loading blast results\n") + if (repex) { + cln = c("query", "subject", "weight", "q.length", "q.start", "q.end", "s.length", "s.start", + "s.end", "sign") + colcls = c("character", "character", "numeric", "numeric", "numeric", "numeric", + "numeric", "numeric", "numeric", "NULL", "NULL", "character") + + } else { + cln = c("query", "q.length", "q.start", "q.end", "subject", "s.length", "s.start", + "s.end","weight", "sign") + colcls = c("character", "numeric", "numeric", "numeric", "character", "numeric", + "numeric", "numeric", "NULL", "numeric", "NULL", "character") + } + if (class(blastfile) == "data.frame") { + blastTable = blastfile + colnames(blastTable)[12] = "sign" + } else { + blastTable = read.table(blastfile, sep = "\t", as.is = TRUE, header = FALSE, + colClasses = colcls, comment.char = "") + colnames(blastTable) = cln + } + ## check for duplicates! + key = with(blastTable, ifelse(query > subject, paste(query, subject), paste(subject, + query))) + if (any(duplicated(key))) { + blastTable = blastTable[!duplicated(key), ] + } + seqs = readDNAStringSet(seqfile) + ## calculate pair completeness for reads before sampling + if (is.null(pair_completeness)) { + if (paired) { + if (seqfile_full != seqfile) { + seqs_full = readDNAStringSet(seqfile_full) + pair_counts = tabulate(table(gsub(".$", "", names(seqs_full)))) + rm(seqs_full) + } else { + pair_counts = tabulate(table(gsub(".$", "", names(seqs)))) + + } + pair_completeness = 1 - pair_counts[1]/sum(pair_counts) + }else{ + pair_completeness = 0 + } + } + NV = length(seqs) # vertices + NE = nrow(blastTable) # nodes + if (maxv < NV | maxe < NE) { + ## Sample if graph is large + V_sample_size = estimate_sample_size(NV, NE, maxv, maxe) + seqs = sample(seqs, V_sample_size) + blastTable = blastTable[blastTable$query %in% names(seqs) & blastTable$subject %in% + names(seqs), ] + } + + blastTable$sign = ifelse(blastTable$sign == "+", 1, -1) + vnames = unique(c(blastTable$query, blastTable$subject)) + vindex = seq_along(vnames) + names(vindex) = vnames + ## correct orientation + cat("creating graph\n") + G = graph.data.frame(blastTable[, c("query", "subject", "weight", "sign")], directed = FALSE, + vertices = vnames) + if (include_layout) { + ## save temporarily modified blastTable if large + tmpB = tempfile() + save(blastTable, file = tmpB) + rm(blastTable) + ## + cat("graph layout calculation ") + if (ecount(G) > 2e+06) { + cat("using fruchterman reingold\n") + L = layout.fruchterman.reingold(G, dim = 3) + } else { + cat("using OGDF & frucherman reingold\n") + Ltmp = OGDFlayout(G, alg = c("fmmm")) + L = cbind(Ltmp[[1]][, 1:2], layout.fruchterman.reingold(G, dim = 2)) + + } + + GL = list(G = G, L = L) + save(GL, file = graph_destination) + if (!is.null(image_file)) { + cat("exporting graph figure") + png(image_file, width = 900, height = 900, pointsize = 20) + plot(GL$G, layout = GL$L[, 1:2], vertex.size = 2, vertex.color = "#000000A0", + edge.color = "#00000040", vertex.shape = "circle", edge.curved = FALSE, + vertex.label = NA, vertex.frame.color = NA, edge.width = 1) + dev.off() + ## create thunmbs: + system(paste("convert ", image_file, " -resize 100x100 ", image_file_tmb)) + + } + rm(GL) + gc() + load(tmpB) + unlink(tmpB) + } + + if (!is.connected(G)) { + cat("Warning - graph is not connected\n") + cc = clusters(G, "weak") + mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), + decreasing = TRUE))))) + selvi = which(mb == 1) # largest component + cat("using largest component: \nsize =", round(length(selvi)/vcount(G) * + 100, 1), "% ;", length(selvi), "reads", "\n") + G = induced.subgraph(G, vids = selvi) + blastTable = blastTable[blastTable$query %in% V(G)$name & blastTable$subject %in% + V(G)$name, ] + } + + Gmst = minimum.spanning.tree(G) + # create alternative trees - for case that unly suboptima solution is found + set.seed(123) + Gmst_alt = list() + Gmst_alt[[1]] = Gmst + for (i in 2:6){ + E(G)$weight = runif(ecount(G), 0.1,1) + Gmst_alt[[i]] = minimum.spanning.tree(G) + } + + rm(G) + gc() + ## six attempts to reorient reads + flip_names_all = list() + prop_of_notfit=numeric() + thr_fit=0.001 + for (ii in 1:6){ + Gmst = Gmst_alt[[ii]] + + + blastTable_mst = as.data.frame(get.edgelist(Gmst, names = TRUE)) + colnames(blastTable_mst) = c("query", "subject") + blastTable_mst$sign = E(Gmst)$sign + + d = dfs(Gmst, root = 1, father = TRUE, order.out = TRUE) + esign = E(Gmst)$sign + rc = rep(FALSE, vcount(Gmst)) + j = 0 + p = 0 + for (v in d$order[-1]) { + j = j + 1 + f = as.numeric(d$father)[v] + if (is.na(f)) { + next + } + eid = get.edge.ids(Gmst, c(v, f)) + if (esign[eid] == -1) { + ie = unique(c(which(blastTable_mst$query %in% V(Gmst)$name[v]), which(blastTable_mst$subject %in% + V(Gmst)$name[v]))) # incident edges + esign[ie] = esign[ie] * -1 + rc[v] = TRUE + p = p + 1 + if (p > 50) { + p = 0 + cat("\r") + cat(j, " : ", sum(esign)/length(esign)) + } + } + } + cat("\t") + flip_names_all[[ii]] = flip_names = V(Gmst)$name[rc] + + if (!exists("blastTable")) { + load(tmpB) + unlink(tmpB) + } + ## add nofit later!! + s.mean = with(blastTable, (s.start + s.end)/2) + s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - + s.mean, s.mean) + q.mean = with(blastTable, (q.start + q.end)/2) + q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - + q.mean, q.mean) + V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query) + V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject) + sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, + 1) * blastTable$sign + nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"])) + prop_of_notfit[ii] = length(nofit)/vcount(Gmst) + ## check if correctly oriented + cat("prop notfit", prop_of_notfit[[ii]],"\n") + if (prop_of_notfit[[ii]]<thr_fit){ + ## OK + break + } + } + if (!prop_of_notfit[[ii]]<thr_fit){ + ## get best solution + ii_best = which.min(prop_of_notfit) + if (ii != ii_best){ # if the last solution was not best, get the best + flip_names = flip_names_all[[ii_best]] + s.mean = with(blastTable, (s.start + s.end)/2) + s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - + s.mean, s.mean) + q.mean = with(blastTable, (q.start + q.end)/2) + q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - + q.mean, q.mean) + V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query) + V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject) + sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, + 1) * blastTable$sign + nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"])) + + } + } + + ## exclude all nofit + + df2 = data.frame(V1, V2, sign = blastTable$sign, sign_final = sign_final, stringsAsFactors = FALSE) + rm(blastTable) + gc() + vertices = data.frame(name = vnames, reverse_complement = vnames %in% flip_names) + G = graph.data.frame(df2, vertices = vertices) + vcount_ori = vcount(G) + G = induced.subgraph(G, vids = which(!V(G)$name %in% nofit)) + + G$escore_mst = sum(esign)/length(esign) + G$escore = sum(sign_final == 1)/length(sign_final) + cc = clusters(G, "strong") + mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), + decreasing = TRUE))))) + names(mb) = V(G)$name + G$loop_index = max(cc$csize)/vcount(G) + G$coverage = vcount(G)/vcount_ori + ## check sign in all edges + V(G)$membership = mb + save(G, file = directed_graph_destination) + ## remove nofit + seqs = seqs[!names(seqs) %in% nofit] + seqs[names(seqs) %in% flip_names] = reverseComplement(seqs[names(seqs) %in% flip_names]) + names(seqs) = paste(names(seqs), mb[names(seqs)]) + writeXStringSet(seqs, filepath = oriented_sequences) + ## calculate satellite probability + if (is.null(satellite_model_path)) { + pSAT = isSAT = NULL + } else { + satellite_model = readRDS(satellite_model_path) + pSAT = get_prob(G$loop_index, pair_completeness, model = satellite_model) + isSAT = isSatellite(G$loop_index, pair_completeness, model = satellite_model) + } + + ## get larges cc + output = list(escore = G$escore, coverage = G$coverage, escore_mts = G$escore_mst, + loop_index = G$loop_index, pair_completeness = pair_completeness, graph_file = graph_destination, + oriented_sequences = oriented_sequences, vcount = vcount(G), ecount = ecount(G), + satellite_probability = pSAT, satellite = isSAT) + ## clean up + all_objects = ls() + do_not_remove = "output" + rm(list = all_objects[!(all_objects %in% do_not_remove)]) + gc(verbose = FALSE, reset = TRUE) + return(list2dictionary(output)) +} + +list2dictionary = function(l){ + dict = "{" + q='"' + for (i in 1:length(l)){ + if (class(l[[i]])=="character" | is.null(l[[i]])){ + q2 = "'''" + }else{ + q2 = '' + } + dict = paste0( + dict, + q,names(l)[i],q,":",q2, l[[i]], q2,", " + ) + } + dict = paste0(dict, "}") + return(dict) +} + +wrap = Vectorize(function(s, width = 80) { + i1 = seq(1, nchar(s), width) + i2 = seq(width, by = width, length.out = length(i1)) + return(paste(substring(s, i1, i2), collapse = "\n")) +}) + + +tarean = function(input_sequences, output_dir, min_kmer_length = 11, max_kmer_length = 27, + CPU = 2, sample_size = 10000, reorient_reads = TRUE, tRNA_database_path = NULL, + include_layout = TRUE, paired = TRUE, lock_file=NULL) { + options(CPU = CPU) + time0 = Sys.time() + dir.create(output_dir) + input_sequences_copy = paste(output_dir, "/", basename(input_sequences), sep = "") + + if (!file.copy(input_sequences, input_sequences_copy, overwrite = TRUE)) { + cat(paste("cannot copy", input_sequences, " to", output_dir), "\n") + stop() + } + + lock_file = waitForRAM(lock_file = lock_file) + pair_completeness = NULL + ## sampling + if (sample_size != 0) { + s = readDNAStringSet(input_sequences_copy) + N = length(s) + ## pair completness must be calculated before sampling! + if (N > sample_size) { + writeXStringSet(sample(s, sample_size), filepath = input_sequences_copy) + if (paired) { + pair_counts = tabulate(table(gsub(".$", "", names(s)))) + pair_completeness = 1 - pair_counts[1]/sum(pair_counts) + } + } + rm(s) + } + + if (reorient_reads) { + input_sequences_oriented = paste0(input_sequences_copy, "oriented.fasta") + graph_file = paste0(input_sequences_copy, "_graph.RData") + GLfile = paste0(input_sequences_copy, "_graph.GL") + cat("reorientig sequences\n") + + blastTable = mgblast(input_sequences_copy, input_sequences_copy) + + graph_info = mgblast2graph(blastTable, input_sequences_copy, GLfile, graph_file, + input_sequences_oriented, include_layout = include_layout, paired = paired, + pair_completeness = pair_completeness) + + ## interupt if it does not look like tandem repeat at all # soft threshold! + if (is.null(graph_info$pair_completeness)) { + if (graph_info$loop_index <= 0.4) { + cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + return(list2dictionary(list(graph_info = graph_info))) + } + } else { + ## for paired reads: + if (graph_info$loop_index < 0.7 | graph_info$pair_completeness < 0.4) { + cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + return(list2dictionary(list(graph_info = graph_info))) + } + } + + } else { + input_sequences_oriented = input_sequences_copy + graph_info = NULL + } + + + + kmer_counts = list() + kmer_lengths = seq(min_kmer_length, max_kmer_length, by = 4) + for (i in kmer_lengths) { + ## use pythonis.null(l[[i]])) function - faster + cmd = paste(script.dir, "/kmer_counting.py ", input_sequences_oriented, " ", + i, sep = "") + cat("calculation kmers of length ", i, "\n") + f = system(cmd, intern = TRUE) + x = read.table(f, as.is = TRUE, sep = "\t") + kmer_counts[[as.character(i)]] = data.frame(x, freq = x$V2/sum(x$V2)) + } + + ## number of kmers: + N50 = sapply(kmer_counts, function(x) { + sum(cumsum(x$freq) < 0.5) + }) + + N70 = sapply(kmer_counts, function(x) { + sum(cumsum(x$freq) < 0.7) + }) + + + time1 = Sys.time() + ggmin = mclapply(kmer_counts, get_mimimal_cc, start = 0.5, mc.cores = CPU) + time2 = Sys.time() + cat("kmers graphs created ") + print(time2 - time1) + names(ggmin) = names(kmer_counts) + + + ## estimate monomer + monomers = mclapply(ggmin, estimate_monomer, mc.cores = CPU) + + names(monomers) = names(kmer_counts) + monomers = monomers[!sapply(monomers, is.null)] + ## error handling: + error = sapply(monomers, class) == "try-error" + if (any(error)) { + cat("\nError in monomers estimation: ") + cat("calculation failed for monomers length ", names(monomers)[error], "\n") + print(monomers[error]) + if (any(!error)) { + monomers = monomers[!error] + } else { + stop("monomer estimation failed") + } + } + + ## summary - make function!! + total_score = list() + k = 0 + length_best = numeric() + score_bn = numeric() + consensus = character() + + for (i in names(monomers)) { + for (v in seq_along(monomers[[i]]$estimates)) { + k = k + 1 + total_score[[k]] = c(kmer = as.numeric(i), variant = v, total_score = monomers[[i]]$estimates[[v]]$total_score) + score_bn[k] = min(rowSums(monomers[[i]]$estimates[[v]]$CM)) + length_best[k] = monomers[[i]]$estimates[[v]]$length_variant_score[1, + 1] + consensus[[k]] = monomers[[i]]$estimates[[v]]$consensus2 + } + } + summary_table = as.data.frame(do.call(rbind, total_score)) + summary_table$monomer_length = length_best + summary_table$consensus_length = nchar(consensus) + summary_table$score_bn = score_bn + summary_table$consensus = paste("<pre>", wrap(consensus, width = 80), "<pre>", + sep = "") + consensus_DS = DNAStringSet(consensus) + names(consensus_DS) = with(summary_table, paste0(kmer, "_", variant, "_sc_", + signif(total_score), "_l_", monomer_length)) + + ## filter by score - keep + + ## reorder by score + consensus_DS = consensus_DS[order(summary_table$total_score, decreasing = TRUE)] + summary_table = summary_table[order(summary_table$total_score, decreasing = TRUE), + ] + rownames(summary_table) = NULL + N = nrow(summary_table) + ## concatenate concensus(ie. make dimer head 2 tail) for pbs detection and other + ## make something like 'pseudo contig' multimer for mapping - min length 200 bp + + ## searches + consensus_DS_dimer = DNAStringSet(paste0(consensus_DS, consensus_DS)) + tarean_contigs = DNAStringSet(sapply(consensus_DS,function(x) + ifelse(nchar(x)<200, + paste(rep(as.character(x),round(300/nchar(as.character(x))+1)),collapse=''), + as.character(x))) + ) + + names(consensus_DS_dimer) = names(consensus_DS) + # save sequences: + consensus_DS_dimer_file = paste0(output_dir, "/consensus_dimer.fasta") + consensus_DS_file = paste0(output_dir, "/consensus.fasta") + tarean_contig_file = paste0(output_dir, "/tarean_contigs.fasta") + writeXStringSet(consensus_DS, consensus_DS_file) + writeXStringSet(tarean_contigs, tarean_contig_file) + writeXStringSet(consensus_DS_dimer, consensus_DS_dimer_file) + if (is.null(tRNA_database_path)) { + pbs_score = -1 + } else { + pbs_blast_table = paste0(output_dir, "/pbs_detection") + pbs_score = detect_pbs(dimers_file = consensus_DS_dimer_file, tRNA_database_path = tRNA_database_path, + reads = input_sequences_oriented, output = pbs_blast_table) + } + ## search of open reading frames get the length of the longest + orf_l = max_ORF_length(consensus_DS_dimer) + + + dir.create(paste0(output_dir, "/img"), recursive = TRUE) + summary_table$monomer_length_graph = numeric(N) + summary_table$graph_image = character(N) + summary_table$monomer_length_logo = numeric(N) + summary_table$logo_image = character(N) + ## export graph nd consensus estimate to cluster directory type of output may + ## change in future + save(ggmin, file = paste0(output_dir, "/ggmin.RData")) + save(monomers, file = paste0(output_dir, "/monomers.RData")) + + cat("generating HTML output\n") + for (i in 1:N) { + kmer = as.character(summary_table$kmer[i]) + variant = summary_table$variant[i] + ## export graph + fout_link = paste0("img/graph_", kmer, "mer_", variant, ".png") + fout = paste0(output_dir, "/", fout_link) + ## summary_table$monomer_length_graph[i] = summary_table$monomer_length[i] + ## summary_table$monomer_length_logo[[i]] = nrow(monomers[[kmer]]$estimates[[variant]]$CM) + summary_table$monomer_length[[i]] = length(monomers[[kmer]]$estimates[[variant]]$consensus) + + if (i <= 10) { + png(fout, width = 800, height = 800) + plot_kmer_graph(ggmin[[kmer]], highlight = unlist(monomers[[kmer]]$paths[[variant]]$tr_paths)) + dev.off() + summary_table$graph_image[i] = hwriteImage(fout_link, link = fout_link, + table = FALSE, width = 100, height = 100) + ## export logo + png_link = paste0("img/logo_", kmer, "mer_", variant, ".png") + fout = paste0(output_dir, "/", png_link) + png(fout, width = 1200, height = round(summary_table$monomer_length[i] * + 1) + 550) + try(plot_multiline_logo(monomers[[kmer]]$estimates[[variant]]$CM, W = 100)) + dev.off() + ## export corresponding position probability matrices + ppm_file = paste0(output_dir, '/ppm_', kmer, "mer_", variant, ".csv") + ppm_link = paste0('ppm_', kmer, "mer_", variant, ".csv") + write.table(monomers[[kmer]]$estimates[[variant]]$CM, + file = ppm_file, + col.names = TRUE, quote = FALSE, + row.names = FALSE, sep="\t") + summary_table$logo_image[i] = hwriteImage(png_link, link = ppm_link, + table = FALSE, width = 200, height = 100) + } + + } + + ## html_report = HTMLReport() + + htmlfile = paste0(output_dir, "/report.html") + cat(htmlheader, file = htmlfile) + included_columns = c('kmer', 'variant', 'total_score', 'consensus_length','consensus', 'graph_image', 'logo_image') + summary_table_clean = summary_table[,included_columns] + colnames(summary_table_clean) = c('k-mer length', + 'Variant index', + 'k-mer coverage score', + 'Consensus length', + 'Consensus sequence', + 'k-mer based graph', + 'Sequence logo') + HTML(summary_table_clean, file = htmlfile, sortableDF = TRUE) + HTMLEndFile(file = htmlfile) + time4 = Sys.time() + print(time4 - time0) + if (!is.null(lock_file)){ + print("------removing-lock--------") + removelock(lock_file) + } + + print(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1], + TR_monomer_length = as.numeric(summary_table$consensus_length[1]), + TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info, + orf_l = orf_l, tarean_contig_file = tarean_contig_file)) + return(list2dictionary(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1], + TR_monomer_length = as.numeric(summary_table$consensus_length[1]), + TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info, + orf_l = orf_l, tarean_contig_file = tarean_contig_file))) +} + + +## graph loop index stability +loop_index_instability = function(G) { + N = 50 + s = seq(vcount(G), vcount(G)/10, length.out = N) + p = seq(1, 0.1, length.out = N) + li = numeric() + for (i in seq_along(s)) { + print(i) + gs = induced_subgraph(G, sample(1:vcount(G), s[i])) + li[i] = max(clusters(gs, "strong")$csize)/vcount(gs) + } + instability = lm(li ~ p)$coefficient[2] + return(instability) +} + +isSatellite = function(x, y, model) { + p = get_prob(x, y, model) + if (p > model$cutoff) { + return("Putative Satellite") + } else { + return("") + } +} + +get_prob = function(x, y, model) { + pm = model$prob_matrix + N = ncol(pm) + i = round(x * (N - 1)) + 1 + j = round(y * (N - 1)) + 1 + p = pm[i, j] + return(p) +} + + +detectMemUsage = function() { + con = textConnection(gsub(" +", " ", readLines("/proc/meminfo"))) + memInfo = read.table(con, fill = TRUE, row.names = 1) + close(con) + memUsage = 1 - (memInfo["MemFree", 1] + memInfo["Cached", 1])/memInfo["MemTotal", + 1] + return(memUsage) +} + + +makelock<-function(lockfile,lockmsg,CreateDirectories=TRUE){ + lockdir=dirname(lockfile) + if(!file.exists(lockdir)){ + if(CreateDirectories) dir.create(lockdir,recursive=TRUE) + else stop("Lock Directory for lockfile ",lockfile," does not exist") + } + if(missing(lockmsg)) lockmsg=paste(system('hostname',intern=TRUE),Sys.getenv("R_SESSION_TMPDIR")) + if (file.exists(lockfile)) return (FALSE) + # note the use of paste makes the message writing atomic + cat(paste(lockmsg,"\n",sep=""),file=lockfile,append=TRUE,sep="") + firstline=readLines(lockfile,n=1) + if(firstline!=lockmsg){ + # somebody else got there first + return(FALSE) + } else return(TRUE) +} + + +removelock<-function(lockfile){ + if(unlink(lockfile)!=0) { + warning("Unable to remove ",lockfile) + return (FALSE) + } + return (TRUE) +} + + +waitForRAM = function(p = 0.5,lock_file=NULL) { + if (detectMemUsage() < p) { + return(NULL) + ## check lock file: + } else { + cat("waiting for RAM \n") + free_count = 0 + while (TRUE) { + if (makelock(lock_file)){ + print("---------locking--------") + return(lock_file) + } + if (detectMemUsage() < p) { + cat("RAM freed \n") + return(NULL) + } + Sys.sleep(5) + if (evaluate_user_cpu_usage() == 'free'){ + free_count = free_count + 1 + }else{ + free_count = 0 + } + if (detectMemUsage() < 0.8 & free_count > 100){ + cat("RAM not free but nothing else is running \n") + return(NULL) + } + } + } +} + +lsmem = function() { + g = globalenv() + out_all = envs = list() + envs = append(envs, g) + total_size = numeric() + while (environmentName(g) != "R_EmptyEnv") { + g <- parent.env(g) + envs = append(envs, g) + } + for (e in envs) { + + obj = ls(envir = e) + if (length(obj) == 0) { + break + } + obj.size = list() + for (i in obj) { + obj.size[[i]] = object.size(get(i, envir = e)) + } + out = data.frame(object = obj, size = unlist(obj.size), stringsAsFactors = FALSE) + out = out[order(out$size, decreasing = TRUE), ] + out_all = append(out_all, out) + total_size = append(total_size, sum(out$size)) + } + return(list(objects = out_all, total_size = total_size)) +} + +evaluate_user_cpu_usage = function(){ + user = Sys.info()["user"] + a = sum(as.numeric (system(paste ("ps -e -o %cpu -u", user), intern = TRUE)[-1])) + s = substring (system(paste ("ps -e -o stat -u", user), intern = TRUE)[-1],1,1) + if (a<5 & sum(s %in% 'D')==0 & sum(s%in% 'R')<2){ + status = 'free' + }else{ + status = 'full' + } + return(status) +}