comparison lib/tarean/methods.R @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1d1b9e1b2e2f
1 #!/user/bin/env Rscript
2
3 suppressPackageStartupMessages(library(igraph))
4 suppressPackageStartupMessages(library(parallel))
5 suppressPackageStartupMessages(library(Biostrings))
6 suppressPackageStartupMessages(library(scales))
7 suppressPackageStartupMessages(library(stringr))
8 suppressPackageStartupMessages(library(hwriter))
9 suppressPackageStartupMessages(library(R2HTML))
10 suppressPackageStartupMessages(library(plyr))
11 suppressPackageStartupMessages(library(dplyr))
12
13 max_ORF_length = function(s) {
14 ## check all frames
15 L = 0
16 for (i in 1:3) {
17 L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(s, i))), "*",
18 fixed = TRUE))))
19 L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(reverseComplement(s),
20 i))), "*", fixed = TRUE))))
21 }
22 return(L)
23 }
24
25 kmers2graph = function(kmers, mode = "strong", prop = NULL) {
26 kmerLength = nchar(kmers[1, 1])
27 if (ncol(kmers) == 2) {
28 kmers$size = kmers[, 2]/sum(kmers[, 2])
29 }
30 colnames(kmers) = c("name", "count", "size")
31 if (!is.null(prop)) { # tohle se nepouziva(prop je null), a je to asi spatne - filtuje se to pred tridenim!!
32 p = cumsum(kmers$size)
33 kmers = kmers[p < prop, ]
34 }
35 N = dim(kmers)[1]
36 kmers = kmers[order(kmers$size), ]
37 ## convert kmers to fasta file
38 kms = data.frame(kmer = substring(kmers$name, 1, kmerLength - 1), ids = 1:nrow(kmers),stringsAsFactors = FALSE)
39 kme = data.frame(kmer = substring(kmers$name, 2), ide = 1:nrow(kmers), stringsAsFactors = FALSE)
40
41 ## df = merge(kms,kme, by = 'kmer',all=FALSE)[,c(2,3,1)]
42 df = inner_join(kme,kms, by = 'kmer')[,c(2,3)]
43
44 ## names(kms) = seq_along(kms)
45 ## kme = substring(kmers$name, 2)
46 ## names(kme) = seq_along(kme)
47 ## ## use new blast!
48 ## database = tempfile()
49 ## query = tempfile()
50 ## output = tempfile()
51 ## writeXStringSet(DNAStringSet(kms), filepath = database, format = "fasta")
52 ## writeXStringSet(DNAStringSet(kme), filepath = query, format = "fasta")
53 ## cmd = paste("makeblastdb -in", database, "-dbtype nucl")
54 ## system(cmd, ignore.stdout = TRUE)
55 ## cmd = paste("blastn -outfmt '6 qseqid sseqid pident' -strand plus -dust no -perc_identity 100 -query ",
56 ## query, "-db", database, "-word_size", kmerLength - 1, "-out", output)
57 ## system(cmd)
58 ## df = try({
59 ## read.table(output, as.is = TRUE)
60 ## })
61 ## if (class(df) == "try-error"){
62 ## print("creation of kmer graph failed")
63 ## print(query)
64 ## print(output)
65 ## print(database)
66 ## return(NULL)
67 ## }
68 ## unlink(query)
69 ## unlink(paste(database, "*", sep = ""))
70 ## unlist(output)
71 gm_mean = function(x, na.rm = TRUE) {
72 exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x))
73 }
74
75 whg = apply(cbind(kmers[df[, 1], 2], V2 = kmers[df[, 2], 2]), 1, gm_mean)
76 G = graph.data.frame(data.frame(V1 = kmers$name[df[, 1]], V2 = kmers$name[df[,
77 2]], weight = whg), vertices = kmers[, 1:3])
78 # separate to connected components:
79 ccs = clusters(G, mode = mode)$membership
80 sel_cls = which(tabulate(ccs) > 1)
81 Gs = list()
82 for (i in seq_along(sel_cls)) {
83 Gs[[i]] = induced.subgraph(G, vids = which(ccs %in% sel_cls[i]))
84 }
85 ## reorder!!!
86 Gs = Gs[order(sapply(Gs, vcount), decreasing = TRUE)]
87 return(Gs)
88 }
89
90
91 OGDFlayout = function(G, ncol = NULL, alg = "fmmm", OGDF = getOption("OGDF")) {
92 ## is ogdf binary available?
93 if (is.null(OGDF)) {
94 OGDF = Sys.getenv("OGDF")
95 if ("" == OGDF) {
96 options(warn = -1)
97 OGDF = system("which runOGDFlayout", intern = TRUE)
98 options(warn = 0)
99 if (length(OGDF) == 0) {
100 cat("path to runOGDFlayout not found\n")
101 return(NULL)
102 }
103
104 }
105 }
106 if (is.null(ncol)) {
107 if (is.null(E(G)$weight)) {
108 el = data.frame(get.edgelist(G, names = TRUE), rep(1, ecount(G)))
109 } else {
110 el = data.frame(get.edgelist(G, names = TRUE), E(G)$weight)
111 }
112 ncol = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", sep = "")
113 write.table(el, file = ncol, row.names = FALSE, col.names = FALSE, sep = "\t",
114 quote = FALSE)
115 } else {
116 # copy ncol:
117 ncol_tmp = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout",
118 sep = "")
119 file.copy(ncol, ncol_tmp)
120 ncol = ncol_tmp
121 }
122 algopt = c("fmmm", "sm", "fme")
123 if (!(alg %in% c("fmmm", "sm", "fme") && TRUE)) {
124 stop("alg must by :", algopt, "\n")
125
126 }
127
128 # output file:
129 Louts = list()
130 layout_file = tempfile(pattern = as.character(Sys.getpid()))
131 for (i in alg) {
132 cmd = paste(OGDF, "-alg", i, "-iff layout -off layout -out", layout_file,
133 ncol)
134 system(cmd, intern = TRUE)
135 L = read.table(layout_file, skip = ecount(G))
136 L = L[match(V(G)$name, L$V2), ]
137 Lout = as.matrix(L[, -(1:2)])
138 unlink(layout_file)
139 Louts[[i]] = Lout
140
141 }
142 # clean up
143 unlink(ncol)
144 return(Louts)
145 }
146
147 xcolor_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")
148
149 kmers2color = function(s, position = NULL) {
150 if (is.null(position)) {
151 position = round(nchar(s[1])/2, 0)
152 ## position = 1
153 position = nchar(s[1])
154 }
155 color_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")
156 color_base = substring(s, position, position)
157 colors = color_code[color_base]
158 names(colors) = color_base
159 return(colors)
160 }
161 get_sequence = function(g, v, position = NULL) {
162 s = V(g)$name
163 if (is.null(position)) {
164 position = round(nchar(s[1])/2, 0)
165 ## position = 1
166 position = nchar(s[1])
167 }
168 nt = paste(substring(s[v], position, position), collapse = "")
169 return(nt)
170 }
171
172
173 get_mimimal_cc = function(km, thr = 20, min_coverage = 0.45, step = 2, start = NULL) {
174 if (is.null(start)) {
175 i = sum(cumsum(km$freq) < 0.5)
176 } else {
177 i = sum(cumsum(km$freq) < start)
178 }
179 continue = TRUE
180 while (continue) {
181 if (i > nrow(km)) {
182 i = nrow(km)
183 continue = FALSE
184 step = 1
185 }
186 GG = kmers2graph(km[1:i, ])
187 if (length(GG) > 0) {
188 if (vcount(GG[[1]]) > thr) {
189 if (sum(V(GG[[1]])$size) >= min_coverage) {
190 GG[[1]]$input_coverage = sum(km$freq[1:i])
191 GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
192 return(GG[[1]])
193 }
194 }
195 }
196 i = round(i * step)
197
198 }
199 if (length(GG) == 0 | is.null(GG)) {
200 return(NULL)
201 }
202
203 GG[[1]]$input_coverage = sum(km$freq[1:i])
204 GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
205 return(GG[[1]])
206 }
207
208
209
210 paths2string = function(paths) {
211 pathstring = sapply(lapply(lapply(paths, as_ids), substring, 1, 1), paste, collapse = "")
212 return(pathstring)
213 }
214
215
216 align_paths = function(paths, G) {
217 shift = rep(NA, length(paths))
218 thr = 0 # minimal length
219 tr_paths = list()
220 Seqs = list()
221 centre_node = as.numeric(names(sort(table(unlist(paths)), decreasing = TRUE)))[[1]]
222
223 for (i in seq_along(paths)) {
224 if (centre_node %in% paths[[i]]) {
225 S = which(paths[[i]] %in% centre_node)
226 shift[i] = S
227 if (S == 1) {
228 tr_paths[[i]] = paths[[i]]
229 } else {
230 tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S -
231 1)])
232 }
233 Seqs[[i]] = get_sequence(G, tr_paths[[i]])
234 } else {
235 shift[i] = NA
236 }
237 }
238 paths_n = lapply(paths, as.numeric)
239 tr_paths_n = do.call(cbind, lapply(tr_paths, as.numeric))
240 new_shift = shift
241 for (i in which(is.na(shift))) {
242 score = numeric(length(paths_n))
243 for (S in seq_along(paths_n[[i]])) {
244 if (S == 1) {
245 path_tmp_n = paths_n[[i]]
246 } else {
247 path_tmp_n = c(paths_n[[i]][S:length(paths_n[[i]])], paths_n[[i]][1:(S -
248 1)])
249 }
250 score[S] = sum(tr_paths_n == path_tmp_n)
251 }
252 if (sum(score) != 0) {
253 S = which.max(score)
254 new_shift[i] = S
255 if (S == 1) {
256 tr_paths[[i]] = paths[[i]]
257 } else {
258 tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S -
259 1)])
260 }
261 Seqs[[i]] = get_sequence(G, tr_paths[[i]])
262 }
263 }
264 shift = new_shift
265 # try to shift based on the sequences itself
266 return(list(Seqs = Seqs[!is.na(shift)], tr_paths = tr_paths[!is.na(shift)], shift = shift[!is.na(shift)]))
267 }
268
269 make_consensus = function(paths_info, G) {
270 include = !is.na(paths_info$shift)
271 ## make alignments using mafft
272 aln = mafft(unlist(paths_info$Seqs[include]))
273 CM = calculate_consensus_matrix(aln = aln, tr_paths = paths_info$tr_paths[include],
274 G = G)
275 gaps = get_gaps_from_alignment(aln)
276 CMnorm = CM/rowSums(CM)
277 bases = colnames(CM)
278 consensus = sapply(apply(CMnorm, 1, function(x) bases[which(x > 0.2)][order(x[x >
279 0.2], decreasing = TRUE)]), paste, collapse = "")
280 consensus2 = gsub("-", "", paste0(substring(consensus, 1, 1), collapse = ""),
281 fixed = TRUE)
282 number_of_SNP = sum(rowSums(CM > 0) > 1)
283 SNP_positions = which(rowSums(CM > 0) > 1)
284 number_of_position_with_indels = sum(colSums(do.call(rbind, strsplit(as.character(aln),
285 "")) == "-") > 0)
286 indel_positions = which(colSums(do.call(rbind, strsplit(as.character(aln), "")) ==
287 "-") > 0)
288 if (length(SNP_positions) > 0) {
289 variable_sites = unlist(c(c(mapply(paste, strsplit((sapply(apply(CMnorm,
290 1, function(x) bases[which(x > 0.2)]), paste, collapse = ""))[SNP_positions],
291 ""), SNP_positions, sep = "_")), paste("-", indel_positions, sep = "_")))
292 } else {
293 variable_sites = NULL
294 }
295 variable_positions = unique(SNP_positions, indel_positions)
296 return(list(aln = aln, CM = CM, CMnorm = CMnorm, consensus = consensus, consensus2 = consensus2,
297 number_of_SNP = number_of_SNP, SNP_positions = SNP_positions, number_of_position_with_indels = number_of_position_with_indels,
298 indel_positions = indel_positions, variable_positions = variable_positions,
299 variable_sites = variable_sites, gaps = gaps))
300 }
301
302
303 estimate_monomer = function(G, weights = NULL, limit = NULL) {
304 if (is.null(G)) {
305 return(NULL)
306 }
307 ## estimate monomer from kmer based graph
308 V(G)$id = 1:vcount(G)
309 GS = induced_subgraph(G, vids = which(degree(G) == 2)) ## keep only vertices without branching
310 cls = clusters(GS)$membership
311
312
313 ids = mapply(FUN = function(x, y) x[which.max(y)], split(V(GS)$id, cls), split(V(GS)$size,
314 cls)) ## from each branch use only one vertex with larges size!
315
316
317 ids = ids[order(V(G)$size[ids], decreasing = TRUE)]
318 ids_size = V(G)$size[ids]
319 N50 = sum(cumsum(ids_size)/sum(ids_size) < 0.5)
320 if (length(ids) > 10000) {
321 ids = ids[1:N50]
322 }
323 ## use only large vertices in search! how many?
324 el = get.edgelist(G, names = FALSE)
325 node_use = numeric(vcount(G))
326 LL = numeric()
327 ## W=numeric()
328 i = 0
329 paths = list()
330 if (is.null(weights)) {
331 weights = (max(E(G)$weight) - (E(G)$weight) + 1)
332 weights = E(G)$weight^(-3)
333 }
334 included = rep(FALSE, vcount(G))
335 W_total = sum(V(G)$size)
336
337 coverage = numeric()
338 t0 = c()
339 i = 0
340 j = 0
341 for (n in ids) {
342 j = j + 1
343 t0[j] = Sys.time()
344 if (included[n]) {
345 next
346 }
347 m = which(el[, 1] %in% n)
348 i = i + 1
349 s = get.shortest.paths(G, el[m, 2], el[m, 1], weights = weights, output = "vpath")
350 included[as.numeric(s$vpath[[1]])] = TRUE
351 paths[[i]] = s$vpath[[1]]
352 LL[i] = (length(s$vpath[[1]]))
353 }
354
355 ## evaluate if paths should be divided to variants - by length and path weight
356 paths_clusters = split(paths, LL)
357 paths_clusters_tr = mclapply(paths_clusters, FUN = align_paths, G = G, mc.cores = getOption("CPU"))
358
359 ## paths_clusters_tr = lapply(paths_clusters, FUN = align_paths, G = G) consensus
360 paths_consensus = mclapply(paths_clusters_tr, make_consensus, G = G, mc.cores = getOption("CPU"))
361
362 ## evaluate weight for individual paths:
363 for (v in seq_along(paths_consensus)) {
364 p = paths_clusters_tr[[v]]$tr_paths
365 ## clean
366 p = p[!sapply(p, function(x) anyNA(x) | is.null(x))]
367 L = sapply(p, length)
368 p_groups = split(p, L)
369 w_groups = sapply(p_groups, function(x) sum(V(G)$size[unique(c(sapply(x,
370 as.numeric)))]))
371 total_score = sum(V(G)$size[unique(c(unlist(sapply(p, as.numeric))))])
372 LW = data.frame(`Length estimate` = unique(L), weight = w_groups, stringsAsFactors = FALSE)
373 LW = LW[order(LW$weight, decreasing = TRUE), ]
374 rownames(LW) = NULL
375 paths_consensus[[v]]$total_score = total_score
376 paths_consensus[[v]]$length_variant_score = LW
377
378 }
379 return(list(estimates = paths_consensus, paths = paths_clusters_tr))
380 }
381
382
383
384 detect_pbs = function(dimers_file, tRNA_database_path, reads_file, output) {
385 ## read_file contain oriented reads!
386 min_offset = 10
387 max_end_dist = 2
388 min_aln_length = 30
389 max_pbs_search = 30
390 thr_length = 12
391 end_position = 23
392 insertion_proportion_threshold <- 0.15
393 ## read_file contain oriented reads! for testing
394 if (FALSE) {
395 library(Biostrings)
396 thr_length = 12
397 end_position = 23
398 dimers_file = "consensus_dimer.fasta"
399 reads_file = "reads_oriented.fas"
400 tRNA_database_path = "/mnt/raid/users/petr/workspace/repex_tarean/databases/tRNA_database2.fasta"
401
402 }
403 ## find read which are half aligned do reference dimer
404 dimers_seq = readDNAStringSet(dimers_file)
405 cmd = paste("makeblastdb -in", dimers_file, "-dbtype nucl")
406 system(cmd, ignore.stdout = TRUE)
407 output = paste0(reads_file, "blast_out.cvs")
408 columns = c("qseqid", "qstart", "qend", "qlen", "sseqid", "sstart", "send", "sstrand",
409 "slen", "pident", "length")
410 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), "' -perc_identity 90 -query ",
411 reads_file, "-db", dimers_file, "-word_size", 7, "-dust no -num_alignments 99999 -strand plus ",
412 "-out", output)
413 system(cmd)
414 blastdf = read.table(output, as.is = TRUE, col.names = columns, comment.char = "")
415 unlink(output)
416 blastdf = blastdf[blastdf$length >= min_aln_length, ]
417
418 ## expand two whole read to see unligned reads parts
419 blastdf_expand = blastdf
420 blastdf_expand$qstart = blastdf$qstart - (blastdf$qstart - 1)
421 blastdf_expand$sstart = blastdf$sstart - (blastdf$qstart - 1)
422 blastdf_expand$qend = blastdf$qend + (blastdf$qlen - blastdf$qend)
423 blastdf_expand$send = blastdf$send + (blastdf$qlen - blastdf$qend)
424 pS = blastdf$sstart
425 pE = blastdf$send
426 pSF = blastdf_expand$sstart
427 pEF = blastdf_expand$send
428 cond1 = pS - pSF >= min_offset & blastdf$qend >= (blastdf$qlen - max_end_dist) &
429 blastdf$sstart >= max_pbs_search
430 cond2 = pEF - pE >= min_offset & blastdf$qstart <= max_end_dist & blastdf$send <=
431 blastdf$slen - max_pbs_search
432
433 ## coverage of alignments: evaluate coverage at site with breaks, it is neccessary
434 ## that there must be at least 20% of interupted alignments at given position to
435 ## be considered for additional search
436 coverage_profiles = subject_coverage(blastdf)
437
438
439 ## extract flanking sequences - cca 50nt, it should contain tRNA sequence search
440 ## Left
441 fin = tempfile()
442 fout = tempfile()
443 scoreL = scoreR = 0
444
445
446 ## left side unique insertion sites
447 if (any(cond1)) {
448 insertion_sites = ddply(blastdf[cond1, ], .(sseqid, sstart), nrow)
449 ## check coverage
450 insertion_sites$coverage = 0
451 for (i in 1:nrow(insertion_sites)) {
452 insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$sstart[[i]]]
453 }
454 insertion_OK_left <- insertion_sites[with(insertion_sites, V1/coverage >
455 insertion_proportion_threshold), ]
456
457 if (nrow(insertion_OK_left) > 0) {
458 s = ifelse(insertion_OK_left[, "sstart"] - max_pbs_search < 1, 1, insertion_OK_left[,
459 "sstart"] - max_pbs_search)
460 Lpart = subseq(dimers_seq[match(insertion_OK_left$sseqid, names(dimers_seq))],
461 s, insertion_OK_left$sstart)
462
463 ## names are CONSENSUSID__READID_position
464 names(Lpart) = paste0(names(Lpart), "__", insertion_OK_left$V1, "__",
465 insertion_OK_left$sstart)
466 ## check presence TG dinucleotide
467 TG = vcountPattern("TG", subseq(dimers_seq[match(insertion_OK_left$sseqid,
468 names(dimers_seq))], insertion_OK_left$sstart - 2, insertion_OK_left$sstart +
469 2))
470 if (any(TG > 0)) {
471 writeXStringSet(Lpart[TG > 0], filepath = fin)
472 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "),
473 "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path,
474 "-word_size", 7, "-strand plus -dust no -max_target_seqs 10000 -evalue 100",
475 "-out", fout)
476
477 system(cmd, ignore.stdout = TRUE)
478 df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
479 filter1 = df$length >= thr_length
480 filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
481 df_pass = df[filter1 & filter2, , drop = FALSE]
482 df_pass_L = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
483 scoreL = get_score(df_pass_L)
484 write.table(df_pass_L, file = paste0(output, "_L.csv"), row.names = FALSE,
485 sep = "\t")
486 }
487 }
488 }
489 if (any(cond2)) {
490 ## search Right
491 insertion_sites = ddply(blastdf[cond2, ], .(sseqid, send, slen), nrow)
492 ## check coverage
493 insertion_sites$coverage = 0
494 for (i in 1:nrow(insertion_sites)) {
495 insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$send[[i]]]
496 }
497 insertion_OK_right <- insertion_sites[with(insertion_sites, V1/coverage >
498 insertion_proportion_threshold), ]
499
500 if (nrow(insertion_OK_right) > 0) {
501 s = ifelse(insertion_OK_right$send + max_pbs_search > insertion_OK_right$slen,
502 insertion_OK_right$slen, insertion_OK_right$send + max_pbs_search)
503 Rpart = subseq(dimers_seq[match(insertion_OK_right$sseqid, names(dimers_seq))],
504 insertion_OK_right$send, s)
505 names(Rpart) = paste0(names(Rpart), "__", insertion_OK_right$V1, "__",
506 insertion_OK_right$send)
507
508 ## check presence CA dinucleotide
509 CA = vcountPattern("CA", subseq(dimers_seq[match(insertion_OK_right$sseqid,
510 names(dimers_seq))], insertion_OK_right$send - 2, insertion_OK_right$send +
511 2, ))
512 if (any(CA > 0)) {
513 writeXStringSet(Rpart[CA > 0], filepath = fin)
514 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "),
515 "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path,
516 "-word_size", 7, "-strand minus -dust no -max_target_seqs 10000 -evalue 100",
517 "-out", fout)
518
519 system(cmd, ignore.stdout = TRUE)
520 df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
521 filter1 = df$length >= thr_length
522 filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
523 df_pass = df[filter1 & filter2, , drop = FALSE]
524 df_pass_R = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
525 write.table(df_pass_R, file = paste0(output, "_R.csv"), row.names = FALSE,
526 sep = "\t")
527 scoreR = get_score(df_pass_R)
528 }
529 }
530 }
531 unlink(fin)
532 unlink(fout)
533 return(max(scoreL, scoreR))
534 }
535
536
537 subject_coverage = function(blastdf) {
538 ## calculate coverage for all blast subjects
539 coverage_profiles <- by(blastdf, INDICES = blastdf$sseqid, FUN = function(x) {
540 as.numeric(coverage(IRanges(start = x$sstart, end = x$send)))
541 })
542 return(coverage_profiles)
543
544 }
545
546 get_score = function(x) {
547 ## keep best tRNA
548 if (nrow(x) == 0) {
549 return(0)
550 }
551 xm = data.frame(do.call(rbind, strsplit(x$qseqid, "__")), stringsAsFactors = FALSE)
552 xm$score = as.numeric(sapply(strsplit(xm[, 1], "_"), "[[", 4))
553 xm$AA = gsub("-.+$", "", gsub("^.+__", "", x$sseqid))
554 best_score = max(by(xm$score, INDICES = xm$AA, FUN = sum))
555 return(best_score)
556 }
557
558
559
560 dotter = function(seq1, seq2 = NULL, params = "") {
561 if (is.null(seq2)) {
562 seq2 = seq1
563 }
564 library(Biostrings)
565 if (class(seq1) != "DNAStringSet") {
566 seq1 = BStringSet(seq1)
567 }
568 if (class(seq2) != "DNAStringSet") {
569 seq2 = BStringSet(seq2)
570 }
571 sf1 = tempfile("seq1")
572 writeXStringSet(seq1, file = sf1)
573 sf2 = tempfile("seq2")
574 writeXStringSet(seq2, file = sf2)
575 system(paste("dotter", params, sf1, sf2), wait = FALSE)
576 Sys.sleep(2)
577 unlink(c(sf1, sf2))
578 return(NULL)
579 }
580
581
582
583 dotter2 = function(seq1, seq2 = NULL, params = NULL) {
584 if (is.null(seq2)) {
585 seq2 = seq1
586 }
587 if (is.null(params)) {
588 params = " -windowsize 30 -threshold 45 "
589 }
590 library(Biostrings)
591 if (class(seq1) != "DNAStringSet") {
592 seq1 = DNAStringSet(seq1)
593 }
594 if (class(seq2) != "DNAStringSet") {
595 seq2 = DNAStringSet(seq2)
596 }
597 L1 = nchar(seq1)
598 L2 = nchar(seq2)
599
600 tmpdat1 = tempfile()
601
602 dir.create(tmpdat1)
603 tmpdat2 = tempfile()
604 dir.create(tmpdat2)
605 oridir = getwd()
606
607
608 seq1_merged = DNAStringSet(paste(seq1, collapse = ""))
609 seq2_merged = DNAStringSet(paste(seq2, collapse = ""))
610 seq2rc_merged = reverseComplement(DNAStringSet(paste(seq2, collapse = "")))
611
612
613 sf1 = tempfile("seq1")
614 writeXStringSet(seq1_merged, filepath = sf1)
615 sf2 = tempfile("seq2")
616 sf2rc = tempfile("seq2rc")
617 writeXStringSet(seq2_merged, filepath = sf2)
618 writeXStringSet(seq2rc_merged, filepath = sf2rc)
619
620 cmd1 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2, params)
621 cmd2 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2rc,
622 params)
623 setwd(tmpdat1)
624 output1 = system(cmd1, intern = TRUE)
625 setwd(tmpdat2)
626 output2 = system(cmd2, intern = TRUE)
627 setwd(oridir)
628
629
630
631 fout1 = strsplit(tail(output1, n = 1), split = " ")[[1]][2]
632 rawdat1 = readLines(paste(tmpdat1, "/", fout1, sep = ""))
633 rawdat1 = rawdat1[1:min(grep("^Rectangle", rawdat1))]
634
635 if (length(rawdat1[grep("^Line", rawdat1)]) == 0) {
636 coord1 = NULL
637 } else {
638 coord1 = apply(sapply(strsplit(rawdat1[grep("^Line", rawdat1)], " "), "[",
639 c(3, 5, 7, 9, 11)), 1, as.numeric)
640 coord1 = matrix(coord1, ncol = 5)
641 }
642
643 fout2 = strsplit(tail(output2, n = 1), split = " ")[[1]][2]
644 rawdat2 = readLines(paste(tmpdat2, "/", fout2, sep = ""))
645 rawdat2 = rawdat2[1:min(grep("^Rectangle", rawdat2))]
646
647 if (length(rawdat2[grep("^Line", rawdat2)]) == 0) {
648 coord2 = NULL
649 } else {
650 coord2 = apply(sapply(strsplit(rawdat2[grep("^Line", rawdat2)], " "), "[",
651 c(3, 5, 7, 9, 11)), 1, as.numeric)
652 coord2 = matrix(coord2, ncol = 5)
653 }
654 unlink(sf1)
655 unlink(sf2)
656 unlink(sf2rc)
657 unlink(tmpdat1, recursive = TRUE)
658 unlink(tmpdat2, recursive = TRUE)
659
660 N1 = sum(nchar(seq1))
661 N2 = sum(nchar(seq2))
662 op = par(xaxs = "i", yaxs = "i", mar = c(5, 2, 6, 10), las = 1)
663 on.exit(par(op))
664 plot(c(1, N1), c(1, N2), type = "n", xlab = "", ylab = "", axes = FALSE)
665 if (!is.null(coord1)) {
666 segments(x0 = coord1[, 1], y0 = coord1[, 2], x1 = coord1[, 3], y1 = coord1[,
667 4])
668 }
669 if (!is.null(coord2)) {
670 segments(x0 = coord2[, 1], y0 = N2 - coord2[, 2], x1 = coord2[, 3], y1 = N2 -
671 coord2[, 4])
672 }
673 abline(v = c(0, cumsum(L1)), col = "green")
674 abline(h = c(0, cumsum(L2)), col = "green")
675 box()
676 axis(1, at = c(1, cumsum(L1))[-(length(L1) + 1)], labels = names(seq1), hadj = 0,
677 cex.axis = 0.7)
678 axis(4, at = c(1, cumsum(L2))[-(length(L2) + 1)], labels = names(seq2), hadj = 0,
679 cex.axis = 0.7)
680 invisible(list(coord1, coord2))
681 }
682
683
684 CM2sequence = function(x) {
685 bases = c(colnames(x), "N")
686 x = cbind(x, 0)
687 colnames(x) = bases
688 seqs = paste(bases[apply(x, 1, which.max)], collapse = "")
689
690 return(seqs)
691 }
692
693 ## in alingment calculate significance from kmers
694 calculate_consensus_matrix = function(aln, tr_paths, G) {
695 bases = c("A", "C", "G", "T", "-")
696 positions = lapply(strsplit(as.character(aln), split = ""), function(x) which(!x %in%
697 "-"))
698 base_matrix = do.call(rbind, strsplit(as.character(aln), split = ""))
699 weights = matrix(0, nrow = length(aln), ncol = nchar(aln))
700 kmer_rel_proportions = (V(G)$size/table(factor(unlist(tr_paths), levels = 1:vcount(G))))
701 for (i in seq_along(positions)) {
702 weights[i, positions[[i]]] = kmer_rel_proportions[tr_paths[[i]]]
703 }
704 names(kmer_rel_proportions) = V(G)$name
705 ## get weights for gaps by approximation
706 fitgaps = function(y) {
707 if (sum(y == 0) == 0) {
708 return(y)
709 } else {
710 y0 = rep(y[y != 0], 3)
711 x0 = which(rep(y, 3) != 0)
712 fitted = approx(x0, y0, xout = seq_along(rep(y, 3)), rule = 1)
713 }
714 return(fitted$y[-seq_along(y)][seq_along(y)])
715 }
716 weights_with_gaps = t(apply(weights, 1, fitgaps))
717 ## weights_with_gaps = get_gap_weights(weights, aln,G) ## this step take wey too
718 ## long time!!!-not so important TODO handle gaps differently - more effectively
719
720 ## get consensus matrix
721 CM = sapply(1:nchar(aln[[1]]), function(i) {
722 sapply(split(weights_with_gaps[, i], factor(base_matrix[, i], levels = bases)),
723 sum)
724 })
725 return(t(CM)[, 1:5])
726 }
727
728
729 get_gaps_from_alignment = function(aln) {
730 as.character(aln)
731 gaps_positions = unique(do.call(rbind, str_locate_all(as.character(aln), "-+")))
732 return(gaps_positions)
733 }
734
735 plot_kmer_graph = function(G, L = NULL, vertex.size = NULL, ord = NULL, upto = NULL,
736 highlight = NULL) {
737 if (!is.null(G$L) & is.null(L)) {
738 L = G$L
739 }
740 if (is.null(L)) {
741 ## L=layout.kamada.kawai(G)
742 L = OGDFlayout(G)[[1]]
743 }
744 clr = kmers2color(V(G)$name)
745 if (!is.null(highlight)) {
746 clr[highlight] = "#00000080"
747 }
748 if (!is.null(ord)) {
749 clr[ord[1:upto]] = paste(clr[ord[1:upto]], "30", sep = "")
750 }
751 if (is.null(vertex.size)) {
752 vertex.size = rescale(V(G)$size, to = c(0.5, 6))
753
754 }
755 plot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE,
756 vertex.color = clr, vertex.frame.color = "#00000020", edge.width = 2, edge.arrow.mode = 1,
757 edge.arrow.size = 0.2)
758
759 }
760
761 rglplot_kmer_graph = function(G, L = NULL, vertex.size = 4) {
762 if (is.null(L)) {
763 L = layout.kamada.kawai(G)
764 }
765
766 rglplot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE,
767 vertex.color = kmers2color(V(G)$name), edge.width = 2, edge.arrow.mode = 1,
768 edge.arrow.size = 0.5)
769 }
770
771
772
773
774 mafft = function(seqs, params = "--auto --thread 1 ") {
775 if (length(seqs) < 2) {
776 return(seqs)
777 }
778 infile = tempfile()
779 if (class(seqs) == "character") {
780 seqs = DNAStringSet(seqs)
781 }
782 writeXStringSet(seqs, file = infile)
783 outfile = tempfile()
784 cmd = paste("mafft --quiet --nuc ", params, infile, "2> /dev/null > ", outfile)
785 system(cmd, intern = TRUE, ignore.stderr = FALSE)
786 aln = readDNAStringSet(outfile)
787 unlink(c(outfile, infile))
788 return(aln)
789 }
790
791 mgblast = function(databaseSeq, querySeq) {
792 params = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -C 30 -D 30 "
793 # no dust filtering:
794 paramsDF = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -F F"
795
796 database = tempfile()
797 query = tempfile()
798 output = tempfile()
799 if (class(databaseSeq) == "character") {
800 database = databaseSeq
801 do_not_delete_database = TRUE
802 } else {
803 writeXStringSet(databaseSeq, filepath = database, format = "fasta")
804 do_not_delete_database = FALSE
805 }
806 if (class(querySeq) == "character") {
807 query = querySeq
808 do_not_delete_query = TRUE
809 } else {
810 writeXStringSet(querySeq, filepath = query, format = "fasta")
811 do_not_delete_query = FALSE
812 }
813 ## create database:
814 cmd = paste("formatdb -i", database, "-p F")
815 system(cmd)
816 cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, params)
817 system(cmd)
818 if (file.info(output)$size == 0) {
819 # no hist, try wthou dust masker
820 cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, paramsDF)
821 system(cmd)
822 }
823 blastOut = read.table(output, sep = "\t", header = FALSE, as.is = TRUE, comment.char = "")
824 unlink(output)
825 if (!do_not_delete_query) {
826 unlink(query)
827 }
828 if (!do_not_delete_database) {
829 unlink(paste(database, "*", sep = ""))
830 }
831 colnames(blastOut) = c("query", "q.length", "q.start", "q.end", "subject", "s.length",
832 "s.start", "s.end", "pid", "weight", "e.value", "strand")
833 blastOut
834 }
835
836
837 estimate_sample_size = function(NV, NE, maxv, maxe) {
838 ## density
839 d = (2 * NE)/(NV * (NV - 1))
840 eEst = (maxv * (maxv - 1) * d)/2
841 nEst = (d + sqrt(d^2 + 8 * d * maxe))/(2 * d)
842 if (eEst >= maxe) {
843 N = round(nEst)
844 E = round((N * (N - 1) * d)/2)
845
846 }
847 if (nEst >= maxv) {
848 N = maxv
849 E = round((N * (N - 1) * d)/2)
850
851 }
852 return(N)
853 }
854
855
856
857
858 mgblast2graph = function(blastfile, seqfile, graph_destination, directed_graph_destination,
859 oriented_sequences, paired = TRUE, repex = FALSE, image_file = NULL, image_file_tmb = NULL,
860 include_layout = TRUE, pair_completeness = NULL, satellite_model_path = NULL,
861 maxv = 40000, maxe = 5e+08, seqfile_full = seqfile) {
862 cat("loading blast results\n")
863 if (repex) {
864 cln = c("query", "subject", "weight", "q.length", "q.start", "q.end", "s.length", "s.start",
865 "s.end", "sign")
866 colcls = c("character", "character", "numeric", "numeric", "numeric", "numeric",
867 "numeric", "numeric", "numeric", "NULL", "NULL", "character")
868
869 } else {
870 cln = c("query", "q.length", "q.start", "q.end", "subject", "s.length", "s.start",
871 "s.end","weight", "sign")
872 colcls = c("character", "numeric", "numeric", "numeric", "character", "numeric",
873 "numeric", "numeric", "NULL", "numeric", "NULL", "character")
874 }
875 if (class(blastfile) == "data.frame") {
876 blastTable = blastfile
877 colnames(blastTable)[12] = "sign"
878 } else {
879 blastTable = read.table(blastfile, sep = "\t", as.is = TRUE, header = FALSE,
880 colClasses = colcls, comment.char = "")
881 colnames(blastTable) = cln
882 }
883 ## check for duplicates!
884 key = with(blastTable, ifelse(query > subject, paste(query, subject), paste(subject,
885 query)))
886 if (any(duplicated(key))) {
887 blastTable = blastTable[!duplicated(key), ]
888 }
889 seqs = readDNAStringSet(seqfile)
890 ## calculate pair completeness for reads before sampling
891 if (is.null(pair_completeness)) {
892 if (paired) {
893 if (seqfile_full != seqfile) {
894 seqs_full = readDNAStringSet(seqfile_full)
895 pair_counts = tabulate(table(gsub(".$", "", names(seqs_full))))
896 rm(seqs_full)
897 } else {
898 pair_counts = tabulate(table(gsub(".$", "", names(seqs))))
899
900 }
901 pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
902 }else{
903 pair_completeness = 0
904 }
905 }
906 NV = length(seqs) # vertices
907 NE = nrow(blastTable) # nodes
908 if (maxv < NV | maxe < NE) {
909 ## Sample if graph is large
910 V_sample_size = estimate_sample_size(NV, NE, maxv, maxe)
911 seqs = sample(seqs, V_sample_size)
912 blastTable = blastTable[blastTable$query %in% names(seqs) & blastTable$subject %in%
913 names(seqs), ]
914 }
915
916 blastTable$sign = ifelse(blastTable$sign == "+", 1, -1)
917 vnames = unique(c(blastTable$query, blastTable$subject))
918 vindex = seq_along(vnames)
919 names(vindex) = vnames
920 ## correct orientation
921 cat("creating graph\n")
922 G = graph.data.frame(blastTable[, c("query", "subject", "weight", "sign")], directed = FALSE,
923 vertices = vnames)
924 if (include_layout) {
925 ## save temporarily modified blastTable if large
926 tmpB = tempfile()
927 save(blastTable, file = tmpB)
928 rm(blastTable)
929 ##
930 cat("graph layout calculation ")
931 if (ecount(G) > 2e+06) {
932 cat("using fruchterman reingold\n")
933 L = layout.fruchterman.reingold(G, dim = 3)
934 } else {
935 cat("using OGDF & frucherman reingold\n")
936 Ltmp = OGDFlayout(G, alg = c("fmmm"))
937 L = cbind(Ltmp[[1]][, 1:2], layout.fruchterman.reingold(G, dim = 2))
938
939 }
940
941 GL = list(G = G, L = L)
942 save(GL, file = graph_destination)
943 if (!is.null(image_file)) {
944 cat("exporting graph figure")
945 png(image_file, width = 900, height = 900, pointsize = 20)
946 plot(GL$G, layout = GL$L[, 1:2], vertex.size = 2, vertex.color = "#000000A0",
947 edge.color = "#00000040", vertex.shape = "circle", edge.curved = FALSE,
948 vertex.label = NA, vertex.frame.color = NA, edge.width = 1)
949 dev.off()
950 ## create thunmbs:
951 system(paste("convert ", image_file, " -resize 100x100 ", image_file_tmb))
952
953 }
954 rm(GL)
955 gc()
956 load(tmpB)
957 unlink(tmpB)
958 }
959
960 if (!is.connected(G)) {
961 cat("Warning - graph is not connected\n")
962 cc = clusters(G, "weak")
963 mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)),
964 decreasing = TRUE)))))
965 selvi = which(mb == 1) # largest component
966 cat("using largest component: \nsize =", round(length(selvi)/vcount(G) *
967 100, 1), "% ;", length(selvi), "reads", "\n")
968 G = induced.subgraph(G, vids = selvi)
969 blastTable = blastTable[blastTable$query %in% V(G)$name & blastTable$subject %in%
970 V(G)$name, ]
971 }
972
973 Gmst = minimum.spanning.tree(G)
974 # create alternative trees - for case that unly suboptima solution is found
975 set.seed(123)
976 Gmst_alt = list()
977 Gmst_alt[[1]] = Gmst
978 for (i in 2:6){
979 E(G)$weight = runif(ecount(G), 0.1,1)
980 Gmst_alt[[i]] = minimum.spanning.tree(G)
981 }
982
983 rm(G)
984 gc()
985 ## six attempts to reorient reads
986 flip_names_all = list()
987 prop_of_notfit=numeric()
988 thr_fit=0.001
989 for (ii in 1:6){
990 Gmst = Gmst_alt[[ii]]
991
992
993 blastTable_mst = as.data.frame(get.edgelist(Gmst, names = TRUE))
994 colnames(blastTable_mst) = c("query", "subject")
995 blastTable_mst$sign = E(Gmst)$sign
996
997 d = dfs(Gmst, root = 1, father = TRUE, order.out = TRUE)
998 esign = E(Gmst)$sign
999 rc = rep(FALSE, vcount(Gmst))
1000 j = 0
1001 p = 0
1002 for (v in d$order[-1]) {
1003 j = j + 1
1004 f = as.numeric(d$father)[v]
1005 if (is.na(f)) {
1006 next
1007 }
1008 eid = get.edge.ids(Gmst, c(v, f))
1009 if (esign[eid] == -1) {
1010 ie = unique(c(which(blastTable_mst$query %in% V(Gmst)$name[v]), which(blastTable_mst$subject %in%
1011 V(Gmst)$name[v]))) # incident edges
1012 esign[ie] = esign[ie] * -1
1013 rc[v] = TRUE
1014 p = p + 1
1015 if (p > 50) {
1016 p = 0
1017 cat("\r")
1018 cat(j, " : ", sum(esign)/length(esign))
1019 }
1020 }
1021 }
1022 cat("\t")
1023 flip_names_all[[ii]] = flip_names = V(Gmst)$name[rc]
1024
1025 if (!exists("blastTable")) {
1026 load(tmpB)
1027 unlink(tmpB)
1028 }
1029 ## add nofit later!!
1030 s.mean = with(blastTable, (s.start + s.end)/2)
1031 s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length -
1032 s.mean, s.mean)
1033 q.mean = with(blastTable, (q.start + q.end)/2)
1034 q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length -
1035 q.mean, q.mean)
1036 V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
1037 V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
1038 sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1,
1039 1) * blastTable$sign
1040 nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))
1041 prop_of_notfit[ii] = length(nofit)/vcount(Gmst)
1042 ## check if correctly oriented
1043 cat("prop notfit", prop_of_notfit[[ii]],"\n")
1044 if (prop_of_notfit[[ii]]<thr_fit){
1045 ## OK
1046 break
1047 }
1048 }
1049 if (!prop_of_notfit[[ii]]<thr_fit){
1050 ## get best solution
1051 ii_best = which.min(prop_of_notfit)
1052 if (ii != ii_best){ # if the last solution was not best, get the best
1053 flip_names = flip_names_all[[ii_best]]
1054 s.mean = with(blastTable, (s.start + s.end)/2)
1055 s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length -
1056 s.mean, s.mean)
1057 q.mean = with(blastTable, (q.start + q.end)/2)
1058 q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length -
1059 q.mean, q.mean)
1060 V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
1061 V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
1062 sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1,
1063 1) * blastTable$sign
1064 nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))
1065
1066 }
1067 }
1068
1069 ## exclude all nofit
1070
1071 df2 = data.frame(V1, V2, sign = blastTable$sign, sign_final = sign_final, stringsAsFactors = FALSE)
1072 rm(blastTable)
1073 gc()
1074 vertices = data.frame(name = vnames, reverse_complement = vnames %in% flip_names)
1075 G = graph.data.frame(df2, vertices = vertices)
1076 vcount_ori = vcount(G)
1077 G = induced.subgraph(G, vids = which(!V(G)$name %in% nofit))
1078
1079 G$escore_mst = sum(esign)/length(esign)
1080 G$escore = sum(sign_final == 1)/length(sign_final)
1081 cc = clusters(G, "strong")
1082 mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)),
1083 decreasing = TRUE)))))
1084 names(mb) = V(G)$name
1085 G$loop_index = max(cc$csize)/vcount(G)
1086 G$coverage = vcount(G)/vcount_ori
1087 ## check sign in all edges
1088 V(G)$membership = mb
1089 save(G, file = directed_graph_destination)
1090 ## remove nofit
1091 seqs = seqs[!names(seqs) %in% nofit]
1092 seqs[names(seqs) %in% flip_names] = reverseComplement(seqs[names(seqs) %in% flip_names])
1093 names(seqs) = paste(names(seqs), mb[names(seqs)])
1094 writeXStringSet(seqs, filepath = oriented_sequences)
1095 ## calculate satellite probability
1096 if (is.null(satellite_model_path)) {
1097 pSAT = isSAT = NULL
1098 } else {
1099 satellite_model = readRDS(satellite_model_path)
1100 pSAT = get_prob(G$loop_index, pair_completeness, model = satellite_model)
1101 isSAT = isSatellite(G$loop_index, pair_completeness, model = satellite_model)
1102 }
1103
1104 ## get larges cc
1105 output = list(escore = G$escore, coverage = G$coverage, escore_mts = G$escore_mst,
1106 loop_index = G$loop_index, pair_completeness = pair_completeness, graph_file = graph_destination,
1107 oriented_sequences = oriented_sequences, vcount = vcount(G), ecount = ecount(G),
1108 satellite_probability = pSAT, satellite = isSAT)
1109 ## clean up
1110 all_objects = ls()
1111 do_not_remove = "output"
1112 rm(list = all_objects[!(all_objects %in% do_not_remove)])
1113 gc(verbose = FALSE, reset = TRUE)
1114 return(list2dictionary(output))
1115 }
1116
1117 list2dictionary = function(l){
1118 dict = "{"
1119 q='"'
1120 for (i in 1:length(l)){
1121 if (class(l[[i]])=="character" | is.null(l[[i]])){
1122 q2 = "'''"
1123 }else{
1124 q2 = ''
1125 }
1126 dict = paste0(
1127 dict,
1128 q,names(l)[i],q,":",q2, l[[i]], q2,", "
1129 )
1130 }
1131 dict = paste0(dict, "}")
1132 return(dict)
1133 }
1134
1135 wrap = Vectorize(function(s, width = 80) {
1136 i1 = seq(1, nchar(s), width)
1137 i2 = seq(width, by = width, length.out = length(i1))
1138 return(paste(substring(s, i1, i2), collapse = "\n"))
1139 })
1140
1141
1142 tarean = function(input_sequences, output_dir, min_kmer_length = 11, max_kmer_length = 27,
1143 CPU = 2, sample_size = 10000, reorient_reads = TRUE, tRNA_database_path = NULL,
1144 include_layout = TRUE, paired = TRUE, lock_file=NULL) {
1145 options(CPU = CPU)
1146 time0 = Sys.time()
1147 dir.create(output_dir)
1148 input_sequences_copy = paste(output_dir, "/", basename(input_sequences), sep = "")
1149
1150 if (!file.copy(input_sequences, input_sequences_copy, overwrite = TRUE)) {
1151 cat(paste("cannot copy", input_sequences, " to", output_dir), "\n")
1152 stop()
1153 }
1154
1155 lock_file = waitForRAM(lock_file = lock_file)
1156 pair_completeness = NULL
1157 ## sampling
1158 if (sample_size != 0) {
1159 s = readDNAStringSet(input_sequences_copy)
1160 N = length(s)
1161 ## pair completness must be calculated before sampling!
1162 if (N > sample_size) {
1163 writeXStringSet(sample(s, sample_size), filepath = input_sequences_copy)
1164 if (paired) {
1165 pair_counts = tabulate(table(gsub(".$", "", names(s))))
1166 pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
1167 }
1168 }
1169 rm(s)
1170 }
1171
1172 if (reorient_reads) {
1173 input_sequences_oriented = paste0(input_sequences_copy, "oriented.fasta")
1174 graph_file = paste0(input_sequences_copy, "_graph.RData")
1175 GLfile = paste0(input_sequences_copy, "_graph.GL")
1176 cat("reorientig sequences\n")
1177
1178 blastTable = mgblast(input_sequences_copy, input_sequences_copy)
1179
1180 graph_info = mgblast2graph(blastTable, input_sequences_copy, GLfile, graph_file,
1181 input_sequences_oriented, include_layout = include_layout, paired = paired,
1182 pair_completeness = pair_completeness)
1183
1184 ## interupt if it does not look like tandem repeat at all # soft threshold!
1185 if (is.null(graph_info$pair_completeness)) {
1186 if (graph_info$loop_index <= 0.4) {
1187 cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
1188 return(list2dictionary(list(graph_info = graph_info)))
1189 }
1190 } else {
1191 ## for paired reads:
1192 if (graph_info$loop_index < 0.7 | graph_info$pair_completeness < 0.4) {
1193 cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
1194 return(list2dictionary(list(graph_info = graph_info)))
1195 }
1196 }
1197
1198 } else {
1199 input_sequences_oriented = input_sequences_copy
1200 graph_info = NULL
1201 }
1202
1203
1204
1205 kmer_counts = list()
1206 kmer_lengths = seq(min_kmer_length, max_kmer_length, by = 4)
1207 for (i in kmer_lengths) {
1208 ## use pythonis.null(l[[i]])) function - faster
1209 cmd = paste(script.dir, "/kmer_counting.py ", input_sequences_oriented, " ",
1210 i, sep = "")
1211 cat("calculation kmers of length ", i, "\n")
1212 f = system(cmd, intern = TRUE)
1213 x = read.table(f, as.is = TRUE, sep = "\t")
1214 kmer_counts[[as.character(i)]] = data.frame(x, freq = x$V2/sum(x$V2))
1215 }
1216
1217 ## number of kmers:
1218 N50 = sapply(kmer_counts, function(x) {
1219 sum(cumsum(x$freq) < 0.5)
1220 })
1221
1222 N70 = sapply(kmer_counts, function(x) {
1223 sum(cumsum(x$freq) < 0.7)
1224 })
1225
1226
1227 time1 = Sys.time()
1228 ggmin = mclapply(kmer_counts, get_mimimal_cc, start = 0.5, mc.cores = CPU)
1229 time2 = Sys.time()
1230 cat("kmers graphs created ")
1231 print(time2 - time1)
1232 names(ggmin) = names(kmer_counts)
1233
1234
1235 ## estimate monomer
1236 monomers = mclapply(ggmin, estimate_monomer, mc.cores = CPU)
1237
1238 names(monomers) = names(kmer_counts)
1239 monomers = monomers[!sapply(monomers, is.null)]
1240 ## error handling:
1241 error = sapply(monomers, class) == "try-error"
1242 if (any(error)) {
1243 cat("\nError in monomers estimation: ")
1244 cat("calculation failed for monomers length ", names(monomers)[error], "\n")
1245 print(monomers[error])
1246 if (any(!error)) {
1247 monomers = monomers[!error]
1248 } else {
1249 stop("monomer estimation failed")
1250 }
1251 }
1252
1253 ## summary - make function!!
1254 total_score = list()
1255 k = 0
1256 length_best = numeric()
1257 score_bn = numeric()
1258 consensus = character()
1259
1260 for (i in names(monomers)) {
1261 for (v in seq_along(monomers[[i]]$estimates)) {
1262 k = k + 1
1263 total_score[[k]] = c(kmer = as.numeric(i), variant = v, total_score = monomers[[i]]$estimates[[v]]$total_score)
1264 score_bn[k] = min(rowSums(monomers[[i]]$estimates[[v]]$CM))
1265 length_best[k] = monomers[[i]]$estimates[[v]]$length_variant_score[1,
1266 1]
1267 consensus[[k]] = monomers[[i]]$estimates[[v]]$consensus2
1268 }
1269 }
1270 summary_table = as.data.frame(do.call(rbind, total_score))
1271 summary_table$monomer_length = length_best
1272 summary_table$consensus_length = nchar(consensus)
1273 summary_table$score_bn = score_bn
1274 summary_table$consensus = paste("<pre>", wrap(consensus, width = 80), "<pre>",
1275 sep = "")
1276 consensus_DS = DNAStringSet(consensus)
1277 names(consensus_DS) = with(summary_table, paste0(kmer, "_", variant, "_sc_",
1278 signif(total_score), "_l_", monomer_length))
1279
1280 ## filter by score - keep
1281
1282 ## reorder by score
1283 consensus_DS = consensus_DS[order(summary_table$total_score, decreasing = TRUE)]
1284 summary_table = summary_table[order(summary_table$total_score, decreasing = TRUE),
1285 ]
1286 rownames(summary_table) = NULL
1287 N = nrow(summary_table)
1288 ## concatenate concensus(ie. make dimer head 2 tail) for pbs detection and other
1289 ## make something like 'pseudo contig' multimer for mapping - min length 200 bp
1290
1291 ## searches
1292 consensus_DS_dimer = DNAStringSet(paste0(consensus_DS, consensus_DS))
1293 tarean_contigs = DNAStringSet(sapply(consensus_DS,function(x)
1294 ifelse(nchar(x)<200,
1295 paste(rep(as.character(x),round(300/nchar(as.character(x))+1)),collapse=''),
1296 as.character(x)))
1297 )
1298
1299 names(consensus_DS_dimer) = names(consensus_DS)
1300 # save sequences:
1301 consensus_DS_dimer_file = paste0(output_dir, "/consensus_dimer.fasta")
1302 consensus_DS_file = paste0(output_dir, "/consensus.fasta")
1303 tarean_contig_file = paste0(output_dir, "/tarean_contigs.fasta")
1304 writeXStringSet(consensus_DS, consensus_DS_file)
1305 writeXStringSet(tarean_contigs, tarean_contig_file)
1306 writeXStringSet(consensus_DS_dimer, consensus_DS_dimer_file)
1307 if (is.null(tRNA_database_path)) {
1308 pbs_score = -1
1309 } else {
1310 pbs_blast_table = paste0(output_dir, "/pbs_detection")
1311 pbs_score = detect_pbs(dimers_file = consensus_DS_dimer_file, tRNA_database_path = tRNA_database_path,
1312 reads = input_sequences_oriented, output = pbs_blast_table)
1313 }
1314 ## search of open reading frames get the length of the longest
1315 orf_l = max_ORF_length(consensus_DS_dimer)
1316
1317
1318 dir.create(paste0(output_dir, "/img"), recursive = TRUE)
1319 summary_table$monomer_length_graph = numeric(N)
1320 summary_table$graph_image = character(N)
1321 summary_table$monomer_length_logo = numeric(N)
1322 summary_table$logo_image = character(N)
1323 ## export graph nd consensus estimate to cluster directory type of output may
1324 ## change in future
1325 save(ggmin, file = paste0(output_dir, "/ggmin.RData"))
1326 save(monomers, file = paste0(output_dir, "/monomers.RData"))
1327
1328 cat("generating HTML output\n")
1329 for (i in 1:N) {
1330 kmer = as.character(summary_table$kmer[i])
1331 variant = summary_table$variant[i]
1332 ## export graph
1333 fout_link = paste0("img/graph_", kmer, "mer_", variant, ".png")
1334 fout = paste0(output_dir, "/", fout_link)
1335 ## summary_table$monomer_length_graph[i] = summary_table$monomer_length[i]
1336 ## summary_table$monomer_length_logo[[i]] = nrow(monomers[[kmer]]$estimates[[variant]]$CM)
1337 summary_table$monomer_length[[i]] = length(monomers[[kmer]]$estimates[[variant]]$consensus)
1338
1339 if (i <= 10) {
1340 png(fout, width = 800, height = 800)
1341 plot_kmer_graph(ggmin[[kmer]], highlight = unlist(monomers[[kmer]]$paths[[variant]]$tr_paths))
1342 dev.off()
1343 summary_table$graph_image[i] = hwriteImage(fout_link, link = fout_link,
1344 table = FALSE, width = 100, height = 100)
1345 ## export logo
1346 png_link = paste0("img/logo_", kmer, "mer_", variant, ".png")
1347 fout = paste0(output_dir, "/", png_link)
1348 png(fout, width = 1200, height = round(summary_table$monomer_length[i] *
1349 1) + 550)
1350 try(plot_multiline_logo(monomers[[kmer]]$estimates[[variant]]$CM, W = 100))
1351 dev.off()
1352 ## export corresponding position probability matrices
1353 ppm_file = paste0(output_dir, '/ppm_', kmer, "mer_", variant, ".csv")
1354 ppm_link = paste0('ppm_', kmer, "mer_", variant, ".csv")
1355 write.table(monomers[[kmer]]$estimates[[variant]]$CM,
1356 file = ppm_file,
1357 col.names = TRUE, quote = FALSE,
1358 row.names = FALSE, sep="\t")
1359 summary_table$logo_image[i] = hwriteImage(png_link, link = ppm_link,
1360 table = FALSE, width = 200, height = 100)
1361 }
1362
1363 }
1364
1365 ## html_report = HTMLReport()
1366
1367 htmlfile = paste0(output_dir, "/report.html")
1368 cat(htmlheader, file = htmlfile)
1369 included_columns = c('kmer', 'variant', 'total_score', 'consensus_length','consensus', 'graph_image', 'logo_image')
1370 summary_table_clean = summary_table[,included_columns]
1371 colnames(summary_table_clean) = c('k-mer length',
1372 'Variant index',
1373 'k-mer coverage score',
1374 'Consensus length',
1375 'Consensus sequence',
1376 'k-mer based graph',
1377 'Sequence logo')
1378 HTML(summary_table_clean, file = htmlfile, sortableDF = TRUE)
1379 HTMLEndFile(file = htmlfile)
1380 time4 = Sys.time()
1381 print(time4 - time0)
1382 if (!is.null(lock_file)){
1383 print("------removing-lock--------")
1384 removelock(lock_file)
1385 }
1386
1387 print(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
1388 TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
1389 TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
1390 orf_l = orf_l, tarean_contig_file = tarean_contig_file))
1391 return(list2dictionary(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
1392 TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
1393 TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
1394 orf_l = orf_l, tarean_contig_file = tarean_contig_file)))
1395 }
1396
1397
1398 ## graph loop index stability
1399 loop_index_instability = function(G) {
1400 N = 50
1401 s = seq(vcount(G), vcount(G)/10, length.out = N)
1402 p = seq(1, 0.1, length.out = N)
1403 li = numeric()
1404 for (i in seq_along(s)) {
1405 print(i)
1406 gs = induced_subgraph(G, sample(1:vcount(G), s[i]))
1407 li[i] = max(clusters(gs, "strong")$csize)/vcount(gs)
1408 }
1409 instability = lm(li ~ p)$coefficient[2]
1410 return(instability)
1411 }
1412
1413 isSatellite = function(x, y, model) {
1414 p = get_prob(x, y, model)
1415 if (p > model$cutoff) {
1416 return("Putative Satellite")
1417 } else {
1418 return("")
1419 }
1420 }
1421
1422 get_prob = function(x, y, model) {
1423 pm = model$prob_matrix
1424 N = ncol(pm)
1425 i = round(x * (N - 1)) + 1
1426 j = round(y * (N - 1)) + 1
1427 p = pm[i, j]
1428 return(p)
1429 }
1430
1431
1432 detectMemUsage = function() {
1433 con = textConnection(gsub(" +", " ", readLines("/proc/meminfo")))
1434 memInfo = read.table(con, fill = TRUE, row.names = 1)
1435 close(con)
1436 memUsage = 1 - (memInfo["MemFree", 1] + memInfo["Cached", 1])/memInfo["MemTotal",
1437 1]
1438 return(memUsage)
1439 }
1440
1441
1442 makelock<-function(lockfile,lockmsg,CreateDirectories=TRUE){
1443 lockdir=dirname(lockfile)
1444 if(!file.exists(lockdir)){
1445 if(CreateDirectories) dir.create(lockdir,recursive=TRUE)
1446 else stop("Lock Directory for lockfile ",lockfile," does not exist")
1447 }
1448 if(missing(lockmsg)) lockmsg=paste(system('hostname',intern=TRUE),Sys.getenv("R_SESSION_TMPDIR"))
1449 if (file.exists(lockfile)) return (FALSE)
1450 # note the use of paste makes the message writing atomic
1451 cat(paste(lockmsg,"\n",sep=""),file=lockfile,append=TRUE,sep="")
1452 firstline=readLines(lockfile,n=1)
1453 if(firstline!=lockmsg){
1454 # somebody else got there first
1455 return(FALSE)
1456 } else return(TRUE)
1457 }
1458
1459
1460 removelock<-function(lockfile){
1461 if(unlink(lockfile)!=0) {
1462 warning("Unable to remove ",lockfile)
1463 return (FALSE)
1464 }
1465 return (TRUE)
1466 }
1467
1468
1469 waitForRAM = function(p = 0.5,lock_file=NULL) {
1470 if (detectMemUsage() < p) {
1471 return(NULL)
1472 ## check lock file:
1473 } else {
1474 cat("waiting for RAM \n")
1475 free_count = 0
1476 while (TRUE) {
1477 if (makelock(lock_file)){
1478 print("---------locking--------")
1479 return(lock_file)
1480 }
1481 if (detectMemUsage() < p) {
1482 cat("RAM freed \n")
1483 return(NULL)
1484 }
1485 Sys.sleep(5)
1486 if (evaluate_user_cpu_usage() == 'free'){
1487 free_count = free_count + 1
1488 }else{
1489 free_count = 0
1490 }
1491 if (detectMemUsage() < 0.8 & free_count > 100){
1492 cat("RAM not free but nothing else is running \n")
1493 return(NULL)
1494 }
1495 }
1496 }
1497 }
1498
1499 lsmem = function() {
1500 g = globalenv()
1501 out_all = envs = list()
1502 envs = append(envs, g)
1503 total_size = numeric()
1504 while (environmentName(g) != "R_EmptyEnv") {
1505 g <- parent.env(g)
1506 envs = append(envs, g)
1507 }
1508 for (e in envs) {
1509
1510 obj = ls(envir = e)
1511 if (length(obj) == 0) {
1512 break
1513 }
1514 obj.size = list()
1515 for (i in obj) {
1516 obj.size[[i]] = object.size(get(i, envir = e))
1517 }
1518 out = data.frame(object = obj, size = unlist(obj.size), stringsAsFactors = FALSE)
1519 out = out[order(out$size, decreasing = TRUE), ]
1520 out_all = append(out_all, out)
1521 total_size = append(total_size, sum(out$size))
1522 }
1523 return(list(objects = out_all, total_size = total_size))
1524 }
1525
1526 evaluate_user_cpu_usage = function(){
1527 user = Sys.info()["user"]
1528 a = sum(as.numeric (system(paste ("ps -e -o %cpu -u", user), intern = TRUE)[-1]))
1529 s = substring (system(paste ("ps -e -o stat -u", user), intern = TRUE)[-1],1,1)
1530 if (a<5 & sum(s %in% 'D')==0 & sum(s%in% 'R')<2){
1531 status = 'free'
1532 }else{
1533 status = 'full'
1534 }
1535 return(status)
1536 }