Mercurial > repos > petr-novak > repeatrxplorer
comparison lib/tarean/methods.R @ 0:1d1b9e1b2e2f draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 19 Dec 2019 10:24:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1d1b9e1b2e2f |
---|---|
1 #!/user/bin/env Rscript | |
2 | |
3 suppressPackageStartupMessages(library(igraph)) | |
4 suppressPackageStartupMessages(library(parallel)) | |
5 suppressPackageStartupMessages(library(Biostrings)) | |
6 suppressPackageStartupMessages(library(scales)) | |
7 suppressPackageStartupMessages(library(stringr)) | |
8 suppressPackageStartupMessages(library(hwriter)) | |
9 suppressPackageStartupMessages(library(R2HTML)) | |
10 suppressPackageStartupMessages(library(plyr)) | |
11 suppressPackageStartupMessages(library(dplyr)) | |
12 | |
13 max_ORF_length = function(s) { | |
14 ## check all frames | |
15 L = 0 | |
16 for (i in 1:3) { | |
17 L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(s, i))), "*", | |
18 fixed = TRUE)))) | |
19 L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(reverseComplement(s), | |
20 i))), "*", fixed = TRUE)))) | |
21 } | |
22 return(L) | |
23 } | |
24 | |
25 kmers2graph = function(kmers, mode = "strong", prop = NULL) { | |
26 kmerLength = nchar(kmers[1, 1]) | |
27 if (ncol(kmers) == 2) { | |
28 kmers$size = kmers[, 2]/sum(kmers[, 2]) | |
29 } | |
30 colnames(kmers) = c("name", "count", "size") | |
31 if (!is.null(prop)) { # tohle se nepouziva(prop je null), a je to asi spatne - filtuje se to pred tridenim!! | |
32 p = cumsum(kmers$size) | |
33 kmers = kmers[p < prop, ] | |
34 } | |
35 N = dim(kmers)[1] | |
36 kmers = kmers[order(kmers$size), ] | |
37 ## convert kmers to fasta file | |
38 kms = data.frame(kmer = substring(kmers$name, 1, kmerLength - 1), ids = 1:nrow(kmers),stringsAsFactors = FALSE) | |
39 kme = data.frame(kmer = substring(kmers$name, 2), ide = 1:nrow(kmers), stringsAsFactors = FALSE) | |
40 | |
41 ## df = merge(kms,kme, by = 'kmer',all=FALSE)[,c(2,3,1)] | |
42 df = inner_join(kme,kms, by = 'kmer')[,c(2,3)] | |
43 | |
44 ## names(kms) = seq_along(kms) | |
45 ## kme = substring(kmers$name, 2) | |
46 ## names(kme) = seq_along(kme) | |
47 ## ## use new blast! | |
48 ## database = tempfile() | |
49 ## query = tempfile() | |
50 ## output = tempfile() | |
51 ## writeXStringSet(DNAStringSet(kms), filepath = database, format = "fasta") | |
52 ## writeXStringSet(DNAStringSet(kme), filepath = query, format = "fasta") | |
53 ## cmd = paste("makeblastdb -in", database, "-dbtype nucl") | |
54 ## system(cmd, ignore.stdout = TRUE) | |
55 ## cmd = paste("blastn -outfmt '6 qseqid sseqid pident' -strand plus -dust no -perc_identity 100 -query ", | |
56 ## query, "-db", database, "-word_size", kmerLength - 1, "-out", output) | |
57 ## system(cmd) | |
58 ## df = try({ | |
59 ## read.table(output, as.is = TRUE) | |
60 ## }) | |
61 ## if (class(df) == "try-error"){ | |
62 ## print("creation of kmer graph failed") | |
63 ## print(query) | |
64 ## print(output) | |
65 ## print(database) | |
66 ## return(NULL) | |
67 ## } | |
68 ## unlink(query) | |
69 ## unlink(paste(database, "*", sep = "")) | |
70 ## unlist(output) | |
71 gm_mean = function(x, na.rm = TRUE) { | |
72 exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x)) | |
73 } | |
74 | |
75 whg = apply(cbind(kmers[df[, 1], 2], V2 = kmers[df[, 2], 2]), 1, gm_mean) | |
76 G = graph.data.frame(data.frame(V1 = kmers$name[df[, 1]], V2 = kmers$name[df[, | |
77 2]], weight = whg), vertices = kmers[, 1:3]) | |
78 # separate to connected components: | |
79 ccs = clusters(G, mode = mode)$membership | |
80 sel_cls = which(tabulate(ccs) > 1) | |
81 Gs = list() | |
82 for (i in seq_along(sel_cls)) { | |
83 Gs[[i]] = induced.subgraph(G, vids = which(ccs %in% sel_cls[i])) | |
84 } | |
85 ## reorder!!! | |
86 Gs = Gs[order(sapply(Gs, vcount), decreasing = TRUE)] | |
87 return(Gs) | |
88 } | |
89 | |
90 | |
91 OGDFlayout = function(G, ncol = NULL, alg = "fmmm", OGDF = getOption("OGDF")) { | |
92 ## is ogdf binary available? | |
93 if (is.null(OGDF)) { | |
94 OGDF = Sys.getenv("OGDF") | |
95 if ("" == OGDF) { | |
96 options(warn = -1) | |
97 OGDF = system("which runOGDFlayout", intern = TRUE) | |
98 options(warn = 0) | |
99 if (length(OGDF) == 0) { | |
100 cat("path to runOGDFlayout not found\n") | |
101 return(NULL) | |
102 } | |
103 | |
104 } | |
105 } | |
106 if (is.null(ncol)) { | |
107 if (is.null(E(G)$weight)) { | |
108 el = data.frame(get.edgelist(G, names = TRUE), rep(1, ecount(G))) | |
109 } else { | |
110 el = data.frame(get.edgelist(G, names = TRUE), E(G)$weight) | |
111 } | |
112 ncol = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", sep = "") | |
113 write.table(el, file = ncol, row.names = FALSE, col.names = FALSE, sep = "\t", | |
114 quote = FALSE) | |
115 } else { | |
116 # copy ncol: | |
117 ncol_tmp = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", | |
118 sep = "") | |
119 file.copy(ncol, ncol_tmp) | |
120 ncol = ncol_tmp | |
121 } | |
122 algopt = c("fmmm", "sm", "fme") | |
123 if (!(alg %in% c("fmmm", "sm", "fme") && TRUE)) { | |
124 stop("alg must by :", algopt, "\n") | |
125 | |
126 } | |
127 | |
128 # output file: | |
129 Louts = list() | |
130 layout_file = tempfile(pattern = as.character(Sys.getpid())) | |
131 for (i in alg) { | |
132 cmd = paste(OGDF, "-alg", i, "-iff layout -off layout -out", layout_file, | |
133 ncol) | |
134 system(cmd, intern = TRUE) | |
135 L = read.table(layout_file, skip = ecount(G)) | |
136 L = L[match(V(G)$name, L$V2), ] | |
137 Lout = as.matrix(L[, -(1:2)]) | |
138 unlink(layout_file) | |
139 Louts[[i]] = Lout | |
140 | |
141 } | |
142 # clean up | |
143 unlink(ncol) | |
144 return(Louts) | |
145 } | |
146 | |
147 xcolor_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00") | |
148 | |
149 kmers2color = function(s, position = NULL) { | |
150 if (is.null(position)) { | |
151 position = round(nchar(s[1])/2, 0) | |
152 ## position = 1 | |
153 position = nchar(s[1]) | |
154 } | |
155 color_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00") | |
156 color_base = substring(s, position, position) | |
157 colors = color_code[color_base] | |
158 names(colors) = color_base | |
159 return(colors) | |
160 } | |
161 get_sequence = function(g, v, position = NULL) { | |
162 s = V(g)$name | |
163 if (is.null(position)) { | |
164 position = round(nchar(s[1])/2, 0) | |
165 ## position = 1 | |
166 position = nchar(s[1]) | |
167 } | |
168 nt = paste(substring(s[v], position, position), collapse = "") | |
169 return(nt) | |
170 } | |
171 | |
172 | |
173 get_mimimal_cc = function(km, thr = 20, min_coverage = 0.45, step = 2, start = NULL) { | |
174 if (is.null(start)) { | |
175 i = sum(cumsum(km$freq) < 0.5) | |
176 } else { | |
177 i = sum(cumsum(km$freq) < start) | |
178 } | |
179 continue = TRUE | |
180 while (continue) { | |
181 if (i > nrow(km)) { | |
182 i = nrow(km) | |
183 continue = FALSE | |
184 step = 1 | |
185 } | |
186 GG = kmers2graph(km[1:i, ]) | |
187 if (length(GG) > 0) { | |
188 if (vcount(GG[[1]]) > thr) { | |
189 if (sum(V(GG[[1]])$size) >= min_coverage) { | |
190 GG[[1]]$input_coverage = sum(km$freq[1:i]) | |
191 GG[[1]]$L = OGDFlayout(GG[[1]])[[1]] | |
192 return(GG[[1]]) | |
193 } | |
194 } | |
195 } | |
196 i = round(i * step) | |
197 | |
198 } | |
199 if (length(GG) == 0 | is.null(GG)) { | |
200 return(NULL) | |
201 } | |
202 | |
203 GG[[1]]$input_coverage = sum(km$freq[1:i]) | |
204 GG[[1]]$L = OGDFlayout(GG[[1]])[[1]] | |
205 return(GG[[1]]) | |
206 } | |
207 | |
208 | |
209 | |
210 paths2string = function(paths) { | |
211 pathstring = sapply(lapply(lapply(paths, as_ids), substring, 1, 1), paste, collapse = "") | |
212 return(pathstring) | |
213 } | |
214 | |
215 | |
216 align_paths = function(paths, G) { | |
217 shift = rep(NA, length(paths)) | |
218 thr = 0 # minimal length | |
219 tr_paths = list() | |
220 Seqs = list() | |
221 centre_node = as.numeric(names(sort(table(unlist(paths)), decreasing = TRUE)))[[1]] | |
222 | |
223 for (i in seq_along(paths)) { | |
224 if (centre_node %in% paths[[i]]) { | |
225 S = which(paths[[i]] %in% centre_node) | |
226 shift[i] = S | |
227 if (S == 1) { | |
228 tr_paths[[i]] = paths[[i]] | |
229 } else { | |
230 tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - | |
231 1)]) | |
232 } | |
233 Seqs[[i]] = get_sequence(G, tr_paths[[i]]) | |
234 } else { | |
235 shift[i] = NA | |
236 } | |
237 } | |
238 paths_n = lapply(paths, as.numeric) | |
239 tr_paths_n = do.call(cbind, lapply(tr_paths, as.numeric)) | |
240 new_shift = shift | |
241 for (i in which(is.na(shift))) { | |
242 score = numeric(length(paths_n)) | |
243 for (S in seq_along(paths_n[[i]])) { | |
244 if (S == 1) { | |
245 path_tmp_n = paths_n[[i]] | |
246 } else { | |
247 path_tmp_n = c(paths_n[[i]][S:length(paths_n[[i]])], paths_n[[i]][1:(S - | |
248 1)]) | |
249 } | |
250 score[S] = sum(tr_paths_n == path_tmp_n) | |
251 } | |
252 if (sum(score) != 0) { | |
253 S = which.max(score) | |
254 new_shift[i] = S | |
255 if (S == 1) { | |
256 tr_paths[[i]] = paths[[i]] | |
257 } else { | |
258 tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - | |
259 1)]) | |
260 } | |
261 Seqs[[i]] = get_sequence(G, tr_paths[[i]]) | |
262 } | |
263 } | |
264 shift = new_shift | |
265 # try to shift based on the sequences itself | |
266 return(list(Seqs = Seqs[!is.na(shift)], tr_paths = tr_paths[!is.na(shift)], shift = shift[!is.na(shift)])) | |
267 } | |
268 | |
269 make_consensus = function(paths_info, G) { | |
270 include = !is.na(paths_info$shift) | |
271 ## make alignments using mafft | |
272 aln = mafft(unlist(paths_info$Seqs[include])) | |
273 CM = calculate_consensus_matrix(aln = aln, tr_paths = paths_info$tr_paths[include], | |
274 G = G) | |
275 gaps = get_gaps_from_alignment(aln) | |
276 CMnorm = CM/rowSums(CM) | |
277 bases = colnames(CM) | |
278 consensus = sapply(apply(CMnorm, 1, function(x) bases[which(x > 0.2)][order(x[x > | |
279 0.2], decreasing = TRUE)]), paste, collapse = "") | |
280 consensus2 = gsub("-", "", paste0(substring(consensus, 1, 1), collapse = ""), | |
281 fixed = TRUE) | |
282 number_of_SNP = sum(rowSums(CM > 0) > 1) | |
283 SNP_positions = which(rowSums(CM > 0) > 1) | |
284 number_of_position_with_indels = sum(colSums(do.call(rbind, strsplit(as.character(aln), | |
285 "")) == "-") > 0) | |
286 indel_positions = which(colSums(do.call(rbind, strsplit(as.character(aln), "")) == | |
287 "-") > 0) | |
288 if (length(SNP_positions) > 0) { | |
289 variable_sites = unlist(c(c(mapply(paste, strsplit((sapply(apply(CMnorm, | |
290 1, function(x) bases[which(x > 0.2)]), paste, collapse = ""))[SNP_positions], | |
291 ""), SNP_positions, sep = "_")), paste("-", indel_positions, sep = "_"))) | |
292 } else { | |
293 variable_sites = NULL | |
294 } | |
295 variable_positions = unique(SNP_positions, indel_positions) | |
296 return(list(aln = aln, CM = CM, CMnorm = CMnorm, consensus = consensus, consensus2 = consensus2, | |
297 number_of_SNP = number_of_SNP, SNP_positions = SNP_positions, number_of_position_with_indels = number_of_position_with_indels, | |
298 indel_positions = indel_positions, variable_positions = variable_positions, | |
299 variable_sites = variable_sites, gaps = gaps)) | |
300 } | |
301 | |
302 | |
303 estimate_monomer = function(G, weights = NULL, limit = NULL) { | |
304 if (is.null(G)) { | |
305 return(NULL) | |
306 } | |
307 ## estimate monomer from kmer based graph | |
308 V(G)$id = 1:vcount(G) | |
309 GS = induced_subgraph(G, vids = which(degree(G) == 2)) ## keep only vertices without branching | |
310 cls = clusters(GS)$membership | |
311 | |
312 | |
313 ids = mapply(FUN = function(x, y) x[which.max(y)], split(V(GS)$id, cls), split(V(GS)$size, | |
314 cls)) ## from each branch use only one vertex with larges size! | |
315 | |
316 | |
317 ids = ids[order(V(G)$size[ids], decreasing = TRUE)] | |
318 ids_size = V(G)$size[ids] | |
319 N50 = sum(cumsum(ids_size)/sum(ids_size) < 0.5) | |
320 if (length(ids) > 10000) { | |
321 ids = ids[1:N50] | |
322 } | |
323 ## use only large vertices in search! how many? | |
324 el = get.edgelist(G, names = FALSE) | |
325 node_use = numeric(vcount(G)) | |
326 LL = numeric() | |
327 ## W=numeric() | |
328 i = 0 | |
329 paths = list() | |
330 if (is.null(weights)) { | |
331 weights = (max(E(G)$weight) - (E(G)$weight) + 1) | |
332 weights = E(G)$weight^(-3) | |
333 } | |
334 included = rep(FALSE, vcount(G)) | |
335 W_total = sum(V(G)$size) | |
336 | |
337 coverage = numeric() | |
338 t0 = c() | |
339 i = 0 | |
340 j = 0 | |
341 for (n in ids) { | |
342 j = j + 1 | |
343 t0[j] = Sys.time() | |
344 if (included[n]) { | |
345 next | |
346 } | |
347 m = which(el[, 1] %in% n) | |
348 i = i + 1 | |
349 s = get.shortest.paths(G, el[m, 2], el[m, 1], weights = weights, output = "vpath") | |
350 included[as.numeric(s$vpath[[1]])] = TRUE | |
351 paths[[i]] = s$vpath[[1]] | |
352 LL[i] = (length(s$vpath[[1]])) | |
353 } | |
354 | |
355 ## evaluate if paths should be divided to variants - by length and path weight | |
356 paths_clusters = split(paths, LL) | |
357 paths_clusters_tr = mclapply(paths_clusters, FUN = align_paths, G = G, mc.cores = getOption("CPU")) | |
358 | |
359 ## paths_clusters_tr = lapply(paths_clusters, FUN = align_paths, G = G) consensus | |
360 paths_consensus = mclapply(paths_clusters_tr, make_consensus, G = G, mc.cores = getOption("CPU")) | |
361 | |
362 ## evaluate weight for individual paths: | |
363 for (v in seq_along(paths_consensus)) { | |
364 p = paths_clusters_tr[[v]]$tr_paths | |
365 ## clean | |
366 p = p[!sapply(p, function(x) anyNA(x) | is.null(x))] | |
367 L = sapply(p, length) | |
368 p_groups = split(p, L) | |
369 w_groups = sapply(p_groups, function(x) sum(V(G)$size[unique(c(sapply(x, | |
370 as.numeric)))])) | |
371 total_score = sum(V(G)$size[unique(c(unlist(sapply(p, as.numeric))))]) | |
372 LW = data.frame(`Length estimate` = unique(L), weight = w_groups, stringsAsFactors = FALSE) | |
373 LW = LW[order(LW$weight, decreasing = TRUE), ] | |
374 rownames(LW) = NULL | |
375 paths_consensus[[v]]$total_score = total_score | |
376 paths_consensus[[v]]$length_variant_score = LW | |
377 | |
378 } | |
379 return(list(estimates = paths_consensus, paths = paths_clusters_tr)) | |
380 } | |
381 | |
382 | |
383 | |
384 detect_pbs = function(dimers_file, tRNA_database_path, reads_file, output) { | |
385 ## read_file contain oriented reads! | |
386 min_offset = 10 | |
387 max_end_dist = 2 | |
388 min_aln_length = 30 | |
389 max_pbs_search = 30 | |
390 thr_length = 12 | |
391 end_position = 23 | |
392 insertion_proportion_threshold <- 0.15 | |
393 ## read_file contain oriented reads! for testing | |
394 if (FALSE) { | |
395 library(Biostrings) | |
396 thr_length = 12 | |
397 end_position = 23 | |
398 dimers_file = "consensus_dimer.fasta" | |
399 reads_file = "reads_oriented.fas" | |
400 tRNA_database_path = "/mnt/raid/users/petr/workspace/repex_tarean/databases/tRNA_database2.fasta" | |
401 | |
402 } | |
403 ## find read which are half aligned do reference dimer | |
404 dimers_seq = readDNAStringSet(dimers_file) | |
405 cmd = paste("makeblastdb -in", dimers_file, "-dbtype nucl") | |
406 system(cmd, ignore.stdout = TRUE) | |
407 output = paste0(reads_file, "blast_out.cvs") | |
408 columns = c("qseqid", "qstart", "qend", "qlen", "sseqid", "sstart", "send", "sstrand", | |
409 "slen", "pident", "length") | |
410 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), "' -perc_identity 90 -query ", | |
411 reads_file, "-db", dimers_file, "-word_size", 7, "-dust no -num_alignments 99999 -strand plus ", | |
412 "-out", output) | |
413 system(cmd) | |
414 blastdf = read.table(output, as.is = TRUE, col.names = columns, comment.char = "") | |
415 unlink(output) | |
416 blastdf = blastdf[blastdf$length >= min_aln_length, ] | |
417 | |
418 ## expand two whole read to see unligned reads parts | |
419 blastdf_expand = blastdf | |
420 blastdf_expand$qstart = blastdf$qstart - (blastdf$qstart - 1) | |
421 blastdf_expand$sstart = blastdf$sstart - (blastdf$qstart - 1) | |
422 blastdf_expand$qend = blastdf$qend + (blastdf$qlen - blastdf$qend) | |
423 blastdf_expand$send = blastdf$send + (blastdf$qlen - blastdf$qend) | |
424 pS = blastdf$sstart | |
425 pE = blastdf$send | |
426 pSF = blastdf_expand$sstart | |
427 pEF = blastdf_expand$send | |
428 cond1 = pS - pSF >= min_offset & blastdf$qend >= (blastdf$qlen - max_end_dist) & | |
429 blastdf$sstart >= max_pbs_search | |
430 cond2 = pEF - pE >= min_offset & blastdf$qstart <= max_end_dist & blastdf$send <= | |
431 blastdf$slen - max_pbs_search | |
432 | |
433 ## coverage of alignments: evaluate coverage at site with breaks, it is neccessary | |
434 ## that there must be at least 20% of interupted alignments at given position to | |
435 ## be considered for additional search | |
436 coverage_profiles = subject_coverage(blastdf) | |
437 | |
438 | |
439 ## extract flanking sequences - cca 50nt, it should contain tRNA sequence search | |
440 ## Left | |
441 fin = tempfile() | |
442 fout = tempfile() | |
443 scoreL = scoreR = 0 | |
444 | |
445 | |
446 ## left side unique insertion sites | |
447 if (any(cond1)) { | |
448 insertion_sites = ddply(blastdf[cond1, ], .(sseqid, sstart), nrow) | |
449 ## check coverage | |
450 insertion_sites$coverage = 0 | |
451 for (i in 1:nrow(insertion_sites)) { | |
452 insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$sstart[[i]]] | |
453 } | |
454 insertion_OK_left <- insertion_sites[with(insertion_sites, V1/coverage > | |
455 insertion_proportion_threshold), ] | |
456 | |
457 if (nrow(insertion_OK_left) > 0) { | |
458 s = ifelse(insertion_OK_left[, "sstart"] - max_pbs_search < 1, 1, insertion_OK_left[, | |
459 "sstart"] - max_pbs_search) | |
460 Lpart = subseq(dimers_seq[match(insertion_OK_left$sseqid, names(dimers_seq))], | |
461 s, insertion_OK_left$sstart) | |
462 | |
463 ## names are CONSENSUSID__READID_position | |
464 names(Lpart) = paste0(names(Lpart), "__", insertion_OK_left$V1, "__", | |
465 insertion_OK_left$sstart) | |
466 ## check presence TG dinucleotide | |
467 TG = vcountPattern("TG", subseq(dimers_seq[match(insertion_OK_left$sseqid, | |
468 names(dimers_seq))], insertion_OK_left$sstart - 2, insertion_OK_left$sstart + | |
469 2)) | |
470 if (any(TG > 0)) { | |
471 writeXStringSet(Lpart[TG > 0], filepath = fin) | |
472 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), | |
473 "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, | |
474 "-word_size", 7, "-strand plus -dust no -max_target_seqs 10000 -evalue 100", | |
475 "-out", fout) | |
476 | |
477 system(cmd, ignore.stdout = TRUE) | |
478 df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "") | |
479 filter1 = df$length >= thr_length | |
480 filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position | |
481 df_pass = df[filter1 & filter2, , drop = FALSE] | |
482 df_pass_L = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE] | |
483 scoreL = get_score(df_pass_L) | |
484 write.table(df_pass_L, file = paste0(output, "_L.csv"), row.names = FALSE, | |
485 sep = "\t") | |
486 } | |
487 } | |
488 } | |
489 if (any(cond2)) { | |
490 ## search Right | |
491 insertion_sites = ddply(blastdf[cond2, ], .(sseqid, send, slen), nrow) | |
492 ## check coverage | |
493 insertion_sites$coverage = 0 | |
494 for (i in 1:nrow(insertion_sites)) { | |
495 insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$send[[i]]] | |
496 } | |
497 insertion_OK_right <- insertion_sites[with(insertion_sites, V1/coverage > | |
498 insertion_proportion_threshold), ] | |
499 | |
500 if (nrow(insertion_OK_right) > 0) { | |
501 s = ifelse(insertion_OK_right$send + max_pbs_search > insertion_OK_right$slen, | |
502 insertion_OK_right$slen, insertion_OK_right$send + max_pbs_search) | |
503 Rpart = subseq(dimers_seq[match(insertion_OK_right$sseqid, names(dimers_seq))], | |
504 insertion_OK_right$send, s) | |
505 names(Rpart) = paste0(names(Rpart), "__", insertion_OK_right$V1, "__", | |
506 insertion_OK_right$send) | |
507 | |
508 ## check presence CA dinucleotide | |
509 CA = vcountPattern("CA", subseq(dimers_seq[match(insertion_OK_right$sseqid, | |
510 names(dimers_seq))], insertion_OK_right$send - 2, insertion_OK_right$send + | |
511 2, )) | |
512 if (any(CA > 0)) { | |
513 writeXStringSet(Rpart[CA > 0], filepath = fin) | |
514 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), | |
515 "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, | |
516 "-word_size", 7, "-strand minus -dust no -max_target_seqs 10000 -evalue 100", | |
517 "-out", fout) | |
518 | |
519 system(cmd, ignore.stdout = TRUE) | |
520 df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "") | |
521 filter1 = df$length >= thr_length | |
522 filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position | |
523 df_pass = df[filter1 & filter2, , drop = FALSE] | |
524 df_pass_R = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE] | |
525 write.table(df_pass_R, file = paste0(output, "_R.csv"), row.names = FALSE, | |
526 sep = "\t") | |
527 scoreR = get_score(df_pass_R) | |
528 } | |
529 } | |
530 } | |
531 unlink(fin) | |
532 unlink(fout) | |
533 return(max(scoreL, scoreR)) | |
534 } | |
535 | |
536 | |
537 subject_coverage = function(blastdf) { | |
538 ## calculate coverage for all blast subjects | |
539 coverage_profiles <- by(blastdf, INDICES = blastdf$sseqid, FUN = function(x) { | |
540 as.numeric(coverage(IRanges(start = x$sstart, end = x$send))) | |
541 }) | |
542 return(coverage_profiles) | |
543 | |
544 } | |
545 | |
546 get_score = function(x) { | |
547 ## keep best tRNA | |
548 if (nrow(x) == 0) { | |
549 return(0) | |
550 } | |
551 xm = data.frame(do.call(rbind, strsplit(x$qseqid, "__")), stringsAsFactors = FALSE) | |
552 xm$score = as.numeric(sapply(strsplit(xm[, 1], "_"), "[[", 4)) | |
553 xm$AA = gsub("-.+$", "", gsub("^.+__", "", x$sseqid)) | |
554 best_score = max(by(xm$score, INDICES = xm$AA, FUN = sum)) | |
555 return(best_score) | |
556 } | |
557 | |
558 | |
559 | |
560 dotter = function(seq1, seq2 = NULL, params = "") { | |
561 if (is.null(seq2)) { | |
562 seq2 = seq1 | |
563 } | |
564 library(Biostrings) | |
565 if (class(seq1) != "DNAStringSet") { | |
566 seq1 = BStringSet(seq1) | |
567 } | |
568 if (class(seq2) != "DNAStringSet") { | |
569 seq2 = BStringSet(seq2) | |
570 } | |
571 sf1 = tempfile("seq1") | |
572 writeXStringSet(seq1, file = sf1) | |
573 sf2 = tempfile("seq2") | |
574 writeXStringSet(seq2, file = sf2) | |
575 system(paste("dotter", params, sf1, sf2), wait = FALSE) | |
576 Sys.sleep(2) | |
577 unlink(c(sf1, sf2)) | |
578 return(NULL) | |
579 } | |
580 | |
581 | |
582 | |
583 dotter2 = function(seq1, seq2 = NULL, params = NULL) { | |
584 if (is.null(seq2)) { | |
585 seq2 = seq1 | |
586 } | |
587 if (is.null(params)) { | |
588 params = " -windowsize 30 -threshold 45 " | |
589 } | |
590 library(Biostrings) | |
591 if (class(seq1) != "DNAStringSet") { | |
592 seq1 = DNAStringSet(seq1) | |
593 } | |
594 if (class(seq2) != "DNAStringSet") { | |
595 seq2 = DNAStringSet(seq2) | |
596 } | |
597 L1 = nchar(seq1) | |
598 L2 = nchar(seq2) | |
599 | |
600 tmpdat1 = tempfile() | |
601 | |
602 dir.create(tmpdat1) | |
603 tmpdat2 = tempfile() | |
604 dir.create(tmpdat2) | |
605 oridir = getwd() | |
606 | |
607 | |
608 seq1_merged = DNAStringSet(paste(seq1, collapse = "")) | |
609 seq2_merged = DNAStringSet(paste(seq2, collapse = "")) | |
610 seq2rc_merged = reverseComplement(DNAStringSet(paste(seq2, collapse = ""))) | |
611 | |
612 | |
613 sf1 = tempfile("seq1") | |
614 writeXStringSet(seq1_merged, filepath = sf1) | |
615 sf2 = tempfile("seq2") | |
616 sf2rc = tempfile("seq2rc") | |
617 writeXStringSet(seq2_merged, filepath = sf2) | |
618 writeXStringSet(seq2rc_merged, filepath = sf2rc) | |
619 | |
620 cmd1 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2, params) | |
621 cmd2 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2rc, | |
622 params) | |
623 setwd(tmpdat1) | |
624 output1 = system(cmd1, intern = TRUE) | |
625 setwd(tmpdat2) | |
626 output2 = system(cmd2, intern = TRUE) | |
627 setwd(oridir) | |
628 | |
629 | |
630 | |
631 fout1 = strsplit(tail(output1, n = 1), split = " ")[[1]][2] | |
632 rawdat1 = readLines(paste(tmpdat1, "/", fout1, sep = "")) | |
633 rawdat1 = rawdat1[1:min(grep("^Rectangle", rawdat1))] | |
634 | |
635 if (length(rawdat1[grep("^Line", rawdat1)]) == 0) { | |
636 coord1 = NULL | |
637 } else { | |
638 coord1 = apply(sapply(strsplit(rawdat1[grep("^Line", rawdat1)], " "), "[", | |
639 c(3, 5, 7, 9, 11)), 1, as.numeric) | |
640 coord1 = matrix(coord1, ncol = 5) | |
641 } | |
642 | |
643 fout2 = strsplit(tail(output2, n = 1), split = " ")[[1]][2] | |
644 rawdat2 = readLines(paste(tmpdat2, "/", fout2, sep = "")) | |
645 rawdat2 = rawdat2[1:min(grep("^Rectangle", rawdat2))] | |
646 | |
647 if (length(rawdat2[grep("^Line", rawdat2)]) == 0) { | |
648 coord2 = NULL | |
649 } else { | |
650 coord2 = apply(sapply(strsplit(rawdat2[grep("^Line", rawdat2)], " "), "[", | |
651 c(3, 5, 7, 9, 11)), 1, as.numeric) | |
652 coord2 = matrix(coord2, ncol = 5) | |
653 } | |
654 unlink(sf1) | |
655 unlink(sf2) | |
656 unlink(sf2rc) | |
657 unlink(tmpdat1, recursive = TRUE) | |
658 unlink(tmpdat2, recursive = TRUE) | |
659 | |
660 N1 = sum(nchar(seq1)) | |
661 N2 = sum(nchar(seq2)) | |
662 op = par(xaxs = "i", yaxs = "i", mar = c(5, 2, 6, 10), las = 1) | |
663 on.exit(par(op)) | |
664 plot(c(1, N1), c(1, N2), type = "n", xlab = "", ylab = "", axes = FALSE) | |
665 if (!is.null(coord1)) { | |
666 segments(x0 = coord1[, 1], y0 = coord1[, 2], x1 = coord1[, 3], y1 = coord1[, | |
667 4]) | |
668 } | |
669 if (!is.null(coord2)) { | |
670 segments(x0 = coord2[, 1], y0 = N2 - coord2[, 2], x1 = coord2[, 3], y1 = N2 - | |
671 coord2[, 4]) | |
672 } | |
673 abline(v = c(0, cumsum(L1)), col = "green") | |
674 abline(h = c(0, cumsum(L2)), col = "green") | |
675 box() | |
676 axis(1, at = c(1, cumsum(L1))[-(length(L1) + 1)], labels = names(seq1), hadj = 0, | |
677 cex.axis = 0.7) | |
678 axis(4, at = c(1, cumsum(L2))[-(length(L2) + 1)], labels = names(seq2), hadj = 0, | |
679 cex.axis = 0.7) | |
680 invisible(list(coord1, coord2)) | |
681 } | |
682 | |
683 | |
684 CM2sequence = function(x) { | |
685 bases = c(colnames(x), "N") | |
686 x = cbind(x, 0) | |
687 colnames(x) = bases | |
688 seqs = paste(bases[apply(x, 1, which.max)], collapse = "") | |
689 | |
690 return(seqs) | |
691 } | |
692 | |
693 ## in alingment calculate significance from kmers | |
694 calculate_consensus_matrix = function(aln, tr_paths, G) { | |
695 bases = c("A", "C", "G", "T", "-") | |
696 positions = lapply(strsplit(as.character(aln), split = ""), function(x) which(!x %in% | |
697 "-")) | |
698 base_matrix = do.call(rbind, strsplit(as.character(aln), split = "")) | |
699 weights = matrix(0, nrow = length(aln), ncol = nchar(aln)) | |
700 kmer_rel_proportions = (V(G)$size/table(factor(unlist(tr_paths), levels = 1:vcount(G)))) | |
701 for (i in seq_along(positions)) { | |
702 weights[i, positions[[i]]] = kmer_rel_proportions[tr_paths[[i]]] | |
703 } | |
704 names(kmer_rel_proportions) = V(G)$name | |
705 ## get weights for gaps by approximation | |
706 fitgaps = function(y) { | |
707 if (sum(y == 0) == 0) { | |
708 return(y) | |
709 } else { | |
710 y0 = rep(y[y != 0], 3) | |
711 x0 = which(rep(y, 3) != 0) | |
712 fitted = approx(x0, y0, xout = seq_along(rep(y, 3)), rule = 1) | |
713 } | |
714 return(fitted$y[-seq_along(y)][seq_along(y)]) | |
715 } | |
716 weights_with_gaps = t(apply(weights, 1, fitgaps)) | |
717 ## weights_with_gaps = get_gap_weights(weights, aln,G) ## this step take wey too | |
718 ## long time!!!-not so important TODO handle gaps differently - more effectively | |
719 | |
720 ## get consensus matrix | |
721 CM = sapply(1:nchar(aln[[1]]), function(i) { | |
722 sapply(split(weights_with_gaps[, i], factor(base_matrix[, i], levels = bases)), | |
723 sum) | |
724 }) | |
725 return(t(CM)[, 1:5]) | |
726 } | |
727 | |
728 | |
729 get_gaps_from_alignment = function(aln) { | |
730 as.character(aln) | |
731 gaps_positions = unique(do.call(rbind, str_locate_all(as.character(aln), "-+"))) | |
732 return(gaps_positions) | |
733 } | |
734 | |
735 plot_kmer_graph = function(G, L = NULL, vertex.size = NULL, ord = NULL, upto = NULL, | |
736 highlight = NULL) { | |
737 if (!is.null(G$L) & is.null(L)) { | |
738 L = G$L | |
739 } | |
740 if (is.null(L)) { | |
741 ## L=layout.kamada.kawai(G) | |
742 L = OGDFlayout(G)[[1]] | |
743 } | |
744 clr = kmers2color(V(G)$name) | |
745 if (!is.null(highlight)) { | |
746 clr[highlight] = "#00000080" | |
747 } | |
748 if (!is.null(ord)) { | |
749 clr[ord[1:upto]] = paste(clr[ord[1:upto]], "30", sep = "") | |
750 } | |
751 if (is.null(vertex.size)) { | |
752 vertex.size = rescale(V(G)$size, to = c(0.5, 6)) | |
753 | |
754 } | |
755 plot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, | |
756 vertex.color = clr, vertex.frame.color = "#00000020", edge.width = 2, edge.arrow.mode = 1, | |
757 edge.arrow.size = 0.2) | |
758 | |
759 } | |
760 | |
761 rglplot_kmer_graph = function(G, L = NULL, vertex.size = 4) { | |
762 if (is.null(L)) { | |
763 L = layout.kamada.kawai(G) | |
764 } | |
765 | |
766 rglplot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, | |
767 vertex.color = kmers2color(V(G)$name), edge.width = 2, edge.arrow.mode = 1, | |
768 edge.arrow.size = 0.5) | |
769 } | |
770 | |
771 | |
772 | |
773 | |
774 mafft = function(seqs, params = "--auto --thread 1 ") { | |
775 if (length(seqs) < 2) { | |
776 return(seqs) | |
777 } | |
778 infile = tempfile() | |
779 if (class(seqs) == "character") { | |
780 seqs = DNAStringSet(seqs) | |
781 } | |
782 writeXStringSet(seqs, file = infile) | |
783 outfile = tempfile() | |
784 cmd = paste("mafft --quiet --nuc ", params, infile, "2> /dev/null > ", outfile) | |
785 system(cmd, intern = TRUE, ignore.stderr = FALSE) | |
786 aln = readDNAStringSet(outfile) | |
787 unlink(c(outfile, infile)) | |
788 return(aln) | |
789 } | |
790 | |
791 mgblast = function(databaseSeq, querySeq) { | |
792 params = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -C 30 -D 30 " | |
793 # no dust filtering: | |
794 paramsDF = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -F F" | |
795 | |
796 database = tempfile() | |
797 query = tempfile() | |
798 output = tempfile() | |
799 if (class(databaseSeq) == "character") { | |
800 database = databaseSeq | |
801 do_not_delete_database = TRUE | |
802 } else { | |
803 writeXStringSet(databaseSeq, filepath = database, format = "fasta") | |
804 do_not_delete_database = FALSE | |
805 } | |
806 if (class(querySeq) == "character") { | |
807 query = querySeq | |
808 do_not_delete_query = TRUE | |
809 } else { | |
810 writeXStringSet(querySeq, filepath = query, format = "fasta") | |
811 do_not_delete_query = FALSE | |
812 } | |
813 ## create database: | |
814 cmd = paste("formatdb -i", database, "-p F") | |
815 system(cmd) | |
816 cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, params) | |
817 system(cmd) | |
818 if (file.info(output)$size == 0) { | |
819 # no hist, try wthou dust masker | |
820 cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, paramsDF) | |
821 system(cmd) | |
822 } | |
823 blastOut = read.table(output, sep = "\t", header = FALSE, as.is = TRUE, comment.char = "") | |
824 unlink(output) | |
825 if (!do_not_delete_query) { | |
826 unlink(query) | |
827 } | |
828 if (!do_not_delete_database) { | |
829 unlink(paste(database, "*", sep = "")) | |
830 } | |
831 colnames(blastOut) = c("query", "q.length", "q.start", "q.end", "subject", "s.length", | |
832 "s.start", "s.end", "pid", "weight", "e.value", "strand") | |
833 blastOut | |
834 } | |
835 | |
836 | |
837 estimate_sample_size = function(NV, NE, maxv, maxe) { | |
838 ## density | |
839 d = (2 * NE)/(NV * (NV - 1)) | |
840 eEst = (maxv * (maxv - 1) * d)/2 | |
841 nEst = (d + sqrt(d^2 + 8 * d * maxe))/(2 * d) | |
842 if (eEst >= maxe) { | |
843 N = round(nEst) | |
844 E = round((N * (N - 1) * d)/2) | |
845 | |
846 } | |
847 if (nEst >= maxv) { | |
848 N = maxv | |
849 E = round((N * (N - 1) * d)/2) | |
850 | |
851 } | |
852 return(N) | |
853 } | |
854 | |
855 | |
856 | |
857 | |
858 mgblast2graph = function(blastfile, seqfile, graph_destination, directed_graph_destination, | |
859 oriented_sequences, paired = TRUE, repex = FALSE, image_file = NULL, image_file_tmb = NULL, | |
860 include_layout = TRUE, pair_completeness = NULL, satellite_model_path = NULL, | |
861 maxv = 40000, maxe = 5e+08, seqfile_full = seqfile) { | |
862 cat("loading blast results\n") | |
863 if (repex) { | |
864 cln = c("query", "subject", "weight", "q.length", "q.start", "q.end", "s.length", "s.start", | |
865 "s.end", "sign") | |
866 colcls = c("character", "character", "numeric", "numeric", "numeric", "numeric", | |
867 "numeric", "numeric", "numeric", "NULL", "NULL", "character") | |
868 | |
869 } else { | |
870 cln = c("query", "q.length", "q.start", "q.end", "subject", "s.length", "s.start", | |
871 "s.end","weight", "sign") | |
872 colcls = c("character", "numeric", "numeric", "numeric", "character", "numeric", | |
873 "numeric", "numeric", "NULL", "numeric", "NULL", "character") | |
874 } | |
875 if (class(blastfile) == "data.frame") { | |
876 blastTable = blastfile | |
877 colnames(blastTable)[12] = "sign" | |
878 } else { | |
879 blastTable = read.table(blastfile, sep = "\t", as.is = TRUE, header = FALSE, | |
880 colClasses = colcls, comment.char = "") | |
881 colnames(blastTable) = cln | |
882 } | |
883 ## check for duplicates! | |
884 key = with(blastTable, ifelse(query > subject, paste(query, subject), paste(subject, | |
885 query))) | |
886 if (any(duplicated(key))) { | |
887 blastTable = blastTable[!duplicated(key), ] | |
888 } | |
889 seqs = readDNAStringSet(seqfile) | |
890 ## calculate pair completeness for reads before sampling | |
891 if (is.null(pair_completeness)) { | |
892 if (paired) { | |
893 if (seqfile_full != seqfile) { | |
894 seqs_full = readDNAStringSet(seqfile_full) | |
895 pair_counts = tabulate(table(gsub(".$", "", names(seqs_full)))) | |
896 rm(seqs_full) | |
897 } else { | |
898 pair_counts = tabulate(table(gsub(".$", "", names(seqs)))) | |
899 | |
900 } | |
901 pair_completeness = 1 - pair_counts[1]/sum(pair_counts) | |
902 }else{ | |
903 pair_completeness = 0 | |
904 } | |
905 } | |
906 NV = length(seqs) # vertices | |
907 NE = nrow(blastTable) # nodes | |
908 if (maxv < NV | maxe < NE) { | |
909 ## Sample if graph is large | |
910 V_sample_size = estimate_sample_size(NV, NE, maxv, maxe) | |
911 seqs = sample(seqs, V_sample_size) | |
912 blastTable = blastTable[blastTable$query %in% names(seqs) & blastTable$subject %in% | |
913 names(seqs), ] | |
914 } | |
915 | |
916 blastTable$sign = ifelse(blastTable$sign == "+", 1, -1) | |
917 vnames = unique(c(blastTable$query, blastTable$subject)) | |
918 vindex = seq_along(vnames) | |
919 names(vindex) = vnames | |
920 ## correct orientation | |
921 cat("creating graph\n") | |
922 G = graph.data.frame(blastTable[, c("query", "subject", "weight", "sign")], directed = FALSE, | |
923 vertices = vnames) | |
924 if (include_layout) { | |
925 ## save temporarily modified blastTable if large | |
926 tmpB = tempfile() | |
927 save(blastTable, file = tmpB) | |
928 rm(blastTable) | |
929 ## | |
930 cat("graph layout calculation ") | |
931 if (ecount(G) > 2e+06) { | |
932 cat("using fruchterman reingold\n") | |
933 L = layout.fruchterman.reingold(G, dim = 3) | |
934 } else { | |
935 cat("using OGDF & frucherman reingold\n") | |
936 Ltmp = OGDFlayout(G, alg = c("fmmm")) | |
937 L = cbind(Ltmp[[1]][, 1:2], layout.fruchterman.reingold(G, dim = 2)) | |
938 | |
939 } | |
940 | |
941 GL = list(G = G, L = L) | |
942 save(GL, file = graph_destination) | |
943 if (!is.null(image_file)) { | |
944 cat("exporting graph figure") | |
945 png(image_file, width = 900, height = 900, pointsize = 20) | |
946 plot(GL$G, layout = GL$L[, 1:2], vertex.size = 2, vertex.color = "#000000A0", | |
947 edge.color = "#00000040", vertex.shape = "circle", edge.curved = FALSE, | |
948 vertex.label = NA, vertex.frame.color = NA, edge.width = 1) | |
949 dev.off() | |
950 ## create thunmbs: | |
951 system(paste("convert ", image_file, " -resize 100x100 ", image_file_tmb)) | |
952 | |
953 } | |
954 rm(GL) | |
955 gc() | |
956 load(tmpB) | |
957 unlink(tmpB) | |
958 } | |
959 | |
960 if (!is.connected(G)) { | |
961 cat("Warning - graph is not connected\n") | |
962 cc = clusters(G, "weak") | |
963 mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), | |
964 decreasing = TRUE))))) | |
965 selvi = which(mb == 1) # largest component | |
966 cat("using largest component: \nsize =", round(length(selvi)/vcount(G) * | |
967 100, 1), "% ;", length(selvi), "reads", "\n") | |
968 G = induced.subgraph(G, vids = selvi) | |
969 blastTable = blastTable[blastTable$query %in% V(G)$name & blastTable$subject %in% | |
970 V(G)$name, ] | |
971 } | |
972 | |
973 Gmst = minimum.spanning.tree(G) | |
974 # create alternative trees - for case that unly suboptima solution is found | |
975 set.seed(123) | |
976 Gmst_alt = list() | |
977 Gmst_alt[[1]] = Gmst | |
978 for (i in 2:6){ | |
979 E(G)$weight = runif(ecount(G), 0.1,1) | |
980 Gmst_alt[[i]] = minimum.spanning.tree(G) | |
981 } | |
982 | |
983 rm(G) | |
984 gc() | |
985 ## six attempts to reorient reads | |
986 flip_names_all = list() | |
987 prop_of_notfit=numeric() | |
988 thr_fit=0.001 | |
989 for (ii in 1:6){ | |
990 Gmst = Gmst_alt[[ii]] | |
991 | |
992 | |
993 blastTable_mst = as.data.frame(get.edgelist(Gmst, names = TRUE)) | |
994 colnames(blastTable_mst) = c("query", "subject") | |
995 blastTable_mst$sign = E(Gmst)$sign | |
996 | |
997 d = dfs(Gmst, root = 1, father = TRUE, order.out = TRUE) | |
998 esign = E(Gmst)$sign | |
999 rc = rep(FALSE, vcount(Gmst)) | |
1000 j = 0 | |
1001 p = 0 | |
1002 for (v in d$order[-1]) { | |
1003 j = j + 1 | |
1004 f = as.numeric(d$father)[v] | |
1005 if (is.na(f)) { | |
1006 next | |
1007 } | |
1008 eid = get.edge.ids(Gmst, c(v, f)) | |
1009 if (esign[eid] == -1) { | |
1010 ie = unique(c(which(blastTable_mst$query %in% V(Gmst)$name[v]), which(blastTable_mst$subject %in% | |
1011 V(Gmst)$name[v]))) # incident edges | |
1012 esign[ie] = esign[ie] * -1 | |
1013 rc[v] = TRUE | |
1014 p = p + 1 | |
1015 if (p > 50) { | |
1016 p = 0 | |
1017 cat("\r") | |
1018 cat(j, " : ", sum(esign)/length(esign)) | |
1019 } | |
1020 } | |
1021 } | |
1022 cat("\t") | |
1023 flip_names_all[[ii]] = flip_names = V(Gmst)$name[rc] | |
1024 | |
1025 if (!exists("blastTable")) { | |
1026 load(tmpB) | |
1027 unlink(tmpB) | |
1028 } | |
1029 ## add nofit later!! | |
1030 s.mean = with(blastTable, (s.start + s.end)/2) | |
1031 s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - | |
1032 s.mean, s.mean) | |
1033 q.mean = with(blastTable, (q.start + q.end)/2) | |
1034 q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - | |
1035 q.mean, q.mean) | |
1036 V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query) | |
1037 V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject) | |
1038 sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, | |
1039 1) * blastTable$sign | |
1040 nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"])) | |
1041 prop_of_notfit[ii] = length(nofit)/vcount(Gmst) | |
1042 ## check if correctly oriented | |
1043 cat("prop notfit", prop_of_notfit[[ii]],"\n") | |
1044 if (prop_of_notfit[[ii]]<thr_fit){ | |
1045 ## OK | |
1046 break | |
1047 } | |
1048 } | |
1049 if (!prop_of_notfit[[ii]]<thr_fit){ | |
1050 ## get best solution | |
1051 ii_best = which.min(prop_of_notfit) | |
1052 if (ii != ii_best){ # if the last solution was not best, get the best | |
1053 flip_names = flip_names_all[[ii_best]] | |
1054 s.mean = with(blastTable, (s.start + s.end)/2) | |
1055 s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - | |
1056 s.mean, s.mean) | |
1057 q.mean = with(blastTable, (q.start + q.end)/2) | |
1058 q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - | |
1059 q.mean, q.mean) | |
1060 V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query) | |
1061 V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject) | |
1062 sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, | |
1063 1) * blastTable$sign | |
1064 nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"])) | |
1065 | |
1066 } | |
1067 } | |
1068 | |
1069 ## exclude all nofit | |
1070 | |
1071 df2 = data.frame(V1, V2, sign = blastTable$sign, sign_final = sign_final, stringsAsFactors = FALSE) | |
1072 rm(blastTable) | |
1073 gc() | |
1074 vertices = data.frame(name = vnames, reverse_complement = vnames %in% flip_names) | |
1075 G = graph.data.frame(df2, vertices = vertices) | |
1076 vcount_ori = vcount(G) | |
1077 G = induced.subgraph(G, vids = which(!V(G)$name %in% nofit)) | |
1078 | |
1079 G$escore_mst = sum(esign)/length(esign) | |
1080 G$escore = sum(sign_final == 1)/length(sign_final) | |
1081 cc = clusters(G, "strong") | |
1082 mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), | |
1083 decreasing = TRUE))))) | |
1084 names(mb) = V(G)$name | |
1085 G$loop_index = max(cc$csize)/vcount(G) | |
1086 G$coverage = vcount(G)/vcount_ori | |
1087 ## check sign in all edges | |
1088 V(G)$membership = mb | |
1089 save(G, file = directed_graph_destination) | |
1090 ## remove nofit | |
1091 seqs = seqs[!names(seqs) %in% nofit] | |
1092 seqs[names(seqs) %in% flip_names] = reverseComplement(seqs[names(seqs) %in% flip_names]) | |
1093 names(seqs) = paste(names(seqs), mb[names(seqs)]) | |
1094 writeXStringSet(seqs, filepath = oriented_sequences) | |
1095 ## calculate satellite probability | |
1096 if (is.null(satellite_model_path)) { | |
1097 pSAT = isSAT = NULL | |
1098 } else { | |
1099 satellite_model = readRDS(satellite_model_path) | |
1100 pSAT = get_prob(G$loop_index, pair_completeness, model = satellite_model) | |
1101 isSAT = isSatellite(G$loop_index, pair_completeness, model = satellite_model) | |
1102 } | |
1103 | |
1104 ## get larges cc | |
1105 output = list(escore = G$escore, coverage = G$coverage, escore_mts = G$escore_mst, | |
1106 loop_index = G$loop_index, pair_completeness = pair_completeness, graph_file = graph_destination, | |
1107 oriented_sequences = oriented_sequences, vcount = vcount(G), ecount = ecount(G), | |
1108 satellite_probability = pSAT, satellite = isSAT) | |
1109 ## clean up | |
1110 all_objects = ls() | |
1111 do_not_remove = "output" | |
1112 rm(list = all_objects[!(all_objects %in% do_not_remove)]) | |
1113 gc(verbose = FALSE, reset = TRUE) | |
1114 return(list2dictionary(output)) | |
1115 } | |
1116 | |
1117 list2dictionary = function(l){ | |
1118 dict = "{" | |
1119 q='"' | |
1120 for (i in 1:length(l)){ | |
1121 if (class(l[[i]])=="character" | is.null(l[[i]])){ | |
1122 q2 = "'''" | |
1123 }else{ | |
1124 q2 = '' | |
1125 } | |
1126 dict = paste0( | |
1127 dict, | |
1128 q,names(l)[i],q,":",q2, l[[i]], q2,", " | |
1129 ) | |
1130 } | |
1131 dict = paste0(dict, "}") | |
1132 return(dict) | |
1133 } | |
1134 | |
1135 wrap = Vectorize(function(s, width = 80) { | |
1136 i1 = seq(1, nchar(s), width) | |
1137 i2 = seq(width, by = width, length.out = length(i1)) | |
1138 return(paste(substring(s, i1, i2), collapse = "\n")) | |
1139 }) | |
1140 | |
1141 | |
1142 tarean = function(input_sequences, output_dir, min_kmer_length = 11, max_kmer_length = 27, | |
1143 CPU = 2, sample_size = 10000, reorient_reads = TRUE, tRNA_database_path = NULL, | |
1144 include_layout = TRUE, paired = TRUE, lock_file=NULL) { | |
1145 options(CPU = CPU) | |
1146 time0 = Sys.time() | |
1147 dir.create(output_dir) | |
1148 input_sequences_copy = paste(output_dir, "/", basename(input_sequences), sep = "") | |
1149 | |
1150 if (!file.copy(input_sequences, input_sequences_copy, overwrite = TRUE)) { | |
1151 cat(paste("cannot copy", input_sequences, " to", output_dir), "\n") | |
1152 stop() | |
1153 } | |
1154 | |
1155 lock_file = waitForRAM(lock_file = lock_file) | |
1156 pair_completeness = NULL | |
1157 ## sampling | |
1158 if (sample_size != 0) { | |
1159 s = readDNAStringSet(input_sequences_copy) | |
1160 N = length(s) | |
1161 ## pair completness must be calculated before sampling! | |
1162 if (N > sample_size) { | |
1163 writeXStringSet(sample(s, sample_size), filepath = input_sequences_copy) | |
1164 if (paired) { | |
1165 pair_counts = tabulate(table(gsub(".$", "", names(s)))) | |
1166 pair_completeness = 1 - pair_counts[1]/sum(pair_counts) | |
1167 } | |
1168 } | |
1169 rm(s) | |
1170 } | |
1171 | |
1172 if (reorient_reads) { | |
1173 input_sequences_oriented = paste0(input_sequences_copy, "oriented.fasta") | |
1174 graph_file = paste0(input_sequences_copy, "_graph.RData") | |
1175 GLfile = paste0(input_sequences_copy, "_graph.GL") | |
1176 cat("reorientig sequences\n") | |
1177 | |
1178 blastTable = mgblast(input_sequences_copy, input_sequences_copy) | |
1179 | |
1180 graph_info = mgblast2graph(blastTable, input_sequences_copy, GLfile, graph_file, | |
1181 input_sequences_oriented, include_layout = include_layout, paired = paired, | |
1182 pair_completeness = pair_completeness) | |
1183 | |
1184 ## interupt if it does not look like tandem repeat at all # soft threshold! | |
1185 if (is.null(graph_info$pair_completeness)) { | |
1186 if (graph_info$loop_index <= 0.4) { | |
1187 cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
1188 return(list2dictionary(list(graph_info = graph_info))) | |
1189 } | |
1190 } else { | |
1191 ## for paired reads: | |
1192 if (graph_info$loop_index < 0.7 | graph_info$pair_completeness < 0.4) { | |
1193 cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
1194 return(list2dictionary(list(graph_info = graph_info))) | |
1195 } | |
1196 } | |
1197 | |
1198 } else { | |
1199 input_sequences_oriented = input_sequences_copy | |
1200 graph_info = NULL | |
1201 } | |
1202 | |
1203 | |
1204 | |
1205 kmer_counts = list() | |
1206 kmer_lengths = seq(min_kmer_length, max_kmer_length, by = 4) | |
1207 for (i in kmer_lengths) { | |
1208 ## use pythonis.null(l[[i]])) function - faster | |
1209 cmd = paste(script.dir, "/kmer_counting.py ", input_sequences_oriented, " ", | |
1210 i, sep = "") | |
1211 cat("calculation kmers of length ", i, "\n") | |
1212 f = system(cmd, intern = TRUE) | |
1213 x = read.table(f, as.is = TRUE, sep = "\t") | |
1214 kmer_counts[[as.character(i)]] = data.frame(x, freq = x$V2/sum(x$V2)) | |
1215 } | |
1216 | |
1217 ## number of kmers: | |
1218 N50 = sapply(kmer_counts, function(x) { | |
1219 sum(cumsum(x$freq) < 0.5) | |
1220 }) | |
1221 | |
1222 N70 = sapply(kmer_counts, function(x) { | |
1223 sum(cumsum(x$freq) < 0.7) | |
1224 }) | |
1225 | |
1226 | |
1227 time1 = Sys.time() | |
1228 ggmin = mclapply(kmer_counts, get_mimimal_cc, start = 0.5, mc.cores = CPU) | |
1229 time2 = Sys.time() | |
1230 cat("kmers graphs created ") | |
1231 print(time2 - time1) | |
1232 names(ggmin) = names(kmer_counts) | |
1233 | |
1234 | |
1235 ## estimate monomer | |
1236 monomers = mclapply(ggmin, estimate_monomer, mc.cores = CPU) | |
1237 | |
1238 names(monomers) = names(kmer_counts) | |
1239 monomers = monomers[!sapply(monomers, is.null)] | |
1240 ## error handling: | |
1241 error = sapply(monomers, class) == "try-error" | |
1242 if (any(error)) { | |
1243 cat("\nError in monomers estimation: ") | |
1244 cat("calculation failed for monomers length ", names(monomers)[error], "\n") | |
1245 print(monomers[error]) | |
1246 if (any(!error)) { | |
1247 monomers = monomers[!error] | |
1248 } else { | |
1249 stop("monomer estimation failed") | |
1250 } | |
1251 } | |
1252 | |
1253 ## summary - make function!! | |
1254 total_score = list() | |
1255 k = 0 | |
1256 length_best = numeric() | |
1257 score_bn = numeric() | |
1258 consensus = character() | |
1259 | |
1260 for (i in names(monomers)) { | |
1261 for (v in seq_along(monomers[[i]]$estimates)) { | |
1262 k = k + 1 | |
1263 total_score[[k]] = c(kmer = as.numeric(i), variant = v, total_score = monomers[[i]]$estimates[[v]]$total_score) | |
1264 score_bn[k] = min(rowSums(monomers[[i]]$estimates[[v]]$CM)) | |
1265 length_best[k] = monomers[[i]]$estimates[[v]]$length_variant_score[1, | |
1266 1] | |
1267 consensus[[k]] = monomers[[i]]$estimates[[v]]$consensus2 | |
1268 } | |
1269 } | |
1270 summary_table = as.data.frame(do.call(rbind, total_score)) | |
1271 summary_table$monomer_length = length_best | |
1272 summary_table$consensus_length = nchar(consensus) | |
1273 summary_table$score_bn = score_bn | |
1274 summary_table$consensus = paste("<pre>", wrap(consensus, width = 80), "<pre>", | |
1275 sep = "") | |
1276 consensus_DS = DNAStringSet(consensus) | |
1277 names(consensus_DS) = with(summary_table, paste0(kmer, "_", variant, "_sc_", | |
1278 signif(total_score), "_l_", monomer_length)) | |
1279 | |
1280 ## filter by score - keep | |
1281 | |
1282 ## reorder by score | |
1283 consensus_DS = consensus_DS[order(summary_table$total_score, decreasing = TRUE)] | |
1284 summary_table = summary_table[order(summary_table$total_score, decreasing = TRUE), | |
1285 ] | |
1286 rownames(summary_table) = NULL | |
1287 N = nrow(summary_table) | |
1288 ## concatenate concensus(ie. make dimer head 2 tail) for pbs detection and other | |
1289 ## make something like 'pseudo contig' multimer for mapping - min length 200 bp | |
1290 | |
1291 ## searches | |
1292 consensus_DS_dimer = DNAStringSet(paste0(consensus_DS, consensus_DS)) | |
1293 tarean_contigs = DNAStringSet(sapply(consensus_DS,function(x) | |
1294 ifelse(nchar(x)<200, | |
1295 paste(rep(as.character(x),round(300/nchar(as.character(x))+1)),collapse=''), | |
1296 as.character(x))) | |
1297 ) | |
1298 | |
1299 names(consensus_DS_dimer) = names(consensus_DS) | |
1300 # save sequences: | |
1301 consensus_DS_dimer_file = paste0(output_dir, "/consensus_dimer.fasta") | |
1302 consensus_DS_file = paste0(output_dir, "/consensus.fasta") | |
1303 tarean_contig_file = paste0(output_dir, "/tarean_contigs.fasta") | |
1304 writeXStringSet(consensus_DS, consensus_DS_file) | |
1305 writeXStringSet(tarean_contigs, tarean_contig_file) | |
1306 writeXStringSet(consensus_DS_dimer, consensus_DS_dimer_file) | |
1307 if (is.null(tRNA_database_path)) { | |
1308 pbs_score = -1 | |
1309 } else { | |
1310 pbs_blast_table = paste0(output_dir, "/pbs_detection") | |
1311 pbs_score = detect_pbs(dimers_file = consensus_DS_dimer_file, tRNA_database_path = tRNA_database_path, | |
1312 reads = input_sequences_oriented, output = pbs_blast_table) | |
1313 } | |
1314 ## search of open reading frames get the length of the longest | |
1315 orf_l = max_ORF_length(consensus_DS_dimer) | |
1316 | |
1317 | |
1318 dir.create(paste0(output_dir, "/img"), recursive = TRUE) | |
1319 summary_table$monomer_length_graph = numeric(N) | |
1320 summary_table$graph_image = character(N) | |
1321 summary_table$monomer_length_logo = numeric(N) | |
1322 summary_table$logo_image = character(N) | |
1323 ## export graph nd consensus estimate to cluster directory type of output may | |
1324 ## change in future | |
1325 save(ggmin, file = paste0(output_dir, "/ggmin.RData")) | |
1326 save(monomers, file = paste0(output_dir, "/monomers.RData")) | |
1327 | |
1328 cat("generating HTML output\n") | |
1329 for (i in 1:N) { | |
1330 kmer = as.character(summary_table$kmer[i]) | |
1331 variant = summary_table$variant[i] | |
1332 ## export graph | |
1333 fout_link = paste0("img/graph_", kmer, "mer_", variant, ".png") | |
1334 fout = paste0(output_dir, "/", fout_link) | |
1335 ## summary_table$monomer_length_graph[i] = summary_table$monomer_length[i] | |
1336 ## summary_table$monomer_length_logo[[i]] = nrow(monomers[[kmer]]$estimates[[variant]]$CM) | |
1337 summary_table$monomer_length[[i]] = length(monomers[[kmer]]$estimates[[variant]]$consensus) | |
1338 | |
1339 if (i <= 10) { | |
1340 png(fout, width = 800, height = 800) | |
1341 plot_kmer_graph(ggmin[[kmer]], highlight = unlist(monomers[[kmer]]$paths[[variant]]$tr_paths)) | |
1342 dev.off() | |
1343 summary_table$graph_image[i] = hwriteImage(fout_link, link = fout_link, | |
1344 table = FALSE, width = 100, height = 100) | |
1345 ## export logo | |
1346 png_link = paste0("img/logo_", kmer, "mer_", variant, ".png") | |
1347 fout = paste0(output_dir, "/", png_link) | |
1348 png(fout, width = 1200, height = round(summary_table$monomer_length[i] * | |
1349 1) + 550) | |
1350 try(plot_multiline_logo(monomers[[kmer]]$estimates[[variant]]$CM, W = 100)) | |
1351 dev.off() | |
1352 ## export corresponding position probability matrices | |
1353 ppm_file = paste0(output_dir, '/ppm_', kmer, "mer_", variant, ".csv") | |
1354 ppm_link = paste0('ppm_', kmer, "mer_", variant, ".csv") | |
1355 write.table(monomers[[kmer]]$estimates[[variant]]$CM, | |
1356 file = ppm_file, | |
1357 col.names = TRUE, quote = FALSE, | |
1358 row.names = FALSE, sep="\t") | |
1359 summary_table$logo_image[i] = hwriteImage(png_link, link = ppm_link, | |
1360 table = FALSE, width = 200, height = 100) | |
1361 } | |
1362 | |
1363 } | |
1364 | |
1365 ## html_report = HTMLReport() | |
1366 | |
1367 htmlfile = paste0(output_dir, "/report.html") | |
1368 cat(htmlheader, file = htmlfile) | |
1369 included_columns = c('kmer', 'variant', 'total_score', 'consensus_length','consensus', 'graph_image', 'logo_image') | |
1370 summary_table_clean = summary_table[,included_columns] | |
1371 colnames(summary_table_clean) = c('k-mer length', | |
1372 'Variant index', | |
1373 'k-mer coverage score', | |
1374 'Consensus length', | |
1375 'Consensus sequence', | |
1376 'k-mer based graph', | |
1377 'Sequence logo') | |
1378 HTML(summary_table_clean, file = htmlfile, sortableDF = TRUE) | |
1379 HTMLEndFile(file = htmlfile) | |
1380 time4 = Sys.time() | |
1381 print(time4 - time0) | |
1382 if (!is.null(lock_file)){ | |
1383 print("------removing-lock--------") | |
1384 removelock(lock_file) | |
1385 } | |
1386 | |
1387 print(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1], | |
1388 TR_monomer_length = as.numeric(summary_table$consensus_length[1]), | |
1389 TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info, | |
1390 orf_l = orf_l, tarean_contig_file = tarean_contig_file)) | |
1391 return(list2dictionary(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1], | |
1392 TR_monomer_length = as.numeric(summary_table$consensus_length[1]), | |
1393 TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info, | |
1394 orf_l = orf_l, tarean_contig_file = tarean_contig_file))) | |
1395 } | |
1396 | |
1397 | |
1398 ## graph loop index stability | |
1399 loop_index_instability = function(G) { | |
1400 N = 50 | |
1401 s = seq(vcount(G), vcount(G)/10, length.out = N) | |
1402 p = seq(1, 0.1, length.out = N) | |
1403 li = numeric() | |
1404 for (i in seq_along(s)) { | |
1405 print(i) | |
1406 gs = induced_subgraph(G, sample(1:vcount(G), s[i])) | |
1407 li[i] = max(clusters(gs, "strong")$csize)/vcount(gs) | |
1408 } | |
1409 instability = lm(li ~ p)$coefficient[2] | |
1410 return(instability) | |
1411 } | |
1412 | |
1413 isSatellite = function(x, y, model) { | |
1414 p = get_prob(x, y, model) | |
1415 if (p > model$cutoff) { | |
1416 return("Putative Satellite") | |
1417 } else { | |
1418 return("") | |
1419 } | |
1420 } | |
1421 | |
1422 get_prob = function(x, y, model) { | |
1423 pm = model$prob_matrix | |
1424 N = ncol(pm) | |
1425 i = round(x * (N - 1)) + 1 | |
1426 j = round(y * (N - 1)) + 1 | |
1427 p = pm[i, j] | |
1428 return(p) | |
1429 } | |
1430 | |
1431 | |
1432 detectMemUsage = function() { | |
1433 con = textConnection(gsub(" +", " ", readLines("/proc/meminfo"))) | |
1434 memInfo = read.table(con, fill = TRUE, row.names = 1) | |
1435 close(con) | |
1436 memUsage = 1 - (memInfo["MemFree", 1] + memInfo["Cached", 1])/memInfo["MemTotal", | |
1437 1] | |
1438 return(memUsage) | |
1439 } | |
1440 | |
1441 | |
1442 makelock<-function(lockfile,lockmsg,CreateDirectories=TRUE){ | |
1443 lockdir=dirname(lockfile) | |
1444 if(!file.exists(lockdir)){ | |
1445 if(CreateDirectories) dir.create(lockdir,recursive=TRUE) | |
1446 else stop("Lock Directory for lockfile ",lockfile," does not exist") | |
1447 } | |
1448 if(missing(lockmsg)) lockmsg=paste(system('hostname',intern=TRUE),Sys.getenv("R_SESSION_TMPDIR")) | |
1449 if (file.exists(lockfile)) return (FALSE) | |
1450 # note the use of paste makes the message writing atomic | |
1451 cat(paste(lockmsg,"\n",sep=""),file=lockfile,append=TRUE,sep="") | |
1452 firstline=readLines(lockfile,n=1) | |
1453 if(firstline!=lockmsg){ | |
1454 # somebody else got there first | |
1455 return(FALSE) | |
1456 } else return(TRUE) | |
1457 } | |
1458 | |
1459 | |
1460 removelock<-function(lockfile){ | |
1461 if(unlink(lockfile)!=0) { | |
1462 warning("Unable to remove ",lockfile) | |
1463 return (FALSE) | |
1464 } | |
1465 return (TRUE) | |
1466 } | |
1467 | |
1468 | |
1469 waitForRAM = function(p = 0.5,lock_file=NULL) { | |
1470 if (detectMemUsage() < p) { | |
1471 return(NULL) | |
1472 ## check lock file: | |
1473 } else { | |
1474 cat("waiting for RAM \n") | |
1475 free_count = 0 | |
1476 while (TRUE) { | |
1477 if (makelock(lock_file)){ | |
1478 print("---------locking--------") | |
1479 return(lock_file) | |
1480 } | |
1481 if (detectMemUsage() < p) { | |
1482 cat("RAM freed \n") | |
1483 return(NULL) | |
1484 } | |
1485 Sys.sleep(5) | |
1486 if (evaluate_user_cpu_usage() == 'free'){ | |
1487 free_count = free_count + 1 | |
1488 }else{ | |
1489 free_count = 0 | |
1490 } | |
1491 if (detectMemUsage() < 0.8 & free_count > 100){ | |
1492 cat("RAM not free but nothing else is running \n") | |
1493 return(NULL) | |
1494 } | |
1495 } | |
1496 } | |
1497 } | |
1498 | |
1499 lsmem = function() { | |
1500 g = globalenv() | |
1501 out_all = envs = list() | |
1502 envs = append(envs, g) | |
1503 total_size = numeric() | |
1504 while (environmentName(g) != "R_EmptyEnv") { | |
1505 g <- parent.env(g) | |
1506 envs = append(envs, g) | |
1507 } | |
1508 for (e in envs) { | |
1509 | |
1510 obj = ls(envir = e) | |
1511 if (length(obj) == 0) { | |
1512 break | |
1513 } | |
1514 obj.size = list() | |
1515 for (i in obj) { | |
1516 obj.size[[i]] = object.size(get(i, envir = e)) | |
1517 } | |
1518 out = data.frame(object = obj, size = unlist(obj.size), stringsAsFactors = FALSE) | |
1519 out = out[order(out$size, decreasing = TRUE), ] | |
1520 out_all = append(out_all, out) | |
1521 total_size = append(total_size, sum(out$size)) | |
1522 } | |
1523 return(list(objects = out_all, total_size = total_size)) | |
1524 } | |
1525 | |
1526 evaluate_user_cpu_usage = function(){ | |
1527 user = Sys.info()["user"] | |
1528 a = sum(as.numeric (system(paste ("ps -e -o %cpu -u", user), intern = TRUE)[-1])) | |
1529 s = substring (system(paste ("ps -e -o stat -u", user), intern = TRUE)[-1],1,1) | |
1530 if (a<5 & sum(s %in% 'D')==0 & sum(s%in% 'R')<2){ | |
1531 status = 'free' | |
1532 }else{ | |
1533 status = 'full' | |
1534 } | |
1535 return(status) | |
1536 } |