Mercurial > repos > petr-novak > dante_ltr
comparison R/ltr_utils.R @ 0:7b0bbe7477c4 draft
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
author | petr-novak |
---|---|
date | Tue, 08 Mar 2022 13:24:33 +0000 |
parents | |
children | f131886ea194 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7b0bbe7477c4 |
---|---|
1 add_coordinates_of_closest_neighbor <- function(gff) { | |
2 gff <- gff[order(seqnames(gff), start(gff))] | |
3 # split to chromosomes: | |
4 gff_parts <- split(gff, seqnames(gff)) | |
5 upstreams <- c(sapply(gff_parts, function(x) c(1, head(end(x), -1)))) | |
6 downstreams <- c(sapply(gff_parts, function(x) c(start(x)[-1], seqlengths(x)[runValue(seqnames(x[1]))]))) | |
7 gff_updated <- unlist(gff_parts) | |
8 gff_updated$upstream_domain <- unlist(upstreams) | |
9 gff_updated$downstream_domain <- unlist(downstreams) | |
10 names(gff_updated) <- NULL | |
11 return(gff_updated) | |
12 } | |
13 | |
14 get_domain_clusters <- function(gff) { | |
15 # get consecutive domains from same linage | |
16 # must be sorted! | |
17 gag_plus <- as.numeric(cumsum(gff$Name == "GAG" & strand(gff) == '+')) | |
18 gag_minus <- rev(as.numeric(cumsum(rev(gff$Name == "GAG" & strand(gff) == '-')))) | |
19 # split based on classification - must be same and consecutive | |
20 x <- rle(gff$Final_Classification) | |
21 # split on strand change | |
22 s <- rep(seq_along(runLength(strand(gff))), runLength(strand(gff))) | |
23 domain_cluster <- paste0(rep(seq_along(x$lengths), x$lengths), "_", seqnames(gff), | |
24 "_", gag_plus, "_", gag_minus, "_", s) | |
25 gff_clusters <- split(as.data.frame(gff), factor(domain_cluster, levels = unique(domain_cluster))) | |
26 gff_clusters | |
27 } | |
28 | |
29 clean_domain_clusters <- function(gcl) { | |
30 ## remove clusters wich does not have enough domains or domains | |
31 ## are on different strand | |
32 N_domains <- sapply(gcl, nrow) | |
33 N_unique_domains <- sapply(gcl, function(x)length(unique(x$Name))) | |
34 S <- sapply(gcl, function(x)paste(sort(unique(x$strand)), collapse = " ")) | |
35 S_OK <- S %in% c("+", "-") | |
36 min_domains <- 5 | |
37 maxlength <- 15000 # max span between domains | |
38 span <- sapply(gcl, function(x)max(x$end) - min(x$start)) | |
39 cond1 <- S_OK & | |
40 N_unique_domains == N_domains & | |
41 N_domains >= min_domains & | |
42 span <= maxlength | |
43 return(gcl[cond1]) | |
44 } | |
45 | |
46 check_ranges <- function(gx, s, offset = OFFSET) { | |
47 # check is range is not out of sequence length | |
48 START <- sapply(gx, function(x)min(x$start)) - offset | |
49 END <- sapply(gx, function(x)max(x$end)) + offset | |
50 MAX <- seqlengths(s)[sapply(gx, function(x)as.character(x$seqnames[1]))] | |
51 good_ranges <- (START > 0) & (END <= MAX) | |
52 return(good_ranges) | |
53 } | |
54 | |
55 get_ranges <- function(gx, offset = OFFSET) { | |
56 S <- sapply(gx, function(x)min(x$start)) | |
57 E <- sapply(gx, function(x)max(x$end)) | |
58 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset, end = E + offset)) | |
59 } | |
60 | |
61 get_ranges_left <- function(gx, offset = OFFSET, offset2 = 300) { | |
62 S <- sapply(gx, function(x)min(x$start)) | |
63 max_offset <- S - sapply(gx, function(x)min(x$upstream_domain)) | |
64 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) | |
65 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset_adjusted, end = S + offset2)) | |
66 return(gr) | |
67 } | |
68 | |
69 get_ranges_right <- function(gx, offset = OFFSET, offset2 = 300) { | |
70 E <- sapply(gx, function(x)max(x$end)) | |
71 max_offset <- sapply(gx, function(x)max(x$downstream_domain)) - E | |
72 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) | |
73 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = E - offset2, end = E + offset_adjusted)) | |
74 return(gr) | |
75 } | |
76 | |
77 firstTG <- function(ss) { | |
78 x <- matchPattern("TG", ss) | |
79 if (length(x) == 0) { | |
80 return(0) | |
81 }else { | |
82 return(min(start(x))) | |
83 } | |
84 } | |
85 | |
86 lastCA <- function(ss) { | |
87 x <- matchPattern("CA", ss) | |
88 if (length(x) == 0) { | |
89 return(0) | |
90 }else { | |
91 return(max(start(x))) | |
92 } | |
93 } | |
94 | |
95 trim2TGAC <- function(bl) { | |
96 for (i in 1:nrow(bl)) { | |
97 tg_L <- firstTG(bl$qseq[i]) | |
98 tg_R <- firstTG(bl$sseq[i]) | |
99 ca_L <- lastCA(bl$qseq[i]) | |
100 ca_R <- lastCA(bl$sseq[i]) | |
101 e_dist <- bl$length[i] - ca_R | |
102 no_match <- any(tg_L == 0, tg_R == 0, ca_L == 0, ca_R == 0) | |
103 if (!no_match & | |
104 tg_L == tg_R & | |
105 ca_L == ca_R & | |
106 tg_L < 8 & | |
107 e_dist < 8) { | |
108 ## trim alignment | |
109 bl[i,] <- trim_blast_table(bl[i,], tg_L, e_dist - 1) | |
110 } | |
111 } | |
112 return(bl) | |
113 } | |
114 | |
115 trim_blast_table <- function(b, T1, T2) { | |
116 b$qstart <- b$qstart + T1 | |
117 b$sstart <- b$sstart + T1 | |
118 b$qend <- b$qend - T2 | |
119 b$send <- b$send - T2 | |
120 b$sseq <- substring(b$sseq, T1, b$length - T2) | |
121 b$qseq <- substring(b$qseq, T1, b$length - T2) | |
122 b$length <- nchar(b$sseq) | |
123 return(b) | |
124 } | |
125 | |
126 blast_all2all <- function(seqs, db=NULL, ncpus=1, word_size=20, perc_identity=90, max_target_seq = 5000, task = "megablast", additional_params= ""){ | |
127 if (ncpus == 1){ | |
128 query <- list(seqs) | |
129 }else{ | |
130 query <-split(seqs, round(seq(1,ncpus,length.out = length(seqs)))) | |
131 } | |
132 if(is.null(db)){ | |
133 # search against itself | |
134 db <- seqs | |
135 } | |
136 qf <-tempfile(fileext=paste0("_",1:ncpus,".fasta")) | |
137 outf <-tempfile(fileext=paste0("_",1:ncpus,".csv")) | |
138 dbf <- tempfile() | |
139 script <- tempfile() | |
140 writeXStringSet(db, dbf) | |
141 mapply(query, qf, FUN = writeXStringSet) | |
142 cols <- "qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen" | |
143 cmd_db <- paste("makeblastdb -dbtype nucl -in ", dbf) | |
144 cmd_blast <- paste("blastn -task ", task, " -word_size", word_size, | |
145 "-outfmt \"6 ", cols, "\" ", | |
146 "-perc_identity", perc_identity, " -min_raw_gapped_score 500", | |
147 "-max_target_seqs", max_target_seq, additional_params, | |
148 "-query", qf, "-db", dbf, "-out", outf, | |
149 "&" | |
150 ) | |
151 | |
152 # TODO - inspect only forward strand?? | |
153 system(cmd_db) | |
154 cmd_all <- paste(paste(cmd_blast,collapse="\n"),"\nwait") | |
155 cat(cmd_all, file = script) | |
156 system(paste("sh ", script)) | |
157 | |
158 bl_list <- lapply(outf, read.table, stringsAsFactors = FALSE, col.names = unlist(strsplit(cols, " ")), sep="\t", comment.char = "") | |
159 bl_table <- do.call(rbind, bl_list) | |
160 unlink(qf) | |
161 #unlink(outf) | |
162 print(outf) | |
163 unlink(dbf) | |
164 unlink(script) | |
165 return(bl_table) | |
166 } | |
167 | |
168 identify_conflicts <- function (bl){ | |
169 QL <- gsub(".+[|]", "", bl$qaccver) | |
170 SL <- gsub(".+[|]", "", bl$saccver) | |
171 id_with_conflict <- unique(c(bl$qaccver[QL != SL],bl$saccver[QL != SL])) | |
172 id_ok <- setdiff(unique(c(bl$qaccver,bl$saccver)), id_with_conflict) | |
173 single_hit <- sapply( | |
174 sapply( | |
175 split(bl$qaccver, bl$saccver), unique | |
176 ), length) == 1 | |
177 id_with_no_hits <- names(single_hit)[single_hit] # except hit to itself | |
178 return(list( | |
179 ok = id_ok, | |
180 conflict = id_with_conflict, | |
181 no_hit = id_with_no_hits) | |
182 ) | |
183 } | |
184 | |
185 | |
186 analyze_TE <- function(seqs, ncpus = 10, word_size = 20){ | |
187 blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size) | |
188 te_conflict_info <- identify_conflicts(blt) | |
189 blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok) | |
190 te_ok_lineages <- split(blt_te_ok, | |
191 gsub( | |
192 ".+[|]", | |
193 "", | |
194 blt_te_ok$qaccver)) | |
195 gr_representative <- GRangesList(mclapply(te_ok_lineages, | |
196 FUN = get_representative_ranges, | |
197 mc.cores = ncpus | |
198 )) | |
199 seqs_representative <- getSeq(seqs, Reduce(c, gr_representative)) | |
200 names(seqs_representative) <- seqnames(Reduce(c, gr_representative)) | |
201 # TODO - add swithin group verification here, ! exclude self hits!! | |
202 | |
203 return( | |
204 list( | |
205 seqs_representative = seqs_representative, | |
206 te_conflict_info = te_conflict_info, | |
207 gr_representative = gr_representative, | |
208 blast = blt | |
209 ) | |
210 ) | |
211 } | |
212 | |
213 query_coverage <- function(blt){ | |
214 blt <- blt[blt$qaccver != blt$saccver,] | |
215 Q_lengths <- blt$qlen[!duplicated(blt$qaccver)] | |
216 names(Q_lengths) <- blt$qaccver[!duplicated(blt$qaccver)] | |
217 gr <- GRanges(seqnames = blt$qaccver, seqlengths = Q_lengths, | |
218 IRanges(start = blt$qstart, end = blt$qend)) | |
219 return(coverage(gr)) | |
220 } | |
221 | |
222 multiplicity_of_te <- function(blt){ | |
223 # exclude self to self hits and calculate coverage + mean_multiplicity of TE | |
224 # assuption is that TE which are 'identical' to other TE from the same lineage are | |
225 # likely correct | |
226 blt_no_self <- blt[blt$qaccver != blt$saccver,] | |
227 cvr <- query_coverage(blt_no_self) | |
228 L <- sapply(cvr, function(x) sum(width(x))) | |
229 C1 <- sapply(cvr, function(x) sum(as.numeric(runValue(x) >= 1) * runLength(x))) | |
230 multiplicity <- sapply(cvr, function(x) sum(as.numeric(runValue(x)) * runLength(x)))/L | |
231 data.frame(L = L, C1 = C1, multiplicity = multiplicity ) | |
232 } | |
233 | |
234 verify_based_on_multiplicity <- function(TE_info, min_coverage=0.99, min_multiplicity=3){ | |
235 blt <- TE_info$blast[TE_info$blast$qaccver %in% TE_info$te_conflict_info$ok,] | |
236 mp <- multiplicity_of_te(blt) | |
237 id_ok_mp_verified <- rownames(mp)[mp$C1/mp$L > min_coverage & mp$multiplicity >= min_multiplicity] | |
238 return(list(multiplicity = mp, | |
239 id_ok_mp_verified = id_ok_mp_verified)) | |
240 | |
241 } | |
242 | |
243 compare_TE_datasets <- function(Q, S, word_size = 20, min_coverage = 0.95, ncpus=10){ | |
244 blt <- blast_all2all(seqs = Q, db = S, ncpus = ncpus, word_size = word_size) | |
245 QL <- gsub(".+[|]", "", blt$qaccver) | |
246 SL <- gsub(".+[|]", "", blt$saccver) | |
247 id_with_conflict <- unique(c(blt$qaccver[QL != SL])) | |
248 id_ok <- setdiff(unique(blt$qaccver), id_with_conflict) | |
249 # check coverage hits | |
250 blt_ok <- blt[blt$qaccver %in% id_ok,] | |
251 Q_lengths <- blt_ok$qlen[!duplicated(blt_ok$qaccver)] | |
252 names(Q_lengths) <- blt_ok$qaccver[!duplicated(blt_ok$qaccver)] | |
253 gr <- GRanges(seqnames = blt_ok$qaccver, seqlengths = Q_lengths, | |
254 IRanges(start = blt_ok$qstart, end = blt_ok$qend)) | |
255 cvr <- coverage(gr) | |
256 L <- sapply(cvr, function(x) sum(width(x))) | |
257 C1 <- sapply(cvr, function(x) sum(as.numeric(runValue(x) >= 1) * runLength(x))) | |
258 Max_uncovered <- sapply(cvr, function(x){ | |
259 if(any(runValue(x)==0)){ | |
260 return(max(runLength(x)[runValue(x) == 0])) | |
261 }else{ | |
262 return(0) | |
263 } | |
264 }) | |
265 | |
266 # verified based on hit to reference - S | |
267 C1_prop <- C1/L | |
268 pass <- (C1_prop >= min_coverage & Max_uncovered < 500) | |
269 if (any(pass)){ | |
270 id_ok_verified <- names(C1_prop) | |
271 }else { | |
272 id_ok_verified <- NULL | |
273 } | |
274 return(list(id_with_conflict = id_with_conflict, | |
275 id_ok = id_ok, | |
276 id_ok_verified = id_ok_verified | |
277 )) | |
278 } | |
279 | |
280 | |
281 | |
282 blast_table_subset <- function(bl,id){ | |
283 return(bl[bl$qaccver %in% id & bl$saccver %in% id,, drop = FALSE]) | |
284 } | |
285 | |
286 get_representative_ranges <- function(bl, min_length = 60){ | |
287 score <- sort(unlist(by(bl$bitscore, bl$qaccver, sum, simplify = FALSE)), | |
288 decreasing = TRUE) | |
289 L <- bl$qlen[!duplicated(bl$qaccver)] | |
290 names(L) <- bl$qaccver[!duplicated(bl$qaccver)] | |
291 gr <- GRanges(seqnames = bl$qaccver, | |
292 IRanges(start = bl$qstart, end = bl$qend), | |
293 seqlengths = L, | |
294 subject = bl$saccver, | |
295 sstart = ifelse(bl$send < bl$sstart, bl$send, bl$sstart), | |
296 send = ifelse(bl$send > bl$sstart, bl$send, bl$sstart)) | |
297 SN <- levels(seqnames(gr)) | |
298 inc <- rep(TRUE, length(gr)) | |
299 MSK <- GRangesList() | |
300 for (i in names(score)){ | |
301 inc[gr$subject %in% i] <- FALSE | |
302 gr_part <- gr[seqnames(gr) %in% i & inc] | |
303 MSK[[i]] <- GRanges(seqnames = factor(gr_part$subject, levels = SN), | |
304 IRanges(start = gr_part$sstart, end = gr_part$send), | |
305 seqlengths = L | |
306 ) | |
307 } | |
308 gout <- unlist(MSK) | |
309 | |
310 full_gr <- GRanges(seqnames = factor(SN, levels = SN), | |
311 IRanges(start = rep(1,length(L)), | |
312 end = L) | |
313 ) | |
314 unmasked_gr <- GenomicRanges::setdiff(full_gr, gout) | |
315 return(unmasked_gr[width(unmasked_gr) >= min_length]) | |
316 } | |
317 | |
318 expected_diversity <- function(seqs, niter=100, km = 6){ | |
319 L <- nchar(seqs) | |
320 R <- matrix(ncol = niter, nrow = length(seqs)) | |
321 for (i in 1:niter){ | |
322 print(i) | |
323 seqs_rnd <- DNAStringSet(sapply(L, function(n) paste(sample(c("A", "C", "T", "G"), n, replace=TRUE), collapse=""))) | |
324 R[,i] <- seq_diversity(seqs_rnd, km = km)$richness | |
325 } | |
326 R | |
327 | |
328 } | |
329 | |
330 seq_diversity <- function (seqs, km=6){ | |
331 K <- oligonucleotideFrequency(seqs, width=km)>0 | |
332 P <- t(K)/rowSums(K) | |
333 # shannon index | |
334 SI <- apply(P, 2, function(x) {x1 <- x[x>0]; -sum(x1*log(x1))}) | |
335 # richness | |
336 R <- rowSums(K) | |
337 list(richness=R, shannon_index=SI) | |
338 } | |
339 | |
340 | |
341 | |
342 blast <- function(s1, s2) { | |
343 tmp1 <- tempfile() | |
344 tmp2 <- tempfile() | |
345 tmp_out <- tempfile() | |
346 writeXStringSet(DNAStringSet(s1), tmp1) | |
347 writeXStringSet(DNAStringSet(s2), tmp2) | |
348 # alternative blast: | |
349 cmd <- paste("blastn -task blastn -word_size 7 -dust no -gapextend 1 -gapopen 2 -reward 1 -penalty -1", | |
350 " -query ", tmp1, ' -subject ', tmp2, ' -strand plus ', | |
351 '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq"', | |
352 ' -out', tmp_out) | |
353 | |
354 system(cmd) | |
355 out_raw <- read.table(tmp_out, as.is = TRUE, sep = "\t", | |
356 col.names = strsplit("qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq", split = ' ')[[1]]) | |
357 | |
358 if (nrow(out_raw) == 0) { | |
359 return(out_raw) | |
360 } | |
361 out <- trim2TGAC(out_raw) | |
362 # remove alingment shorted that | |
363 out <- out[out$length > 100,] | |
364 if (nrow(out) == 0) { | |
365 return(out) | |
366 } | |
367 ## filter for TGCA | |
368 TG_L <- substring(out$qseq, 1, 2) == "TG" | |
369 TG_R <- substring(out$sseq, 1, 2) == "TG" | |
370 CA_L <- substring(out$qseq, out$length - 1, out$length) == "CA" | |
371 CA_R <- substring(out$sseq, out$length - 1, out$length) == "CA" | |
372 cond <- TG_L & TG_R & CA_L & CA_R | |
373 out <- out[cond, , drop = FALSE] | |
374 unlink(tmp1) | |
375 unlink(tmp2) | |
376 unlink(tmp_out) | |
377 # TODO - detele all temp files! | |
378 # unlink(tmp1, tmp2, tmp_out) | |
379 return(out) | |
380 } | |
381 | |
382 get_correct_coordinates <- function(b) { | |
383 do.call(rbind, strsplit(b$qaccver, split = "_")) | |
384 } | |
385 | |
386 evaluate_ltr <- function(GR, GRL, GRR, blast_line, Lseq, Rseq) { | |
387 LTR_L <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), | |
388 start = start(GRL) + blast_line$qstart - 2, | |
389 end = start(GRL) + blast_line$qend - 1)) | |
390 LTR_R <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), | |
391 start = start(GRR) + blast_line$sstart - 2, | |
392 end = start(GRR) + blast_line$send - 1)) | |
393 | |
394 TSD_L <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), | |
395 start = start(GRL) + blast_line$qstart - 3:10, | |
396 end = start(GRL) + blast_line$qstart - 3)) | |
397 TSD_R <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), | |
398 start = start(GRR) + blast_line$send, | |
399 end = start(GRR) + blast_line$send + 0:7)) | |
400 | |
401 TSD_L_seq <- DNAStringSet(substring(Lseq, blast_line$qstart - 2:9, blast_line$qstart - 2)) | |
402 TSD_R_seq <- DNAStringSet(substring(Rseq, blast_line$send + 1, blast_line$send + 1:8)) | |
403 | |
404 matching_tsd <- TSD_R_seq == TSD_L_seq | |
405 matching_tsd[1:3] <- FALSE # remove short tsd | |
406 p <- which(matching_tsd) | |
407 if (length(p) > 0) { | |
408 TSD_Length <- max(p) | |
409 TSD_sequence <- TSD_L_seq[TSD_Length] | |
410 TSD_position <- append(TSD_R[TSD_Length], TSD_L[TSD_Length]) | |
411 }else { | |
412 TSD_Length <- 0 | |
413 TSD_sequence <- "" | |
414 TSD_position <- NA | |
415 } | |
416 | |
417 TE_Length <- end(LTR_R) - start(LTR_L) | |
418 LTR_Identity <- blast_line$pident | |
419 out <- list(TSD_position = TSD_position, TSD_sequence = TSD_sequence, TSD_Length = TSD_Length, | |
420 LTR_R_position = LTR_R, LTR_L_position = LTR_L, TE_Length = TE_Length, LTR_Identity = LTR_Identity) | |
421 return(out) | |
422 } | |
423 | |
424 get_best_ltr <- function(x) { | |
425 tsd_ok <- sapply(x, function(k)k$TSD_Length > 3) | |
426 te_length_ok <- sapply(x, function(k)k$TE_Length < 30000) | |
427 ltr_length_ok <- sapply(x, function(k)width(k$LTR_R_position) >= 100 & width(k$LTR_L_position) >= 100) | |
428 if (sum(tsd_ok & te_length_ok & ltr_length_ok) >= 1) { | |
429 # return the first one (best bitscore) | |
430 return(x[tsd_ok & te_length_ok][1]) | |
431 } | |
432 if (any(te_length_ok & ltr_length_ok)) { | |
433 return(x[te_length_ok & ltr_length_ok][1]) | |
434 }else { | |
435 return(NULL) | |
436 } | |
437 } | |
438 | |
439 get_te_gff3 <- function(g, ID) { | |
440 D <- makeGRangesFromDataFrame(g$domain, keep.extra.columns = TRUE) | |
441 sn <- seqnames(D)[1] | |
442 S <- strand(D)[1] | |
443 TE <- GRanges(seqnames = sn, | |
444 IRanges(start = start(g$ltr_info[[1]]$LTR_L_position), | |
445 end = end(g$ltr_info[[1]]$LTR_R_position)), strand = S) | |
446 TE$type <- "transposable_element" | |
447 TE$ID <- ID | |
448 | |
449 if (as.character(S) == "+") { | |
450 LTR_5 <- g$ltr_info[[1]]$LTR_L | |
451 LTR_3 <- g$ltr_info[[1]]$LTR_R | |
452 }else { | |
453 LTR_3 <- g$ltr_info[[1]]$LTR_L | |
454 LTR_5 <- g$ltr_info[[1]]$LTR_R | |
455 } | |
456 LTR_5$LTR <- '5LTR' | |
457 LTR_3$LTR <- '3LTR' | |
458 LTR_5$type <- "long_terminal_repeat" | |
459 LTR_3$type <- "long_terminal_repeat" | |
460 strand(LTR_3) <- S | |
461 strand(LTR_5) <- S | |
462 LTR_3$Parent <- ID | |
463 LTR_5$Parent <- ID | |
464 LTR_3$Final_Classification <- D$Final_Classification[1] | |
465 LTR_5$Final_Classification <- D$Final_Classification[1] | |
466 LTR_5$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity | |
467 LTR_3$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity | |
468 | |
469 TE$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity | |
470 TE$LTR5_length <- width(LTR_5) | |
471 TE$LTR3_length <- width(LTR_3) | |
472 | |
473 if (is.na(g$ltr_info[[1]]$TSD_position)[1]) { | |
474 # no TSD found | |
475 TSD <- NULL | |
476 TE$TSD <- 'not_found' | |
477 }else { | |
478 TSD <- g$ltr_info[[1]]$TSD_position | |
479 TSD$type <- "target_site_duplication" | |
480 TSD$Parent <- ID | |
481 TE$TSD <- as.character(g$ltr_info[[1]]$TSD_sequence) | |
482 } | |
483 | |
484 TE$Final_Classification <- D$Final_Classification[1] | |
485 | |
486 D$Parent <- ID | |
487 out <- c(TE, LTR_3, LTR_5, D, TSD) | |
488 return(out) | |
489 } | |
490 | |
491 get_TE <- function(Lseq, Rseq, domains_gff, GR, GRL, GRR) { | |
492 xx <- blast(Lseq, Rseq) | |
493 if (nrow(xx) == 0) { | |
494 return(NULL) | |
495 }else { | |
496 ltr_tmp <- list() | |
497 for (j in 1:nrow(xx)) { | |
498 ltr_tmp[[j]] <- evaluate_ltr(GR, GRL, GRR, xx[j, , drop = FALSE], Lseq, Rseq) | |
499 } | |
500 ltr <- get_best_ltr(ltr_tmp) | |
501 if (length(ltr) == 0) { | |
502 return(NULL) | |
503 ## add good ltr | |
504 }else { | |
505 return(list(domain = domains_gff, ltr_info = ltr, blast_out = xx)) | |
506 } | |
507 } | |
508 } | |
509 | |
510 add_pbs <- function(te, s, trna_db) { | |
511 ltr5 <- te[which(te$LTR == "5LTR")] | |
512 STRAND <- as.character(strand(te)[1]) | |
513 if (STRAND == "+") { | |
514 pbs_gr <- GRanges(seqnames(ltr5), IRanges(start = end(ltr5) + 1, end = end(ltr5) + 31)) | |
515 pbs_s <- reverseComplement(getSeq(s, pbs_gr)) | |
516 }else { | |
517 pbs_gr <- GRanges(seqnames(ltr5), IRanges(end = start(ltr5) - 1, start = start(ltr5) - 30)) | |
518 pbs_s <- getSeq(s, pbs_gr) | |
519 } | |
520 | |
521 names(pbs_s) <- "pbs_region" | |
522 # find trna match | |
523 tmp <- tempfile() | |
524 tmp_out <- tempfile() | |
525 writeXStringSet(DNAStringSet(pbs_s), tmp) | |
526 # alternative blast: | |
527 cmd <- paste("blastn -task blastn -word_size 7 -dust no -perc_identity 100", | |
528 " -query ", tmp, ' -db ', trna_db, ' -strand plus ', | |
529 '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq"', | |
530 ' -out', tmp_out) | |
531 | |
532 system(cmd) | |
533 pbs_exact_gr <- NULL | |
534 out_raw <- read.table(tmp_out, as.is = TRUE, sep = "\t", | |
535 col.names = strsplit( | |
536 "qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq", | |
537 split = ' ')[[1]]) | |
538 if (nrow(out_raw) > 0) { | |
539 cca <- grepl("CCA$", out_raw$qseq) | |
540 to_end <- out_raw$send == 23 # align to end of sequence | |
541 max_dist <- out_raw$qend > 25 # max 5 bp from ltr | |
542 min_length <- out_raw$length >= 10 | |
543 out_pass <- out_raw[cca & to_end & max_dist & min_length,] | |
544 if (nrow(out_pass) > 0) { | |
545 trna_id <- out_pass$saccver[1] | |
546 if (STRAND == "+") { | |
547 S <- end(ltr5) + 32 - out_pass$qend[1] | |
548 E <- end(ltr5) + 32 - out_pass$qstart[1] | |
549 }else { | |
550 S <- start(ltr5) - 31 + out_pass$qstart[1] | |
551 E <- start(ltr5) - 31 + out_pass$qend[1] | |
552 } | |
553 pbs_exact_gr <- GRanges(seqnames(ltr5), IRanges(start = S, end = E)) | |
554 pbs_exact_gr$trna_id <- trna_id | |
555 pbs_exact_gr$Length <- out_pass$Length | |
556 strand(pbs_exact_gr) <- STRAND | |
557 pbs_exact_gr$type <- 'primer_binding_site' | |
558 pbs_exact_gr$Parent <- te[1]$ID | |
559 te$trna_id <- c(trna_id, rep(NA, length(te) - 1)) | |
560 | |
561 } | |
562 } | |
563 te <- append(te, pbs_exact_gr) | |
564 unlink(tmp) | |
565 unlink(tmp_out) | |
566 return(te) | |
567 } | |
568 | |
569 get_te_statistics <- function(gr, RT) { | |
570 DOMAINS_LTR <- gr[gr$type == "transposable_element" & | |
571 gr$TSD == "not_found" & | |
572 is.na(gr$trna_id)] | |
573 DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & | |
574 gr$TSD != "not_found" & | |
575 is.na(gr$trna_id)] | |
576 DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" & | |
577 gr$TSD != "not_found" & | |
578 !is.na(gr$trna_id)] | |
579 DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" & | |
580 gr$TSD == "not_found" & | |
581 !is.na(gr$trna_id)] | |
582 | |
583 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE)) | |
584 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class))) | |
585 DL <- as.integer(table(factor(DOMAINS_LTR$Final_Classification, levels = all_class))) | |
586 DLT <- as.integer(table(factor(DOMAINS_LTR_TSD$Final_Classification, levels = all_class))) | |
587 DLTP <- as.integer(table(factor(DOMAINS_LTR_TSD_PBS$Final_Classification, levels = all_class))) | |
588 DLP <- as.integer(table(factor(DOMAINS_LTR_PBS$Final_Classification, levels = all_class))) | |
589 out <- data.frame(RT_domain = RT_domain, | |
590 DOMAINS_LTR = DL, | |
591 DOMAINS_LTR_TSD = DLT, | |
592 DOMAINS_LTR_PBS = DLP, | |
593 DOMAINS_LTR_TSD_PBS = DLTP, | |
594 row.names = all_class | |
595 ) | |
596 total <- colSums(out) | |
597 out <- rbind(out, Total = total) | |
598 return(out) | |
599 } | |
600 | |
601 getSeqNamed <- function(s, gr) { | |
602 spart <- getSeq(s, gr) | |
603 id1 <- paste0(seqnames(gr), '_', start(gr), "_", end(gr)) | |
604 id2 <- gr$Final_Classification | |
605 names(spart) <- paste0(id1, "#", id2) | |
606 spart | |
607 } | |
608 | |
609 get_TE_id <- function (gr){ | |
610 gr_te <- gr[gr$type == "transposable_element"] | |
611 ID <- gr_te$ID | |
612 A <- paste0(seqnames(gr_te), '_', start(gr_te), "_", end(gr_te)) | |
613 B <- gr_te$Final_Classification | |
614 names(ID) <- paste0(A, "#", B) | |
615 | |
616 } | |
617 | |
618 get_te_sequences <- function(gr, s) { | |
619 # return list of biostrings | |
620 DOMAINS_LTR <- gr[gr$type == "transposable_element" & | |
621 gr$TSD == "not_found" & | |
622 is.na(gr$trna_id)] | |
623 DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & | |
624 gr$TSD != "not_found" & | |
625 is.na(gr$trna_id)] | |
626 DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" & | |
627 gr$TSD != "not_found" & | |
628 !is.na(gr$trna_id)] | |
629 DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" & | |
630 gr$TSD == "not_found" & | |
631 !is.na(gr$trna_id)] | |
632 s_DL <- getSeqNamed(s, DOMAINS_LTR) | |
633 s_DLT <- getSeqNamed(s, DOMAINS_LTR_TSD) | |
634 s_DLP <- getSeqNamed(s, DOMAINS_LTR_PBS) | |
635 s_DLTP <- getSeqNamed(s, DOMAINS_LTR_TSD_PBS) | |
636 return(DNAStringSetList( | |
637 DL = s_DL, | |
638 DLT = s_DLT, | |
639 DLP = s_DLP, | |
640 DLTP = s_DLTP | |
641 )) | |
642 | |
643 } | |
644 | |
645 cd_hit_est <- function(seqs, min_identity = 0.9, word_size = 10, ncpu = 2){ | |
646 # runs cd-hi-est and return table with cluster membership, and size and if reads was repesentative | |
647 # input sequences must be in the same orientation!!! | |
648 sfile <- tempfile() | |
649 fasta_out <- tempfile() | |
650 clstr <- paste0(fasta_out,".clstr") | |
651 # cdhit is triming names!! | |
652 ori_names <- names(seqs) | |
653 names(seqs) <- seq_along(seqs) | |
654 writeXStringSet(seqs, sfile) | |
655 cmd <- paste("cd-hit-est", | |
656 "-i", sfile, | |
657 "-o", fasta_out, | |
658 "-c", min_identity, | |
659 "-n", word_size, | |
660 "-T", ncpu, | |
661 "-r", 0) | |
662 system(cmd) | |
663 cls_raw <- grep("^>", readLines(clstr), invert = TRUE, value = TRUE) | |
664 unlink(fasta_out) | |
665 unlink(clstr) | |
666 index <- gsub("\t.+","",cls_raw) | |
667 id <- as.numeric(gsub("[.].+","", | |
668 gsub(".+>", "", cls_raw)) | |
669 ) | |
670 is_representative <- id %in% id[grep("[*]$",cls_raw)] | |
671 membership <- cumsum(index=="0") | |
672 cluster_size <- tabulate(membership)[membership] | |
673 # reorder | |
674 ord <- order(id) | |
675 cls_info <- data.frame( | |
676 seq_id = ori_names, | |
677 membership = membership[ord], | |
678 cluster_size = cluster_size[ord], | |
679 is_representative = is_representative[ord] | |
680 ) | |
681 return(cls_info) | |
682 } | |
683 |