Mercurial > repos > petr-novak > dante_ltr
diff R/ltr_utils.R @ 0:7b0bbe7477c4 draft
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
author | petr-novak |
---|---|
date | Tue, 08 Mar 2022 13:24:33 +0000 |
parents | |
children | f131886ea194 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/ltr_utils.R Tue Mar 08 13:24:33 2022 +0000 @@ -0,0 +1,683 @@ +add_coordinates_of_closest_neighbor <- function(gff) { + gff <- gff[order(seqnames(gff), start(gff))] + # split to chromosomes: + gff_parts <- split(gff, seqnames(gff)) + upstreams <- c(sapply(gff_parts, function(x) c(1, head(end(x), -1)))) + downstreams <- c(sapply(gff_parts, function(x) c(start(x)[-1], seqlengths(x)[runValue(seqnames(x[1]))]))) + gff_updated <- unlist(gff_parts) + gff_updated$upstream_domain <- unlist(upstreams) + gff_updated$downstream_domain <- unlist(downstreams) + names(gff_updated) <- NULL + return(gff_updated) +} + +get_domain_clusters <- function(gff) { + # get consecutive domains from same linage + # must be sorted! + gag_plus <- as.numeric(cumsum(gff$Name == "GAG" & strand(gff) == '+')) + gag_minus <- rev(as.numeric(cumsum(rev(gff$Name == "GAG" & strand(gff) == '-')))) + # split based on classification - must be same and consecutive + x <- rle(gff$Final_Classification) + # split on strand change + s <- rep(seq_along(runLength(strand(gff))), runLength(strand(gff))) + domain_cluster <- paste0(rep(seq_along(x$lengths), x$lengths), "_", seqnames(gff), + "_", gag_plus, "_", gag_minus, "_", s) + gff_clusters <- split(as.data.frame(gff), factor(domain_cluster, levels = unique(domain_cluster))) + gff_clusters +} + +clean_domain_clusters <- function(gcl) { + ## remove clusters wich does not have enough domains or domains + ## are on different strand + N_domains <- sapply(gcl, nrow) + N_unique_domains <- sapply(gcl, function(x)length(unique(x$Name))) + S <- sapply(gcl, function(x)paste(sort(unique(x$strand)), collapse = " ")) + S_OK <- S %in% c("+", "-") + min_domains <- 5 + maxlength <- 15000 # max span between domains + span <- sapply(gcl, function(x)max(x$end) - min(x$start)) + cond1 <- S_OK & + N_unique_domains == N_domains & + N_domains >= min_domains & + span <= maxlength + return(gcl[cond1]) +} + +check_ranges <- function(gx, s, offset = OFFSET) { + # check is range is not out of sequence length + START <- sapply(gx, function(x)min(x$start)) - offset + END <- sapply(gx, function(x)max(x$end)) + offset + MAX <- seqlengths(s)[sapply(gx, function(x)as.character(x$seqnames[1]))] + good_ranges <- (START > 0) & (END <= MAX) + return(good_ranges) +} + +get_ranges <- function(gx, offset = OFFSET) { + S <- sapply(gx, function(x)min(x$start)) + E <- sapply(gx, function(x)max(x$end)) + gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset, end = E + offset)) +} + +get_ranges_left <- function(gx, offset = OFFSET, offset2 = 300) { + S <- sapply(gx, function(x)min(x$start)) + max_offset <- S - sapply(gx, function(x)min(x$upstream_domain)) + offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) + gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset_adjusted, end = S + offset2)) + return(gr) +} + +get_ranges_right <- function(gx, offset = OFFSET, offset2 = 300) { + E <- sapply(gx, function(x)max(x$end)) + max_offset <- sapply(gx, function(x)max(x$downstream_domain)) - E + offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) + gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = E - offset2, end = E + offset_adjusted)) + return(gr) +} + +firstTG <- function(ss) { + x <- matchPattern("TG", ss) + if (length(x) == 0) { + return(0) + }else { + return(min(start(x))) + } +} + +lastCA <- function(ss) { + x <- matchPattern("CA", ss) + if (length(x) == 0) { + return(0) + }else { + return(max(start(x))) + } +} + +trim2TGAC <- function(bl) { + for (i in 1:nrow(bl)) { + tg_L <- firstTG(bl$qseq[i]) + tg_R <- firstTG(bl$sseq[i]) + ca_L <- lastCA(bl$qseq[i]) + ca_R <- lastCA(bl$sseq[i]) + e_dist <- bl$length[i] - ca_R + no_match <- any(tg_L == 0, tg_R == 0, ca_L == 0, ca_R == 0) + if (!no_match & + tg_L == tg_R & + ca_L == ca_R & + tg_L < 8 & + e_dist < 8) { + ## trim alignment + bl[i,] <- trim_blast_table(bl[i,], tg_L, e_dist - 1) + } + } + return(bl) +} + +trim_blast_table <- function(b, T1, T2) { + b$qstart <- b$qstart + T1 + b$sstart <- b$sstart + T1 + b$qend <- b$qend - T2 + b$send <- b$send - T2 + b$sseq <- substring(b$sseq, T1, b$length - T2) + b$qseq <- substring(b$qseq, T1, b$length - T2) + b$length <- nchar(b$sseq) + return(b) +} + +blast_all2all <- function(seqs, db=NULL, ncpus=1, word_size=20, perc_identity=90, max_target_seq = 5000, task = "megablast", additional_params= ""){ + if (ncpus == 1){ + query <- list(seqs) + }else{ + query <-split(seqs, round(seq(1,ncpus,length.out = length(seqs)))) + } + if(is.null(db)){ + # search against itself + db <- seqs + } + qf <-tempfile(fileext=paste0("_",1:ncpus,".fasta")) + outf <-tempfile(fileext=paste0("_",1:ncpus,".csv")) + dbf <- tempfile() + script <- tempfile() + writeXStringSet(db, dbf) + mapply(query, qf, FUN = writeXStringSet) + cols <- "qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen" + cmd_db <- paste("makeblastdb -dbtype nucl -in ", dbf) + cmd_blast <- paste("blastn -task ", task, " -word_size", word_size, + "-outfmt \"6 ", cols, "\" ", + "-perc_identity", perc_identity, " -min_raw_gapped_score 500", + "-max_target_seqs", max_target_seq, additional_params, + "-query", qf, "-db", dbf, "-out", outf, + "&" + ) + + # TODO - inspect only forward strand?? + system(cmd_db) + cmd_all <- paste(paste(cmd_blast,collapse="\n"),"\nwait") + cat(cmd_all, file = script) + system(paste("sh ", script)) + + bl_list <- lapply(outf, read.table, stringsAsFactors = FALSE, col.names = unlist(strsplit(cols, " ")), sep="\t", comment.char = "") + bl_table <- do.call(rbind, bl_list) + unlink(qf) + #unlink(outf) + print(outf) + unlink(dbf) + unlink(script) + return(bl_table) +} + +identify_conflicts <- function (bl){ + QL <- gsub(".+[|]", "", bl$qaccver) + SL <- gsub(".+[|]", "", bl$saccver) + id_with_conflict <- unique(c(bl$qaccver[QL != SL],bl$saccver[QL != SL])) + id_ok <- setdiff(unique(c(bl$qaccver,bl$saccver)), id_with_conflict) + single_hit <- sapply( + sapply( + split(bl$qaccver, bl$saccver), unique + ), length) == 1 + id_with_no_hits <- names(single_hit)[single_hit] # except hit to itself + return(list( + ok = id_ok, + conflict = id_with_conflict, + no_hit = id_with_no_hits) + ) +} + + +analyze_TE <- function(seqs, ncpus = 10, word_size = 20){ + blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size) + te_conflict_info <- identify_conflicts(blt) + blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok) + te_ok_lineages <- split(blt_te_ok, + gsub( + ".+[|]", + "", + blt_te_ok$qaccver)) + gr_representative <- GRangesList(mclapply(te_ok_lineages, + FUN = get_representative_ranges, + mc.cores = ncpus + )) + seqs_representative <- getSeq(seqs, Reduce(c, gr_representative)) + names(seqs_representative) <- seqnames(Reduce(c, gr_representative)) + # TODO - add swithin group verification here, ! exclude self hits!! + + return( + list( + seqs_representative = seqs_representative, + te_conflict_info = te_conflict_info, + gr_representative = gr_representative, + blast = blt + ) + ) +} + +query_coverage <- function(blt){ + blt <- blt[blt$qaccver != blt$saccver,] + Q_lengths <- blt$qlen[!duplicated(blt$qaccver)] + names(Q_lengths) <- blt$qaccver[!duplicated(blt$qaccver)] + gr <- GRanges(seqnames = blt$qaccver, seqlengths = Q_lengths, + IRanges(start = blt$qstart, end = blt$qend)) + return(coverage(gr)) +} + +multiplicity_of_te <- function(blt){ + # exclude self to self hits and calculate coverage + mean_multiplicity of TE + # assuption is that TE which are 'identical' to other TE from the same lineage are + # likely correct + blt_no_self <- blt[blt$qaccver != blt$saccver,] + cvr <- query_coverage(blt_no_self) + L <- sapply(cvr, function(x) sum(width(x))) + C1 <- sapply(cvr, function(x) sum(as.numeric(runValue(x) >= 1) * runLength(x))) + multiplicity <- sapply(cvr, function(x) sum(as.numeric(runValue(x)) * runLength(x)))/L + data.frame(L = L, C1 = C1, multiplicity = multiplicity ) +} + +verify_based_on_multiplicity <- function(TE_info, min_coverage=0.99, min_multiplicity=3){ + blt <- TE_info$blast[TE_info$blast$qaccver %in% TE_info$te_conflict_info$ok,] + mp <- multiplicity_of_te(blt) + id_ok_mp_verified <- rownames(mp)[mp$C1/mp$L > min_coverage & mp$multiplicity >= min_multiplicity] + return(list(multiplicity = mp, + id_ok_mp_verified = id_ok_mp_verified)) + +} + +compare_TE_datasets <- function(Q, S, word_size = 20, min_coverage = 0.95, ncpus=10){ + blt <- blast_all2all(seqs = Q, db = S, ncpus = ncpus, word_size = word_size) + QL <- gsub(".+[|]", "", blt$qaccver) + SL <- gsub(".+[|]", "", blt$saccver) + id_with_conflict <- unique(c(blt$qaccver[QL != SL])) + id_ok <- setdiff(unique(blt$qaccver), id_with_conflict) + # check coverage hits + blt_ok <- blt[blt$qaccver %in% id_ok,] + Q_lengths <- blt_ok$qlen[!duplicated(blt_ok$qaccver)] + names(Q_lengths) <- blt_ok$qaccver[!duplicated(blt_ok$qaccver)] + gr <- GRanges(seqnames = blt_ok$qaccver, seqlengths = Q_lengths, + IRanges(start = blt_ok$qstart, end = blt_ok$qend)) + cvr <- coverage(gr) + L <- sapply(cvr, function(x) sum(width(x))) + C1 <- sapply(cvr, function(x) sum(as.numeric(runValue(x) >= 1) * runLength(x))) + Max_uncovered <- sapply(cvr, function(x){ + if(any(runValue(x)==0)){ + return(max(runLength(x)[runValue(x) == 0])) + }else{ + return(0) + } + }) + + # verified based on hit to reference - S + C1_prop <- C1/L + pass <- (C1_prop >= min_coverage & Max_uncovered < 500) + if (any(pass)){ + id_ok_verified <- names(C1_prop) + }else { + id_ok_verified <- NULL + } + return(list(id_with_conflict = id_with_conflict, + id_ok = id_ok, + id_ok_verified = id_ok_verified + )) +} + + + +blast_table_subset <- function(bl,id){ + return(bl[bl$qaccver %in% id & bl$saccver %in% id,, drop = FALSE]) +} + +get_representative_ranges <- function(bl, min_length = 60){ + score <- sort(unlist(by(bl$bitscore, bl$qaccver, sum, simplify = FALSE)), + decreasing = TRUE) + L <- bl$qlen[!duplicated(bl$qaccver)] + names(L) <- bl$qaccver[!duplicated(bl$qaccver)] + gr <- GRanges(seqnames = bl$qaccver, + IRanges(start = bl$qstart, end = bl$qend), + seqlengths = L, + subject = bl$saccver, + sstart = ifelse(bl$send < bl$sstart, bl$send, bl$sstart), + send = ifelse(bl$send > bl$sstart, bl$send, bl$sstart)) + SN <- levels(seqnames(gr)) + inc <- rep(TRUE, length(gr)) + MSK <- GRangesList() + for (i in names(score)){ + inc[gr$subject %in% i] <- FALSE + gr_part <- gr[seqnames(gr) %in% i & inc] + MSK[[i]] <- GRanges(seqnames = factor(gr_part$subject, levels = SN), + IRanges(start = gr_part$sstart, end = gr_part$send), + seqlengths = L + ) + } + gout <- unlist(MSK) + + full_gr <- GRanges(seqnames = factor(SN, levels = SN), + IRanges(start = rep(1,length(L)), + end = L) + ) + unmasked_gr <- GenomicRanges::setdiff(full_gr, gout) + return(unmasked_gr[width(unmasked_gr) >= min_length]) +} + +expected_diversity <- function(seqs, niter=100, km = 6){ + L <- nchar(seqs) + R <- matrix(ncol = niter, nrow = length(seqs)) + for (i in 1:niter){ + print(i) + seqs_rnd <- DNAStringSet(sapply(L, function(n) paste(sample(c("A", "C", "T", "G"), n, replace=TRUE), collapse=""))) + R[,i] <- seq_diversity(seqs_rnd, km = km)$richness + } + R + +} + +seq_diversity <- function (seqs, km=6){ + K <- oligonucleotideFrequency(seqs, width=km)>0 + P <- t(K)/rowSums(K) + # shannon index + SI <- apply(P, 2, function(x) {x1 <- x[x>0]; -sum(x1*log(x1))}) + # richness + R <- rowSums(K) + list(richness=R, shannon_index=SI) +} + + + +blast <- function(s1, s2) { + tmp1 <- tempfile() + tmp2 <- tempfile() + tmp_out <- tempfile() + writeXStringSet(DNAStringSet(s1), tmp1) + writeXStringSet(DNAStringSet(s2), tmp2) + # alternative blast: + cmd <- paste("blastn -task blastn -word_size 7 -dust no -gapextend 1 -gapopen 2 -reward 1 -penalty -1", + " -query ", tmp1, ' -subject ', tmp2, ' -strand plus ', + '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq"', + ' -out', tmp_out) + + system(cmd) + out_raw <- read.table(tmp_out, as.is = TRUE, sep = "\t", + col.names = strsplit("qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq", split = ' ')[[1]]) + + if (nrow(out_raw) == 0) { + return(out_raw) + } + out <- trim2TGAC(out_raw) + # remove alingment shorted that + out <- out[out$length > 100,] + if (nrow(out) == 0) { + return(out) + } + ## filter for TGCA + TG_L <- substring(out$qseq, 1, 2) == "TG" + TG_R <- substring(out$sseq, 1, 2) == "TG" + CA_L <- substring(out$qseq, out$length - 1, out$length) == "CA" + CA_R <- substring(out$sseq, out$length - 1, out$length) == "CA" + cond <- TG_L & TG_R & CA_L & CA_R + out <- out[cond, , drop = FALSE] + unlink(tmp1) + unlink(tmp2) + unlink(tmp_out) + # TODO - detele all temp files! + # unlink(tmp1, tmp2, tmp_out) + return(out) +} + +get_correct_coordinates <- function(b) { + do.call(rbind, strsplit(b$qaccver, split = "_")) +} + +evaluate_ltr <- function(GR, GRL, GRR, blast_line, Lseq, Rseq) { + LTR_L <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), + start = start(GRL) + blast_line$qstart - 2, + end = start(GRL) + blast_line$qend - 1)) + LTR_R <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), + start = start(GRR) + blast_line$sstart - 2, + end = start(GRR) + blast_line$send - 1)) + + TSD_L <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), + start = start(GRL) + blast_line$qstart - 3:10, + end = start(GRL) + blast_line$qstart - 3)) + TSD_R <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), + start = start(GRR) + blast_line$send, + end = start(GRR) + blast_line$send + 0:7)) + + TSD_L_seq <- DNAStringSet(substring(Lseq, blast_line$qstart - 2:9, blast_line$qstart - 2)) + TSD_R_seq <- DNAStringSet(substring(Rseq, blast_line$send + 1, blast_line$send + 1:8)) + + matching_tsd <- TSD_R_seq == TSD_L_seq + matching_tsd[1:3] <- FALSE # remove short tsd + p <- which(matching_tsd) + if (length(p) > 0) { + TSD_Length <- max(p) + TSD_sequence <- TSD_L_seq[TSD_Length] + TSD_position <- append(TSD_R[TSD_Length], TSD_L[TSD_Length]) + }else { + TSD_Length <- 0 + TSD_sequence <- "" + TSD_position <- NA + } + + TE_Length <- end(LTR_R) - start(LTR_L) + LTR_Identity <- blast_line$pident + out <- list(TSD_position = TSD_position, TSD_sequence = TSD_sequence, TSD_Length = TSD_Length, + LTR_R_position = LTR_R, LTR_L_position = LTR_L, TE_Length = TE_Length, LTR_Identity = LTR_Identity) + return(out) +} + +get_best_ltr <- function(x) { + tsd_ok <- sapply(x, function(k)k$TSD_Length > 3) + te_length_ok <- sapply(x, function(k)k$TE_Length < 30000) + ltr_length_ok <- sapply(x, function(k)width(k$LTR_R_position) >= 100 & width(k$LTR_L_position) >= 100) + if (sum(tsd_ok & te_length_ok & ltr_length_ok) >= 1) { + # return the first one (best bitscore) + return(x[tsd_ok & te_length_ok][1]) + } + if (any(te_length_ok & ltr_length_ok)) { + return(x[te_length_ok & ltr_length_ok][1]) + }else { + return(NULL) + } +} + +get_te_gff3 <- function(g, ID) { + D <- makeGRangesFromDataFrame(g$domain, keep.extra.columns = TRUE) + sn <- seqnames(D)[1] + S <- strand(D)[1] + TE <- GRanges(seqnames = sn, + IRanges(start = start(g$ltr_info[[1]]$LTR_L_position), + end = end(g$ltr_info[[1]]$LTR_R_position)), strand = S) + TE$type <- "transposable_element" + TE$ID <- ID + + if (as.character(S) == "+") { + LTR_5 <- g$ltr_info[[1]]$LTR_L + LTR_3 <- g$ltr_info[[1]]$LTR_R + }else { + LTR_3 <- g$ltr_info[[1]]$LTR_L + LTR_5 <- g$ltr_info[[1]]$LTR_R + } + LTR_5$LTR <- '5LTR' + LTR_3$LTR <- '3LTR' + LTR_5$type <- "long_terminal_repeat" + LTR_3$type <- "long_terminal_repeat" + strand(LTR_3) <- S + strand(LTR_5) <- S + LTR_3$Parent <- ID + LTR_5$Parent <- ID + LTR_3$Final_Classification <- D$Final_Classification[1] + LTR_5$Final_Classification <- D$Final_Classification[1] + LTR_5$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity + LTR_3$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity + + TE$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity + TE$LTR5_length <- width(LTR_5) + TE$LTR3_length <- width(LTR_3) + + if (is.na(g$ltr_info[[1]]$TSD_position)[1]) { + # no TSD found + TSD <- NULL + TE$TSD <- 'not_found' + }else { + TSD <- g$ltr_info[[1]]$TSD_position + TSD$type <- "target_site_duplication" + TSD$Parent <- ID + TE$TSD <- as.character(g$ltr_info[[1]]$TSD_sequence) + } + + TE$Final_Classification <- D$Final_Classification[1] + + D$Parent <- ID + out <- c(TE, LTR_3, LTR_5, D, TSD) + return(out) +} + +get_TE <- function(Lseq, Rseq, domains_gff, GR, GRL, GRR) { + xx <- blast(Lseq, Rseq) + if (nrow(xx) == 0) { + return(NULL) + }else { + ltr_tmp <- list() + for (j in 1:nrow(xx)) { + ltr_tmp[[j]] <- evaluate_ltr(GR, GRL, GRR, xx[j, , drop = FALSE], Lseq, Rseq) + } + ltr <- get_best_ltr(ltr_tmp) + if (length(ltr) == 0) { + return(NULL) + ## add good ltr + }else { + return(list(domain = domains_gff, ltr_info = ltr, blast_out = xx)) + } + } +} + +add_pbs <- function(te, s, trna_db) { + ltr5 <- te[which(te$LTR == "5LTR")] + STRAND <- as.character(strand(te)[1]) + if (STRAND == "+") { + pbs_gr <- GRanges(seqnames(ltr5), IRanges(start = end(ltr5) + 1, end = end(ltr5) + 31)) + pbs_s <- reverseComplement(getSeq(s, pbs_gr)) + }else { + pbs_gr <- GRanges(seqnames(ltr5), IRanges(end = start(ltr5) - 1, start = start(ltr5) - 30)) + pbs_s <- getSeq(s, pbs_gr) + } + + names(pbs_s) <- "pbs_region" + # find trna match + tmp <- tempfile() + tmp_out <- tempfile() + writeXStringSet(DNAStringSet(pbs_s), tmp) + # alternative blast: + cmd <- paste("blastn -task blastn -word_size 7 -dust no -perc_identity 100", + " -query ", tmp, ' -db ', trna_db, ' -strand plus ', + '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq"', + ' -out', tmp_out) + + system(cmd) + pbs_exact_gr <- NULL + out_raw <- read.table(tmp_out, as.is = TRUE, sep = "\t", + col.names = strsplit( + "qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq", + split = ' ')[[1]]) + if (nrow(out_raw) > 0) { + cca <- grepl("CCA$", out_raw$qseq) + to_end <- out_raw$send == 23 # align to end of sequence + max_dist <- out_raw$qend > 25 # max 5 bp from ltr + min_length <- out_raw$length >= 10 + out_pass <- out_raw[cca & to_end & max_dist & min_length,] + if (nrow(out_pass) > 0) { + trna_id <- out_pass$saccver[1] + if (STRAND == "+") { + S <- end(ltr5) + 32 - out_pass$qend[1] + E <- end(ltr5) + 32 - out_pass$qstart[1] + }else { + S <- start(ltr5) - 31 + out_pass$qstart[1] + E <- start(ltr5) - 31 + out_pass$qend[1] + } + pbs_exact_gr <- GRanges(seqnames(ltr5), IRanges(start = S, end = E)) + pbs_exact_gr$trna_id <- trna_id + pbs_exact_gr$Length <- out_pass$Length + strand(pbs_exact_gr) <- STRAND + pbs_exact_gr$type <- 'primer_binding_site' + pbs_exact_gr$Parent <- te[1]$ID + te$trna_id <- c(trna_id, rep(NA, length(te) - 1)) + + } + } + te <- append(te, pbs_exact_gr) + unlink(tmp) + unlink(tmp_out) + return(te) +} + +get_te_statistics <- function(gr, RT) { + DOMAINS_LTR <- gr[gr$type == "transposable_element" & + gr$TSD == "not_found" & + is.na(gr$trna_id)] + DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & + gr$TSD != "not_found" & + is.na(gr$trna_id)] + DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" & + gr$TSD != "not_found" & + !is.na(gr$trna_id)] + DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" & + gr$TSD == "not_found" & + !is.na(gr$trna_id)] + + all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE)) + RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class))) + DL <- as.integer(table(factor(DOMAINS_LTR$Final_Classification, levels = all_class))) + DLT <- as.integer(table(factor(DOMAINS_LTR_TSD$Final_Classification, levels = all_class))) + DLTP <- as.integer(table(factor(DOMAINS_LTR_TSD_PBS$Final_Classification, levels = all_class))) + DLP <- as.integer(table(factor(DOMAINS_LTR_PBS$Final_Classification, levels = all_class))) + out <- data.frame(RT_domain = RT_domain, + DOMAINS_LTR = DL, + DOMAINS_LTR_TSD = DLT, + DOMAINS_LTR_PBS = DLP, + DOMAINS_LTR_TSD_PBS = DLTP, + row.names = all_class + ) + total <- colSums(out) + out <- rbind(out, Total = total) + return(out) +} + +getSeqNamed <- function(s, gr) { + spart <- getSeq(s, gr) + id1 <- paste0(seqnames(gr), '_', start(gr), "_", end(gr)) + id2 <- gr$Final_Classification + names(spart) <- paste0(id1, "#", id2) + spart +} + +get_TE_id <- function (gr){ + gr_te <- gr[gr$type == "transposable_element"] + ID <- gr_te$ID + A <- paste0(seqnames(gr_te), '_', start(gr_te), "_", end(gr_te)) + B <- gr_te$Final_Classification + names(ID) <- paste0(A, "#", B) + +} + +get_te_sequences <- function(gr, s) { + # return list of biostrings + DOMAINS_LTR <- gr[gr$type == "transposable_element" & + gr$TSD == "not_found" & + is.na(gr$trna_id)] + DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & + gr$TSD != "not_found" & + is.na(gr$trna_id)] + DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" & + gr$TSD != "not_found" & + !is.na(gr$trna_id)] + DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" & + gr$TSD == "not_found" & + !is.na(gr$trna_id)] + s_DL <- getSeqNamed(s, DOMAINS_LTR) + s_DLT <- getSeqNamed(s, DOMAINS_LTR_TSD) + s_DLP <- getSeqNamed(s, DOMAINS_LTR_PBS) + s_DLTP <- getSeqNamed(s, DOMAINS_LTR_TSD_PBS) + return(DNAStringSetList( + DL = s_DL, + DLT = s_DLT, + DLP = s_DLP, + DLTP = s_DLTP + )) + +} + +cd_hit_est <- function(seqs, min_identity = 0.9, word_size = 10, ncpu = 2){ + # runs cd-hi-est and return table with cluster membership, and size and if reads was repesentative + # input sequences must be in the same orientation!!! + sfile <- tempfile() + fasta_out <- tempfile() + clstr <- paste0(fasta_out,".clstr") + # cdhit is triming names!! + ori_names <- names(seqs) + names(seqs) <- seq_along(seqs) + writeXStringSet(seqs, sfile) + cmd <- paste("cd-hit-est", + "-i", sfile, + "-o", fasta_out, + "-c", min_identity, + "-n", word_size, + "-T", ncpu, + "-r", 0) + system(cmd) + cls_raw <- grep("^>", readLines(clstr), invert = TRUE, value = TRUE) + unlink(fasta_out) + unlink(clstr) + index <- gsub("\t.+","",cls_raw) + id <- as.numeric(gsub("[.].+","", + gsub(".+>", "", cls_raw)) + ) + is_representative <- id %in% id[grep("[*]$",cls_raw)] + membership <- cumsum(index=="0") + cluster_size <- tabulate(membership)[membership] + # reorder + ord <- order(id) + cls_info <- data.frame( + seq_id = ori_names, + membership = membership[ord], + cluster_size = cluster_size[ord], + is_representative = is_representative[ord] + ) + return(cls_info) +} +