diff R/ltr_utils.R @ 7:c33d6583e548 draft

"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author petr-novak
date Fri, 24 Jun 2022 14:19:48 +0000
parents 6ae4a341d1f3
children 9de392f2fc02
line wrap: on
line diff
--- a/R/ltr_utils.R	Thu May 19 08:21:55 2022 +0000
+++ b/R/ltr_utils.R	Fri Jun 24 14:19:48 2022 +0000
@@ -11,35 +11,115 @@
   return(gff_updated)
 }
 
+get_domain_clusters_alt <- function(gff, dist_models, threshold=0.99){
+  gff <- sort(gff, by = ~ seqnames * start)
+  domain_pairs <- data.frame(
+    D1 = paste0(head(gff$Name,-1),"_S"),
+    D2 = paste0(gff$Name[-1],"_S"),
+    C1 =  head(gff$Final_Classification,-1),
+    C2 =  gff$Final_Classification[-1],
+    S1 =  head(strand(gff),-1),
+    S2 =  strand(gff)[-1],
+    start1 =  head(start(gff),-1),
+    start2 = start(gff)[-1],
+    chrpos= paste0(seqnames(gff)[-1],"_",start(gff)[-1])
+  )
+  domain_pairs$D_A <- with(domain_pairs, ifelse(D1 < D2, D1, D2))
+  domain_pairs$D_B <- with(domain_pairs, ifelse(D1 > D2, D1, D2))
+  domain_pairs$quantile <- 1
+  same_element_cluster <- domain_pairs$S1 == domain_pairs$S2 & domain_pairs$C1 == domain_pairs$C2
+  domain_pairs$same_element_cluster <- same_element_cluster
+
+  q_plus_function <- apply(domain_pairs[same_element_cluster,],
+                                         1,
+                                         function(x) dist_models$plus[[x["C1"]]][[x["D_A"]]][[x["D_B"]]])
+
+  D <- abs(domain_pairs[same_element_cluster,]$start1 - domain_pairs[same_element_cluster,]$start2)
+
+  q_plus_value <- mapply(function(x, D)if(is.null(x)){0}else{x(D)}, x= q_plus_function, D = D)
+
+  domain_pairs$quantile[same_element_cluster] <- q_plus_value
+
+  domain_pairs$split_position <- !same_element_cluster | domain_pairs$quantile > threshold
+  # combine clusters based on distances and original cluster
+  clusters <- paste(cumsum(c(TRUE, domain_pairs$split_position)), get_domain_clusters(gff))
+  return(clusters)
+}
+
+
 get_domain_clusters <- function(gff) {
   # get consecutive domains from same linage
   # must be sorted!
-  gag_plus <- as.numeric(cumsum(gff$Name == "GAG" & strand(gff) == '+'))
-  gag_minus <- rev(as.numeric(cumsum(rev(gff$Name == "GAG" & strand(gff) == '-'))))
+  plus_order_split <- c(0, as.numeric(cumsum(head(gff$domain_order, -1) >= gff$domain_order[-1] & strand(gff)[-1] == '+')))
+  minus_order_split <- rev(as.numeric(cumsum(rev(c(gff$domain_order[-1] >= head(gff$domain_order,-1) & strand(gff)[-1] == '-', FALSE)))))
   # split based on classification - must be same and consecutive
   x <- rle(gff$Final_Classification)
   # split on strand change
   s <- rep(seq_along(runLength(strand(gff))), runLength(strand(gff)))
   domain_cluster <- paste0(rep(seq_along(x$lengths), x$lengths), "_", seqnames(gff),
-                           "_", gag_plus, "_", gag_minus, "_", s)
-  gff_clusters <- split(as.data.frame(gff), factor(domain_cluster, levels = unique(domain_cluster)))
-  gff_clusters
+                           "_", plus_order_split, "_", minus_order_split, "_", s)
+  clusters <- factor(domain_cluster, levels = unique(domain_cluster))
+  return(clusters)
+}
+
+# create partial TE from clusters
+get_partial_te_from_cluster_of_domains <- function (gpart){
+  ID <- paste("TE_partial_", gpart$Cluster[1], sep="")
+  te_partial <- GRanges(type="transposable_element_partial",
+                        strand=strand(gpart)[1],
+                        ID = ID,
+                        source = "dante_ltr",
+                        seqnames=seqnames(gpart)[1],
+                        Final_Classification=gpart$Final_Classification[1],
+                        Name=gpart$Final_Classification[1],
+                        IRanges(min(start(gpart)), max(end(gpart))))
+  gpart$Parent <- ID
+  return(c(te_partial,gpart))
 }
 
-clean_domain_clusters <- function(gcl) {
+# create partial TE from clusters
+get_partial_te_from_cluster_of_domains <- function(gpart, ID) {
+  te_partial <- makeGRangesFromDataFrame(
+    data.frame(type = "transposable_element_partial",
+               strand = gpart$strand[1],
+               ID = ID,
+               source = "dante_ltr",
+               seqnames = gpart$seqnames[1],
+               Final_Classification =
+                 gpart$Final_Classification[1],
+               Name = gpart$Final_Classification[1],
+               IRanges(min(gpart$start), max(gpart$end))),
+    keep.extra.columns = TRUE)
+
+
+  gpart$Parent <- ID
+  gpart_gr = makeGRangesFromDataFrame(gpart, keep.extra.columns = TRUE)
+  return(c(te_partial, gpart_gr))
+}
+
+# function to count for each element number of occurences:
+count_occurences_for_each_element <- function(x) {
+  counts_unique <- table(x)
+  counts <- counts_unique[x]
+  counts
+}
+
+
+clean_domain_clusters <- function(gcl, lineage_domain_span, min_domains) {
   ## remove clusters wich does not have enough domains or domains
   ## are on different strand
   N_domains <- sapply(gcl, nrow)
   N_unique_domains <- sapply(gcl, function(x)length(unique(x$Name)))
   S <- sapply(gcl, function(x)paste(sort(unique(x$strand)), collapse = " "))
   S_OK <- S %in% c("+", "-")
-  min_domains <- 5
-  maxlength <- 15000 # max span between domains
+  max_span <- lineage_domain_span[sapply(gcl, function(x)  x$Final_Classification[1])]
+  # set to zero if lineage is not covered in lineage domain span
+  max_span[is.na(max_span)] <- 0
   span <- sapply(gcl, function(x)max(x$end) - min(x$start))
   cond1 <- S_OK &
     N_unique_domains == N_domains &
     N_domains >= min_domains &
-    span <= maxlength
+    span <= max_span
   return(gcl[cond1])
 }
 
@@ -92,6 +172,15 @@
   }
 }
 
+domain_distance <- function(d_query, d_reference){
+  if (d_query == d_reference){
+    return (0)
+  }
+  d_query_p <-  strsplit(d_query," ")[[1]]
+  d_reference_p <-  strsplit(d_reference," ")[[1]]
+  d <- length(d_reference_p) - sum(d_query_p == d_reference_p[d_reference_p %in% d_query_p])
+  return(d)
+}
 
 
 trim2TGAC <- function(bl) {
@@ -129,6 +218,9 @@
     query <- list(seqs)
   }else{
     query <-split(seqs, round(seq(1,ncpus,length.out = length(seqs))))
+    if (length(query) < ncpus){
+      ncpus <- length(query)
+    }
   }
   if(is.null(db)){
     # search against itself
@@ -183,8 +275,8 @@
 }
 
 
-analyze_TE <- function(seqs, ncpus = 10, word_size = 20){
-  blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size, perc_identity = 90)
+analyze_TE <- function(seqs, ncpus = 10, word_size = 20, perc_identity = 90, ...){
+  blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size, perc_identity = perc_identity, ...)
   te_conflict_info <- identify_conflicts(blt)
   blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok)
   te_ok_lineages <- split(blt_te_ok,
@@ -198,7 +290,6 @@
   ))
   seqs_representative <- getSeq(seqs, Reduce(c, gr_representative))
   names(seqs_representative) <- seqnames(Reduce(c, gr_representative))
-  # TODO - add swithin group verification here, ! exclude self hits!!
 
   return(
     list(
@@ -240,8 +331,8 @@
 
 }
 
-compare_TE_datasets <- function(Q, S, word_size = 20, min_coverage = 0.95, ncpus=10){
-  blt <- blast_all2all(seqs = Q, db = S, ncpus = ncpus, word_size = word_size)
+compare_TE_datasets <- function(Q, S, word_size = 20, min_coverage = 0.95, ncpus=10, ...){
+  blt <- blast_all2all(seqs = Q, db = S, ncpus = ncpus, word_size = word_size, ...)
   QL <- gsub(".+[|]", "", blt$qaccver)
   SL <- gsub(".+[|]", "", blt$saccver)
   id_with_conflict <- unique(c(blt$qaccver[QL != SL]))
@@ -340,7 +431,7 @@
 
 
 
-blast <- function(s1, s2) {
+blast <- function(s1, s2, expected_aln_lenght=NULL) {
   tmp1 <- tempfile()
   tmp2 <- tempfile()
   tmp_out <- tempfile()
@@ -361,7 +452,9 @@
   }
   out <- trim2TGAC(out_raw)
   # remove alingment shorted that
-  out <- out[out$length > 100,]
+  # out <- out[out$length > 100,]
+  # alngment must be at least 80% of expected LTRmin95
+  out <- out[out$length > (expected_aln_lenght *0.8),]
   if (nrow(out) == 0) {
     return(out)
   }
@@ -482,15 +575,15 @@
     TE$TSD <- as.character(g$ltr_info[[1]]$TSD_sequence)
   }
 
-  TE$Final_Classification <- D$Final_Classification[1]
+  TE$Name <- TE$Final_Classification <- D$Final_Classification[1]
 
   D$Parent <- ID
   out <- c(TE, LTR_3, LTR_5, D, TSD)
   return(out)
 }
 
-get_TE <- function(Lseq, Rseq, domains_gff, GR, GRL, GRR) {
-  xx <- blast(Lseq, Rseq)
+get_TE <- function(Lseq, Rseq, domains_gff, GR, GRL, GRR,LTR_length) {
+  xx <- blast(Lseq, Rseq, LTR_length)
   if (nrow(xx) == 0) {
     return(NULL)
   }else {
@@ -591,27 +684,40 @@
   return(Rank)
 }
 
-get_te_statistics <- function(gr, RT) {
-  DOMAINS_LTR <- gr[gr$type == "transposable_element" &
-                      gr$TSD == "not_found" &
-                      is.na(gr$trna_id)]
-  DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" &
-                          gr$TSD != "not_found" &
-                          is.na(gr$trna_id)]
-  DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" &
-                              gr$TSD != "not_found" &
-                              !is.na(gr$trna_id)]
-  DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" &
-                          gr$TSD == "not_found" &
-                          !is.na(gr$trna_id)]
+
+get_te_statistics <- function(gr, RT){
+  Ranks <- c("D", "DL", "DLT", "DLP", "DLTP")
+  all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE))
+  RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class)))
+  rank_table <- list()
+  for (i in 1:length(Ranks)) {
+    gr_part <- gr[gr$type == "transposable_element" & gr$Rank == Ranks[i]]
+    rank_table[[Ranks[i]]] <- as.integer(table(factor(gr_part$Final_Classification, levels = all_class)))
+
+  }
+  out <- cbind(do.call(cbind, rank_table), RT_domain=RT_domain)
+  out <- rbind(out, Total = colSums(out))
+  rownames(out) <- c(all_class, "Total")
+  return(out)
+}
+
+
+get_te_statistics_old <- function(gr, RT) {
+  DOMAINS <- gr[gr$type == "transposable_element" & gr$Rank == "D"]
+  DOMAINS_LTR <- gr[gr$type == "transposable_element" & gr$Rank == "DL"]
+  DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & gr$Rank == "DLT"]
+  DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" &  gr$Rank == "DLTP"]
+  DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element"  & gr$Rank == "DLP"]
 
   all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE))
   RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class)))
+  D <- as.integer(table(factor(DOMAINS$Final_Classification, levels = all_class)))
   DL <- as.integer(table(factor(DOMAINS_LTR$Final_Classification, levels = all_class)))
   DLT <- as.integer(table(factor(DOMAINS_LTR_TSD$Final_Classification, levels = all_class)))
   DLTP <- as.integer(table(factor(DOMAINS_LTR_TSD_PBS$Final_Classification, levels = all_class)))
   DLP <- as.integer(table(factor(DOMAINS_LTR_PBS$Final_Classification, levels = all_class)))
   out <- data.frame(RT_domain = RT_domain,
+                    DOMAINS = D,
                     DOMAINS_LTR = DL,
                     DOMAINS_LTR_TSD = DLT,
                     DOMAINS_LTR_PBS = DLP,
@@ -644,33 +750,20 @@
 
 }
 
-get_te_sequences <- function(gr, s) {
-  # return list of biostrings
-  DOMAINS_LTR <- gr[gr$type == "transposable_element" &
-                      gr$TSD == "not_found" &
-                      is.na(gr$trna_id)]
-  DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" &
-                          gr$TSD != "not_found" &
-                          is.na(gr$trna_id)]
-  DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" &
-                              gr$TSD != "not_found" &
-                              !is.na(gr$trna_id)]
-  DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" &
-                          gr$TSD == "not_found" &
-                          !is.na(gr$trna_id)]
-  s_DL <- getSeqNamed(s, DOMAINS_LTR)
-  s_DLT <- getSeqNamed(s, DOMAINS_LTR_TSD)
-  s_DLP <- getSeqNamed(s, DOMAINS_LTR_PBS)
-  s_DLTP <- getSeqNamed(s, DOMAINS_LTR_TSD_PBS)
-  return(DNAStringSetList(
-    DL = s_DL,
-    DLT = s_DLT,
-    DLP = s_DLP,
-    DLTP = s_DLTP
-  ))
 
+get_te_sequences <- function (gr, s) {
+  Ranks <- c("D", "DL", "DLT", "DLP", "DLTP")
+  s_te <- list()
+    for (i in 1:length(Ranks)) {
+        gr_te <- gr[gr$type == "transposable_element" & gr$Rank == Ranks[i]]
+        if (length(gr_te) > 0) {
+        s_te[[Ranks[i]]] <- getSeqNamed(s, gr_te)
+        }
+    }
+  return(s_te)
 }
 
+
 cd_hit_est <- function(seqs, min_identity = 0.9, word_size = 10, ncpu = 2){
   # runs cd-hi-est and return table with cluster membership, and size and if reads was repesentative
   # input sequences must be in the same orientation!!!
@@ -710,3 +803,54 @@
   return(cls_info)
 }
 
+
+dante2dist <-  function(dc){
+  # dc - cluster of domains, granges
+  dpos <- get_dom_pos(dc)
+  D <-  c(dist(dpos))
+  NN <-  combn(names(dpos), 2)
+  # order feature in pair alphabeticaly
+  N1 <-  ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ])
+  N2 <-  ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ])
+  dfd <-  data.frame(F1 = N1, F2 = N2, distance = D)
+}
+
+
+
+dante_to_quantiles <- function(dc, model_function, lineage=NULL){
+  dfd <- dante2dist(dc)
+  if (is.null(lineage)){
+    lineage <- dc$Final_Classification[1]
+  }
+  # check if lineage has defined ecdf
+  if (lineage %in% names(model_function$plus)){
+    fn <- model_function$plus[[lineage]]
+    fnm <- model_function$minus[[lineage]]
+    dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x))
+    dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x))
+    dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1))
+    return(dout)
+  }else{
+    return(NULL)
+  }
+}
+
+get_dom_pos <- function(g){
+  if (length(g) == 0){
+    return(NULL)
+  }
+  gdf <-  data.frame(rexdb = as.character(seqnames(g)),
+                   domain = g$Name,
+                   S = start(g),
+                   E = end(g),
+                   stringsAsFactors = FALSE)
+  gSmat <- xtabs(S  ~ rexdb + domain, data = gdf)
+  colnames(gSmat) <-  paste0(colnames(gSmat),"_S")
+  gEmat <- xtabs(E  ~ rexdb + domain, data = gdf)
+  colnames(gEmat) <-  paste0(colnames(gEmat),"_E")
+  SEmat <- cbind(gSmat,gEmat)
+  # reorder
+  dom_position <- colMeans(SEmat)
+  SEmat_sorted <-  SEmat[,order(dom_position)]
+  return(SEmat_sorted)
+}