view do_not_track_info/te_analysis.R @ 8:9de392f2fc02 draft

"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
author petr-novak
date Tue, 28 Jun 2022 12:33:22 +0000
parents c33d6583e548
children
line wrap: on
line source

#!/usr/bin/env Rscript
library(ggplot2)
lf <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/ltr_finder_13100_2018_144_MOESM1_ESM.csv",
                as.is=TRUE, header=TRUE, sep="\t", quote = "")
cls <-  read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/classification__domain_features_13100_2018_144_MOESM1_ESM.csv",
                 as.is=TRUE, header=TRUE, sep="\t", quote='', comment.char = "")

# add classification to ltr
rexdb <- cls$Classification..new03.
names(rexdb) <- cls$REXdb.name


lf$classification <- cls$Classification..new03.[match(lf$REXdb.name, cls$REXdb.name)]




length_quantiles = sapply(split(lf$size..including.TSD., lf$classification), quantile, seq(0,1,by=0.025))
length95min = length_quantiles[2,]
length95max = length_quantiles[40,]

ltr_length_quantiles = sapply(split(lf$X3.LTR.size, lf$classification), quantile, seq(0,1,by=0.025))
ltr_length95min = ltr_length_quantiles[2,]
ltr_length95max = ltr_length_quantiles[40,]

write.table(length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/TE_length_quantiles.csv", sep="\t")
write.table(ltr_length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/LTR_length_quantiles.csv", sep="\t")

library(rtracklayer)
d = import("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/dante_on_rexdb.gff3")

# some elements have duplicated domains - remove it
dom_dup = sapply(split(d$Name, seqnames(d)), function (x)any(duplicated(x)))
d2 = d[seqnames(d) %in% names(dom_dup)[!dom_dup]]
dparts = split(d2, d2$Final_Classification)

dparts_rex = sapply(dparts, function(x) split(x, seqnames(x), drop = TRUE))


get_dom_pos <- function(g){
  if (length(g) == 0){
    return(NULL)
  }
  gdf <-  data.frame(rexdb = as.character(seqnames(g)),
                   domain = g$Name,
                   S = start(g),
                   E = end(g),
                   stringsAsFactors = FALSE)
  gSmat <- xtabs(S  ~ rexdb + domain, data = gdf)
  colnames(gSmat) <-  paste0(colnames(gSmat),"_S")
  gEmat <- xtabs(E  ~ rexdb + domain, data = gdf)
  colnames(gEmat) <-  paste0(colnames(gEmat),"_E")
  SEmat <- cbind(gSmat,gEmat)
  # reorder
  dom_position <- colMeans(SEmat)
  SEmat_sorted <-  SEmat[,order(dom_position)]
  return(SEmat_sorted)
}


domains_SE = lapply(dparts, get_dom_pos)

# add ltr5_E, ltr5_S, ..
LSE = list()
for (i in seq_along(domains_SE)){
  ltr3 = lf$X3.LTR.position[match(rownames(domains_SE[[i]]), lf$REXdb.name)]
  ltr5 = lf$X5.LTR..position[match(rownames(domains_SE[[i]]), lf$REXdb.name)]
  if (all(is.na(ltr3))){
    # not ltr retrotransposon
    LSE[[i]] = domains_SE[[i]]
    next
  }
  ltr5_S = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])})
  ltr5_E = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])})
  ltr3_S = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])})
  ltr3_E = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])})
  LSE[[i]] = cbind(ltr5_S, ltr5_E, domains_SE[[i]], ltr3_S, ltr3_E)
}

names(LSE) = names(domains_SE)
newN = function(a,b){
  ap = gsub("_.+","", a)
  bp = gsub("_.+","", b)
  ifelse(ap == bp, ap, paste0(ap,"_", bp))
}

# positions to distances:
LSED = lapply(LSE, function(x){
  x[x==0]=NA
  N = ncol(x)
  if (is.null(N)){
    return(NULL)
  }
  NM2 = colnames(x[,2:N])
  NM1 = colnames(x[,1:(N - 1)])
  d = x[,2:N, drop = FALSE] - x[,1:(N-1), drop=FALSE]
  colnames(d) = newN(NM1, NM2)
  return(d)
}
)


# create empirical cumulative distribution function
get_ecdf <- function(dd, s = 1) {
  ecdf_list = list()
  for (i1 in colnames(dd)){
    for (i2 in colnames(dd)){
      if (i1 == i2){
        next
      }else{
        ii = sort(c(i1,i2))
        ecdf_list[[ii[1]]][[ii[2]]]=ecdf(s * abs(dd[,i2]-dd[,i1]))

      }
    }
  }
return(ecdf_list)
}


ecdf_list = lapply(LSE, get_ecdf)
ecdf_list_m = lapply(LSE, get_ecdf, -1)

ecdf_model = list(plus = ecdf_list, minus = ecdf_list_m)

saveRDS(ecdf_model, "databases/feature_distances_model.RDS")

dante2dist <-  function(dc){
  # dc - cluster of domains, granges
  dpos <- get_dom_pos(dc)
  D <-  c(dist(dpos))
  NN <-  combn(names(dpos), 2)
  # order feature in pair alphabeticaly
  N1 <-  ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ])
  N2 <-  ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ])
  dfd <-  data.frame(F1 = N1, F2 = N2, distance = D)
}



dante_to_quantiles <- function(dc, model_function, lineage=NULL){
  dfd <- dante2dist(dc)
  if (is.null(lineage)){
    lineage <- dc$Final_Classification[1]
  }
  fn <- model_function$plus[[lineage]]
  fnm <- model_function$minus[[lineage]]
  dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x))
  dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x))
  dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1))
}



q=dante_to_quantiles(dparts_rex[[29]][[500]], ecdf_model)

qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[43]][[1]]$Final_Classification[1])
qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[29]][[1]]$Final_Classification[1])

ll = sapply(qq, function(x)(sum(x$dstat12<0.01))/nrow(x))
lln = sapply(qq, function(x)(sum(x$dstat12<0.01)))


table(ll > 0.1) # more that 10% dist are outliers

table(ll)
table(lln)
lll = sapply(ll, min)
hist(lll, breaks= 500)
dc = dparts_rex[[30]][[30]]

print(min(dout[4:5]))
plot(dout[4:5], xlim=c(0,1), ylim=c(0,1))

hist(dout[,6], breaks=50, xlim=c(0,1))
mtext(min(dout[,6]))

apply(LSED[[30]],2, median, na.rm = TRUE)