Mercurial > repos > petr-novak > dante_ltr
diff do_not_track_info/te_analysis.R @ 7:c33d6583e548 draft
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author | petr-novak |
---|---|
date | Fri, 24 Jun 2022 14:19:48 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/do_not_track_info/te_analysis.R Fri Jun 24 14:19:48 2022 +0000 @@ -0,0 +1,178 @@ +#!/usr/bin/env Rscript +library(ggplot2) +lf <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/ltr_finder_13100_2018_144_MOESM1_ESM.csv", + as.is=TRUE, header=TRUE, sep="\t", quote = "") +cls <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/classification__domain_features_13100_2018_144_MOESM1_ESM.csv", + as.is=TRUE, header=TRUE, sep="\t", quote='', comment.char = "") + +# add classification to ltr +rexdb <- cls$Classification..new03. +names(rexdb) <- cls$REXdb.name + + +lf$classification <- cls$Classification..new03.[match(lf$REXdb.name, cls$REXdb.name)] + + + + +length_quantiles = sapply(split(lf$size..including.TSD., lf$classification), quantile, seq(0,1,by=0.025)) +length95min = length_quantiles[2,] +length95max = length_quantiles[40,] + +ltr_length_quantiles = sapply(split(lf$X3.LTR.size, lf$classification), quantile, seq(0,1,by=0.025)) +ltr_length95min = ltr_length_quantiles[2,] +ltr_length95max = ltr_length_quantiles[40,] + +write.table(length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/TE_length_quantiles.csv", sep="\t") +write.table(ltr_length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/LTR_length_quantiles.csv", sep="\t") + +library(rtracklayer) +d = import("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/dante_on_rexdb.gff3") + +# some elements have duplicated domains - remove it +dom_dup = sapply(split(d$Name, seqnames(d)), function (x)any(duplicated(x))) +d2 = d[seqnames(d) %in% names(dom_dup)[!dom_dup]] +dparts = split(d2, d2$Final_Classification) + +dparts_rex = sapply(dparts, function(x) split(x, seqnames(x), drop = TRUE)) + + +get_dom_pos <- function(g){ + if (length(g) == 0){ + return(NULL) + } + gdf <- data.frame(rexdb = as.character(seqnames(g)), + domain = g$Name, + S = start(g), + E = end(g), + stringsAsFactors = FALSE) + gSmat <- xtabs(S ~ rexdb + domain, data = gdf) + colnames(gSmat) <- paste0(colnames(gSmat),"_S") + gEmat <- xtabs(E ~ rexdb + domain, data = gdf) + colnames(gEmat) <- paste0(colnames(gEmat),"_E") + SEmat <- cbind(gSmat,gEmat) + # reorder + dom_position <- colMeans(SEmat) + SEmat_sorted <- SEmat[,order(dom_position)] + return(SEmat_sorted) +} + + +domains_SE = lapply(dparts, get_dom_pos) + +# add ltr5_E, ltr5_S, .. +LSE = list() +for (i in seq_along(domains_SE)){ + ltr3 = lf$X3.LTR.position[match(rownames(domains_SE[[i]]), lf$REXdb.name)] + ltr5 = lf$X5.LTR..position[match(rownames(domains_SE[[i]]), lf$REXdb.name)] + if (all(is.na(ltr3))){ + # not ltr retrotransposon + LSE[[i]] = domains_SE[[i]] + next + } + ltr5_S = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])}) + ltr5_E = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])}) + ltr3_S = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])}) + ltr3_E = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])}) + LSE[[i]] = cbind(ltr5_S, ltr5_E, domains_SE[[i]], ltr3_S, ltr3_E) +} + +names(LSE) = names(domains_SE) +newN = function(a,b){ + ap = gsub("_.+","", a) + bp = gsub("_.+","", b) + ifelse(ap == bp, ap, paste0(ap,"_", bp)) +} + +# positions to distances: +LSED = lapply(LSE, function(x){ + x[x==0]=NA + N = ncol(x) + if (is.null(N)){ + return(NULL) + } + NM2 = colnames(x[,2:N]) + NM1 = colnames(x[,1:(N - 1)]) + d = x[,2:N, drop = FALSE] - x[,1:(N-1), drop=FALSE] + colnames(d) = newN(NM1, NM2) + return(d) +} +) + + +# create empirical cumulative distribution function +get_ecdf <- function(dd, s = 1) { + ecdf_list = list() + for (i1 in colnames(dd)){ + for (i2 in colnames(dd)){ + if (i1 == i2){ + next + }else{ + ii = sort(c(i1,i2)) + ecdf_list[[ii[1]]][[ii[2]]]=ecdf(s * abs(dd[,i2]-dd[,i1])) + + } + } + } +return(ecdf_list) +} + + +ecdf_list = lapply(LSE, get_ecdf) +ecdf_list_m = lapply(LSE, get_ecdf, -1) + +ecdf_model = list(plus = ecdf_list, minus = ecdf_list_m) + +saveRDS(ecdf_model, "databases/feature_distances_model.RDS") + +dante2dist <- function(dc){ + # dc - cluster of domains, granges + dpos <- get_dom_pos(dc) + D <- c(dist(dpos)) + NN <- combn(names(dpos), 2) + # order feature in pair alphabeticaly + N1 <- ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ]) + N2 <- ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ]) + dfd <- data.frame(F1 = N1, F2 = N2, distance = D) +} + + + +dante_to_quantiles <- function(dc, model_function, lineage=NULL){ + dfd <- dante2dist(dc) + if (is.null(lineage)){ + lineage <- dc$Final_Classification[1] + } + fn <- model_function$plus[[lineage]] + fnm <- model_function$minus[[lineage]] + dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x)) + dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x)) + dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1)) +} + + + +q=dante_to_quantiles(dparts_rex[[29]][[500]], ecdf_model) + +qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[43]][[1]]$Final_Classification[1]) +qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[29]][[1]]$Final_Classification[1]) + +ll = sapply(qq, function(x)(sum(x$dstat12<0.01))/nrow(x)) +lln = sapply(qq, function(x)(sum(x$dstat12<0.01))) + + +table(ll > 0.1) # more that 10% dist are outliers + +table(ll) +table(lln) +lll = sapply(ll, min) +hist(lll, breaks= 500) +dc = dparts_rex[[30]][[30]] + +print(min(dout[4:5])) +plot(dout[4:5], xlim=c(0,1), ylim=c(0,1)) + +hist(dout[,6], breaks=50, xlim=c(0,1)) +mtext(min(dout[,6])) + +apply(LSED[[30]],2, median, na.rm = TRUE) \ No newline at end of file