Mercurial > repos > petr-novak > dante_ltr
view do_not_track_info/te_analysis.R @ 7:c33d6583e548 draft
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author | petr-novak |
---|---|
date | Fri, 24 Jun 2022 14:19:48 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env Rscript library(ggplot2) lf <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/ltr_finder_13100_2018_144_MOESM1_ESM.csv", as.is=TRUE, header=TRUE, sep="\t", quote = "") cls <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/classification__domain_features_13100_2018_144_MOESM1_ESM.csv", as.is=TRUE, header=TRUE, sep="\t", quote='', comment.char = "") # add classification to ltr rexdb <- cls$Classification..new03. names(rexdb) <- cls$REXdb.name lf$classification <- cls$Classification..new03.[match(lf$REXdb.name, cls$REXdb.name)] length_quantiles = sapply(split(lf$size..including.TSD., lf$classification), quantile, seq(0,1,by=0.025)) length95min = length_quantiles[2,] length95max = length_quantiles[40,] ltr_length_quantiles = sapply(split(lf$X3.LTR.size, lf$classification), quantile, seq(0,1,by=0.025)) ltr_length95min = ltr_length_quantiles[2,] ltr_length95max = ltr_length_quantiles[40,] write.table(length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/TE_length_quantiles.csv", sep="\t") write.table(ltr_length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/LTR_length_quantiles.csv", sep="\t") library(rtracklayer) d = import("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/dante_on_rexdb.gff3") # some elements have duplicated domains - remove it dom_dup = sapply(split(d$Name, seqnames(d)), function (x)any(duplicated(x))) d2 = d[seqnames(d) %in% names(dom_dup)[!dom_dup]] dparts = split(d2, d2$Final_Classification) dparts_rex = sapply(dparts, function(x) split(x, seqnames(x), drop = TRUE)) get_dom_pos <- function(g){ if (length(g) == 0){ return(NULL) } gdf <- data.frame(rexdb = as.character(seqnames(g)), domain = g$Name, S = start(g), E = end(g), stringsAsFactors = FALSE) gSmat <- xtabs(S ~ rexdb + domain, data = gdf) colnames(gSmat) <- paste0(colnames(gSmat),"_S") gEmat <- xtabs(E ~ rexdb + domain, data = gdf) colnames(gEmat) <- paste0(colnames(gEmat),"_E") SEmat <- cbind(gSmat,gEmat) # reorder dom_position <- colMeans(SEmat) SEmat_sorted <- SEmat[,order(dom_position)] return(SEmat_sorted) } domains_SE = lapply(dparts, get_dom_pos) # add ltr5_E, ltr5_S, .. LSE = list() for (i in seq_along(domains_SE)){ ltr3 = lf$X3.LTR.position[match(rownames(domains_SE[[i]]), lf$REXdb.name)] ltr5 = lf$X5.LTR..position[match(rownames(domains_SE[[i]]), lf$REXdb.name)] if (all(is.na(ltr3))){ # not ltr retrotransposon LSE[[i]] = domains_SE[[i]] next } ltr5_S = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])}) ltr5_E = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])}) ltr3_S = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])}) ltr3_E = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])}) LSE[[i]] = cbind(ltr5_S, ltr5_E, domains_SE[[i]], ltr3_S, ltr3_E) } names(LSE) = names(domains_SE) newN = function(a,b){ ap = gsub("_.+","", a) bp = gsub("_.+","", b) ifelse(ap == bp, ap, paste0(ap,"_", bp)) } # positions to distances: LSED = lapply(LSE, function(x){ x[x==0]=NA N = ncol(x) if (is.null(N)){ return(NULL) } NM2 = colnames(x[,2:N]) NM1 = colnames(x[,1:(N - 1)]) d = x[,2:N, drop = FALSE] - x[,1:(N-1), drop=FALSE] colnames(d) = newN(NM1, NM2) return(d) } ) # create empirical cumulative distribution function get_ecdf <- function(dd, s = 1) { ecdf_list = list() for (i1 in colnames(dd)){ for (i2 in colnames(dd)){ if (i1 == i2){ next }else{ ii = sort(c(i1,i2)) ecdf_list[[ii[1]]][[ii[2]]]=ecdf(s * abs(dd[,i2]-dd[,i1])) } } } return(ecdf_list) } ecdf_list = lapply(LSE, get_ecdf) ecdf_list_m = lapply(LSE, get_ecdf, -1) ecdf_model = list(plus = ecdf_list, minus = ecdf_list_m) saveRDS(ecdf_model, "databases/feature_distances_model.RDS") dante2dist <- function(dc){ # dc - cluster of domains, granges dpos <- get_dom_pos(dc) D <- c(dist(dpos)) NN <- combn(names(dpos), 2) # order feature in pair alphabeticaly N1 <- ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ]) N2 <- ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ]) dfd <- data.frame(F1 = N1, F2 = N2, distance = D) } dante_to_quantiles <- function(dc, model_function, lineage=NULL){ dfd <- dante2dist(dc) if (is.null(lineage)){ lineage <- dc$Final_Classification[1] } fn <- model_function$plus[[lineage]] fnm <- model_function$minus[[lineage]] dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x)) dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x)) dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1)) } q=dante_to_quantiles(dparts_rex[[29]][[500]], ecdf_model) qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[43]][[1]]$Final_Classification[1]) qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[29]][[1]]$Final_Classification[1]) ll = sapply(qq, function(x)(sum(x$dstat12<0.01))/nrow(x)) lln = sapply(qq, function(x)(sum(x$dstat12<0.01))) table(ll > 0.1) # more that 10% dist are outliers table(ll) table(lln) lll = sapply(ll, min) hist(lll, breaks= 500) dc = dparts_rex[[30]][[30]] print(min(dout[4:5])) plot(dout[4:5], xlim=c(0,1), ylim=c(0,1)) hist(dout[,6], breaks=50, xlim=c(0,1)) mtext(min(dout[,6])) apply(LSED[[30]],2, median, na.rm = TRUE)