Mercurial > repos > petr-novak > dante_ltr
comparison do_not_track_info/te_analysis.R @ 7:c33d6583e548 draft
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author | petr-novak |
---|---|
date | Fri, 24 Jun 2022 14:19:48 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:b91ca438a1cb | 7:c33d6583e548 |
---|---|
1 #!/usr/bin/env Rscript | |
2 library(ggplot2) | |
3 lf <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/ltr_finder_13100_2018_144_MOESM1_ESM.csv", | |
4 as.is=TRUE, header=TRUE, sep="\t", quote = "") | |
5 cls <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/classification__domain_features_13100_2018_144_MOESM1_ESM.csv", | |
6 as.is=TRUE, header=TRUE, sep="\t", quote='', comment.char = "") | |
7 | |
8 # add classification to ltr | |
9 rexdb <- cls$Classification..new03. | |
10 names(rexdb) <- cls$REXdb.name | |
11 | |
12 | |
13 lf$classification <- cls$Classification..new03.[match(lf$REXdb.name, cls$REXdb.name)] | |
14 | |
15 | |
16 | |
17 | |
18 length_quantiles = sapply(split(lf$size..including.TSD., lf$classification), quantile, seq(0,1,by=0.025)) | |
19 length95min = length_quantiles[2,] | |
20 length95max = length_quantiles[40,] | |
21 | |
22 ltr_length_quantiles = sapply(split(lf$X3.LTR.size, lf$classification), quantile, seq(0,1,by=0.025)) | |
23 ltr_length95min = ltr_length_quantiles[2,] | |
24 ltr_length95max = ltr_length_quantiles[40,] | |
25 | |
26 write.table(length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/TE_length_quantiles.csv", sep="\t") | |
27 write.table(ltr_length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/LTR_length_quantiles.csv", sep="\t") | |
28 | |
29 library(rtracklayer) | |
30 d = import("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/dante_on_rexdb.gff3") | |
31 | |
32 # some elements have duplicated domains - remove it | |
33 dom_dup = sapply(split(d$Name, seqnames(d)), function (x)any(duplicated(x))) | |
34 d2 = d[seqnames(d) %in% names(dom_dup)[!dom_dup]] | |
35 dparts = split(d2, d2$Final_Classification) | |
36 | |
37 dparts_rex = sapply(dparts, function(x) split(x, seqnames(x), drop = TRUE)) | |
38 | |
39 | |
40 get_dom_pos <- function(g){ | |
41 if (length(g) == 0){ | |
42 return(NULL) | |
43 } | |
44 gdf <- data.frame(rexdb = as.character(seqnames(g)), | |
45 domain = g$Name, | |
46 S = start(g), | |
47 E = end(g), | |
48 stringsAsFactors = FALSE) | |
49 gSmat <- xtabs(S ~ rexdb + domain, data = gdf) | |
50 colnames(gSmat) <- paste0(colnames(gSmat),"_S") | |
51 gEmat <- xtabs(E ~ rexdb + domain, data = gdf) | |
52 colnames(gEmat) <- paste0(colnames(gEmat),"_E") | |
53 SEmat <- cbind(gSmat,gEmat) | |
54 # reorder | |
55 dom_position <- colMeans(SEmat) | |
56 SEmat_sorted <- SEmat[,order(dom_position)] | |
57 return(SEmat_sorted) | |
58 } | |
59 | |
60 | |
61 domains_SE = lapply(dparts, get_dom_pos) | |
62 | |
63 # add ltr5_E, ltr5_S, .. | |
64 LSE = list() | |
65 for (i in seq_along(domains_SE)){ | |
66 ltr3 = lf$X3.LTR.position[match(rownames(domains_SE[[i]]), lf$REXdb.name)] | |
67 ltr5 = lf$X5.LTR..position[match(rownames(domains_SE[[i]]), lf$REXdb.name)] | |
68 if (all(is.na(ltr3))){ | |
69 # not ltr retrotransposon | |
70 LSE[[i]] = domains_SE[[i]] | |
71 next | |
72 } | |
73 ltr5_S = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])}) | |
74 ltr5_E = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])}) | |
75 ltr3_S = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])}) | |
76 ltr3_E = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])}) | |
77 LSE[[i]] = cbind(ltr5_S, ltr5_E, domains_SE[[i]], ltr3_S, ltr3_E) | |
78 } | |
79 | |
80 names(LSE) = names(domains_SE) | |
81 newN = function(a,b){ | |
82 ap = gsub("_.+","", a) | |
83 bp = gsub("_.+","", b) | |
84 ifelse(ap == bp, ap, paste0(ap,"_", bp)) | |
85 } | |
86 | |
87 # positions to distances: | |
88 LSED = lapply(LSE, function(x){ | |
89 x[x==0]=NA | |
90 N = ncol(x) | |
91 if (is.null(N)){ | |
92 return(NULL) | |
93 } | |
94 NM2 = colnames(x[,2:N]) | |
95 NM1 = colnames(x[,1:(N - 1)]) | |
96 d = x[,2:N, drop = FALSE] - x[,1:(N-1), drop=FALSE] | |
97 colnames(d) = newN(NM1, NM2) | |
98 return(d) | |
99 } | |
100 ) | |
101 | |
102 | |
103 # create empirical cumulative distribution function | |
104 get_ecdf <- function(dd, s = 1) { | |
105 ecdf_list = list() | |
106 for (i1 in colnames(dd)){ | |
107 for (i2 in colnames(dd)){ | |
108 if (i1 == i2){ | |
109 next | |
110 }else{ | |
111 ii = sort(c(i1,i2)) | |
112 ecdf_list[[ii[1]]][[ii[2]]]=ecdf(s * abs(dd[,i2]-dd[,i1])) | |
113 | |
114 } | |
115 } | |
116 } | |
117 return(ecdf_list) | |
118 } | |
119 | |
120 | |
121 ecdf_list = lapply(LSE, get_ecdf) | |
122 ecdf_list_m = lapply(LSE, get_ecdf, -1) | |
123 | |
124 ecdf_model = list(plus = ecdf_list, minus = ecdf_list_m) | |
125 | |
126 saveRDS(ecdf_model, "databases/feature_distances_model.RDS") | |
127 | |
128 dante2dist <- function(dc){ | |
129 # dc - cluster of domains, granges | |
130 dpos <- get_dom_pos(dc) | |
131 D <- c(dist(dpos)) | |
132 NN <- combn(names(dpos), 2) | |
133 # order feature in pair alphabeticaly | |
134 N1 <- ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ]) | |
135 N2 <- ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ]) | |
136 dfd <- data.frame(F1 = N1, F2 = N2, distance = D) | |
137 } | |
138 | |
139 | |
140 | |
141 dante_to_quantiles <- function(dc, model_function, lineage=NULL){ | |
142 dfd <- dante2dist(dc) | |
143 if (is.null(lineage)){ | |
144 lineage <- dc$Final_Classification[1] | |
145 } | |
146 fn <- model_function$plus[[lineage]] | |
147 fnm <- model_function$minus[[lineage]] | |
148 dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x)) | |
149 dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x)) | |
150 dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1)) | |
151 } | |
152 | |
153 | |
154 | |
155 q=dante_to_quantiles(dparts_rex[[29]][[500]], ecdf_model) | |
156 | |
157 qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[43]][[1]]$Final_Classification[1]) | |
158 qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[29]][[1]]$Final_Classification[1]) | |
159 | |
160 ll = sapply(qq, function(x)(sum(x$dstat12<0.01))/nrow(x)) | |
161 lln = sapply(qq, function(x)(sum(x$dstat12<0.01))) | |
162 | |
163 | |
164 table(ll > 0.1) # more that 10% dist are outliers | |
165 | |
166 table(ll) | |
167 table(lln) | |
168 lll = sapply(ll, min) | |
169 hist(lll, breaks= 500) | |
170 dc = dparts_rex[[30]][[30]] | |
171 | |
172 print(min(dout[4:5])) | |
173 plot(dout[4:5], xlim=c(0,1), ylim=c(0,1)) | |
174 | |
175 hist(dout[,6], breaks=50, xlim=c(0,1)) | |
176 mtext(min(dout[,6])) | |
177 | |
178 apply(LSED[[30]],2, median, na.rm = TRUE) |