annotate do_not_track_info/te_analysis.R @ 9:1aa578e6c8b3 draft

"planemo upload commit 9488b982bae902f1868785ec4ad47134dac50ff3"
author petr-novak
date Wed, 29 Jun 2022 09:25:54 +0000
parents c33d6583e548
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
1 #!/usr/bin/env Rscript
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
2 library(ggplot2)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
3 lf <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/ltr_finder_13100_2018_144_MOESM1_ESM.csv",
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
4 as.is=TRUE, header=TRUE, sep="\t", quote = "")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
5 cls <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/classification__domain_features_13100_2018_144_MOESM1_ESM.csv",
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
6 as.is=TRUE, header=TRUE, sep="\t", quote='', comment.char = "")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
8 # add classification to ltr
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
9 rexdb <- cls$Classification..new03.
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
10 names(rexdb) <- cls$REXdb.name
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
11
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
12
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
13 lf$classification <- cls$Classification..new03.[match(lf$REXdb.name, cls$REXdb.name)]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
14
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
15
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
16
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
17
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
18 length_quantiles = sapply(split(lf$size..including.TSD., lf$classification), quantile, seq(0,1,by=0.025))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
19 length95min = length_quantiles[2,]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
20 length95max = length_quantiles[40,]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
21
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
22 ltr_length_quantiles = sapply(split(lf$X3.LTR.size, lf$classification), quantile, seq(0,1,by=0.025))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
23 ltr_length95min = ltr_length_quantiles[2,]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
24 ltr_length95max = ltr_length_quantiles[40,]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
25
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
26 write.table(length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/TE_length_quantiles.csv", sep="\t")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
27 write.table(ltr_length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/LTR_length_quantiles.csv", sep="\t")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
28
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
29 library(rtracklayer)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
30 d = import("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/dante_on_rexdb.gff3")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
31
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
32 # some elements have duplicated domains - remove it
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
33 dom_dup = sapply(split(d$Name, seqnames(d)), function (x)any(duplicated(x)))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
34 d2 = d[seqnames(d) %in% names(dom_dup)[!dom_dup]]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
35 dparts = split(d2, d2$Final_Classification)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
36
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
37 dparts_rex = sapply(dparts, function(x) split(x, seqnames(x), drop = TRUE))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
38
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
39
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
40 get_dom_pos <- function(g){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
41 if (length(g) == 0){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
42 return(NULL)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
43 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
44 gdf <- data.frame(rexdb = as.character(seqnames(g)),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
45 domain = g$Name,
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
46 S = start(g),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
47 E = end(g),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
48 stringsAsFactors = FALSE)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
49 gSmat <- xtabs(S ~ rexdb + domain, data = gdf)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
50 colnames(gSmat) <- paste0(colnames(gSmat),"_S")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
51 gEmat <- xtabs(E ~ rexdb + domain, data = gdf)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
52 colnames(gEmat) <- paste0(colnames(gEmat),"_E")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
53 SEmat <- cbind(gSmat,gEmat)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
54 # reorder
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
55 dom_position <- colMeans(SEmat)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
56 SEmat_sorted <- SEmat[,order(dom_position)]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
57 return(SEmat_sorted)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
58 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
59
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
60
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
61 domains_SE = lapply(dparts, get_dom_pos)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
62
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
63 # add ltr5_E, ltr5_S, ..
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
64 LSE = list()
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
65 for (i in seq_along(domains_SE)){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
66 ltr3 = lf$X3.LTR.position[match(rownames(domains_SE[[i]]), lf$REXdb.name)]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
67 ltr5 = lf$X5.LTR..position[match(rownames(domains_SE[[i]]), lf$REXdb.name)]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
68 if (all(is.na(ltr3))){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
69 # not ltr retrotransposon
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
70 LSE[[i]] = domains_SE[[i]]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
71 next
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
72 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
73 ltr5_S = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])})
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
74 ltr5_E = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])})
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
75 ltr3_S = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])})
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
76 ltr3_E = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])})
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
77 LSE[[i]] = cbind(ltr5_S, ltr5_E, domains_SE[[i]], ltr3_S, ltr3_E)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
78 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
79
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
80 names(LSE) = names(domains_SE)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
81 newN = function(a,b){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
82 ap = gsub("_.+","", a)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
83 bp = gsub("_.+","", b)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
84 ifelse(ap == bp, ap, paste0(ap,"_", bp))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
85 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
86
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
87 # positions to distances:
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
88 LSED = lapply(LSE, function(x){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
89 x[x==0]=NA
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
90 N = ncol(x)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
91 if (is.null(N)){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
92 return(NULL)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
93 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
94 NM2 = colnames(x[,2:N])
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
95 NM1 = colnames(x[,1:(N - 1)])
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
96 d = x[,2:N, drop = FALSE] - x[,1:(N-1), drop=FALSE]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
97 colnames(d) = newN(NM1, NM2)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
98 return(d)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
99 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
100 )
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
101
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
102
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
103 # create empirical cumulative distribution function
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
104 get_ecdf <- function(dd, s = 1) {
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
105 ecdf_list = list()
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
106 for (i1 in colnames(dd)){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
107 for (i2 in colnames(dd)){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
108 if (i1 == i2){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
109 next
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
110 }else{
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
111 ii = sort(c(i1,i2))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
112 ecdf_list[[ii[1]]][[ii[2]]]=ecdf(s * abs(dd[,i2]-dd[,i1]))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
113
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
114 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
115 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
116 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
117 return(ecdf_list)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
118 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
119
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
120
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
121 ecdf_list = lapply(LSE, get_ecdf)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
122 ecdf_list_m = lapply(LSE, get_ecdf, -1)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
123
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
124 ecdf_model = list(plus = ecdf_list, minus = ecdf_list_m)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
125
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
126 saveRDS(ecdf_model, "databases/feature_distances_model.RDS")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
127
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
128 dante2dist <- function(dc){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
129 # dc - cluster of domains, granges
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
130 dpos <- get_dom_pos(dc)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
131 D <- c(dist(dpos))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
132 NN <- combn(names(dpos), 2)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
133 # order feature in pair alphabeticaly
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
134 N1 <- ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ])
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
135 N2 <- ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ])
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
136 dfd <- data.frame(F1 = N1, F2 = N2, distance = D)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
137 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
138
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
139
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
140
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
141 dante_to_quantiles <- function(dc, model_function, lineage=NULL){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
142 dfd <- dante2dist(dc)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
143 if (is.null(lineage)){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
144 lineage <- dc$Final_Classification[1]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
145 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
146 fn <- model_function$plus[[lineage]]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
147 fnm <- model_function$minus[[lineage]]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
148 dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
149 dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
150 dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
151 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
152
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
153
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
154
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
155 q=dante_to_quantiles(dparts_rex[[29]][[500]], ecdf_model)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
156
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
157 qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[43]][[1]]$Final_Classification[1])
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
158 qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[29]][[1]]$Final_Classification[1])
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
159
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
160 ll = sapply(qq, function(x)(sum(x$dstat12<0.01))/nrow(x))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
161 lln = sapply(qq, function(x)(sum(x$dstat12<0.01)))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
162
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
163
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
164 table(ll > 0.1) # more that 10% dist are outliers
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
165
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
166 table(ll)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
167 table(lln)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
168 lll = sapply(ll, min)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
169 hist(lll, breaks= 500)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
170 dc = dparts_rex[[30]][[30]]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
171
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
172 print(min(dout[4:5]))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
173 plot(dout[4:5], xlim=c(0,1), ylim=c(0,1))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
174
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
175 hist(dout[,6], breaks=50, xlim=c(0,1))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
176 mtext(min(dout[,6]))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
177
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff changeset
178 apply(LSED[[30]],2, median, na.rm = TRUE)