Mercurial > repos > petr-novak > dante_ltr
annotate do_not_track_info/te_analysis.R @ 9:1aa578e6c8b3 draft
"planemo upload commit 9488b982bae902f1868785ec4ad47134dac50ff3"
author | petr-novak |
---|---|
date | Wed, 29 Jun 2022 09:25:54 +0000 |
parents | c33d6583e548 |
children |
rev | line source |
---|---|
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
1 #!/usr/bin/env Rscript |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
2 library(ggplot2) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
3 lf <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/ltr_finder_13100_2018_144_MOESM1_ESM.csv", |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
4 as.is=TRUE, header=TRUE, sep="\t", quote = "") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
5 cls <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/classification__domain_features_13100_2018_144_MOESM1_ESM.csv", |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
6 as.is=TRUE, header=TRUE, sep="\t", quote='', comment.char = "") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
7 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
8 # add classification to ltr |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
9 rexdb <- cls$Classification..new03. |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
10 names(rexdb) <- cls$REXdb.name |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
11 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
12 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
13 lf$classification <- cls$Classification..new03.[match(lf$REXdb.name, cls$REXdb.name)] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
14 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
15 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
16 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
17 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
18 length_quantiles = sapply(split(lf$size..including.TSD., lf$classification), quantile, seq(0,1,by=0.025)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
19 length95min = length_quantiles[2,] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
20 length95max = length_quantiles[40,] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
21 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
22 ltr_length_quantiles = sapply(split(lf$X3.LTR.size, lf$classification), quantile, seq(0,1,by=0.025)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
23 ltr_length95min = ltr_length_quantiles[2,] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
24 ltr_length95max = ltr_length_quantiles[40,] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
25 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
26 write.table(length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/TE_length_quantiles.csv", sep="\t") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
27 write.table(ltr_length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/LTR_length_quantiles.csv", sep="\t") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
28 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
29 library(rtracklayer) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
30 d = import("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/dante_on_rexdb.gff3") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
31 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
32 # some elements have duplicated domains - remove it |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
33 dom_dup = sapply(split(d$Name, seqnames(d)), function (x)any(duplicated(x))) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
34 d2 = d[seqnames(d) %in% names(dom_dup)[!dom_dup]] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
35 dparts = split(d2, d2$Final_Classification) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
36 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
37 dparts_rex = sapply(dparts, function(x) split(x, seqnames(x), drop = TRUE)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
38 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
39 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
40 get_dom_pos <- function(g){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
41 if (length(g) == 0){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
42 return(NULL) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
43 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
44 gdf <- data.frame(rexdb = as.character(seqnames(g)), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
45 domain = g$Name, |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
46 S = start(g), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
47 E = end(g), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
48 stringsAsFactors = FALSE) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
49 gSmat <- xtabs(S ~ rexdb + domain, data = gdf) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
50 colnames(gSmat) <- paste0(colnames(gSmat),"_S") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
51 gEmat <- xtabs(E ~ rexdb + domain, data = gdf) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
52 colnames(gEmat) <- paste0(colnames(gEmat),"_E") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
53 SEmat <- cbind(gSmat,gEmat) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
54 # reorder |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
55 dom_position <- colMeans(SEmat) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
56 SEmat_sorted <- SEmat[,order(dom_position)] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
57 return(SEmat_sorted) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
58 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
59 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
60 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
61 domains_SE = lapply(dparts, get_dom_pos) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
62 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
63 # add ltr5_E, ltr5_S, .. |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
64 LSE = list() |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
65 for (i in seq_along(domains_SE)){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
66 ltr3 = lf$X3.LTR.position[match(rownames(domains_SE[[i]]), lf$REXdb.name)] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
67 ltr5 = lf$X5.LTR..position[match(rownames(domains_SE[[i]]), lf$REXdb.name)] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
68 if (all(is.na(ltr3))){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
69 # not ltr retrotransposon |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
70 LSE[[i]] = domains_SE[[i]] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
71 next |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
72 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
73 ltr5_S = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])}) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
74 ltr5_E = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])}) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
75 ltr3_S = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])}) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
76 ltr3_E = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])}) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
77 LSE[[i]] = cbind(ltr5_S, ltr5_E, domains_SE[[i]], ltr3_S, ltr3_E) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
78 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
79 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
80 names(LSE) = names(domains_SE) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
81 newN = function(a,b){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
82 ap = gsub("_.+","", a) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
83 bp = gsub("_.+","", b) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
84 ifelse(ap == bp, ap, paste0(ap,"_", bp)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
85 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
86 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
87 # positions to distances: |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
88 LSED = lapply(LSE, function(x){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
89 x[x==0]=NA |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
90 N = ncol(x) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
91 if (is.null(N)){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
92 return(NULL) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
93 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
94 NM2 = colnames(x[,2:N]) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
95 NM1 = colnames(x[,1:(N - 1)]) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
96 d = x[,2:N, drop = FALSE] - x[,1:(N-1), drop=FALSE] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
97 colnames(d) = newN(NM1, NM2) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
98 return(d) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
99 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
100 ) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
101 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
102 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
103 # create empirical cumulative distribution function |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
104 get_ecdf <- function(dd, s = 1) { |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
105 ecdf_list = list() |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
106 for (i1 in colnames(dd)){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
107 for (i2 in colnames(dd)){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
108 if (i1 == i2){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
109 next |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
110 }else{ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
111 ii = sort(c(i1,i2)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
112 ecdf_list[[ii[1]]][[ii[2]]]=ecdf(s * abs(dd[,i2]-dd[,i1])) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
113 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
114 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
115 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
116 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
117 return(ecdf_list) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
118 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
119 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
120 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
121 ecdf_list = lapply(LSE, get_ecdf) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
122 ecdf_list_m = lapply(LSE, get_ecdf, -1) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
123 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
124 ecdf_model = list(plus = ecdf_list, minus = ecdf_list_m) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
125 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
126 saveRDS(ecdf_model, "databases/feature_distances_model.RDS") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
127 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
128 dante2dist <- function(dc){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
129 # dc - cluster of domains, granges |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
130 dpos <- get_dom_pos(dc) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
131 D <- c(dist(dpos)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
132 NN <- combn(names(dpos), 2) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
133 # order feature in pair alphabeticaly |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
134 N1 <- ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ]) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
135 N2 <- ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ]) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
136 dfd <- data.frame(F1 = N1, F2 = N2, distance = D) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
137 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
138 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
139 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
140 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
141 dante_to_quantiles <- function(dc, model_function, lineage=NULL){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
142 dfd <- dante2dist(dc) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
143 if (is.null(lineage)){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
144 lineage <- dc$Final_Classification[1] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
145 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
146 fn <- model_function$plus[[lineage]] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
147 fnm <- model_function$minus[[lineage]] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
148 dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
149 dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
150 dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
151 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
152 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
153 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
154 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
155 q=dante_to_quantiles(dparts_rex[[29]][[500]], ecdf_model) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
156 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
157 qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[43]][[1]]$Final_Classification[1]) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
158 qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[29]][[1]]$Final_Classification[1]) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
159 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
160 ll = sapply(qq, function(x)(sum(x$dstat12<0.01))/nrow(x)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
161 lln = sapply(qq, function(x)(sum(x$dstat12<0.01))) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
162 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
163 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
164 table(ll > 0.1) # more that 10% dist are outliers |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
165 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
166 table(ll) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
167 table(lln) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
168 lll = sapply(ll, min) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
169 hist(lll, breaks= 500) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
170 dc = dparts_rex[[30]][[30]] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
171 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
172 print(min(dout[4:5])) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
173 plot(dout[4:5], xlim=c(0,1), ylim=c(0,1)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
174 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
175 hist(dout[,6], breaks=50, xlim=c(0,1)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
176 mtext(min(dout[,6])) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
177 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
diff
changeset
|
178 apply(LSED[[30]],2, median, na.rm = TRUE) |