Mercurial > repos > petr-novak > repeat_annotation_pipeline3
annotate compare_gff.R @ 9:4a068d23fda6 draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 03 May 2023 11:19:06 +0000 |
parents | b53f5a456d01 |
children | 5366d5ea04bc |
rev | line source |
---|---|
6
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
1 #!/usr/bin/env Rscript |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
2 # function from questionr R package: |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
3 wtable <- function(x, y = NULL, weights = NULL, digits = 3, normwt = FALSE, useNA = c("no", "ifany", "always"), na.rm = TRUE, na.show = FALSE, exclude = NULL) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
4 { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
5 if (is.null(weights)) { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
6 warning("no weights argument given, using uniform weights of 1") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
7 weights <- rep(1, length(x)) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
8 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
9 if (length(x) != length(weights)) stop("x and weights lengths must be the same") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
10 if (!is.null(y) & (length(x) != length(y))) stop("x and y lengths must be the same") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
11 miss.usena <- missing(useNA) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
12 useNA <- match.arg(useNA) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
13 weights[is.na(weights)] <- 0 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
14 if (normwt) { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
15 weights <- weights * length(x)/sum(weights) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
16 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
17 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
18 if (!missing(na.show) || !missing(na.rm)) { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
19 warning("'na.rm' and 'na.show' are ignored when 'useNA' is provided.") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
20 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
21 if (useNA != "no" || (na.show && miss.usena)) { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
22 if (match(NA, exclude, nomatch = 0L)) { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
23 warning("'exclude' containing NA and 'useNA' != \"no\"' are a bit contradicting") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
24 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
25 x <- addNA(x) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
26 if (!is.null(y)) y <- addNA(y) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
27 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
28 if (useNA == "no" || (na.rm && miss.usena)) { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
29 s <- !is.na(x) & !is.na(weights) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
30 if (!is.null(y)) s <- s & !is.na(y) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
31 x <- x[s, drop = FALSE] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
32 if (!is.null(y)) y <- y[s, drop = FALSE] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
33 weights <- weights[s] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
34 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
35 if (!is.null(exclude)) { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
36 s <- !(x %in% exclude) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
37 if (!is.null(y)) s <- s & !(y %in% exclude) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
38 x <- factor(x[s, drop = FALSE]) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
39 if (!is.null(y)) y <- factor(y[s, drop = FALSE]) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
40 weights <- weights[s] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
41 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
42 if (is.null(y)) { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
43 result <- tapply(weights, x, sum, simplify = TRUE) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
44 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
45 else { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
46 result <- tapply(weights, list(x,y), sum, simplify = TRUE) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
47 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
48 result[is.na(result)] <- 0 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
49 tab <- as.table(result) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
50 if (useNA == "ifany") { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
51 if (!is.null(y)) { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
52 if (sum(tab[,is.na(colnames(tab))]) == 0) tab <- tab[,!is.na(colnames(tab))] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
53 if (sum(tab[is.na(rownames(tab)),]) == 0) tab <- tab[!is.na(rownames(tab)),] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
54 } else { |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
55 if (tab[is.na(names(tab))] == 0) tab <- tab[!is.na(names(tab))] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
56 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
57 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
58 tab |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
59 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
60 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
61 suppressPackageStartupMessages(library(rtracklayer)) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
62 library(parallel) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
63 g1 <- import(commandArgs(T)[1], format = "GFF") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
64 g2 <- import(commandArgs(T)[2], format = "GFF") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
65 # assume non inrarange overlaps! |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
66 attribute_name <- commandArgs(T)[3] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
67 if (FALSE){ |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
68 g1 <- import("test_data/RM_LTR_TE.gff3") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
69 g2 <- import("test_data/RM_RE.gff3") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
70 g2 <- import("test_data/RM_RE_CL.gff3") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
71 g2 <- import("test_data/RM_RE_v3.gff3") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
72 g2 <- import("test_data/RM_RE_v4.gff3") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
73 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
74 attribute_name <- "Name" |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
75 } |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
76 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
77 g1$SOURCE <- 1 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
78 g2$SOURCE <- 2 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
79 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
80 g12 <- append(g1,g2) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
81 g12_disjoin <- disjoin(g12, ignore.strand=TRUE, with.revmap=TRUE) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
82 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
83 g12$sel_attr <- mcols(g12)[,attribute_name] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
84 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
85 annot_name <- mclapply(as.list(g12_disjoin$revmap), FUN = function(x)g12$sel_attr[x], mc.cores = 8) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
86 annot_source <- mclapply(as.list(g12_disjoin$revmap), FUN = function(x)g12$SOURCE[x], mc.cores = 8) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
87 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
88 c1 <- sapply(annot_source, function (x) 1 %in% x & !(2 %in% x)) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
89 c2 <- sapply(annot_source, function (x) 2 %in% x & !(1 %in% x)) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
90 c12 <- !(c1 | c2) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
91 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
92 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
93 n1 <- cbind(sapply(annot_name[c1], "[", 1 ), "No Annotation") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
94 n2 <- cbind("No Annotation", sapply(annot_name[c2], "[", 1)) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
95 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
96 n12 <- do.call(rbind, annot_name[c12]) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
97 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
98 n12all <- matrix(character(), ncol = 2, nrow = length(g12_disjoin)) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
99 n12all[c1,] <- n1 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
100 n12all[c2,] <- n2 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
101 n12all[c12,] <- n12 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
102 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
103 traw <- wtable(n12all[,1], n12all[,2], weights = width(g12_disjoin)) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
104 tbl_df <- as.data.frame((traw)) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
105 tbl_df <- tbl_df[order(tbl_df$Freq, decreasing = TRUE),] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
106 colnames(tbl_df) <- c("Annotation1", "Annotation2", "Overlap[bp]") |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
107 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
108 ## NOTE - if name change - modify it in xml too |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
109 write.table(tbl_df, file = 'annotation_overlap_long.csv', sep="\t", quote=FALSE, row.names = FALSE) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
110 |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
111 tbl <- as.matrix(traw) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
112 tbl <- tbl[order(rowSums(tbl), decreasing = TRUE),] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
113 tbl <- tbl[, order(colSums(tbl), decreasing = TRUE)] |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
114 write.table(tbl, file= 'annotation_overlap.csv', sep="\t", quote = FALSE) |
b53f5a456d01
"planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff
changeset
|
115 |