annotate compare_gff.R @ 11:5366d5ea04bc draft

planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
author petr-novak
date Fri, 04 Aug 2023 12:35:32 +0000
parents b53f5a456d01
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
1 #!/usr/bin/env Rscript
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
2 # function from questionr R package:
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
3 wtable <- function(x, y = NULL, weights = NULL, digits = 3, normwt = FALSE, useNA = c("no", "ifany", "always"), na.rm = TRUE, na.show = FALSE, exclude = NULL)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
4 {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
5 if (is.null(weights)) {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
6 warning("no weights argument given, using uniform weights of 1")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
7 weights <- rep(1, length(x))
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
8 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
9 if (length(x) != length(weights)) stop("x and weights lengths must be the same")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
10 if (!is.null(y) & (length(x) != length(y))) stop("x and y lengths must be the same")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
11 miss.usena <- missing(useNA)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
12 useNA <- match.arg(useNA)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
13 weights[is.na(weights)] <- 0
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
14 if (normwt) {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
15 weights <- weights * length(x)/sum(weights)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
16 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
17
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
18 if (!missing(na.show) || !missing(na.rm)) {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
19 warning("'na.rm' and 'na.show' are ignored when 'useNA' is provided.")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
20 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
21 if (useNA != "no" || (na.show && miss.usena)) {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
22 if (match(NA, exclude, nomatch = 0L)) {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
23 warning("'exclude' containing NA and 'useNA' != \"no\"' are a bit contradicting")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
24 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
25 x <- addNA(x)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
26 if (!is.null(y)) y <- addNA(y)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
27 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
28 if (useNA == "no" || (na.rm && miss.usena)) {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
29 s <- !is.na(x) & !is.na(weights)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
30 if (!is.null(y)) s <- s & !is.na(y)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
31 x <- x[s, drop = FALSE]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
32 if (!is.null(y)) y <- y[s, drop = FALSE]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
33 weights <- weights[s]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
34 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
35 if (!is.null(exclude)) {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
36 s <- !(x %in% exclude)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
37 if (!is.null(y)) s <- s & !(y %in% exclude)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
38 x <- factor(x[s, drop = FALSE])
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
39 if (!is.null(y)) y <- factor(y[s, drop = FALSE])
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
40 weights <- weights[s]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
41 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
42 if (is.null(y)) {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
43 result <- tapply(weights, x, sum, simplify = TRUE)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
44 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
45 else {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
46 result <- tapply(weights, list(x,y), sum, simplify = TRUE)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
47 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
48 result[is.na(result)] <- 0
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
49 tab <- as.table(result)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
50 if (useNA == "ifany") {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
51 if (!is.null(y)) {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
52 if (sum(tab[,is.na(colnames(tab))]) == 0) tab <- tab[,!is.na(colnames(tab))]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
53 if (sum(tab[is.na(rownames(tab)),]) == 0) tab <- tab[!is.na(rownames(tab)),]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
54 } else {
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
55 if (tab[is.na(names(tab))] == 0) tab <- tab[!is.na(names(tab))]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
56 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
57 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
58 tab
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
59 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
60
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
61 suppressPackageStartupMessages(library(rtracklayer))
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
62 library(parallel)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
63 g1 <- import(commandArgs(T)[1], format = "GFF")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
64 g2 <- import(commandArgs(T)[2], format = "GFF")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
65 # assume non inrarange overlaps!
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
66 attribute_name <- commandArgs(T)[3]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
67 if (FALSE){
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
68 g1 <- import("test_data/RM_LTR_TE.gff3")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
69 g2 <- import("test_data/RM_RE.gff3")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
70 g2 <- import("test_data/RM_RE_CL.gff3")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
71 g2 <- import("test_data/RM_RE_v3.gff3")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
72 g2 <- import("test_data/RM_RE_v4.gff3")
11
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents: 6
diff changeset
73 g1 <- import("test_data/RM_RE_v4.gff3")
6
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
74
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
75 attribute_name <- "Name"
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
76 }
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
77
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
78 g1$SOURCE <- 1
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
79 g2$SOURCE <- 2
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
80
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
81 g12 <- append(g1,g2)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
82 g12_disjoin <- disjoin(g12, ignore.strand=TRUE, with.revmap=TRUE)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
83
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
84 g12$sel_attr <- mcols(g12)[,attribute_name]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
85
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
86 annot_name <- mclapply(as.list(g12_disjoin$revmap), FUN = function(x)g12$sel_attr[x], mc.cores = 8)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
87 annot_source <- mclapply(as.list(g12_disjoin$revmap), FUN = function(x)g12$SOURCE[x], mc.cores = 8)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
88
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
89 c1 <- sapply(annot_source, function (x) 1 %in% x & !(2 %in% x))
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
90 c2 <- sapply(annot_source, function (x) 2 %in% x & !(1 %in% x))
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
91 c12 <- !(c1 | c2)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
92
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
93
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
94 n1 <- cbind(sapply(annot_name[c1], "[", 1 ), "No Annotation")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
95 n2 <- cbind("No Annotation", sapply(annot_name[c2], "[", 1))
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
96
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
97 n12 <- do.call(rbind, annot_name[c12])
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
98
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
99 n12all <- matrix(character(), ncol = 2, nrow = length(g12_disjoin))
11
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents: 6
diff changeset
100
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents: 6
diff changeset
101 if (any(c1)){
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents: 6
diff changeset
102 n12all[c1,] <- n1
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents: 6
diff changeset
103 }
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents: 6
diff changeset
104 if (any(c2)){
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents: 6
diff changeset
105 n12all[c2,] <- n2
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents: 6
diff changeset
106 }
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents: 6
diff changeset
107 if (any(c12)){
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents: 6
diff changeset
108 n12all[c12,] <- n12
5366d5ea04bc planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents: 6
diff changeset
109 }
6
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
110
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
111 traw <- wtable(n12all[,1], n12all[,2], weights = width(g12_disjoin))
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
112 tbl_df <- as.data.frame((traw))
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
113 tbl_df <- tbl_df[order(tbl_df$Freq, decreasing = TRUE),]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
114 colnames(tbl_df) <- c("Annotation1", "Annotation2", "Overlap[bp]")
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
115
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
116 ## NOTE - if name change - modify it in xml too
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
117 write.table(tbl_df, file = 'annotation_overlap_long.csv', sep="\t", quote=FALSE, row.names = FALSE)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
118
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
119 tbl <- as.matrix(traw)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
120 tbl <- tbl[order(rowSums(tbl), decreasing = TRUE),]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
121 tbl <- tbl[, order(colSums(tbl), decreasing = TRUE)]
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
122 write.table(tbl, file= 'annotation_overlap.csv', sep="\t", quote = FALSE)
b53f5a456d01 "planemo upload commit 3aefb0555456837d10fe69e4ad25de08d5972cb2"
petr-novak
parents:
diff changeset
123