Mercurial > repos > petr-novak > dante_ltr
annotate R/ltr_utils.R @ 10:a004cd05177d draft
"planemo upload commit ca5a8b9bbf761419a408bce11a17e880d1b1152c"
author | petr-novak |
---|---|
date | Wed, 13 Jul 2022 10:59:58 +0000 |
parents | 1aa578e6c8b3 |
children | ff01d4263391 |
rev | line source |
---|---|
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
1 add_coordinates_of_closest_neighbor <- function(gff) { |
10
a004cd05177d
"planemo upload commit ca5a8b9bbf761419a408bce11a17e880d1b1152c"
petr-novak
parents:
9
diff
changeset
|
2 gff <- gff[order(seqnames(gff), as.vector(start(gff)))] |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
3 # split to chromosomes: |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
4 gff_parts <- split(gff, seqnames(gff)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
5 upstreams <- c(sapply(gff_parts, function(x) c(1, head(end(x), -1)))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
6 downstreams <- c(sapply(gff_parts, function(x) c(start(x)[-1], seqlengths(x)[runValue(seqnames(x[1]))]))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
7 gff_updated <- unlist(gff_parts) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
8 gff_updated$upstream_domain <- unlist(upstreams) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
9 gff_updated$downstream_domain <- unlist(downstreams) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
10 names(gff_updated) <- NULL |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
11 return(gff_updated) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
12 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
13 |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
14 get_domain_clusters_alt <- function(gff, dist_models, threshold=0.99){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
15 gff <- sort(gff, by = ~ seqnames * start) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
16 domain_pairs <- data.frame( |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
17 D1 = paste0(head(gff$Name,-1),"_S"), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
18 D2 = paste0(gff$Name[-1],"_S"), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
19 C1 = head(gff$Final_Classification,-1), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
20 C2 = gff$Final_Classification[-1], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
21 S1 = head(strand(gff),-1), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
22 S2 = strand(gff)[-1], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
23 start1 = head(start(gff),-1), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
24 start2 = start(gff)[-1], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
25 chrpos= paste0(seqnames(gff)[-1],"_",start(gff)[-1]) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
26 ) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
27 domain_pairs$D_A <- with(domain_pairs, ifelse(D1 < D2, D1, D2)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
28 domain_pairs$D_B <- with(domain_pairs, ifelse(D1 > D2, D1, D2)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
29 domain_pairs$quantile <- 1 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
30 same_element_cluster <- domain_pairs$S1 == domain_pairs$S2 & domain_pairs$C1 == domain_pairs$C2 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
31 domain_pairs$same_element_cluster <- same_element_cluster |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
32 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
33 q_plus_function <- apply(domain_pairs[same_element_cluster,], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
34 1, |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
35 function(x) dist_models$plus[[x["C1"]]][[x["D_A"]]][[x["D_B"]]]) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
36 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
37 D <- abs(domain_pairs[same_element_cluster,]$start1 - domain_pairs[same_element_cluster,]$start2) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
38 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
39 q_plus_value <- mapply(function(x, D)if(is.null(x)){0}else{x(D)}, x= q_plus_function, D = D) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
40 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
41 domain_pairs$quantile[same_element_cluster] <- q_plus_value |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
42 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
43 domain_pairs$split_position <- !same_element_cluster | domain_pairs$quantile > threshold |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
44 # combine clusters based on distances and original cluster |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
45 clusters <- paste(cumsum(c(TRUE, domain_pairs$split_position)), get_domain_clusters(gff)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
46 return(clusters) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
47 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
48 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
49 |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
50 get_domain_clusters <- function(gff) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
51 # get consecutive domains from same linage |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
52 # must be sorted! |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
53 plus_order_split <- c(0, as.numeric(cumsum(head(gff$domain_order, -1) >= gff$domain_order[-1] & strand(gff)[-1] == '+'))) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
54 minus_order_split <- rev(as.numeric(cumsum(rev(c(gff$domain_order[-1] >= head(gff$domain_order,-1) & strand(gff)[-1] == '-', FALSE))))) |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
55 # split based on classification - must be same and consecutive |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
56 x <- rle(gff$Final_Classification) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
57 # split on strand change |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
58 s <- rep(seq_along(runLength(strand(gff))), runLength(strand(gff))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
59 domain_cluster <- paste0(rep(seq_along(x$lengths), x$lengths), "_", seqnames(gff), |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
60 "_", plus_order_split, "_", minus_order_split, "_", s) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
61 clusters <- factor(domain_cluster, levels = unique(domain_cluster)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
62 return(clusters) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
63 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
64 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
65 # create partial TE from clusters |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
66 get_partial_te_from_cluster_of_domains <- function (gpart){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
67 ID <- paste("TE_partial_", gpart$Cluster[1], sep="") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
68 te_partial <- GRanges(type="transposable_element_partial", |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
69 strand=strand(gpart)[1], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
70 ID = ID, |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
71 source = "dante_ltr", |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
72 seqnames=seqnames(gpart)[1], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
73 Final_Classification=gpart$Final_Classification[1], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
74 Name=gpart$Final_Classification[1], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
75 IRanges(min(start(gpart)), max(end(gpart)))) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
76 gpart$Parent <- ID |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
77 return(c(te_partial,gpart)) |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
78 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
79 |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
80 # create partial TE from clusters |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
81 get_partial_te_from_cluster_of_domains <- function(gpart, ID) { |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
82 te_partial <- makeGRangesFromDataFrame( |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
83 data.frame(type = "transposable_element_partial", |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
84 strand = gpart$strand[1], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
85 ID = ID, |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
86 source = "dante_ltr", |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
87 seqnames = gpart$seqnames[1], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
88 Final_Classification = |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
89 gpart$Final_Classification[1], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
90 Name = gpart$Final_Classification[1], |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
91 IRanges(min(gpart$start), max(gpart$end))), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
92 keep.extra.columns = TRUE) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
93 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
94 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
95 gpart$Parent <- ID |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
96 gpart_gr = makeGRangesFromDataFrame(gpart, keep.extra.columns = TRUE) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
97 return(c(te_partial, gpart_gr)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
98 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
99 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
100 # function to count for each element number of occurences: |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
101 count_occurences_for_each_element <- function(x) { |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
102 counts_unique <- table(x) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
103 counts <- counts_unique[x] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
104 counts |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
105 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
106 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
107 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
108 clean_domain_clusters <- function(gcl, lineage_domain_span, min_domains) { |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
109 ## remove clusters wich does not have enough domains or domains |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
110 ## are on different strand |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
111 N_domains <- sapply(gcl, nrow) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
112 N_unique_domains <- sapply(gcl, function(x)length(unique(x$Name))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
113 S <- sapply(gcl, function(x)paste(sort(unique(x$strand)), collapse = " ")) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
114 S_OK <- S %in% c("+", "-") |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
115 max_span <- lineage_domain_span[sapply(gcl, function(x) x$Final_Classification[1])] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
116 # set to zero if lineage is not covered in lineage domain span |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
117 max_span[is.na(max_span)] <- 0 |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
118 span <- sapply(gcl, function(x)max(x$end) - min(x$start)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
119 cond1 <- S_OK & |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
120 N_unique_domains == N_domains & |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
121 N_domains >= min_domains & |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
122 span <= max_span |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
123 return(gcl[cond1]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
124 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
125 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
126 check_ranges <- function(gx, s, offset = OFFSET) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
127 # check is range is not out of sequence length |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
128 START <- sapply(gx, function(x)min(x$start)) - offset |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
129 END <- sapply(gx, function(x)max(x$end)) + offset |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
130 MAX <- seqlengths(s)[sapply(gx, function(x)as.character(x$seqnames[1]))] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
131 good_ranges <- (START > 0) & (END <= MAX) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
132 return(good_ranges) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
133 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
134 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
135 get_ranges <- function(gx, offset = OFFSET) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
136 S <- sapply(gx, function(x)min(x$start)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
137 E <- sapply(gx, function(x)max(x$end)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
138 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset, end = E + offset)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
139 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
140 |
9
1aa578e6c8b3
"planemo upload commit 9488b982bae902f1868785ec4ad47134dac50ff3"
petr-novak
parents:
8
diff
changeset
|
141 get_ranges_left <- function(gx, offset = OFFSET, offset2 = 300) { |
8
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
142 ## offset2 - how many nt cen LTR extend to closes protein domain |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
143 ## this is necassary as some detected proteins domains does not have correct bopundaries |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
144 ## if LTR retrotransposons insters to other protein domain. |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
145 S <- sapply(gx, function(x)min(x$start)) |
9
1aa578e6c8b3
"planemo upload commit 9488b982bae902f1868785ec4ad47134dac50ff3"
petr-novak
parents:
8
diff
changeset
|
146 max_offset <- S - sapply(gx, function(x)min(x$upstream_domain)) + 10 |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
147 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
148 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset_adjusted, end = S + offset2)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
149 return(gr) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
150 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
151 |
9
1aa578e6c8b3
"planemo upload commit 9488b982bae902f1868785ec4ad47134dac50ff3"
petr-novak
parents:
8
diff
changeset
|
152 get_ranges_right <- function(gx, offset = OFFSET, offset2 = 300) { |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
153 E <- sapply(gx, function(x)max(x$end)) |
9
1aa578e6c8b3
"planemo upload commit 9488b982bae902f1868785ec4ad47134dac50ff3"
petr-novak
parents:
8
diff
changeset
|
154 max_offset <- sapply(gx, function(x)max(x$downstream_domain)) - E + 10 |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
155 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
156 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = E - offset2, end = E + offset_adjusted)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
157 return(gr) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
158 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
159 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
160 firstTG <- function(ss) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
161 x <- matchPattern("TG", ss) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
162 if (length(x) == 0) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
163 return(0) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
164 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
165 return(min(start(x))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
166 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
167 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
168 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
169 lastCA <- function(ss) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
170 x <- matchPattern("CA", ss) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
171 if (length(x) == 0) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
172 return(0) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
173 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
174 return(max(start(x))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
175 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
176 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
177 |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
178 domain_distance <- function(d_query, d_reference){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
179 if (d_query == d_reference){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
180 return (0) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
181 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
182 d_query_p <- strsplit(d_query," ")[[1]] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
183 d_reference_p <- strsplit(d_reference," ")[[1]] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
184 d <- length(d_reference_p) - sum(d_query_p == d_reference_p[d_reference_p %in% d_query_p]) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
185 return(d) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
186 } |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
187 |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
188 |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
189 trim2TGAC <- function(bl) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
190 for (i in 1:nrow(bl)) { |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
191 cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?") |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
192 tg_P <- firstTG(cons) |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
193 ca_P <- lastCA(cons) |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
194 e_dist <- bl$length[i] - ca_P |
8
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
195 max_dist <- 50 # was 25 |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
196 no_match <- any(tg_P == 0, ca_P == 0) |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
197 if (!no_match & |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
198 tg_P < max_dist & |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
199 e_dist < max_dist) { |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
200 ## trim alignment |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
201 bl[i,] <- trim_blast_table(bl[i,], tg_P, e_dist - 1) |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
202 } |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
203 |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
204 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
205 return(bl) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
206 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
207 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
208 trim_blast_table <- function(b, T1, T2) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
209 b$qstart <- b$qstart + T1 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
210 b$sstart <- b$sstart + T1 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
211 b$qend <- b$qend - T2 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
212 b$send <- b$send - T2 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
213 b$sseq <- substring(b$sseq, T1, b$length - T2) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
214 b$qseq <- substring(b$qseq, T1, b$length - T2) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
215 b$length <- nchar(b$sseq) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
216 return(b) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
217 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
218 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
219 blast_all2all <- function(seqs, db=NULL, ncpus=1, word_size=20, perc_identity=90, max_target_seq = 5000, task = "megablast", additional_params= ""){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
220 if (ncpus == 1){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
221 query <- list(seqs) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
222 }else{ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
223 query <-split(seqs, round(seq(1,ncpus,length.out = length(seqs)))) |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
224 if (length(query) < ncpus){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
225 ncpus <- length(query) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
226 } |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
227 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
228 if(is.null(db)){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
229 # search against itself |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
230 db <- seqs |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
231 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
232 qf <-tempfile(fileext=paste0("_",1:ncpus,".fasta")) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
233 outf <-tempfile(fileext=paste0("_",1:ncpus,".csv")) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
234 dbf <- tempfile() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
235 script <- tempfile() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
236 writeXStringSet(db, dbf) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
237 mapply(query, qf, FUN = writeXStringSet) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
238 cols <- "qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
239 cmd_db <- paste("makeblastdb -dbtype nucl -in ", dbf) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
240 cmd_blast <- paste("blastn -task ", task, " -word_size", word_size, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
241 "-outfmt \"6 ", cols, "\" ", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
242 "-perc_identity", perc_identity, " -min_raw_gapped_score 500", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
243 "-max_target_seqs", max_target_seq, additional_params, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
244 "-query", qf, "-db", dbf, "-out", outf, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
245 "&" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
246 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
247 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
248 # TODO - inspect only forward strand?? |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
249 system(cmd_db) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
250 cmd_all <- paste(paste(cmd_blast,collapse="\n"),"\nwait") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
251 cat(cmd_all, file = script) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
252 system(paste("sh ", script)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
253 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
254 bl_list <- lapply(outf, read.table, stringsAsFactors = FALSE, col.names = unlist(strsplit(cols, " ")), sep="\t", comment.char = "") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
255 bl_table <- do.call(rbind, bl_list) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
256 unlink(qf) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
257 #unlink(outf) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
258 unlink(dbf) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
259 unlink(script) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
260 return(bl_table) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
261 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
262 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
263 identify_conflicts <- function (bl){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
264 QL <- gsub(".+[|]", "", bl$qaccver) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
265 SL <- gsub(".+[|]", "", bl$saccver) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
266 id_with_conflict <- unique(c(bl$qaccver[QL != SL],bl$saccver[QL != SL])) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
267 id_ok <- setdiff(unique(c(bl$qaccver,bl$saccver)), id_with_conflict) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
268 single_hit <- sapply( |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
269 sapply( |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
270 split(bl$qaccver, bl$saccver), unique |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
271 ), length) == 1 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
272 id_with_no_hits <- names(single_hit)[single_hit] # except hit to itself |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
273 return(list( |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
274 ok = id_ok, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
275 conflict = id_with_conflict, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
276 no_hit = id_with_no_hits) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
277 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
278 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
279 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
280 |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
281 analyze_TE <- function(seqs, ncpus = 10, word_size = 20, perc_identity = 90, ...){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
282 blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size, perc_identity = perc_identity, ...) |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
283 te_conflict_info <- identify_conflicts(blt) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
284 blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
285 te_ok_lineages <- split(blt_te_ok, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
286 gsub( |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
287 ".+[|]", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
288 "", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
289 blt_te_ok$qaccver)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
290 gr_representative <- GRangesList(mclapply(te_ok_lineages, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
291 FUN = get_representative_ranges, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
292 mc.cores = ncpus |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
293 )) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
294 seqs_representative <- getSeq(seqs, Reduce(c, gr_representative)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
295 names(seqs_representative) <- seqnames(Reduce(c, gr_representative)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
296 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
297 return( |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
298 list( |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
299 seqs_representative = seqs_representative, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
300 te_conflict_info = te_conflict_info, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
301 gr_representative = gr_representative, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
302 blast = blt |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
303 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
304 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
305 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
306 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
307 query_coverage <- function(blt){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
308 blt <- blt[blt$qaccver != blt$saccver,] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
309 Q_lengths <- blt$qlen[!duplicated(blt$qaccver)] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
310 names(Q_lengths) <- blt$qaccver[!duplicated(blt$qaccver)] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
311 gr <- GRanges(seqnames = blt$qaccver, seqlengths = Q_lengths, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
312 IRanges(start = blt$qstart, end = blt$qend)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
313 return(coverage(gr)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
314 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
315 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
316 multiplicity_of_te <- function(blt){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
317 # exclude self to self hits and calculate coverage + mean_multiplicity of TE |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
318 # assuption is that TE which are 'identical' to other TE from the same lineage are |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
319 # likely correct |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
320 blt_no_self <- blt[blt$qaccver != blt$saccver,] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
321 cvr <- query_coverage(blt_no_self) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
322 L <- sapply(cvr, function(x) sum(width(x))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
323 C1 <- sapply(cvr, function(x) sum(as.numeric(runValue(x) >= 1) * runLength(x))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
324 multiplicity <- sapply(cvr, function(x) sum(as.numeric(runValue(x)) * runLength(x)))/L |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
325 data.frame(L = L, C1 = C1, multiplicity = multiplicity ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
326 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
327 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
328 verify_based_on_multiplicity <- function(TE_info, min_coverage=0.99, min_multiplicity=3){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
329 blt <- TE_info$blast[TE_info$blast$qaccver %in% TE_info$te_conflict_info$ok,] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
330 mp <- multiplicity_of_te(blt) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
331 id_ok_mp_verified <- rownames(mp)[mp$C1/mp$L > min_coverage & mp$multiplicity >= min_multiplicity] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
332 return(list(multiplicity = mp, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
333 id_ok_mp_verified = id_ok_mp_verified)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
334 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
335 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
336 |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
337 compare_TE_datasets <- function(Q, S, word_size = 20, min_coverage = 0.95, ncpus=10, ...){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
338 blt <- blast_all2all(seqs = Q, db = S, ncpus = ncpus, word_size = word_size, ...) |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
339 QL <- gsub(".+[|]", "", blt$qaccver) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
340 SL <- gsub(".+[|]", "", blt$saccver) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
341 id_with_conflict <- unique(c(blt$qaccver[QL != SL])) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
342 id_ok <- setdiff(unique(blt$qaccver), id_with_conflict) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
343 # check coverage hits |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
344 blt_ok <- blt[blt$qaccver %in% id_ok,] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
345 Q_lengths <- blt_ok$qlen[!duplicated(blt_ok$qaccver)] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
346 names(Q_lengths) <- blt_ok$qaccver[!duplicated(blt_ok$qaccver)] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
347 gr <- GRanges(seqnames = blt_ok$qaccver, seqlengths = Q_lengths, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
348 IRanges(start = blt_ok$qstart, end = blt_ok$qend)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
349 cvr <- coverage(gr) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
350 L <- sapply(cvr, function(x) sum(width(x))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
351 C1 <- sapply(cvr, function(x) sum(as.numeric(runValue(x) >= 1) * runLength(x))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
352 Max_uncovered <- sapply(cvr, function(x){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
353 if(any(runValue(x)==0)){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
354 return(max(runLength(x)[runValue(x) == 0])) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
355 }else{ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
356 return(0) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
357 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
358 }) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
359 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
360 # verified based on hit to reference - S |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
361 C1_prop <- C1/L |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
362 pass <- (C1_prop >= min_coverage & Max_uncovered < 500) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
363 if (any(pass)){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
364 id_ok_verified <- names(C1_prop) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
365 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
366 id_ok_verified <- NULL |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
367 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
368 return(list(id_with_conflict = id_with_conflict, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
369 id_ok = id_ok, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
370 id_ok_verified = id_ok_verified |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
371 )) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
372 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
373 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
374 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
375 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
376 blast_table_subset <- function(bl,id){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
377 return(bl[bl$qaccver %in% id & bl$saccver %in% id,, drop = FALSE]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
378 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
379 |
3
6ae4a341d1f3
"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents:
2
diff
changeset
|
380 get_representative_ranges <- function(bl, min_length = 200, min_identity = 98){ |
6ae4a341d1f3
"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents:
2
diff
changeset
|
381 bl <- bl[bl$pident>=min_identity, , drop=FALSE] |
6ae4a341d1f3
"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents:
2
diff
changeset
|
382 bl <- bl[bl$pident>=min_identity & bl$length >= min_length, , drop=FALSE] |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
383 score <- sort(unlist(by(bl$bitscore, bl$qaccver, sum, simplify = FALSE)), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
384 decreasing = TRUE) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
385 L <- bl$qlen[!duplicated(bl$qaccver)] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
386 names(L) <- bl$qaccver[!duplicated(bl$qaccver)] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
387 gr <- GRanges(seqnames = bl$qaccver, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
388 IRanges(start = bl$qstart, end = bl$qend), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
389 seqlengths = L, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
390 subject = bl$saccver, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
391 sstart = ifelse(bl$send < bl$sstart, bl$send, bl$sstart), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
392 send = ifelse(bl$send > bl$sstart, bl$send, bl$sstart)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
393 SN <- levels(seqnames(gr)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
394 inc <- rep(TRUE, length(gr)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
395 MSK <- GRangesList() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
396 for (i in names(score)){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
397 inc[gr$subject %in% i] <- FALSE |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
398 gr_part <- gr[seqnames(gr) %in% i & inc] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
399 MSK[[i]] <- GRanges(seqnames = factor(gr_part$subject, levels = SN), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
400 IRanges(start = gr_part$sstart, end = gr_part$send), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
401 seqlengths = L |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
402 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
403 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
404 gout <- unlist(MSK) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
405 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
406 full_gr <- GRanges(seqnames = factor(SN, levels = SN), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
407 IRanges(start = rep(1,length(L)), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
408 end = L) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
409 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
410 unmasked_gr <- GenomicRanges::setdiff(full_gr, gout) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
411 return(unmasked_gr[width(unmasked_gr) >= min_length]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
412 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
413 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
414 expected_diversity <- function(seqs, niter=100, km = 6){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
415 L <- nchar(seqs) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
416 R <- matrix(ncol = niter, nrow = length(seqs)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
417 for (i in 1:niter){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
418 seqs_rnd <- DNAStringSet(sapply(L, function(n) paste(sample(c("A", "C", "T", "G"), n, replace=TRUE), collapse=""))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
419 R[,i] <- seq_diversity(seqs_rnd, km = km)$richness |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
420 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
421 R |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
422 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
423 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
424 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
425 seq_diversity <- function (seqs, km=6){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
426 K <- oligonucleotideFrequency(seqs, width=km)>0 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
427 P <- t(K)/rowSums(K) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
428 # shannon index |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
429 SI <- apply(P, 2, function(x) {x1 <- x[x>0]; -sum(x1*log(x1))}) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
430 # richness |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
431 R <- rowSums(K) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
432 list(richness=R, shannon_index=SI) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
433 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
434 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
435 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
436 |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
437 blast <- function(s1, s2, expected_aln_lenght=NULL) { |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
438 tmp1 <- tempfile() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
439 tmp2 <- tempfile() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
440 tmp_out <- tempfile() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
441 writeXStringSet(DNAStringSet(s1), tmp1) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
442 writeXStringSet(DNAStringSet(s2), tmp2) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
443 # alternative blast: |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
444 cmd <- paste("blastn -task blastn -word_size 7 -dust no -gapextend 1 -gapopen 2 -reward 1 -penalty -1", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
445 " -query ", tmp1, ' -subject ', tmp2, ' -strand plus ', |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
446 '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq"', |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
447 ' -out', tmp_out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
448 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
449 system(cmd) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
450 out_raw <- read.table(tmp_out, as.is = TRUE, sep = "\t", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
451 col.names = strsplit("qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq", split = ' ')[[1]]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
452 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
453 if (nrow(out_raw) == 0) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
454 return(out_raw) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
455 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
456 out <- trim2TGAC(out_raw) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
457 # remove alingment shorted that |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
458 # out <- out[out$length > 100,] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
459 # alngment must be at least 80% of expected LTRmin95 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
460 out <- out[out$length > (expected_aln_lenght *0.8),] |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
461 if (nrow(out) == 0) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
462 return(out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
463 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
464 ## filter for TGCA |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
465 TG_L <- substring(out$qseq, 1, 2) == "TG" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
466 TG_R <- substring(out$sseq, 1, 2) == "TG" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
467 CA_L <- substring(out$qseq, out$length - 1, out$length) == "CA" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
468 CA_R <- substring(out$sseq, out$length - 1, out$length) == "CA" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
469 cond <- TG_L & TG_R & CA_L & CA_R |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
470 out <- out[cond, , drop = FALSE] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
471 unlink(tmp1) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
472 unlink(tmp2) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
473 unlink(tmp_out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
474 # TODO - detele all temp files! |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
475 # unlink(tmp1, tmp2, tmp_out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
476 return(out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
477 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
478 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
479 get_correct_coordinates <- function(b) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
480 do.call(rbind, strsplit(b$qaccver, split = "_")) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
481 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
482 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
483 evaluate_ltr <- function(GR, GRL, GRR, blast_line, Lseq, Rseq) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
484 LTR_L <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
485 start = start(GRL) + blast_line$qstart - 2, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
486 end = start(GRL) + blast_line$qend - 1)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
487 LTR_R <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
488 start = start(GRR) + blast_line$sstart - 2, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
489 end = start(GRR) + blast_line$send - 1)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
490 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
491 TSD_L <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
492 start = start(GRL) + blast_line$qstart - 3:10, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
493 end = start(GRL) + blast_line$qstart - 3)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
494 TSD_R <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
495 start = start(GRR) + blast_line$send, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
496 end = start(GRR) + blast_line$send + 0:7)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
497 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
498 TSD_L_seq <- DNAStringSet(substring(Lseq, blast_line$qstart - 2:9, blast_line$qstart - 2)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
499 TSD_R_seq <- DNAStringSet(substring(Rseq, blast_line$send + 1, blast_line$send + 1:8)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
500 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
501 matching_tsd <- TSD_R_seq == TSD_L_seq |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
502 matching_tsd[1:3] <- FALSE # remove short tsd |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
503 p <- which(matching_tsd) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
504 if (length(p) > 0) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
505 TSD_Length <- max(p) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
506 TSD_sequence <- TSD_L_seq[TSD_Length] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
507 TSD_position <- append(TSD_R[TSD_Length], TSD_L[TSD_Length]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
508 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
509 TSD_Length <- 0 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
510 TSD_sequence <- "" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
511 TSD_position <- NA |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
512 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
513 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
514 TE_Length <- end(LTR_R) - start(LTR_L) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
515 LTR_Identity <- blast_line$pident |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
516 out <- list(TSD_position = TSD_position, TSD_sequence = TSD_sequence, TSD_Length = TSD_Length, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
517 LTR_R_position = LTR_R, LTR_L_position = LTR_L, TE_Length = TE_Length, LTR_Identity = LTR_Identity) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
518 return(out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
519 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
520 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
521 get_best_ltr <- function(x) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
522 tsd_ok <- sapply(x, function(k)k$TSD_Length > 3) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
523 te_length_ok <- sapply(x, function(k)k$TE_Length < 30000) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
524 ltr_length_ok <- sapply(x, function(k)width(k$LTR_R_position) >= 100 & width(k$LTR_L_position) >= 100) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
525 if (sum(tsd_ok & te_length_ok & ltr_length_ok) >= 1) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
526 # return the first one (best bitscore) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
527 return(x[tsd_ok & te_length_ok][1]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
528 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
529 if (any(te_length_ok & ltr_length_ok)) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
530 return(x[te_length_ok & ltr_length_ok][1]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
531 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
532 return(NULL) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
533 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
534 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
535 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
536 get_te_gff3 <- function(g, ID) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
537 D <- makeGRangesFromDataFrame(g$domain, keep.extra.columns = TRUE) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
538 sn <- seqnames(D)[1] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
539 S <- strand(D)[1] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
540 TE <- GRanges(seqnames = sn, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
541 IRanges(start = start(g$ltr_info[[1]]$LTR_L_position), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
542 end = end(g$ltr_info[[1]]$LTR_R_position)), strand = S) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
543 TE$type <- "transposable_element" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
544 TE$ID <- ID |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
545 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
546 if (as.character(S) == "+") { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
547 LTR_5 <- g$ltr_info[[1]]$LTR_L |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
548 LTR_3 <- g$ltr_info[[1]]$LTR_R |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
549 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
550 LTR_3 <- g$ltr_info[[1]]$LTR_L |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
551 LTR_5 <- g$ltr_info[[1]]$LTR_R |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
552 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
553 LTR_5$LTR <- '5LTR' |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
554 LTR_3$LTR <- '3LTR' |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
555 LTR_5$type <- "long_terminal_repeat" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
556 LTR_3$type <- "long_terminal_repeat" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
557 strand(LTR_3) <- S |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
558 strand(LTR_5) <- S |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
559 LTR_3$Parent <- ID |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
560 LTR_5$Parent <- ID |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
561 LTR_3$Final_Classification <- D$Final_Classification[1] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
562 LTR_5$Final_Classification <- D$Final_Classification[1] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
563 LTR_5$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
564 LTR_3$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
565 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
566 TE$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
567 TE$LTR5_length <- width(LTR_5) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
568 TE$LTR3_length <- width(LTR_3) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
569 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
570 if (is.na(g$ltr_info[[1]]$TSD_position)[1]) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
571 # no TSD found |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
572 TSD <- NULL |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
573 TE$TSD <- 'not_found' |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
574 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
575 TSD <- g$ltr_info[[1]]$TSD_position |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
576 TSD$type <- "target_site_duplication" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
577 TSD$Parent <- ID |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
578 TE$TSD <- as.character(g$ltr_info[[1]]$TSD_sequence) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
579 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
580 |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
581 TE$Name <- TE$Final_Classification <- D$Final_Classification[1] |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
582 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
583 D$Parent <- ID |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
584 out <- c(TE, LTR_3, LTR_5, D, TSD) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
585 return(out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
586 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
587 |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
588 get_TE <- function(Lseq, Rseq, domains_gff, GR, GRL, GRR,LTR_length) { |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
589 xx <- blast(Lseq, Rseq, LTR_length) |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
590 if (nrow(xx) == 0) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
591 return(NULL) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
592 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
593 ltr_tmp <- list() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
594 for (j in 1:nrow(xx)) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
595 ltr_tmp[[j]] <- evaluate_ltr(GR, GRL, GRR, xx[j, , drop = FALSE], Lseq, Rseq) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
596 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
597 ltr <- get_best_ltr(ltr_tmp) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
598 if (length(ltr) == 0) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
599 return(NULL) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
600 ## add good ltr |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
601 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
602 return(list(domain = domains_gff, ltr_info = ltr, blast_out = xx)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
603 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
604 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
605 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
606 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
607 add_pbs <- function(te, s, trna_db) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
608 ltr5 <- te[which(te$LTR == "5LTR")] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
609 STRAND <- as.character(strand(te)[1]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
610 if (STRAND == "+") { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
611 pbs_gr <- GRanges(seqnames(ltr5), IRanges(start = end(ltr5) + 1, end = end(ltr5) + 31)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
612 pbs_s <- reverseComplement(getSeq(s, pbs_gr)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
613 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
614 pbs_gr <- GRanges(seqnames(ltr5), IRanges(end = start(ltr5) - 1, start = start(ltr5) - 30)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
615 pbs_s <- getSeq(s, pbs_gr) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
616 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
617 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
618 names(pbs_s) <- "pbs_region" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
619 # find trna match |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
620 tmp <- tempfile() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
621 tmp_out <- tempfile() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
622 writeXStringSet(DNAStringSet(pbs_s), tmp) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
623 # alternative blast: |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
624 cmd <- paste("blastn -task blastn -word_size 7 -dust no -perc_identity 100", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
625 " -query ", tmp, ' -db ', trna_db, ' -strand plus ', |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
626 '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq"', |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
627 ' -out', tmp_out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
628 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
629 system(cmd) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
630 pbs_exact_gr <- NULL |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
631 out_raw <- read.table(tmp_out, as.is = TRUE, sep = "\t", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
632 col.names = strsplit( |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
633 "qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
634 split = ' ')[[1]]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
635 if (nrow(out_raw) > 0) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
636 cca <- grepl("CCA$", out_raw$qseq) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
637 to_end <- out_raw$send == 23 # align to end of sequence |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
638 max_dist <- out_raw$qend > 25 # max 5 bp from ltr |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
639 min_length <- out_raw$length >= 10 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
640 out_pass <- out_raw[cca & to_end & max_dist & min_length,] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
641 if (nrow(out_pass) > 0) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
642 trna_id <- out_pass$saccver[1] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
643 if (STRAND == "+") { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
644 S <- end(ltr5) + 32 - out_pass$qend[1] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
645 E <- end(ltr5) + 32 - out_pass$qstart[1] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
646 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
647 S <- start(ltr5) - 31 + out_pass$qstart[1] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
648 E <- start(ltr5) - 31 + out_pass$qend[1] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
649 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
650 pbs_exact_gr <- GRanges(seqnames(ltr5), IRanges(start = S, end = E)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
651 pbs_exact_gr$trna_id <- trna_id |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
652 pbs_exact_gr$Length <- out_pass$Length |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
653 strand(pbs_exact_gr) <- STRAND |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
654 pbs_exact_gr$type <- 'primer_binding_site' |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
655 pbs_exact_gr$Parent <- te[1]$ID |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
656 te$trna_id <- c(trna_id, rep(NA, length(te) - 1)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
657 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
658 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
659 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
660 te <- append(te, pbs_exact_gr) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
661 unlink(tmp) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
662 unlink(tmp_out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
663 return(te) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
664 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
665 |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
666 |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
667 get_te_rank <- function (gr){ |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
668 DL_id <- gr$ID[gr$type == "transposable_element" & |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
669 gr$TSD == "not_found" & |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
670 is.na(gr$trna_id)] |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
671 DLT_id <- gr$ID[gr$type == "transposable_element" & |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
672 gr$TSD != "not_found" & |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
673 is.na(gr$trna_id)] |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
674 DLTP_id <- gr$ID[gr$type == "transposable_element" & |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
675 gr$TSD != "not_found" & |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
676 !is.na(gr$trna_id)] |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
677 DLP_id <- gr$ID[gr$type == "transposable_element" & |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
678 gr$TSD == "not_found" & |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
679 !is.na(gr$trna_id)] |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
680 Rank <- character(length(gr)) |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
681 ID <- unlist(ifelse(is.na(gr$ID), gr$Parent, gr$ID)) |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
682 |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
683 Rank[ID %in% DL_id] <- "DL" |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
684 Rank[ID %in% DLT_id] <- "DLT" |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
685 Rank[ID %in% DLP_id] <- "DLP" |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
686 Rank[ID %in% DLTP_id] <- "DLTP" |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
687 return(Rank) |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
688 } |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
689 |
8
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
690 dante_filtering <- function(dante_gff, min_similarity=0.4, |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
691 min_identity=0.2, Relative_Length=0.6, |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
692 min_relat_interuptions=8) { |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
693 include <- as.numeric(dante_gff$Similarity) >= min_similarity & |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
694 as.numeric(dante_gff$Identity) >= min_identity & |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
695 as.numeric(dante_gff$Relat_Length) >= Relative_Length & |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
696 as.numeric(dante_gff$Relat_Interruptions) <= min_relat_interuptions |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
697 |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
698 include[is.na(include)] <- FALSE |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
699 return(dante_gff[include]) |
9de392f2fc02
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents:
7
diff
changeset
|
700 } |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
701 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
702 get_te_statistics <- function(gr, RT){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
703 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
704 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
705 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class))) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
706 rank_table <- list() |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
707 for (i in 1:length(Ranks)) { |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
708 gr_part <- gr[gr$type == "transposable_element" & gr$Rank == Ranks[i]] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
709 rank_table[[Ranks[i]]] <- as.integer(table(factor(gr_part$Final_Classification, levels = all_class))) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
710 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
711 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
712 out <- cbind(do.call(cbind, rank_table), RT_domain=RT_domain) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
713 out <- rbind(out, Total = colSums(out)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
714 rownames(out) <- c(all_class, "Total") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
715 return(out) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
716 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
717 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
718 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
719 get_te_statistics_old <- function(gr, RT) { |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
720 DOMAINS <- gr[gr$type == "transposable_element" & gr$Rank == "D"] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
721 DOMAINS_LTR <- gr[gr$type == "transposable_element" & gr$Rank == "DL"] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
722 DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & gr$Rank == "DLT"] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
723 DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" & gr$Rank == "DLTP"] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
724 DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" & gr$Rank == "DLP"] |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
725 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
726 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
727 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class))) |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
728 D <- as.integer(table(factor(DOMAINS$Final_Classification, levels = all_class))) |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
729 DL <- as.integer(table(factor(DOMAINS_LTR$Final_Classification, levels = all_class))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
730 DLT <- as.integer(table(factor(DOMAINS_LTR_TSD$Final_Classification, levels = all_class))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
731 DLTP <- as.integer(table(factor(DOMAINS_LTR_TSD_PBS$Final_Classification, levels = all_class))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
732 DLP <- as.integer(table(factor(DOMAINS_LTR_PBS$Final_Classification, levels = all_class))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
733 out <- data.frame(RT_domain = RT_domain, |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
734 DOMAINS = D, |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
735 DOMAINS_LTR = DL, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
736 DOMAINS_LTR_TSD = DLT, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
737 DOMAINS_LTR_PBS = DLP, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
738 DOMAINS_LTR_TSD_PBS = DLTP, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
739 row.names = all_class |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
740 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
741 total <- colSums(out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
742 out <- rbind(out, Total = total) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
743 return(out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
744 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
745 |
3
6ae4a341d1f3
"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents:
2
diff
changeset
|
746 getSeqNamed <- function(s, gr, name = NULL) { |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
747 spart <- getSeq(s, gr) |
3
6ae4a341d1f3
"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents:
2
diff
changeset
|
748 if (is.null(name)){ |
6ae4a341d1f3
"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents:
2
diff
changeset
|
749 id1 <- paste0(seqnames(gr), '_', start(gr), "_", end(gr)) |
6ae4a341d1f3
"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents:
2
diff
changeset
|
750 }else{ |
6ae4a341d1f3
"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents:
2
diff
changeset
|
751 id1 <- mcols(gr)[,name] |
6ae4a341d1f3
"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents:
2
diff
changeset
|
752 } |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
753 id2 <- gr$Final_Classification |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
754 names(spart) <- paste0(id1, "#", id2) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
755 spart |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
756 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
757 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
758 get_TE_id <- function (gr){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
759 gr_te <- gr[gr$type == "transposable_element"] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
760 ID <- gr_te$ID |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
761 A <- paste0(seqnames(gr_te), '_', start(gr_te), "_", end(gr_te)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
762 B <- gr_te$Final_Classification |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
763 names(ID) <- paste0(A, "#", B) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
764 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
765 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
766 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
767 |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
768 get_te_sequences <- function (gr, s) { |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
769 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
770 s_te <- list() |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
771 for (i in 1:length(Ranks)) { |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
772 gr_te <- gr[gr$type == "transposable_element" & gr$Rank == Ranks[i]] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
773 if (length(gr_te) > 0) { |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
774 s_te[[Ranks[i]]] <- getSeqNamed(s, gr_te) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
775 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
776 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
777 return(s_te) |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
778 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
779 |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
780 |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
781 cd_hit_est <- function(seqs, min_identity = 0.9, word_size = 10, ncpu = 2){ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
782 # runs cd-hi-est and return table with cluster membership, and size and if reads was repesentative |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
783 # input sequences must be in the same orientation!!! |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
784 sfile <- tempfile() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
785 fasta_out <- tempfile() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
786 clstr <- paste0(fasta_out,".clstr") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
787 # cdhit is triming names!! |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
788 ori_names <- names(seqs) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
789 names(seqs) <- seq_along(seqs) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
790 writeXStringSet(seqs, sfile) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
791 cmd <- paste("cd-hit-est", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
792 "-i", sfile, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
793 "-o", fasta_out, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
794 "-c", min_identity, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
795 "-n", word_size, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
796 "-T", ncpu, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
797 "-r", 0) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
798 system(cmd) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
799 cls_raw <- grep("^>", readLines(clstr), invert = TRUE, value = TRUE) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
800 unlink(fasta_out) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
801 unlink(clstr) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
802 index <- gsub("\t.+","",cls_raw) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
803 id <- as.numeric(gsub("[.].+","", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
804 gsub(".+>", "", cls_raw)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
805 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
806 is_representative <- id %in% id[grep("[*]$",cls_raw)] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
807 membership <- cumsum(index=="0") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
808 cluster_size <- tabulate(membership)[membership] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
809 # reorder |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
810 ord <- order(id) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
811 cls_info <- data.frame( |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
812 seq_id = ori_names, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
813 membership = membership[ord], |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
814 cluster_size = cluster_size[ord], |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
815 is_representative = is_representative[ord] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
816 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
817 return(cls_info) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
818 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
819 |
7
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
820 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
821 dante2dist <- function(dc){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
822 # dc - cluster of domains, granges |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
823 dpos <- get_dom_pos(dc) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
824 D <- c(dist(dpos)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
825 NN <- combn(names(dpos), 2) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
826 # order feature in pair alphabeticaly |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
827 N1 <- ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ]) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
828 N2 <- ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ]) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
829 dfd <- data.frame(F1 = N1, F2 = N2, distance = D) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
830 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
831 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
832 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
833 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
834 dante_to_quantiles <- function(dc, model_function, lineage=NULL){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
835 dfd <- dante2dist(dc) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
836 if (is.null(lineage)){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
837 lineage <- dc$Final_Classification[1] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
838 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
839 # check if lineage has defined ecdf |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
840 if (lineage %in% names(model_function$plus)){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
841 fn <- model_function$plus[[lineage]] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
842 fnm <- model_function$minus[[lineage]] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
843 dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
844 dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
845 dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1)) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
846 return(dout) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
847 }else{ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
848 return(NULL) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
849 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
850 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
851 |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
852 get_dom_pos <- function(g){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
853 if (length(g) == 0){ |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
854 return(NULL) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
855 } |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
856 gdf <- data.frame(rexdb = as.character(seqnames(g)), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
857 domain = g$Name, |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
858 S = start(g), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
859 E = end(g), |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
860 stringsAsFactors = FALSE) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
861 gSmat <- xtabs(S ~ rexdb + domain, data = gdf) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
862 colnames(gSmat) <- paste0(colnames(gSmat),"_S") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
863 gEmat <- xtabs(E ~ rexdb + domain, data = gdf) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
864 colnames(gEmat) <- paste0(colnames(gEmat),"_E") |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
865 SEmat <- cbind(gSmat,gEmat) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
866 # reorder |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
867 dom_position <- colMeans(SEmat) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
868 SEmat_sorted <- SEmat[,order(dom_position)] |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
869 return(SEmat_sorted) |
c33d6583e548
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents:
3
diff
changeset
|
870 } |