Mercurial > repos > petr-novak > dante_ltr
comparison R/ltr_utils.R @ 7:c33d6583e548 draft
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author | petr-novak |
---|---|
date | Fri, 24 Jun 2022 14:19:48 +0000 |
parents | 6ae4a341d1f3 |
children | 9de392f2fc02 |
comparison
equal
deleted
inserted
replaced
6:b91ca438a1cb | 7:c33d6583e548 |
---|---|
9 gff_updated$downstream_domain <- unlist(downstreams) | 9 gff_updated$downstream_domain <- unlist(downstreams) |
10 names(gff_updated) <- NULL | 10 names(gff_updated) <- NULL |
11 return(gff_updated) | 11 return(gff_updated) |
12 } | 12 } |
13 | 13 |
14 get_domain_clusters_alt <- function(gff, dist_models, threshold=0.99){ | |
15 gff <- sort(gff, by = ~ seqnames * start) | |
16 domain_pairs <- data.frame( | |
17 D1 = paste0(head(gff$Name,-1),"_S"), | |
18 D2 = paste0(gff$Name[-1],"_S"), | |
19 C1 = head(gff$Final_Classification,-1), | |
20 C2 = gff$Final_Classification[-1], | |
21 S1 = head(strand(gff),-1), | |
22 S2 = strand(gff)[-1], | |
23 start1 = head(start(gff),-1), | |
24 start2 = start(gff)[-1], | |
25 chrpos= paste0(seqnames(gff)[-1],"_",start(gff)[-1]) | |
26 ) | |
27 domain_pairs$D_A <- with(domain_pairs, ifelse(D1 < D2, D1, D2)) | |
28 domain_pairs$D_B <- with(domain_pairs, ifelse(D1 > D2, D1, D2)) | |
29 domain_pairs$quantile <- 1 | |
30 same_element_cluster <- domain_pairs$S1 == domain_pairs$S2 & domain_pairs$C1 == domain_pairs$C2 | |
31 domain_pairs$same_element_cluster <- same_element_cluster | |
32 | |
33 q_plus_function <- apply(domain_pairs[same_element_cluster,], | |
34 1, | |
35 function(x) dist_models$plus[[x["C1"]]][[x["D_A"]]][[x["D_B"]]]) | |
36 | |
37 D <- abs(domain_pairs[same_element_cluster,]$start1 - domain_pairs[same_element_cluster,]$start2) | |
38 | |
39 q_plus_value <- mapply(function(x, D)if(is.null(x)){0}else{x(D)}, x= q_plus_function, D = D) | |
40 | |
41 domain_pairs$quantile[same_element_cluster] <- q_plus_value | |
42 | |
43 domain_pairs$split_position <- !same_element_cluster | domain_pairs$quantile > threshold | |
44 # combine clusters based on distances and original cluster | |
45 clusters <- paste(cumsum(c(TRUE, domain_pairs$split_position)), get_domain_clusters(gff)) | |
46 return(clusters) | |
47 } | |
48 | |
49 | |
14 get_domain_clusters <- function(gff) { | 50 get_domain_clusters <- function(gff) { |
15 # get consecutive domains from same linage | 51 # get consecutive domains from same linage |
16 # must be sorted! | 52 # must be sorted! |
17 gag_plus <- as.numeric(cumsum(gff$Name == "GAG" & strand(gff) == '+')) | 53 plus_order_split <- c(0, as.numeric(cumsum(head(gff$domain_order, -1) >= gff$domain_order[-1] & strand(gff)[-1] == '+'))) |
18 gag_minus <- rev(as.numeric(cumsum(rev(gff$Name == "GAG" & strand(gff) == '-')))) | 54 minus_order_split <- rev(as.numeric(cumsum(rev(c(gff$domain_order[-1] >= head(gff$domain_order,-1) & strand(gff)[-1] == '-', FALSE))))) |
19 # split based on classification - must be same and consecutive | 55 # split based on classification - must be same and consecutive |
20 x <- rle(gff$Final_Classification) | 56 x <- rle(gff$Final_Classification) |
21 # split on strand change | 57 # split on strand change |
22 s <- rep(seq_along(runLength(strand(gff))), runLength(strand(gff))) | 58 s <- rep(seq_along(runLength(strand(gff))), runLength(strand(gff))) |
23 domain_cluster <- paste0(rep(seq_along(x$lengths), x$lengths), "_", seqnames(gff), | 59 domain_cluster <- paste0(rep(seq_along(x$lengths), x$lengths), "_", seqnames(gff), |
24 "_", gag_plus, "_", gag_minus, "_", s) | 60 "_", plus_order_split, "_", minus_order_split, "_", s) |
25 gff_clusters <- split(as.data.frame(gff), factor(domain_cluster, levels = unique(domain_cluster))) | 61 clusters <- factor(domain_cluster, levels = unique(domain_cluster)) |
26 gff_clusters | 62 return(clusters) |
27 } | 63 } |
28 | 64 |
29 clean_domain_clusters <- function(gcl) { | 65 # create partial TE from clusters |
66 get_partial_te_from_cluster_of_domains <- function (gpart){ | |
67 ID <- paste("TE_partial_", gpart$Cluster[1], sep="") | |
68 te_partial <- GRanges(type="transposable_element_partial", | |
69 strand=strand(gpart)[1], | |
70 ID = ID, | |
71 source = "dante_ltr", | |
72 seqnames=seqnames(gpart)[1], | |
73 Final_Classification=gpart$Final_Classification[1], | |
74 Name=gpart$Final_Classification[1], | |
75 IRanges(min(start(gpart)), max(end(gpart)))) | |
76 gpart$Parent <- ID | |
77 return(c(te_partial,gpart)) | |
78 } | |
79 | |
80 # create partial TE from clusters | |
81 get_partial_te_from_cluster_of_domains <- function(gpart, ID) { | |
82 te_partial <- makeGRangesFromDataFrame( | |
83 data.frame(type = "transposable_element_partial", | |
84 strand = gpart$strand[1], | |
85 ID = ID, | |
86 source = "dante_ltr", | |
87 seqnames = gpart$seqnames[1], | |
88 Final_Classification = | |
89 gpart$Final_Classification[1], | |
90 Name = gpart$Final_Classification[1], | |
91 IRanges(min(gpart$start), max(gpart$end))), | |
92 keep.extra.columns = TRUE) | |
93 | |
94 | |
95 gpart$Parent <- ID | |
96 gpart_gr = makeGRangesFromDataFrame(gpart, keep.extra.columns = TRUE) | |
97 return(c(te_partial, gpart_gr)) | |
98 } | |
99 | |
100 # function to count for each element number of occurences: | |
101 count_occurences_for_each_element <- function(x) { | |
102 counts_unique <- table(x) | |
103 counts <- counts_unique[x] | |
104 counts | |
105 } | |
106 | |
107 | |
108 clean_domain_clusters <- function(gcl, lineage_domain_span, min_domains) { | |
30 ## remove clusters wich does not have enough domains or domains | 109 ## remove clusters wich does not have enough domains or domains |
31 ## are on different strand | 110 ## are on different strand |
32 N_domains <- sapply(gcl, nrow) | 111 N_domains <- sapply(gcl, nrow) |
33 N_unique_domains <- sapply(gcl, function(x)length(unique(x$Name))) | 112 N_unique_domains <- sapply(gcl, function(x)length(unique(x$Name))) |
34 S <- sapply(gcl, function(x)paste(sort(unique(x$strand)), collapse = " ")) | 113 S <- sapply(gcl, function(x)paste(sort(unique(x$strand)), collapse = " ")) |
35 S_OK <- S %in% c("+", "-") | 114 S_OK <- S %in% c("+", "-") |
36 min_domains <- 5 | 115 max_span <- lineage_domain_span[sapply(gcl, function(x) x$Final_Classification[1])] |
37 maxlength <- 15000 # max span between domains | 116 # set to zero if lineage is not covered in lineage domain span |
117 max_span[is.na(max_span)] <- 0 | |
38 span <- sapply(gcl, function(x)max(x$end) - min(x$start)) | 118 span <- sapply(gcl, function(x)max(x$end) - min(x$start)) |
39 cond1 <- S_OK & | 119 cond1 <- S_OK & |
40 N_unique_domains == N_domains & | 120 N_unique_domains == N_domains & |
41 N_domains >= min_domains & | 121 N_domains >= min_domains & |
42 span <= maxlength | 122 span <= max_span |
43 return(gcl[cond1]) | 123 return(gcl[cond1]) |
44 } | 124 } |
45 | 125 |
46 check_ranges <- function(gx, s, offset = OFFSET) { | 126 check_ranges <- function(gx, s, offset = OFFSET) { |
47 # check is range is not out of sequence length | 127 # check is range is not out of sequence length |
90 }else { | 170 }else { |
91 return(max(start(x))) | 171 return(max(start(x))) |
92 } | 172 } |
93 } | 173 } |
94 | 174 |
175 domain_distance <- function(d_query, d_reference){ | |
176 if (d_query == d_reference){ | |
177 return (0) | |
178 } | |
179 d_query_p <- strsplit(d_query," ")[[1]] | |
180 d_reference_p <- strsplit(d_reference," ")[[1]] | |
181 d <- length(d_reference_p) - sum(d_query_p == d_reference_p[d_reference_p %in% d_query_p]) | |
182 return(d) | |
183 } | |
95 | 184 |
96 | 185 |
97 trim2TGAC <- function(bl) { | 186 trim2TGAC <- function(bl) { |
98 for (i in 1:nrow(bl)) { | 187 for (i in 1:nrow(bl)) { |
99 cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?") | 188 cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?") |
127 blast_all2all <- function(seqs, db=NULL, ncpus=1, word_size=20, perc_identity=90, max_target_seq = 5000, task = "megablast", additional_params= ""){ | 216 blast_all2all <- function(seqs, db=NULL, ncpus=1, word_size=20, perc_identity=90, max_target_seq = 5000, task = "megablast", additional_params= ""){ |
128 if (ncpus == 1){ | 217 if (ncpus == 1){ |
129 query <- list(seqs) | 218 query <- list(seqs) |
130 }else{ | 219 }else{ |
131 query <-split(seqs, round(seq(1,ncpus,length.out = length(seqs)))) | 220 query <-split(seqs, round(seq(1,ncpus,length.out = length(seqs)))) |
221 if (length(query) < ncpus){ | |
222 ncpus <- length(query) | |
223 } | |
132 } | 224 } |
133 if(is.null(db)){ | 225 if(is.null(db)){ |
134 # search against itself | 226 # search against itself |
135 db <- seqs | 227 db <- seqs |
136 } | 228 } |
181 no_hit = id_with_no_hits) | 273 no_hit = id_with_no_hits) |
182 ) | 274 ) |
183 } | 275 } |
184 | 276 |
185 | 277 |
186 analyze_TE <- function(seqs, ncpus = 10, word_size = 20){ | 278 analyze_TE <- function(seqs, ncpus = 10, word_size = 20, perc_identity = 90, ...){ |
187 blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size, perc_identity = 90) | 279 blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size, perc_identity = perc_identity, ...) |
188 te_conflict_info <- identify_conflicts(blt) | 280 te_conflict_info <- identify_conflicts(blt) |
189 blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok) | 281 blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok) |
190 te_ok_lineages <- split(blt_te_ok, | 282 te_ok_lineages <- split(blt_te_ok, |
191 gsub( | 283 gsub( |
192 ".+[|]", | 284 ".+[|]", |
196 FUN = get_representative_ranges, | 288 FUN = get_representative_ranges, |
197 mc.cores = ncpus | 289 mc.cores = ncpus |
198 )) | 290 )) |
199 seqs_representative <- getSeq(seqs, Reduce(c, gr_representative)) | 291 seqs_representative <- getSeq(seqs, Reduce(c, gr_representative)) |
200 names(seqs_representative) <- seqnames(Reduce(c, gr_representative)) | 292 names(seqs_representative) <- seqnames(Reduce(c, gr_representative)) |
201 # TODO - add swithin group verification here, ! exclude self hits!! | |
202 | 293 |
203 return( | 294 return( |
204 list( | 295 list( |
205 seqs_representative = seqs_representative, | 296 seqs_representative = seqs_representative, |
206 te_conflict_info = te_conflict_info, | 297 te_conflict_info = te_conflict_info, |
238 return(list(multiplicity = mp, | 329 return(list(multiplicity = mp, |
239 id_ok_mp_verified = id_ok_mp_verified)) | 330 id_ok_mp_verified = id_ok_mp_verified)) |
240 | 331 |
241 } | 332 } |
242 | 333 |
243 compare_TE_datasets <- function(Q, S, word_size = 20, min_coverage = 0.95, ncpus=10){ | 334 compare_TE_datasets <- function(Q, S, word_size = 20, min_coverage = 0.95, ncpus=10, ...){ |
244 blt <- blast_all2all(seqs = Q, db = S, ncpus = ncpus, word_size = word_size) | 335 blt <- blast_all2all(seqs = Q, db = S, ncpus = ncpus, word_size = word_size, ...) |
245 QL <- gsub(".+[|]", "", blt$qaccver) | 336 QL <- gsub(".+[|]", "", blt$qaccver) |
246 SL <- gsub(".+[|]", "", blt$saccver) | 337 SL <- gsub(".+[|]", "", blt$saccver) |
247 id_with_conflict <- unique(c(blt$qaccver[QL != SL])) | 338 id_with_conflict <- unique(c(blt$qaccver[QL != SL])) |
248 id_ok <- setdiff(unique(blt$qaccver), id_with_conflict) | 339 id_ok <- setdiff(unique(blt$qaccver), id_with_conflict) |
249 # check coverage hits | 340 # check coverage hits |
338 list(richness=R, shannon_index=SI) | 429 list(richness=R, shannon_index=SI) |
339 } | 430 } |
340 | 431 |
341 | 432 |
342 | 433 |
343 blast <- function(s1, s2) { | 434 blast <- function(s1, s2, expected_aln_lenght=NULL) { |
344 tmp1 <- tempfile() | 435 tmp1 <- tempfile() |
345 tmp2 <- tempfile() | 436 tmp2 <- tempfile() |
346 tmp_out <- tempfile() | 437 tmp_out <- tempfile() |
347 writeXStringSet(DNAStringSet(s1), tmp1) | 438 writeXStringSet(DNAStringSet(s1), tmp1) |
348 writeXStringSet(DNAStringSet(s2), tmp2) | 439 writeXStringSet(DNAStringSet(s2), tmp2) |
359 if (nrow(out_raw) == 0) { | 450 if (nrow(out_raw) == 0) { |
360 return(out_raw) | 451 return(out_raw) |
361 } | 452 } |
362 out <- trim2TGAC(out_raw) | 453 out <- trim2TGAC(out_raw) |
363 # remove alingment shorted that | 454 # remove alingment shorted that |
364 out <- out[out$length > 100,] | 455 # out <- out[out$length > 100,] |
456 # alngment must be at least 80% of expected LTRmin95 | |
457 out <- out[out$length > (expected_aln_lenght *0.8),] | |
365 if (nrow(out) == 0) { | 458 if (nrow(out) == 0) { |
366 return(out) | 459 return(out) |
367 } | 460 } |
368 ## filter for TGCA | 461 ## filter for TGCA |
369 TG_L <- substring(out$qseq, 1, 2) == "TG" | 462 TG_L <- substring(out$qseq, 1, 2) == "TG" |
480 TSD$type <- "target_site_duplication" | 573 TSD$type <- "target_site_duplication" |
481 TSD$Parent <- ID | 574 TSD$Parent <- ID |
482 TE$TSD <- as.character(g$ltr_info[[1]]$TSD_sequence) | 575 TE$TSD <- as.character(g$ltr_info[[1]]$TSD_sequence) |
483 } | 576 } |
484 | 577 |
485 TE$Final_Classification <- D$Final_Classification[1] | 578 TE$Name <- TE$Final_Classification <- D$Final_Classification[1] |
486 | 579 |
487 D$Parent <- ID | 580 D$Parent <- ID |
488 out <- c(TE, LTR_3, LTR_5, D, TSD) | 581 out <- c(TE, LTR_3, LTR_5, D, TSD) |
489 return(out) | 582 return(out) |
490 } | 583 } |
491 | 584 |
492 get_TE <- function(Lseq, Rseq, domains_gff, GR, GRL, GRR) { | 585 get_TE <- function(Lseq, Rseq, domains_gff, GR, GRL, GRR,LTR_length) { |
493 xx <- blast(Lseq, Rseq) | 586 xx <- blast(Lseq, Rseq, LTR_length) |
494 if (nrow(xx) == 0) { | 587 if (nrow(xx) == 0) { |
495 return(NULL) | 588 return(NULL) |
496 }else { | 589 }else { |
497 ltr_tmp <- list() | 590 ltr_tmp <- list() |
498 for (j in 1:nrow(xx)) { | 591 for (j in 1:nrow(xx)) { |
589 Rank[ID %in% DLP_id] <- "DLP" | 682 Rank[ID %in% DLP_id] <- "DLP" |
590 Rank[ID %in% DLTP_id] <- "DLTP" | 683 Rank[ID %in% DLTP_id] <- "DLTP" |
591 return(Rank) | 684 return(Rank) |
592 } | 685 } |
593 | 686 |
594 get_te_statistics <- function(gr, RT) { | 687 |
595 DOMAINS_LTR <- gr[gr$type == "transposable_element" & | 688 get_te_statistics <- function(gr, RT){ |
596 gr$TSD == "not_found" & | 689 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP") |
597 is.na(gr$trna_id)] | |
598 DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & | |
599 gr$TSD != "not_found" & | |
600 is.na(gr$trna_id)] | |
601 DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" & | |
602 gr$TSD != "not_found" & | |
603 !is.na(gr$trna_id)] | |
604 DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" & | |
605 gr$TSD == "not_found" & | |
606 !is.na(gr$trna_id)] | |
607 | |
608 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE)) | 690 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE)) |
609 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class))) | 691 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class))) |
692 rank_table <- list() | |
693 for (i in 1:length(Ranks)) { | |
694 gr_part <- gr[gr$type == "transposable_element" & gr$Rank == Ranks[i]] | |
695 rank_table[[Ranks[i]]] <- as.integer(table(factor(gr_part$Final_Classification, levels = all_class))) | |
696 | |
697 } | |
698 out <- cbind(do.call(cbind, rank_table), RT_domain=RT_domain) | |
699 out <- rbind(out, Total = colSums(out)) | |
700 rownames(out) <- c(all_class, "Total") | |
701 return(out) | |
702 } | |
703 | |
704 | |
705 get_te_statistics_old <- function(gr, RT) { | |
706 DOMAINS <- gr[gr$type == "transposable_element" & gr$Rank == "D"] | |
707 DOMAINS_LTR <- gr[gr$type == "transposable_element" & gr$Rank == "DL"] | |
708 DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & gr$Rank == "DLT"] | |
709 DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" & gr$Rank == "DLTP"] | |
710 DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" & gr$Rank == "DLP"] | |
711 | |
712 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE)) | |
713 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class))) | |
714 D <- as.integer(table(factor(DOMAINS$Final_Classification, levels = all_class))) | |
610 DL <- as.integer(table(factor(DOMAINS_LTR$Final_Classification, levels = all_class))) | 715 DL <- as.integer(table(factor(DOMAINS_LTR$Final_Classification, levels = all_class))) |
611 DLT <- as.integer(table(factor(DOMAINS_LTR_TSD$Final_Classification, levels = all_class))) | 716 DLT <- as.integer(table(factor(DOMAINS_LTR_TSD$Final_Classification, levels = all_class))) |
612 DLTP <- as.integer(table(factor(DOMAINS_LTR_TSD_PBS$Final_Classification, levels = all_class))) | 717 DLTP <- as.integer(table(factor(DOMAINS_LTR_TSD_PBS$Final_Classification, levels = all_class))) |
613 DLP <- as.integer(table(factor(DOMAINS_LTR_PBS$Final_Classification, levels = all_class))) | 718 DLP <- as.integer(table(factor(DOMAINS_LTR_PBS$Final_Classification, levels = all_class))) |
614 out <- data.frame(RT_domain = RT_domain, | 719 out <- data.frame(RT_domain = RT_domain, |
720 DOMAINS = D, | |
615 DOMAINS_LTR = DL, | 721 DOMAINS_LTR = DL, |
616 DOMAINS_LTR_TSD = DLT, | 722 DOMAINS_LTR_TSD = DLT, |
617 DOMAINS_LTR_PBS = DLP, | 723 DOMAINS_LTR_PBS = DLP, |
618 DOMAINS_LTR_TSD_PBS = DLTP, | 724 DOMAINS_LTR_TSD_PBS = DLTP, |
619 row.names = all_class | 725 row.names = all_class |
642 B <- gr_te$Final_Classification | 748 B <- gr_te$Final_Classification |
643 names(ID) <- paste0(A, "#", B) | 749 names(ID) <- paste0(A, "#", B) |
644 | 750 |
645 } | 751 } |
646 | 752 |
647 get_te_sequences <- function(gr, s) { | 753 |
648 # return list of biostrings | 754 get_te_sequences <- function (gr, s) { |
649 DOMAINS_LTR <- gr[gr$type == "transposable_element" & | 755 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP") |
650 gr$TSD == "not_found" & | 756 s_te <- list() |
651 is.na(gr$trna_id)] | 757 for (i in 1:length(Ranks)) { |
652 DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & | 758 gr_te <- gr[gr$type == "transposable_element" & gr$Rank == Ranks[i]] |
653 gr$TSD != "not_found" & | 759 if (length(gr_te) > 0) { |
654 is.na(gr$trna_id)] | 760 s_te[[Ranks[i]]] <- getSeqNamed(s, gr_te) |
655 DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" & | 761 } |
656 gr$TSD != "not_found" & | 762 } |
657 !is.na(gr$trna_id)] | 763 return(s_te) |
658 DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" & | 764 } |
659 gr$TSD == "not_found" & | 765 |
660 !is.na(gr$trna_id)] | |
661 s_DL <- getSeqNamed(s, DOMAINS_LTR) | |
662 s_DLT <- getSeqNamed(s, DOMAINS_LTR_TSD) | |
663 s_DLP <- getSeqNamed(s, DOMAINS_LTR_PBS) | |
664 s_DLTP <- getSeqNamed(s, DOMAINS_LTR_TSD_PBS) | |
665 return(DNAStringSetList( | |
666 DL = s_DL, | |
667 DLT = s_DLT, | |
668 DLP = s_DLP, | |
669 DLTP = s_DLTP | |
670 )) | |
671 | |
672 } | |
673 | 766 |
674 cd_hit_est <- function(seqs, min_identity = 0.9, word_size = 10, ncpu = 2){ | 767 cd_hit_est <- function(seqs, min_identity = 0.9, word_size = 10, ncpu = 2){ |
675 # runs cd-hi-est and return table with cluster membership, and size and if reads was repesentative | 768 # runs cd-hi-est and return table with cluster membership, and size and if reads was repesentative |
676 # input sequences must be in the same orientation!!! | 769 # input sequences must be in the same orientation!!! |
677 sfile <- tempfile() | 770 sfile <- tempfile() |
708 is_representative = is_representative[ord] | 801 is_representative = is_representative[ord] |
709 ) | 802 ) |
710 return(cls_info) | 803 return(cls_info) |
711 } | 804 } |
712 | 805 |
806 | |
807 dante2dist <- function(dc){ | |
808 # dc - cluster of domains, granges | |
809 dpos <- get_dom_pos(dc) | |
810 D <- c(dist(dpos)) | |
811 NN <- combn(names(dpos), 2) | |
812 # order feature in pair alphabeticaly | |
813 N1 <- ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ]) | |
814 N2 <- ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ]) | |
815 dfd <- data.frame(F1 = N1, F2 = N2, distance = D) | |
816 } | |
817 | |
818 | |
819 | |
820 dante_to_quantiles <- function(dc, model_function, lineage=NULL){ | |
821 dfd <- dante2dist(dc) | |
822 if (is.null(lineage)){ | |
823 lineage <- dc$Final_Classification[1] | |
824 } | |
825 # check if lineage has defined ecdf | |
826 if (lineage %in% names(model_function$plus)){ | |
827 fn <- model_function$plus[[lineage]] | |
828 fnm <- model_function$minus[[lineage]] | |
829 dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x)) | |
830 dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x)) | |
831 dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1)) | |
832 return(dout) | |
833 }else{ | |
834 return(NULL) | |
835 } | |
836 } | |
837 | |
838 get_dom_pos <- function(g){ | |
839 if (length(g) == 0){ | |
840 return(NULL) | |
841 } | |
842 gdf <- data.frame(rexdb = as.character(seqnames(g)), | |
843 domain = g$Name, | |
844 S = start(g), | |
845 E = end(g), | |
846 stringsAsFactors = FALSE) | |
847 gSmat <- xtabs(S ~ rexdb + domain, data = gdf) | |
848 colnames(gSmat) <- paste0(colnames(gSmat),"_S") | |
849 gEmat <- xtabs(E ~ rexdb + domain, data = gdf) | |
850 colnames(gEmat) <- paste0(colnames(gEmat),"_E") | |
851 SEmat <- cbind(gSmat,gEmat) | |
852 # reorder | |
853 dom_position <- colMeans(SEmat) | |
854 SEmat_sorted <- SEmat[,order(dom_position)] | |
855 return(SEmat_sorted) | |
856 } |