comparison R/ltr_utils.R @ 8:9de392f2fc02 draft

"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
author petr-novak
date Tue, 28 Jun 2022 12:33:22 +0000
parents c33d6583e548
children 1aa578e6c8b3
comparison
equal deleted inserted replaced
7:c33d6583e548 8:9de392f2fc02
136 S <- sapply(gx, function(x)min(x$start)) 136 S <- sapply(gx, function(x)min(x$start))
137 E <- sapply(gx, function(x)max(x$end)) 137 E <- sapply(gx, function(x)max(x$end))
138 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset, end = E + offset)) 138 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset, end = E + offset))
139 } 139 }
140 140
141 get_ranges_left <- function(gx, offset = OFFSET, offset2 = 300) { 141 get_ranges_left <- function(gx, offset = OFFSET, offset2 = 10) {
142 ## offset2 - how many nt cen LTR extend to closes protein domain
143 ## this is necassary as some detected proteins domains does not have correct bopundaries
144 ## if LTR retrotransposons insters to other protein domain.
142 S <- sapply(gx, function(x)min(x$start)) 145 S <- sapply(gx, function(x)min(x$start))
143 max_offset <- S - sapply(gx, function(x)min(x$upstream_domain)) 146 max_offset <- S - sapply(gx, function(x)min(x$upstream_domain)) + offset2
144 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) 147 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset)
145 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset_adjusted, end = S + offset2)) 148 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset_adjusted, end = S + offset2))
146 return(gr) 149 return(gr)
147 } 150 }
148 151
149 get_ranges_right <- function(gx, offset = OFFSET, offset2 = 300) { 152 get_ranges_right <- function(gx, offset = OFFSET, offset2 = 10) {
150 E <- sapply(gx, function(x)max(x$end)) 153 E <- sapply(gx, function(x)max(x$end))
151 max_offset <- sapply(gx, function(x)max(x$downstream_domain)) - E 154 max_offset <- sapply(gx, function(x)max(x$downstream_domain)) - E + offset2
152 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) 155 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset)
153 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = E - offset2, end = E + offset_adjusted)) 156 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = E - offset2, end = E + offset_adjusted))
154 return(gr) 157 return(gr)
155 } 158 }
156 159
187 for (i in 1:nrow(bl)) { 190 for (i in 1:nrow(bl)) {
188 cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?") 191 cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?")
189 tg_P <- firstTG(cons) 192 tg_P <- firstTG(cons)
190 ca_P <- lastCA(cons) 193 ca_P <- lastCA(cons)
191 e_dist <- bl$length[i] - ca_P 194 e_dist <- bl$length[i] - ca_P
192 max_dist = 50 # was 25 195 max_dist <- 50 # was 25
193 no_match <- any(tg_P == 0, ca_P == 0) 196 no_match <- any(tg_P == 0, ca_P == 0)
194 if (!no_match & 197 if (!no_match &
195 tg_P < max_dist & 198 tg_P < max_dist &
196 e_dist < max_dist) { 199 e_dist < max_dist) {
197 ## trim alignment 200 ## trim alignment
682 Rank[ID %in% DLP_id] <- "DLP" 685 Rank[ID %in% DLP_id] <- "DLP"
683 Rank[ID %in% DLTP_id] <- "DLTP" 686 Rank[ID %in% DLTP_id] <- "DLTP"
684 return(Rank) 687 return(Rank)
685 } 688 }
686 689
690 dante_filtering <- function(dante_gff, min_similarity=0.4,
691 min_identity=0.2, Relative_Length=0.6,
692 min_relat_interuptions=8) {
693 include <- as.numeric(dante_gff$Similarity) >= min_similarity &
694 as.numeric(dante_gff$Identity) >= min_identity &
695 as.numeric(dante_gff$Relat_Length) >= Relative_Length &
696 as.numeric(dante_gff$Relat_Interruptions) <= min_relat_interuptions
697
698 include[is.na(include)] <- FALSE
699 return(dante_gff[include])
700 }
687 701
688 get_te_statistics <- function(gr, RT){ 702 get_te_statistics <- function(gr, RT){
689 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP") 703 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP")
690 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE)) 704 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE))
691 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class))) 705 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class)))