Mercurial > repos > petr-novak > dante_ltr
comparison R/ltr_utils.R @ 8:9de392f2fc02 draft
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
author | petr-novak |
---|---|
date | Tue, 28 Jun 2022 12:33:22 +0000 |
parents | c33d6583e548 |
children | 1aa578e6c8b3 |
comparison
equal
deleted
inserted
replaced
7:c33d6583e548 | 8:9de392f2fc02 |
---|---|
136 S <- sapply(gx, function(x)min(x$start)) | 136 S <- sapply(gx, function(x)min(x$start)) |
137 E <- sapply(gx, function(x)max(x$end)) | 137 E <- sapply(gx, function(x)max(x$end)) |
138 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset, end = E + offset)) | 138 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset, end = E + offset)) |
139 } | 139 } |
140 | 140 |
141 get_ranges_left <- function(gx, offset = OFFSET, offset2 = 300) { | 141 get_ranges_left <- function(gx, offset = OFFSET, offset2 = 10) { |
142 ## offset2 - how many nt cen LTR extend to closes protein domain | |
143 ## this is necassary as some detected proteins domains does not have correct bopundaries | |
144 ## if LTR retrotransposons insters to other protein domain. | |
142 S <- sapply(gx, function(x)min(x$start)) | 145 S <- sapply(gx, function(x)min(x$start)) |
143 max_offset <- S - sapply(gx, function(x)min(x$upstream_domain)) | 146 max_offset <- S - sapply(gx, function(x)min(x$upstream_domain)) + offset2 |
144 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) | 147 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) |
145 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset_adjusted, end = S + offset2)) | 148 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset_adjusted, end = S + offset2)) |
146 return(gr) | 149 return(gr) |
147 } | 150 } |
148 | 151 |
149 get_ranges_right <- function(gx, offset = OFFSET, offset2 = 300) { | 152 get_ranges_right <- function(gx, offset = OFFSET, offset2 = 10) { |
150 E <- sapply(gx, function(x)max(x$end)) | 153 E <- sapply(gx, function(x)max(x$end)) |
151 max_offset <- sapply(gx, function(x)max(x$downstream_domain)) - E | 154 max_offset <- sapply(gx, function(x)max(x$downstream_domain)) - E + offset2 |
152 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) | 155 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) |
153 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = E - offset2, end = E + offset_adjusted)) | 156 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = E - offset2, end = E + offset_adjusted)) |
154 return(gr) | 157 return(gr) |
155 } | 158 } |
156 | 159 |
187 for (i in 1:nrow(bl)) { | 190 for (i in 1:nrow(bl)) { |
188 cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?") | 191 cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?") |
189 tg_P <- firstTG(cons) | 192 tg_P <- firstTG(cons) |
190 ca_P <- lastCA(cons) | 193 ca_P <- lastCA(cons) |
191 e_dist <- bl$length[i] - ca_P | 194 e_dist <- bl$length[i] - ca_P |
192 max_dist = 50 # was 25 | 195 max_dist <- 50 # was 25 |
193 no_match <- any(tg_P == 0, ca_P == 0) | 196 no_match <- any(tg_P == 0, ca_P == 0) |
194 if (!no_match & | 197 if (!no_match & |
195 tg_P < max_dist & | 198 tg_P < max_dist & |
196 e_dist < max_dist) { | 199 e_dist < max_dist) { |
197 ## trim alignment | 200 ## trim alignment |
682 Rank[ID %in% DLP_id] <- "DLP" | 685 Rank[ID %in% DLP_id] <- "DLP" |
683 Rank[ID %in% DLTP_id] <- "DLTP" | 686 Rank[ID %in% DLTP_id] <- "DLTP" |
684 return(Rank) | 687 return(Rank) |
685 } | 688 } |
686 | 689 |
690 dante_filtering <- function(dante_gff, min_similarity=0.4, | |
691 min_identity=0.2, Relative_Length=0.6, | |
692 min_relat_interuptions=8) { | |
693 include <- as.numeric(dante_gff$Similarity) >= min_similarity & | |
694 as.numeric(dante_gff$Identity) >= min_identity & | |
695 as.numeric(dante_gff$Relat_Length) >= Relative_Length & | |
696 as.numeric(dante_gff$Relat_Interruptions) <= min_relat_interuptions | |
697 | |
698 include[is.na(include)] <- FALSE | |
699 return(dante_gff[include]) | |
700 } | |
687 | 701 |
688 get_te_statistics <- function(gr, RT){ | 702 get_te_statistics <- function(gr, RT){ |
689 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP") | 703 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP") |
690 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE)) | 704 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE)) |
691 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class))) | 705 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class))) |