Mercurial > repos > petr-novak > dante_ltr
comparison detect_putative_ltr.R @ 13:559940c04c44 draft
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
author | petr-novak |
---|---|
date | Thu, 11 Aug 2022 07:29:06 +0000 |
parents | ff01d4263391 |
children |
comparison
equal
deleted
inserted
replaced
12:ff01d4263391 | 13:559940c04c44 |
---|---|
107 } | 107 } |
108 | 108 |
109 # MAIN ############################################################# | 109 # MAIN ############################################################# |
110 | 110 |
111 # load data: | 111 # load data: |
112 | |
113 cat("reading gff...") | 112 cat("reading gff...") |
114 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 | 113 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 |
115 cat("done\n") | 114 cat("done\n") |
116 cat("reading fasta...") | 115 cat("reading fasta...") |
117 s <- readDNAStringSet(opt$reference_sequence) # genome assembly | 116 s <- readDNAStringSet(opt$reference_sequence) # genome assembly |
132 names(lineage_ltr_mean_length) <- ln | 131 names(lineage_ltr_mean_length) <- ln |
133 lineage_domains_sequence <- unlist(mapply(function(d,l) { | 132 lineage_domains_sequence <- unlist(mapply(function(d,l) { |
134 paste(strsplit(d, " ")[[1]], ":", l, sep = "") | 133 paste(strsplit(d, " ")[[1]], ":", l, sep = "") |
135 }, d = lineage_domain, l = names(lineage_domain))) | 134 }, d = lineage_domain, l = names(lineage_domain))) |
136 | 135 |
137 # filter g gff3 | 136 # this repeat block is run just once |
138 g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default | 137 # it can breaks in eny point if zero TE is found |
139 | 138 repeat{ |
140 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] | 139 |
141 g <- add_coordinates_of_closest_neighbor(g) | 140 # filter g gff3 |
142 | 141 g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default |
143 # add info about domain order: | 142 |
144 g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) | 143 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] |
145 # set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements | 144 g <- add_coordinates_of_closest_neighbor(g) |
146 g$domain_order[is.na(g$domain_order)] <- 0 | 145 |
147 | 146 # add info about domain order: |
148 # NOTE - some operation is faster of GrangesList but some on list of data.frames | 147 g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) |
149 # this is primary clusteing | 148 # set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements |
150 cls <- get_domain_clusters(g) | 149 g$domain_order[is.na(g$domain_order)] <- 0 |
151 gcl <- split(as.data.frame(g), cls) | 150 |
152 # gcl_as_GRL <- split(g, cls) # delete? | 151 # NOTE - some operation is faster of GrangesList but some on list of data.frames |
153 | 152 # this is primary clusteing |
154 cls_alt <- get_domain_clusters_alt(g, FDM) | 153 cls <- get_domain_clusters(g) |
155 g$Cluster <- as.numeric(factor(cls_alt)) | 154 gcl <- split(as.data.frame(g), cls) |
156 | 155 # gcl_as_GRL <- split(g, cls) # delete? |
157 gcl_alt <- split(as.data.frame(g), cls_alt) | 156 |
158 | 157 cls_alt <- get_domain_clusters_alt(g, FDM) |
159 TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), | 158 g$Cluster <- as.numeric(factor(cls_alt)) |
160 Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), | 159 |
161 Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), | 160 gcl_alt <- split(as.data.frame(g), cls_alt) |
162 ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))), | 161 |
163 strand = sapply(gcl_alt, function(x) x$strand[1]), | 162 TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), |
164 Ndomains = sapply(gcl_alt, function(x) nrow(x)), | 163 Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), |
165 type = "transposable_element", | 164 Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), |
166 source = "dante_ltr", | 165 ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))), |
167 Rank="D", | 166 strand = sapply(gcl_alt, function(x) x$strand[1]), |
168 IRanges(start = sapply(gcl_alt, function(x) min(x$start)), | 167 Ndomains = sapply(gcl_alt, function(x) nrow(x)), |
169 end = sapply(gcl_alt, function(x) max(x$end))) | 168 type = "transposable_element", |
170 ) | 169 source = "dante_ltr", |
171 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) | 170 Rank="D", |
172 g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster)) | 171 IRanges(start = sapply(gcl_alt, function(x) min(x$start)), |
173 g$Rank <- "D" | 172 end = sapply(gcl_alt, function(x) max(x$end))) |
174 | 173 ) |
175 # keep only partial TE with more than one domain | 174 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) |
176 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] | 175 g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster)) |
177 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] | 176 g$Rank <- "D" |
178 | 177 # for statistics |
179 # first filtering - remove TEs with low number of domains | 178 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] |
180 gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) | 179 |
181 | 180 # keep only partial TE with more than one domain |
182 # glc annotation | 181 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] |
183 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) | 182 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] |
184 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", | 183 |
185 paste(rev(x$Name), collapse = " "), | 184 # first filtering - remove TEs with low number of domains |
186 paste(x$Name, collapse = " ")) | 185 gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) |
187 ) | 186 |
188 | 187 # glc annotation |
189 # compare detected domains with domains in lineages from REXdb database | 188 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) |
190 dd <- mapply(domain_distance, | 189 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", |
191 d_query = gcl_clean_domains, | 190 paste(rev(x$Name), collapse = " "), |
192 d_reference = lineage_domain[gcl_clean_lineage]) | 191 paste(x$Name, collapse = " ")) |
193 | 192 ) |
194 # get lineages which has correct number and order of domains | 193 |
195 # gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] | 194 # compare detected domains with domains in lineages from REXdb database |
196 gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] | 195 dd <- mapply(domain_distance, |
197 | 196 d_query = gcl_clean_domains, |
198 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] | 197 d_reference = lineage_domain[gcl_clean_lineage]) |
199 gr <- get_ranges(gcl_clean_with_domains) | 198 |
200 | 199 # get lineages which has correct number and order of domains |
201 | 200 # gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] |
202 cat('Number of analyzed regions:\n') | 201 |
203 cat('Total number of domain clusters : ', length(gcl), '\n') | 202 |
204 cat('Number of clean clusters : ', length(gcl_clean), '\n') | 203 gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] |
205 cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') | 204 |
206 | 205 if(length(gcl_clean2) == 0) { |
207 | 206 cat("No complete TE found\n") |
208 te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) | 207 good_TE <- list() |
209 te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) | 208 break |
210 | 209 } |
211 max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) | 210 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] |
212 max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) | 211 gr <- get_ranges(gcl_clean_with_domains) |
213 | 212 |
214 grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) | 213 cat('Number of analyzed regions:\n') |
215 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) | 214 cat('Total number of domain clusters : ', length(gcl), '\n') |
216 | 215 cat('Number of clean clusters : ', length(gcl_clean), '\n') |
217 s_left <- getSeq(s, grL) | 216 cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') |
218 s_right <- getSeq(s, grR) | 217 |
219 | 218 |
220 expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] | 219 te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) |
221 # for statistics | 220 te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) |
222 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] | 221 |
223 # cleanup | 222 max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) |
224 #rm(g) | 223 max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) |
225 rm(gcl) | 224 |
226 rm(gcl_clean) | 225 grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) |
227 rm(gcl_clean2) | 226 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) |
228 | 227 |
229 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") | 228 s_left <- getSeq(s, grL) |
230 names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") | 229 s_right <- getSeq(s, grR) |
231 names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") | 230 |
232 cat('Identification of LTRs...') | 231 expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] |
233 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], | 232 |
234 s_right[x], | 233 # cleanup |
235 gcl_clean_with_domains[[x]], | 234 #rm(g) |
236 gr[x], | 235 rm(gcl) |
237 grL[x], | 236 rm(gcl_clean) |
238 grR[x], | 237 rm(gcl_clean2) |
239 expected_ltr_length[x]), | 238 |
240 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE | 239 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") |
241 ) | 240 names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") |
242 | 241 names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") |
243 cat('done.\n') | 242 cat('Identification of LTRs...') |
244 | 243 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], |
245 good_TE <- TE[!sapply(TE, is.null)] | 244 s_right[x], |
246 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') | 245 gcl_clean_with_domains[[x]], |
247 | 246 gr[x], |
248 # TODO - extent TE region to cover also TSD | 247 grL[x], |
249 ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) | 248 grR[x], |
250 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) | 249 expected_ltr_length[x]), |
251 | 250 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE |
252 cat('Identification of PBS ...') | 251 ) |
253 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) | 252 |
254 cat('done\n') | 253 cat('done.\n') |
255 gff3_out <- do.call(c, gff3_list2) | 254 good_TE <- TE[!sapply(TE, is.null)] |
256 | 255 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') |
257 # define new source | 256 break |
258 src <- as.character(gff3_out$source) | 257 } |
259 src[is.na(src)] <- "dante_ltr" | 258 |
260 gff3_out$source <- src | 259 if (length(good_TE)>0){ # handle empty list |
261 gff3_out$Rank <- get_te_rank(gff3_out) | 260 ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) |
262 | 261 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) |
263 # add partial TEs but first remove all ovelaping | 262 |
264 TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] | 263 cat('Identification of PBS ...') |
265 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] | 264 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) |
266 | 265 cat('done\n') |
267 gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) | 266 gff3_out <- do.call(c, gff3_list2) |
267 | |
268 # define new source | |
269 src <- as.character(gff3_out$source) | |
270 src[is.na(src)] <- "dante_ltr" | |
271 gff3_out$source <- src | |
272 gff3_out$Rank <- get_te_rank(gff3_out) | |
273 # add partial TEs but first remove all ovelaping | |
274 TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] | |
275 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] | |
276 gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) | |
277 }else{ | |
278 # but this could be a problem if there are no TEs in the sequence | |
279 if (length(TE_partial_with_more_than_one_domain)>0){ | |
280 TE_partial_parent_part <- TE_partial_with_more_than_one_domain | |
281 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] | |
282 gff3_out <- sort(c(TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) | |
283 }else{ | |
284 gff3_out <- NULL | |
285 } | |
286 } | |
287 | |
288 if (is.null(gff3_out)){ | |
289 cat('No TEs found.\n') | |
290 }else{ | |
268 # modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level | 291 # modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level |
269 gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)]) | 292 gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)]) |
270 gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)]) | 293 gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)]) |
271 | 294 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') |
272 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') | 295 all_tbl <- get_te_statistics(gff3_out, RT) |
273 | 296 all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) |
274 all_tbl <- get_te_statistics(gff3_out, RT) | 297 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) |
275 all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) | 298 # export fasta files: |
276 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) | 299 s_te <- get_te_sequences(gff3_out, s) |
277 # export fasta files: | 300 for (i in seq_along(s_te)) { |
278 s_te <- get_te_sequences(gff3_out, s) | 301 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") |
279 for (i in seq_along(s_te)) { | 302 writeXStringSet(s_te[[i]], filepath = outname) |
280 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") | 303 } |
281 writeXStringSet(s_te[[i]], filepath = outname) | |
282 } | 304 } |
283 | 305 |