comparison detect_putative_ltr.R @ 13:559940c04c44 draft

"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
author petr-novak
date Thu, 11 Aug 2022 07:29:06 +0000
parents ff01d4263391
children
comparison
equal deleted inserted replaced
12:ff01d4263391 13:559940c04c44
107 } 107 }
108 108
109 # MAIN ############################################################# 109 # MAIN #############################################################
110 110
111 # load data: 111 # load data:
112
113 cat("reading gff...") 112 cat("reading gff...")
114 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 113 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3
115 cat("done\n") 114 cat("done\n")
116 cat("reading fasta...") 115 cat("reading fasta...")
117 s <- readDNAStringSet(opt$reference_sequence) # genome assembly 116 s <- readDNAStringSet(opt$reference_sequence) # genome assembly
132 names(lineage_ltr_mean_length) <- ln 131 names(lineage_ltr_mean_length) <- ln
133 lineage_domains_sequence <- unlist(mapply(function(d,l) { 132 lineage_domains_sequence <- unlist(mapply(function(d,l) {
134 paste(strsplit(d, " ")[[1]], ":", l, sep = "") 133 paste(strsplit(d, " ")[[1]], ":", l, sep = "")
135 }, d = lineage_domain, l = names(lineage_domain))) 134 }, d = lineage_domain, l = names(lineage_domain)))
136 135
137 # filter g gff3 136 # this repeat block is run just once
138 g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default 137 # it can breaks in eny point if zero TE is found
139 138 repeat{
140 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] 139
141 g <- add_coordinates_of_closest_neighbor(g) 140 # filter g gff3
142 141 g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default
143 # add info about domain order: 142
144 g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) 143 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))]
145 # set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements 144 g <- add_coordinates_of_closest_neighbor(g)
146 g$domain_order[is.na(g$domain_order)] <- 0 145
147 146 # add info about domain order:
148 # NOTE - some operation is faster of GrangesList but some on list of data.frames 147 g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence))
149 # this is primary clusteing 148 # set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements
150 cls <- get_domain_clusters(g) 149 g$domain_order[is.na(g$domain_order)] <- 0
151 gcl <- split(as.data.frame(g), cls) 150
152 # gcl_as_GRL <- split(g, cls) # delete? 151 # NOTE - some operation is faster of GrangesList but some on list of data.frames
153 152 # this is primary clusteing
154 cls_alt <- get_domain_clusters_alt(g, FDM) 153 cls <- get_domain_clusters(g)
155 g$Cluster <- as.numeric(factor(cls_alt)) 154 gcl <- split(as.data.frame(g), cls)
156 155 # gcl_as_GRL <- split(g, cls) # delete?
157 gcl_alt <- split(as.data.frame(g), cls_alt) 156
158 157 cls_alt <- get_domain_clusters_alt(g, FDM)
159 TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), 158 g$Cluster <- as.numeric(factor(cls_alt))
160 Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), 159
161 Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), 160 gcl_alt <- split(as.data.frame(g), cls_alt)
162 ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))), 161
163 strand = sapply(gcl_alt, function(x) x$strand[1]), 162 TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]),
164 Ndomains = sapply(gcl_alt, function(x) nrow(x)), 163 Name = sapply(gcl_alt, function(x) x$Final_Classification[1]),
165 type = "transposable_element", 164 Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]),
166 source = "dante_ltr", 165 ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))),
167 Rank="D", 166 strand = sapply(gcl_alt, function(x) x$strand[1]),
168 IRanges(start = sapply(gcl_alt, function(x) min(x$start)), 167 Ndomains = sapply(gcl_alt, function(x) nrow(x)),
169 end = sapply(gcl_alt, function(x) max(x$end))) 168 type = "transposable_element",
170 ) 169 source = "dante_ltr",
171 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) 170 Rank="D",
172 g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster)) 171 IRanges(start = sapply(gcl_alt, function(x) min(x$start)),
173 g$Rank <- "D" 172 end = sapply(gcl_alt, function(x) max(x$end)))
174 173 )
175 # keep only partial TE with more than one domain 174 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster)
176 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] 175 g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster))
177 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] 176 g$Rank <- "D"
178 177 # for statistics
179 # first filtering - remove TEs with low number of domains 178 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"]
180 gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) 179
181 180 # keep only partial TE with more than one domain
182 # glc annotation 181 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1]
183 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) 182 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)]
184 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", 183
185 paste(rev(x$Name), collapse = " "), 184 # first filtering - remove TEs with low number of domains
186 paste(x$Name, collapse = " ")) 185 gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains)
187 ) 186
188 187 # glc annotation
189 # compare detected domains with domains in lineages from REXdb database 188 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1])
190 dd <- mapply(domain_distance, 189 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-",
191 d_query = gcl_clean_domains, 190 paste(rev(x$Name), collapse = " "),
192 d_reference = lineage_domain[gcl_clean_lineage]) 191 paste(x$Name, collapse = " "))
193 192 )
194 # get lineages which has correct number and order of domains 193
195 # gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] 194 # compare detected domains with domains in lineages from REXdb database
196 gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] 195 dd <- mapply(domain_distance,
197 196 d_query = gcl_clean_domains,
198 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] 197 d_reference = lineage_domain[gcl_clean_lineage])
199 gr <- get_ranges(gcl_clean_with_domains) 198
200 199 # get lineages which has correct number and order of domains
201 200 # gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]]
202 cat('Number of analyzed regions:\n') 201
203 cat('Total number of domain clusters : ', length(gcl), '\n') 202
204 cat('Number of clean clusters : ', length(gcl_clean), '\n') 203 gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains]
205 cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') 204
206 205 if(length(gcl_clean2) == 0) {
207 206 cat("No complete TE found\n")
208 te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) 207 good_TE <- list()
209 te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) 208 break
210 209 }
211 max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) 210 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)]
212 max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) 211 gr <- get_ranges(gcl_clean_with_domains)
213 212
214 grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) 213 cat('Number of analyzed regions:\n')
215 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) 214 cat('Total number of domain clusters : ', length(gcl), '\n')
216 215 cat('Number of clean clusters : ', length(gcl_clean), '\n')
217 s_left <- getSeq(s, grL) 216 cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n')
218 s_right <- getSeq(s, grR) 217
219 218
220 expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] 219 te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1])
221 # for statistics 220 te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1])
222 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] 221
223 # cleanup 222 max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
224 #rm(g) 223 max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
225 rm(gcl) 224
226 rm(gcl_clean) 225 grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset)
227 rm(gcl_clean2) 226 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset)
228 227
229 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") 228 s_left <- getSeq(s, grL)
230 names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") 229 s_right <- getSeq(s, grR)
231 names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") 230
232 cat('Identification of LTRs...') 231 expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])]
233 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], 232
234 s_right[x], 233 # cleanup
235 gcl_clean_with_domains[[x]], 234 #rm(g)
236 gr[x], 235 rm(gcl)
237 grL[x], 236 rm(gcl_clean)
238 grR[x], 237 rm(gcl_clean2)
239 expected_ltr_length[x]), 238
240 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE 239 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_")
241 ) 240 names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_")
242 241 names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_")
243 cat('done.\n') 242 cat('Identification of LTRs...')
244 243 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x],
245 good_TE <- TE[!sapply(TE, is.null)] 244 s_right[x],
246 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') 245 gcl_clean_with_domains[[x]],
247 246 gr[x],
248 # TODO - extent TE region to cover also TSD 247 grL[x],
249 ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) 248 grR[x],
250 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) 249 expected_ltr_length[x]),
251 250 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE
252 cat('Identification of PBS ...') 251 )
253 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) 252
254 cat('done\n') 253 cat('done.\n')
255 gff3_out <- do.call(c, gff3_list2) 254 good_TE <- TE[!sapply(TE, is.null)]
256 255 cat('Number of putative TE with identified LTR :', length(good_TE), '\n')
257 # define new source 256 break
258 src <- as.character(gff3_out$source) 257 }
259 src[is.na(src)] <- "dante_ltr" 258
260 gff3_out$source <- src 259 if (length(good_TE)>0){ # handle empty list
261 gff3_out$Rank <- get_te_rank(gff3_out) 260 ID <- paste0('TE_', sprintf("%08d", seq(good_TE)))
262 261 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu)
263 # add partial TEs but first remove all ovelaping 262
264 TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] 263 cat('Identification of PBS ...')
265 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] 264 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE)
266 265 cat('done\n')
267 gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) 266 gff3_out <- do.call(c, gff3_list2)
267
268 # define new source
269 src <- as.character(gff3_out$source)
270 src[is.na(src)] <- "dante_ltr"
271 gff3_out$source <- src
272 gff3_out$Rank <- get_te_rank(gff3_out)
273 # add partial TEs but first remove all ovelaping
274 TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out]
275 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID]
276 gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start)
277 }else{
278 # but this could be a problem if there are no TEs in the sequence
279 if (length(TE_partial_with_more_than_one_domain)>0){
280 TE_partial_parent_part <- TE_partial_with_more_than_one_domain
281 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID]
282 gff3_out <- sort(c(TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start)
283 }else{
284 gff3_out <- NULL
285 }
286 }
287
288 if (is.null(gff3_out)){
289 cat('No TEs found.\n')
290 }else{
268 # modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level 291 # modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level
269 gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)]) 292 gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)])
270 gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)]) 293 gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)])
271 294 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3')
272 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') 295 all_tbl <- get_te_statistics(gff3_out, RT)
273 296 all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl)
274 all_tbl <- get_te_statistics(gff3_out, RT) 297 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE)
275 all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) 298 # export fasta files:
276 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) 299 s_te <- get_te_sequences(gff3_out, s)
277 # export fasta files: 300 for (i in seq_along(s_te)) {
278 s_te <- get_te_sequences(gff3_out, s) 301 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta")
279 for (i in seq_along(s_te)) { 302 writeXStringSet(s_te[[i]], filepath = outname)
280 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") 303 }
281 writeXStringSet(s_te[[i]], filepath = outname)
282 } 304 }
283 305