Mercurial > repos > petr-novak > dante_ltr
diff detect_putative_ltr.R @ 13:559940c04c44 draft
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
author | petr-novak |
---|---|
date | Thu, 11 Aug 2022 07:29:06 +0000 |
parents | ff01d4263391 |
children |
line wrap: on
line diff
--- a/detect_putative_ltr.R Thu Jul 21 08:23:15 2022 +0000 +++ b/detect_putative_ltr.R Thu Aug 11 07:29:06 2022 +0000 @@ -109,7 +109,6 @@ # MAIN ############################################################# # load data: - cat("reading gff...") g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 cat("done\n") @@ -134,150 +133,173 @@ paste(strsplit(d, " ")[[1]], ":", l, sep = "") }, d = lineage_domain, l = names(lineage_domain))) -# filter g gff3 -g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default +# this repeat block is run just once +# it can breaks in eny point if zero TE is found +repeat{ -seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] -g <- add_coordinates_of_closest_neighbor(g) + # filter g gff3 + g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default -# add info about domain order: -g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) -# set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements -g$domain_order[is.na(g$domain_order)] <- 0 + seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] + g <- add_coordinates_of_closest_neighbor(g) -# NOTE - some operation is faster of GrangesList but some on list of data.frames -# this is primary clusteing -cls <- get_domain_clusters(g) -gcl <- split(as.data.frame(g), cls) -# gcl_as_GRL <- split(g, cls) # delete? + # add info about domain order: + g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) + # set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements + g$domain_order[is.na(g$domain_order)] <- 0 -cls_alt <- get_domain_clusters_alt(g, FDM) -g$Cluster <- as.numeric(factor(cls_alt)) + # NOTE - some operation is faster of GrangesList but some on list of data.frames + # this is primary clusteing + cls <- get_domain_clusters(g) + gcl <- split(as.data.frame(g), cls) + # gcl_as_GRL <- split(g, cls) # delete? -gcl_alt <- split(as.data.frame(g), cls_alt) + cls_alt <- get_domain_clusters_alt(g, FDM) + g$Cluster <- as.numeric(factor(cls_alt)) + + gcl_alt <- split(as.data.frame(g), cls_alt) -TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), - Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), - Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), - ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))), - strand = sapply(gcl_alt, function(x) x$strand[1]), - Ndomains = sapply(gcl_alt, function(x) nrow(x)), - type = "transposable_element", - source = "dante_ltr", - Rank="D", - IRanges(start = sapply(gcl_alt, function(x) min(x$start)), - end = sapply(gcl_alt, function(x) max(x$end))) -) -g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) -g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster)) -g$Rank <- "D" - -# keep only partial TE with more than one domain -TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] -g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] + TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), + Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), + Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), + ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))), + strand = sapply(gcl_alt, function(x) x$strand[1]), + Ndomains = sapply(gcl_alt, function(x) nrow(x)), + type = "transposable_element", + source = "dante_ltr", + Rank="D", + IRanges(start = sapply(gcl_alt, function(x) min(x$start)), + end = sapply(gcl_alt, function(x) max(x$end))) + ) + g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) + g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster)) + g$Rank <- "D" + # for statistics + RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] -# first filtering - remove TEs with low number of domains -gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) + # keep only partial TE with more than one domain + TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] + g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] -# glc annotation -gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) -gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", - paste(rev(x$Name), collapse = " "), - paste(x$Name, collapse = " ")) -) + # first filtering - remove TEs with low number of domains + gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) -# compare detected domains with domains in lineages from REXdb database -dd <- mapply(domain_distance, - d_query = gcl_clean_domains, - d_reference = lineage_domain[gcl_clean_lineage]) + # glc annotation + gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) + gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", + paste(rev(x$Name), collapse = " "), + paste(x$Name, collapse = " ")) + ) -# get lineages which has correct number and order of domains -# gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] -gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] + # compare detected domains with domains in lineages from REXdb database + dd <- mapply(domain_distance, + d_query = gcl_clean_domains, + d_reference = lineage_domain[gcl_clean_lineage]) -gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] -gr <- get_ranges(gcl_clean_with_domains) + # get lineages which has correct number and order of domains + # gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] -cat('Number of analyzed regions:\n') -cat('Total number of domain clusters : ', length(gcl), '\n') -cat('Number of clean clusters : ', length(gcl_clean), '\n') -cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') + gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] + + if(length(gcl_clean2) == 0) { + cat("No complete TE found\n") + good_TE <- list() + break + } + gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] + gr <- get_ranges(gcl_clean_with_domains) + + cat('Number of analyzed regions:\n') + cat('Total number of domain clusters : ', length(gcl), '\n') + cat('Number of clean clusters : ', length(gcl_clean), '\n') + cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') -te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) -te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) + te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) + te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) -max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) -max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) + max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) + max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) -grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) -grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) + grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) + grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) -s_left <- getSeq(s, grL) -s_right <- getSeq(s, grR) + s_left <- getSeq(s, grL) + s_right <- getSeq(s, grR) + + expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] -expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] -# for statistics -RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] -# cleanup -#rm(g) -rm(gcl) -rm(gcl_clean) -rm(gcl_clean2) + # cleanup + #rm(g) + rm(gcl) + rm(gcl_clean) + rm(gcl_clean2) -names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") -names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") -names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") -cat('Identification of LTRs...') -TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], - s_right[x], - gcl_clean_with_domains[[x]], - gr[x], - grL[x], - grR[x], - expected_ltr_length[x]), - mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE -) - -cat('done.\n') + names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") + names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") + names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") + cat('Identification of LTRs...') + TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], + s_right[x], + gcl_clean_with_domains[[x]], + gr[x], + grL[x], + grR[x], + expected_ltr_length[x]), + mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE + ) -good_TE <- TE[!sapply(TE, is.null)] -cat('Number of putative TE with identified LTR :', length(good_TE), '\n') - -# TODO - extent TE region to cover also TSD -ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) -gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) + cat('done.\n') + good_TE <- TE[!sapply(TE, is.null)] + cat('Number of putative TE with identified LTR :', length(good_TE), '\n') + break + } -cat('Identification of PBS ...') -gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) -cat('done\n') -gff3_out <- do.call(c, gff3_list2) +if (length(good_TE)>0){ # handle empty list + ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) + gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) -# define new source -src <- as.character(gff3_out$source) -src[is.na(src)] <- "dante_ltr" -gff3_out$source <- src -gff3_out$Rank <- get_te_rank(gff3_out) + cat('Identification of PBS ...') + gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) + cat('done\n') + gff3_out <- do.call(c, gff3_list2) -# add partial TEs but first remove all ovelaping -TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] -TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] - -gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) -# modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level -gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)]) -gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)]) - -export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') - -all_tbl <- get_te_statistics(gff3_out, RT) -all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) -write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) -# export fasta files: -s_te <- get_te_sequences(gff3_out, s) -for (i in seq_along(s_te)) { - outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") - writeXStringSet(s_te[[i]], filepath = outname) + # define new source + src <- as.character(gff3_out$source) + src[is.na(src)] <- "dante_ltr" + gff3_out$source <- src + gff3_out$Rank <- get_te_rank(gff3_out) + # add partial TEs but first remove all ovelaping + TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] + TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] + gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) +}else{ + # but this could be a problem if there are no TEs in the sequence + if (length(TE_partial_with_more_than_one_domain)>0){ + TE_partial_parent_part <- TE_partial_with_more_than_one_domain + TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] + gff3_out <- sort(c(TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) + }else{ + gff3_out <- NULL + } } +if (is.null(gff3_out)){ + cat('No TEs found.\n') +}else{ +# modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level + gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)]) + gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)]) + export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') + all_tbl <- get_te_statistics(gff3_out, RT) + all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) + write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) + # export fasta files: + s_te <- get_te_sequences(gff3_out, s) + for (i in seq_along(s_te)) { + outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") + writeXStringSet(s_te[[i]], filepath = outname) + } +} +