diff extract_putative_ltr.R @ 7:c33d6583e548 draft

"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author petr-novak
date Fri, 24 Jun 2022 14:19:48 +0000
parents f131886ea194
children 9de392f2fc02
line wrap: on
line diff
--- a/extract_putative_ltr.R	Thu May 19 08:21:55 2022 +0000
+++ b/extract_putative_ltr.R	Fri Jun 24 14:19:48 2022 +0000
@@ -14,7 +14,11 @@
   make_option(c("-o", "--output"), action = "store", type = "character",
               help = "output file path and prefix", default = NULL),
   make_option(c("-c", "--cpu"), type = "integer", default = 5,
-              help = "Number of cpu to use [default %default]", metavar = "number")
+              help = "Number of cpu to use [default %default]", metavar = "number"),
+  make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0,
+              help = "Maximum number of missing domains is retrotransposon [default %default]",
+              metavar = "number")
+
 
 )
 description <- paste(strwrap(""))
@@ -38,18 +42,23 @@
 OFFSET <- 15000
 # load configuration files and functions:
 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv")
+FDM_file <- paste0(script_dir, "/databases/feature_distances_model.RDS")
 trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R")
 if (file.exists(lineage_file) & file.exists(trna_db)) {
   lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
+  FDM <- readRDS(FDM_file)
   source(ltr_utils_r)
 }else {
+  # this destination work is installed using conda
   lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv")
+  FDM_file <- paste0(script_dir, "/../share/dante_ltr/databases/feature_distances_model.RDS")
   trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
   ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R")
   if (file.exists(lineage_file) & file.exists((trna_db))) {
     lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
     source(ltr_utils_r)
+    FDM <- readRDS(FDM_file)
   }else(
     stop("configuration files not found")
   )
@@ -62,10 +71,8 @@
   s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa")
   lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE)
 
-  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data
-  /DANTE_filtered_part.gff3")
-  s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data
-  /Rbp_part.fa")
+  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3")
+  s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa")
 
   g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data
   /DANTE_Vfaba_chr5.gff3")
@@ -76,7 +83,8 @@
   s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz")
 
   source("R/ltr_utils.R")
-
+  ## feature distance model
+  FDM <- readRDS("./databases/feature_distances_model.RDS")
   g <- rtracklayer::import("./test_data/sample_DANTE.gff3")
   s <- readDNAStringSet("./test_data/sample_genome.fasta")
   outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3"
@@ -100,19 +108,63 @@
 # clean sequence names:
 names(s) <- gsub(" .+", "", names(s))
 lineage_domain <- lineage_info$Domains.order
+lineage_domain_span <- lineage_info$domain_span
+lineage_ltr_mean_length <- lineage_info$ltr_length
 lineage_offset5prime <- lineage_info$offset5prime
 lineage_offset3prime <- lineage_info$offset3prime
 ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage)))
 names(lineage_offset3prime) <-  ln
 names(lineage_offset5prime) <-  ln
 names(lineage_domain) <- ln
+names(lineage_domain_span) <- ln
+names(lineage_ltr_mean_length) <- ln
+lineage_domains_sequence <- unlist(mapply(function(d,l) {
+  paste(strsplit(d, " ")[[1]], ":", l, sep = "")
+}, d = lineage_domain, l = names(lineage_domain)))
 
 
 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))]
 g <- add_coordinates_of_closest_neighbor(g)
 
-gcl <- get_domain_clusters(g)
-gcl_clean <- clean_domain_clusters(gcl)
+# add info about domain order:
+g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence))
+# set NA to 0 in  g$domain_order ( some domains are not fromm ClassI elements
+g$domain_order[is.na(g$domain_order)] <- 0
+
+# NOTE - some operation is faster of GrangesList but some on list of data.frames
+# this is primary clusteing
+cls <- get_domain_clusters(g)
+gcl <- split(as.data.frame(g), cls)
+# gcl_as_GRL <- split(g, cls)  # delete?
+
+cls_alt <- get_domain_clusters_alt(g, FDM)
+g$Cluster <- as.numeric(factor(cls_alt))
+
+gcl_alt <- split(as.data.frame(g), cls_alt)
+
+TE_partial <-  GRanges(seqnames =  sapply(gcl_alt, function(x) x$seqnames[1]),
+                       Name = sapply(gcl_alt, function(x) x$Final_Classification[1]),
+                       Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]),
+                       ID = sapply(gcl_alt, function(x) paste0("TE_partial_", x$Cluster[1])),
+                       strand = sapply(gcl_alt, function(x) x$strand[1]),
+                       Ndomains = sapply(gcl_alt, function(x) nrow(x)),
+                       type = "transposable_element",
+                       source = "dante_ltr",
+                       Rank="D",
+                       IRanges(start = sapply(gcl_alt, function(x) min(x$start)),
+                               end = sapply(gcl_alt, function(x) max(x$end)))
+)
+g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster)
+g$Parent <- paste0("TE_partial_", g$Cluster)
+g$Rank="D"
+
+# keep only partial TE with more than one domain
+TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1]
+g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)]
+
+# first filtering  - remove TEs with low number of domains
+gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains)
+
 # glc annotation
 gcl_clean_lineage <- sapply(gcl_clean, function(x)  x$Final_Classification[1])
 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-",
@@ -120,8 +172,15 @@
                                                 paste(x$Name, collapse = " "))
 )
 
+# compare detected domains with domains in lineages from REXdb database
+dd <- mapply(domain_distance,
+             d_query = gcl_clean_domains,
+             d_reference = lineage_domain[gcl_clean_lineage])
+
 # get lineages which has correct number and order of domains
-gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]]
+# gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]]
+gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains]
+
 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)]
 gr <- get_ranges(gcl_clean_with_domains)
 
@@ -144,10 +203,11 @@
 s_left <- getSeq(s, grL)
 s_right <- getSeq(s, grR)
 
+expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])]
 # for statistics
 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"]
 # cleanup
-rm(g)
+#rm(g)
 rm(gcl)
 rm(gcl_clean)
 rm(gcl_clean2)
@@ -161,7 +221,8 @@
                                                 gcl_clean_with_domains[[x]],
                                                 gr[x],
                                                 grL[x],
-                                                grR[x]),
+                                                grR[x],
+                                                expected_ltr_length[x]),
                mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE
 )
 
@@ -170,7 +231,7 @@
 good_TE <- TE[!sapply(TE, is.null)]
 cat('Number of putative TE with identified LTR   :', length(good_TE), '\n')
 
-
+# TODO - extent TE region to cover also TSD
 ID <- paste0('TE_', sprintf("%08d", seq(good_TE)))
 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu)
 
@@ -184,15 +245,20 @@
 src[is.na(src)] <- "dante_ltr"
 gff3_out$source <- src
 gff3_out$Rank <- get_te_rank(gff3_out)
-# TODO add attributte specifying individual groups DL, DLT, DLP DLPT gff3
+
+# add partial TEs but first remove all ovelaping
+TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out]
+TE_partial_domain_part <-  g[g$Parent %in% TE_partial_parent_part$ID]
+
+gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start)
 
 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3')
 
 all_tbl <- get_te_statistics(gff3_out, RT)
-write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = TRUE)
+all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl)
+write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE)
 # export fasta files:
 s_te <- get_te_sequences(gff3_out, s)
-
 for (i in seq_along(s_te)) {
   outname <- paste0(outfile, "_", names(s_te)[i], ".fasta")
   writeXStringSet(s_te[[i]], filepath = outname)