Mercurial > repos > petr-novak > dante_ltr
diff extract_putative_ltr.R @ 7:c33d6583e548 draft
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author | petr-novak |
---|---|
date | Fri, 24 Jun 2022 14:19:48 +0000 |
parents | f131886ea194 |
children | 9de392f2fc02 |
line wrap: on
line diff
--- a/extract_putative_ltr.R Thu May 19 08:21:55 2022 +0000 +++ b/extract_putative_ltr.R Fri Jun 24 14:19:48 2022 +0000 @@ -14,7 +14,11 @@ make_option(c("-o", "--output"), action = "store", type = "character", help = "output file path and prefix", default = NULL), make_option(c("-c", "--cpu"), type = "integer", default = 5, - help = "Number of cpu to use [default %default]", metavar = "number") + help = "Number of cpu to use [default %default]", metavar = "number"), + make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0, + help = "Maximum number of missing domains is retrotransposon [default %default]", + metavar = "number") + ) description <- paste(strwrap("")) @@ -38,18 +42,23 @@ OFFSET <- 15000 # load configuration files and functions: lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") +FDM_file <- paste0(script_dir, "/databases/feature_distances_model.RDS") trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") if (file.exists(lineage_file) & file.exists(trna_db)) { lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) + FDM <- readRDS(FDM_file) source(ltr_utils_r) }else { + # this destination work is installed using conda lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv") + FDM_file <- paste0(script_dir, "/../share/dante_ltr/databases/feature_distances_model.RDS") trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R") if (file.exists(lineage_file) & file.exists((trna_db))) { lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) source(ltr_utils_r) + FDM <- readRDS(FDM_file) }else( stop("configuration files not found") ) @@ -62,10 +71,8 @@ s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) - g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data - /DANTE_filtered_part.gff3") - s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data - /Rbp_part.fa") + g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3") + s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa") g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data /DANTE_Vfaba_chr5.gff3") @@ -76,7 +83,8 @@ s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz") source("R/ltr_utils.R") - + ## feature distance model + FDM <- readRDS("./databases/feature_distances_model.RDS") g <- rtracklayer::import("./test_data/sample_DANTE.gff3") s <- readDNAStringSet("./test_data/sample_genome.fasta") outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" @@ -100,19 +108,63 @@ # clean sequence names: names(s) <- gsub(" .+", "", names(s)) lineage_domain <- lineage_info$Domains.order +lineage_domain_span <- lineage_info$domain_span +lineage_ltr_mean_length <- lineage_info$ltr_length lineage_offset5prime <- lineage_info$offset5prime lineage_offset3prime <- lineage_info$offset3prime ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage))) names(lineage_offset3prime) <- ln names(lineage_offset5prime) <- ln names(lineage_domain) <- ln +names(lineage_domain_span) <- ln +names(lineage_ltr_mean_length) <- ln +lineage_domains_sequence <- unlist(mapply(function(d,l) { + paste(strsplit(d, " ")[[1]], ":", l, sep = "") +}, d = lineage_domain, l = names(lineage_domain))) seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] g <- add_coordinates_of_closest_neighbor(g) -gcl <- get_domain_clusters(g) -gcl_clean <- clean_domain_clusters(gcl) +# add info about domain order: +g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) +# set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements +g$domain_order[is.na(g$domain_order)] <- 0 + +# NOTE - some operation is faster of GrangesList but some on list of data.frames +# this is primary clusteing +cls <- get_domain_clusters(g) +gcl <- split(as.data.frame(g), cls) +# gcl_as_GRL <- split(g, cls) # delete? + +cls_alt <- get_domain_clusters_alt(g, FDM) +g$Cluster <- as.numeric(factor(cls_alt)) + +gcl_alt <- split(as.data.frame(g), cls_alt) + +TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), + Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), + Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), + ID = sapply(gcl_alt, function(x) paste0("TE_partial_", x$Cluster[1])), + strand = sapply(gcl_alt, function(x) x$strand[1]), + Ndomains = sapply(gcl_alt, function(x) nrow(x)), + type = "transposable_element", + source = "dante_ltr", + Rank="D", + IRanges(start = sapply(gcl_alt, function(x) min(x$start)), + end = sapply(gcl_alt, function(x) max(x$end))) +) +g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) +g$Parent <- paste0("TE_partial_", g$Cluster) +g$Rank="D" + +# keep only partial TE with more than one domain +TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] +g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] + +# first filtering - remove TEs with low number of domains +gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) + # glc annotation gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", @@ -120,8 +172,15 @@ paste(x$Name, collapse = " ")) ) +# compare detected domains with domains in lineages from REXdb database +dd <- mapply(domain_distance, + d_query = gcl_clean_domains, + d_reference = lineage_domain[gcl_clean_lineage]) + # get lineages which has correct number and order of domains -gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] +# gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] +gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] + gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] gr <- get_ranges(gcl_clean_with_domains) @@ -144,10 +203,11 @@ s_left <- getSeq(s, grL) s_right <- getSeq(s, grR) +expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] # for statistics RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] # cleanup -rm(g) +#rm(g) rm(gcl) rm(gcl_clean) rm(gcl_clean2) @@ -161,7 +221,8 @@ gcl_clean_with_domains[[x]], gr[x], grL[x], - grR[x]), + grR[x], + expected_ltr_length[x]), mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE ) @@ -170,7 +231,7 @@ good_TE <- TE[!sapply(TE, is.null)] cat('Number of putative TE with identified LTR :', length(good_TE), '\n') - +# TODO - extent TE region to cover also TSD ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) @@ -184,15 +245,20 @@ src[is.na(src)] <- "dante_ltr" gff3_out$source <- src gff3_out$Rank <- get_te_rank(gff3_out) -# TODO add attributte specifying individual groups DL, DLT, DLP DLPT gff3 + +# add partial TEs but first remove all ovelaping +TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] +TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] + +gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') all_tbl <- get_te_statistics(gff3_out, RT) -write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = TRUE) +all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) +write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) # export fasta files: s_te <- get_te_sequences(gff3_out, s) - for (i in seq_along(s_te)) { outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") writeXStringSet(s_te[[i]], filepath = outname)