diff extract_putative_ltr.R @ 0:7b0bbe7477c4 draft

"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
author petr-novak
date Tue, 08 Mar 2022 13:24:33 +0000
parents
children f131886ea194
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_putative_ltr.R	Tue Mar 08 13:24:33 2022 +0000
@@ -0,0 +1,195 @@
+#!/usr/bin/env Rscript
+initial_options <- commandArgs(trailingOnly = FALSE)
+file_arg_name <- "--file="
+script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep(file_arg_name, initial_options)]))
+script_dir <- dirname(script_name)
+library(optparse)
+
+parser <- OptionParser()
+option_list <- list(
+  make_option(c("-g", "--gff3"), action = "store", type = "character",
+              help = "gff3 with dante results", default = NULL),
+  make_option(c("-s", "--reference_sequence"), action = "store", type = "character",
+              help = "reference sequence as fasta", default = NULL),
+  make_option(c("-o", "--output"), action = "store", type = "character",
+              help = "output file path and prefix", default = NULL),
+  make_option(c("-c", "--cpu"), type = "integer", default = 5,
+              help = "Number of cpu to use [default %default]", metavar = "number")
+
+)
+description <- paste(strwrap(""))
+
+epilogue <- ""
+parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = description,
+                       usage = "usage: %prog COMMAND [OPTIONS]")
+opt <- parse_args(parser, args = commandArgs(TRUE))
+
+
+# load packages
+suppressPackageStartupMessages({
+  library(rtracklayer)
+  library(Biostrings)
+  library(BSgenome)
+  library(parallel)
+})
+
+
+# CONFIGURATION
+OFFSET <- 15000
+# load configuration files and functions:
+lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv")
+trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
+ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R")
+if (file.exists(lineage_file) & file.exists(trna_db)) {
+  lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
+  source(ltr_utils_r)
+}else {
+  lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv")
+  trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
+  ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R")
+  if (file.exists(lineage_file) & file.exists((trna_db))) {
+    lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
+    source(ltr_utils_r)
+  }else(
+    stop("configuration files not found")
+  )
+}
+
+
+# for testing
+if (FALSE) {
+  g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3")
+  s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa")
+  lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE)
+
+  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data
+  /DANTE_filtered_part.gff3")
+  s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data
+  /Rbp_part.fa")
+
+  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data
+  /DANTE_Vfaba_chr5.gff3")
+  s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910
+  /fasta_parts/211010_Vfaba_chr5.fasta")
+
+  g <- rtracklayer::import("./test_data/sample_DANTE.gff3")
+  s <- readDNAStringSet("./test_data/sample_genome.fasta")
+  outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3"
+  lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header =
+    TRUE, as.is = TRUE)
+  trna_db <- "./databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta"
+
+}
+
+# MAIN #############################################################
+
+# load data:
+
+cat("reading gff...")
+g <- rtracklayer::import(opt$gff3, format = 'gff3')  # DANTE gff3
+cat("done\n")
+cat("reading fasta...")
+s <- readDNAStringSet(opt$reference_sequence)  # genome assembly
+cat("done\n")
+outfile <- opt$output
+# clean sequence names:
+names(s) <- gsub(" .+", "", names(s))
+lineage_domain <- lineage_info$Domains.order
+lineage_offset5prime <- lineage_info$offset5prime
+lineage_offset3prime <- lineage_info$offset3prime
+ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage)))
+names(lineage_offset3prime) <-  ln
+names(lineage_offset5prime) <-  ln
+names(lineage_domain) <- ln
+
+
+seqlengths(g) <- seqlengths(s)[names(seqlengths(g))]
+g <- add_coordinates_of_closest_neighbor(g)
+
+gcl <- get_domain_clusters(g)
+gcl_clean <- clean_domain_clusters(gcl)
+# glc annotation
+gcl_clean_lineage <- sapply(gcl_clean, function(x)  x$Final_Classification[1])
+gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-",
+                                                paste(rev(x$Name), collapse = " "),
+                                                paste(x$Name, collapse = " "))
+)
+
+# get lineages which has correct number and order of domains
+gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]]
+gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)]
+gr <- get_ranges(gcl_clean_with_domains)
+
+
+cat('Number of analyzed regions:\n')
+cat('Total number of domain clusters             : ', length(gcl), '\n')
+cat('Number of clean clusters                    : ', length(gcl_clean), '\n')
+cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n')
+
+
+te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1])
+te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1])
+
+max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
+max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
+
+grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset)
+grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset)
+
+s_left <- getSeq(s, grL)
+s_right <- getSeq(s, grR)
+
+# for statistics
+RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"]
+# cleanup
+rm(g)
+rm(gcl)
+rm(gcl_clean)
+rm(gcl_clean2)
+
+names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_")
+names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_")
+names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_")
+cat('Identification of LTRs...')
+TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x],
+                                                s_right[x],
+                                                gcl_clean_with_domains[[x]],
+                                                gr[x],
+                                                grL[x],
+                                                grR[x]),
+               mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE
+)
+cat('done.\n')
+
+good_TE <- TE[!sapply(TE, is.null)]
+cat('Number of putative TE with identified LTR   :', length(good_TE), '\n')
+
+
+ID <- paste0('TE_', sprintf("%08d", seq(good_TE)))
+gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu)
+
+cat('Identification of PBS ...')
+gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE)
+cat('done\n')
+gff3_out <- do.call(c, gff3_list2)
+
+# define new source
+src <- as.character(gff3_out$source)
+src[is.na(src)] <- "dante_ltr"
+gff3_out$source <- src
+
+# TODO export all files to single directory
+# TODO export individual groups DL, DLT, DLP DLPT gff3
+
+export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3')
+# summary statistics
+all_tbl <- get_te_statistics(gff3_out, RT)
+write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE)
+# export fasta files:
+s_te <- get_te_sequences(gff3_out, s)
+
+for (i in seq_along(s_te)) {
+  outname <- paste0(outfile, "_", names(s_te)[i], ".fasta")
+  writeXStringSet(s_te[[i]], filepath = outname)
+}
+