Mercurial > repos > petr-novak > dante_ltr
diff clean_ltr.R @ 0:7b0bbe7477c4 draft
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
author | petr-novak |
---|---|
date | Tue, 08 Mar 2022 13:24:33 +0000 |
parents | |
children | 6ae4a341d1f3 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/clean_ltr.R Tue Mar 08 13:24:33 2022 +0000 @@ -0,0 +1,175 @@ +#!/usr/bin/env Rscript +initial_options <- commandArgs(trailingOnly = FALSE) +file_arg_name <- "--file=" +script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep + (file_arg_name, + initial_options)])) +script_dir <- dirname(script_name) +library(optparse) + +parser <- OptionParser() +option_list <- list( + make_option(c("-g", "--gff3"), action = "store", type = "character", + help = "gff3 with LTR Transposable elements", default = NULL), + make_option(c("-s", "--reference_sequence"), action = "store", type = "character", + help = "reference sequence as fasta", + default = NULL), + make_option(c("-o", "--output"), action = "store", type = "character", + help = "output file prefix", default = NULL), + make_option(c("-c", "--cpu"), type = + "integer", default = 5, help = "Number of cpu to use [default %default]", + metavar = "number") + +) +description <- paste(strwrap("")) + +epilogue <- "" +parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = + description, usage = "usage: %prog COMMAND [OPTIONS]") +opt <- parse_args(parser, args = commandArgs(TRUE)) + + +# load packages +suppressPackageStartupMessages({ library(rtracklayer) + library(Biostrings) + library(BSgenome) + library(parallel) + }) + +# CONFIGURATION +# load configuration files and functions: +lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") +ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") +if (file.exists(lineage_file)) { + lineage_info <- read.table(lineage_file, sep = "\t", + header = TRUE, + as.is = TRUE) + source(ltr_utils_r) +}else { + lineage_file <- paste0(script_dir, "/. + ./share/dante_ltr/databases/lineage_domain_order.csv") + ltr_utils_r <- paste0(script_dir, "/. + ./share/dante_ltr/R/ltr_utils.R") + if (file.exists(lineage_file)) { + lineage_info <- read.table(lineage_file, sep = "\t", + header = TRUE, + as.is = TRUE) + source(ltr_utils_r) + }else { + (stop("configuration files not found")) + } +} + + +ncpus <- opt$cpu + + +# load data +cat("reading fasta...") +s <- readDNAStringSet(opt$reference_sequence) # genome assembly +cat("done\n") +outfile <- opt$output +# clean sequence names: +names(s) <- gsub(" .+", "", names(s)) +cat("reading gff...") +g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 +cat("done\n") +# testing +if (FALSE) { + g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3") + s <- readDNAStringSet("./test_data/sample_genome.fasta") + + g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3") + s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta") + names(s) <- gsub(" .+", "", names(s)) + ncpus <- 10 + lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = + TRUE, as.is = TRUE) + source("./R/ltr_utils.R") +} + + +# get te sequence based on rank + +# evaluate best TE - DLTP grou +s_te <- get_te_sequences(g, s) # split by 'element quality' +# best quality - split by lineage +word_size <- 15 +# best TE +TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus) + +# TE rank 2: +TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus + = ncpus) +TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus + = ncpus, + TE_DLTP_info$seqs_representative, + word_size = word_size +) + +TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) + +# create additional library from rank 2 verified by multiplicity +id_for_additional_library <- setdiff( + TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, + TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified) + +if (length(id_for_additional_library) > 1) { + seqs_for_additional_library <- c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% + id_for_additional_library] + seqs_additional_info <- analyze_TE(seqs_for_additional_library, word_size = + word_size, ncpus = ncpus) + seq_representative <- c( + TE_DLTP_info$seqs_representative, + seqs_additional_info$seqs_representative + ) +}else { + if (length(id_for_additional_library) == 1) { + seq_representative <- c( + TE_DLTP_info$seqs_representative, + c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library] + ) + }else { + seq_representative <- TE_DLTP_info$seqs_representative + } +} + + +# TE rank 3 +TE_DL_info_DLTP_verified <- compare_TE_datasets( + s_te$DL, + TE_DLTP_info$seqs_representative, min_coverage = 0.98, + ncpus = ncpus +) + + +R <- seq_diversity(seq_representative)$richness +SI <- seq_diversity(seq_representative)$shannon_index + +# final RM library: +seq_representative_no_ssr <- seq_representative[R > 20] + +ID <- g$ID[g$type == "transposable_element"] +names(ID) <- paste0(seqnames(g), "_", + start(g), "_", + end(g), "#", + g$Final_Classification +)[g$type %in% "transposable_element"] + +# create clean gff3 +id_of_good_te <- unique(c( + TE_DLTP_info$te_conflict_info$ok, + TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified, + TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, + TE_DL_info_DLTP_verified$id_ok_verified) +) + +c1 <- g$ID %in% ID[id_of_good_te] +c2 <- sapply(g$Parent, function(x)ifelse(length(x) == 0, "", x)) %in% ID[id_of_good_te] + +gff_out <- g[c1 | c2] + + +writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib.fasta")) +export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3") +