Mercurial > repos > petr-novak > dante_ltr
view clean_ltr.R @ 6:b91ca438a1cb draft
"planemo upload commit 9633fb98932151f059ce02a0ce202a4374ef8d68"
author | petr-novak |
---|---|
date | Thu, 19 May 2022 08:21:55 +0000 |
parents | 93d35ae65e1b |
children | c33d6583e548 |
line wrap: on
line source
#!/usr/bin/env Rscript initial_options <- commandArgs(trailingOnly = FALSE) file_arg_name <- "--file=" script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep (file_arg_name, initial_options)])) script_dir <- dirname(script_name) library(optparse) parser <- OptionParser() option_list <- list( make_option(c("-g", "--gff3"), action = "store", type = "character", help = "gff3 with LTR Transposable elements", default = NULL), make_option(c("-s", "--reference_sequence"), action = "store", type = "character", help = "reference sequence as fasta", default = NULL), make_option(c("-o", "--output"), action = "store", type = "character", help = "output file prefix", default = NULL), make_option(c("-c", "--cpu"), type = "integer", default = 5, help = "Number of cpu to use [default %default]", metavar = "number") ) description <- paste(strwrap("")) epilogue <- "" parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = description, usage = "usage: %prog COMMAND [OPTIONS]") opt <- parse_args(parser, args = commandArgs(TRUE)) # load packages suppressPackageStartupMessages({ library(rtracklayer) library(Biostrings) library(BSgenome) library(parallel) }) # CONFIGURATION # load configuration files and functions: lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") if (file.exists(lineage_file)) { lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) source(ltr_utils_r) }else { lineage_file <- paste0(script_dir, "/. ./share/dante_ltr/databases/lineage_domain_order.csv") ltr_utils_r <- paste0(script_dir, "/. ./share/dante_ltr/R/ltr_utils.R") if (file.exists(lineage_file)) { lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) source(ltr_utils_r) }else { (stop("configuration files not found")) } } ncpus <- opt$cpu # load data cat("reading fasta...") s <- readDNAStringSet(opt$reference_sequence) # genome assembly cat("done\n") outfile <- opt$output # clean sequence names: names(s) <- gsub(" .+", "", names(s)) cat("reading gff...") g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 cat("done\n") # testing if (FALSE) { g <- rtracklayer::import("./test_data/big_test_data/dante_ltr_unfiltered_t.cacao.gff3") s <- readDNAStringSet("./test_data/big_test_data/T_cacao_chromosomes.fasta") g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3") s <- readDNAStringSet("./test_data/sample_genome.fasta") g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3") s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta") names(s) <- gsub(" .+", "", names(s)) ncpus <- 10 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) source("./R/ltr_utils.R") } ## ID in g must be unique - this could be a problem if gff is concatenated from multiple files! ## id ID is renamed - rename parent to! ## add chromosom index to disctinguish same IDs suffix <- as.numeric(seqnames(g)) g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix)) g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix)) # get te sequence based on rank # evaluate best TE - DLTP grou s_te <- get_te_sequences(g, s) # split by 'element quality' # best quality - split by lineage word_size <- 15 # best TE TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus) # TE rank 2: TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus = ncpus) TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus = ncpus,TE_DLTP_info$seqs_representative, word_size = word_size ) TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) # create additional library from rank 2 verified by multiplicity id_for_additional_library <- setdiff( TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified) if (length(id_for_additional_library) > 1) { seqs_for_additional_library <- c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library] seqs_additional_info <- analyze_TE(seqs_for_additional_library, word_size = word_size, ncpus = ncpus) seq_representative <- c( TE_DLTP_info$seqs_representative, seqs_additional_info$seqs_representative ) }else { if (length(id_for_additional_library) == 1) { seq_representative <- c( TE_DLTP_info$seqs_representative, c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library] ) }else { seq_representative <- TE_DLTP_info$seqs_representative } } # TE rank 3 - verify agains good DLTP TE_DL_info_DLTP_verified <- compare_TE_datasets( s_te$DL, TE_DLTP_info$seqs_representative, min_coverage = 0.95, ncpus = ncpus, word_size = word_size ) R <- seq_diversity(seq_representative)$richness SI <- seq_diversity(seq_representative)$shannon_index # final RM library: seq_representative_no_ssr <- seq_representative[R > 20] ID <- g$ID[g$type == "transposable_element"] names(ID) <- paste0(seqnames(g), "_", start(g), "_", end(g), "#", g$Final_Classification )[g$type %in% "transposable_element"] # create clean gff3 id_of_good_te <- unique(c( TE_DLTP_info$te_conflict_info$ok, TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified, TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, TE_DL_info_DLTP_verified$id_ok_verified) ) c1 <- g$ID %in% ID[id_of_good_te] c2 <- sapply(g$Parent, function(x)ifelse(length(x) == 0, "", x)) %in% ID[id_of_good_te] gff_out <- g[c1 | c2] gff_te <- gff_out[gff_out$type %in% "transposable_element"] gff_5ltr <- gff_out[gff_out$LTR %in% "5LTR"] gff_3ltr <- gff_out[gff_out$LTR %in% "3LTR"] full_te <- getSeqNamed(s, gff_te) names(full_te) <- paste0(gff_te$ID,":",names(full_te)) ltr5 <- getSeqNamed(s, gff_5ltr) names(ltr5) <- paste0(gff_5ltr$Parent,":",names(ltr5)) ltr3 <- getSeqNamed(s, gff_3ltr) names(ltr3) <- paste0(gff_3ltr$Parent,":",names(ltr3)) inc <- gff_te$Rank != "DL" writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta")) writeXStringSet(full_te, paste0(opt$output, "_RM_lib_full_TE.fasta")) writeXStringSet(ltr5, paste0(opt$output, "_RM_lib_5LTR.fasta")) writeXStringSet(ltr3, paste0(opt$output, "_RM_lib_3LTR.fasta")) export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3") lv <- sort(unique(gff_te$Final_Classification)) te_count <- table(factor(gff_te$Final_Classification, levels=lv)) pdf(paste0(opt$output, "_summary.pdf"), width = 13, height=8, pointsize = 10) par(mfrow=c(1,2), mar=c(5,7,2,0), las=1) boxplot(width(gff_te) ~ factor(gff_te$Final_Classification, levels=lv), horizontal = TRUE, xlab="length[bp]", ylab="", names = paste0(gsub("^.+[|]", "", lv), " (", te_count, ")"), main = 'Full TE', at = seq_along(lv)*4 ) boxplot(width(gff_te[inc]) ~ factor(gff_te$Final_Classification[inc], levels=lv), horizontal = TRUE, xlab="length[bp]", ylab="", names = rep("", length(lv)), main = 'Full TE', at = seq_along(lv)*4-1, add=TRUE, col=2 ) par(mar=c(5,0,2,7)) boxplot(width(gff_5ltr) ~ factor(gff_5ltr$Final_Classification, levels=lv), horizontal = TRUE, xlab="length[bp]", ylab="", names = rep("", length(lv)), main = "5'LTR", at = seq_along(lv)*4 ) boxplot(width(gff_5ltr[inc]) ~ factor(gff_5ltr$Final_Classification[inc], levels=lv), horizontal = TRUE, xlab="length[bp]", ylab="", names = rep("", length(lv)), main = "5'LTR", at = seq_along(lv)*4-1, add=TRUE, col=2 ) legend('bottomright', col=c("grey","2"), legend=c("All TE", "TE with PBS/TSD"), pch=15) dev.off()