Mercurial > repos > petr-novak > dante_ltr
changeset 3:6ae4a341d1f3 draft
"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
author | petr-novak |
---|---|
date | Tue, 03 May 2022 12:38:12 +0000 |
parents | f131886ea194 |
children | 93d35ae65e1b |
files | R/ltr_utils.R clean_dante_ltr.xml clean_ltr.R dante_ltr_search.xml |
diffstat | 4 files changed, 75 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/R/ltr_utils.R Tue Apr 12 12:55:32 2022 +0000 +++ b/R/ltr_utils.R Tue May 03 12:38:12 2022 +0000 @@ -160,7 +160,6 @@ bl_table <- do.call(rbind, bl_list) unlink(qf) #unlink(outf) - print(outf) unlink(dbf) unlink(script) return(bl_table) @@ -185,7 +184,7 @@ analyze_TE <- function(seqs, ncpus = 10, word_size = 20){ - blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size) + blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size, perc_identity = 90) te_conflict_info <- identify_conflicts(blt) blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok) te_ok_lineages <- split(blt_te_ok, @@ -284,7 +283,9 @@ return(bl[bl$qaccver %in% id & bl$saccver %in% id,, drop = FALSE]) } -get_representative_ranges <- function(bl, min_length = 60){ +get_representative_ranges <- function(bl, min_length = 200, min_identity = 98){ + bl <- bl[bl$pident>=min_identity, , drop=FALSE] + bl <- bl[bl$pident>=min_identity & bl$length >= min_length, , drop=FALSE] score <- sort(unlist(by(bl$bitscore, bl$qaccver, sum, simplify = FALSE)), decreasing = TRUE) L <- bl$qlen[!duplicated(bl$qaccver)] @@ -320,7 +321,6 @@ L <- nchar(seqs) R <- matrix(ncol = niter, nrow = length(seqs)) for (i in 1:niter){ - print(i) seqs_rnd <- DNAStringSet(sapply(L, function(n) paste(sample(c("A", "C", "T", "G"), n, replace=TRUE), collapse=""))) R[,i] <- seq_diversity(seqs_rnd, km = km)$richness } @@ -623,9 +623,13 @@ return(out) } -getSeqNamed <- function(s, gr) { +getSeqNamed <- function(s, gr, name = NULL) { spart <- getSeq(s, gr) - id1 <- paste0(seqnames(gr), '_', start(gr), "_", end(gr)) + if (is.null(name)){ + id1 <- paste0(seqnames(gr), '_', start(gr), "_", end(gr)) + }else{ + id1 <- mcols(gr)[,name] + } id2 <- gr$Final_Classification names(spart) <- paste0(id1, "#", id2) spart
--- a/clean_dante_ltr.xml Tue Apr 12 12:55:32 2022 +0000 +++ b/clean_dante_ltr.xml Tue May 03 12:38:12 2022 +0000 @@ -1,4 +1,4 @@ -<tool id="clean_dante_ltr" name="DANTE_LTR transposamble elements filtering" version="0.1.4" python_template_version="3.5"> +<tool id="clean_dante_ltr" name="DANTE_LTR transposamble elements filtering" version="0.1.5" python_template_version="3.5"> <requirements> <requirement type="package">r-optparse</requirement> @@ -12,7 +12,16 @@ && mv output_clean.gff3 $dante_ltr_clean && - mv output_RM_lib.fasta $rm_lib + mv output_RM_lib_non_redundant.fasta $rm_lib + && + mv output_RM_lib_full_TE.fasta $te_full + && + mv output_RM_lib_5LTR.fasta $ltr5 + && + mv output_RM_lib_3LTR.fasta $ltr3 + && + mv output_summary.pdf $summary + ]]></command> <inputs> <param type="data" name="dante_ltr" format="gff3" /> @@ -23,6 +32,19 @@ elements based on annotation $dante_ltr.hid and reference $reference.hid"/> <data name="rm_lib" format="fasta" label="Non-redundant library of LTR transposable elements based on annotation $dante_ltr.hid and reference $reference.hid"/> + + <data name="te_full" format="fasta" label="Full length LTR transposable + elements based on annotation $dante_ltr.hid and reference $reference.hid"/> + + <data name="ltr5" format="fasta" label="5'LTR of transposable + elements based on annotation $dante_ltr.hid and reference $reference.hid"/> + + <data name="ltr3" format="fasta" label="3'LTR of transposable + elements based on annotation $dante_ltr.hid and reference $reference.hid"/> + + <data name="summary" format="pdf" label="Summary of TE and LTR lenghts based on + $dante_ltr.hid and reference $reference.hid"/> + </outputs> <help><![CDATA[ This tool takes output from DANTE_LTR search identifies good quality transposable elements.
--- a/clean_ltr.R Tue Apr 12 12:55:32 2022 +0000 +++ b/clean_ltr.R Tue May 03 12:38:12 2022 +0000 @@ -108,7 +108,6 @@ ) TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) - # create additional library from rank 2 verified by multiplicity id_for_additional_library <- setdiff( TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, @@ -134,7 +133,6 @@ } } - # TE rank 3 TE_DL_info_DLTP_verified <- compare_TE_datasets( s_te$DL, @@ -169,7 +167,46 @@ gff_out <- g[c1 | c2] +gff_te <- gff_out[gff_out$type %in% "transposable_element"] +gff_5ltr <- gff_out[gff_out$LTR %in% "5LTR"] +gff_3ltr <- gff_out[gff_out$LTR %in% "3LTR"] +full_te <- getSeqNamed(s, gff_te) +ltr5 <- getSeqNamed(s, gff_5ltr) +ltr3 <- getSeqNamed(s, gff_3ltr) +inc <- gff_te$Rank != "DL" -writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib.fasta")) +writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta")) +writeXStringSet(full_te, paste0(opt$output, "_RM_lib_full_TE.fasta")) +writeXStringSet(ltr5, paste0(opt$output, "_RM_lib_5LTR.fasta")) +writeXStringSet(ltr3, paste0(opt$output, "_RM_lib_3LTR.fasta")) + export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3") +lv <- sort(unique(gff_te$Final_Classification)) +te_count <- table(factor(gff_te$Final_Classification, levels=lv)) + +pdf(paste0(opt$output, "_summary.pdf"), width = 13, height=8, pointsize = 10) +par(mfrow=c(1,2), mar=c(5,7,2,0), las=1) +boxplot(width(gff_te) ~ factor(gff_te$Final_Classification, levels=lv), + horizontal = TRUE, xlab="length[bp]", ylab="", + names = paste0(gsub("^.+[|]", "", lv), " (", te_count, ")"), + main = 'Full TE', at = seq_along(lv)*4 +) +boxplot(width(gff_te[inc]) ~ factor(gff_te$Final_Classification[inc], levels=lv), + horizontal = TRUE, xlab="length[bp]", ylab="", + names = rep("", length(lv)), + main = 'Full TE', at = seq_along(lv)*4-1, add=TRUE, col=2 +) +par(mar=c(5,0,2,7)) +boxplot(width(gff_5ltr) ~ factor(gff_5ltr$Final_Classification, levels=lv), + horizontal = TRUE, xlab="length[bp]", ylab="", + names = rep("", length(lv)), + main = "5'LTR", at = seq_along(lv)*4 +) +boxplot(width(gff_5ltr[inc]) ~ factor(gff_5ltr$Final_Classification[inc], levels=lv), + horizontal = TRUE, xlab="length[bp]", ylab="", + names = rep("", length(lv)), + main = "5'LTR", at = seq_along(lv)*4-1, add=TRUE, col=2 +) +legend('bottomright', col=c("grey","2"), legend=c("All TE", "TE with PBS/TSD"), pch=15) +dev.off()
--- a/dante_ltr_search.xml Tue Apr 12 12:55:32 2022 +0000 +++ b/dante_ltr_search.xml Tue May 03 12:38:12 2022 +0000 @@ -1,4 +1,4 @@ -<tool id="dante_ltr_search" name="DANTE_LTR transposable element identification" version="0.1.4" python_template_version="3.5"> +<tool id="dante_ltr_search" name="DANTE_LTR transposable element identification" version="0.1.5" python_template_version="3.5"> <requirements> <requirement type="package">blast</requirement> <requirement type="package">r-optparse</requirement>