Mercurial > repos > petr-novak > dante_ltr
changeset 8:9de392f2fc02 draft
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
author | petr-novak |
---|---|
date | Tue, 28 Jun 2022 12:33:22 +0000 |
parents | c33d6583e548 |
children | 1aa578e6c8b3 |
files | R/ltr_utils.R README.md clean_dante_ltr.xml dante_ltr_search.xml dante_ltr_workflow.odg dante_ltr_workflow.png extract_putative_ltr.R |
diffstat | 7 files changed, 93 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/R/ltr_utils.R Fri Jun 24 14:19:48 2022 +0000 +++ b/R/ltr_utils.R Tue Jun 28 12:33:22 2022 +0000 @@ -138,17 +138,20 @@ gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset, end = E + offset)) } -get_ranges_left <- function(gx, offset = OFFSET, offset2 = 300) { +get_ranges_left <- function(gx, offset = OFFSET, offset2 = 10) { + ## offset2 - how many nt cen LTR extend to closes protein domain + ## this is necassary as some detected proteins domains does not have correct bopundaries + ## if LTR retrotransposons insters to other protein domain. S <- sapply(gx, function(x)min(x$start)) - max_offset <- S - sapply(gx, function(x)min(x$upstream_domain)) + max_offset <- S - sapply(gx, function(x)min(x$upstream_domain)) + offset2 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset_adjusted, end = S + offset2)) return(gr) } -get_ranges_right <- function(gx, offset = OFFSET, offset2 = 300) { +get_ranges_right <- function(gx, offset = OFFSET, offset2 = 10) { E <- sapply(gx, function(x)max(x$end)) - max_offset <- sapply(gx, function(x)max(x$downstream_domain)) - E + max_offset <- sapply(gx, function(x)max(x$downstream_domain)) - E + offset2 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset) gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = E - offset2, end = E + offset_adjusted)) return(gr) @@ -189,7 +192,7 @@ tg_P <- firstTG(cons) ca_P <- lastCA(cons) e_dist <- bl$length[i] - ca_P - max_dist = 50 # was 25 + max_dist <- 50 # was 25 no_match <- any(tg_P == 0, ca_P == 0) if (!no_match & tg_P < max_dist & @@ -684,6 +687,17 @@ return(Rank) } +dante_filtering <- function(dante_gff, min_similarity=0.4, + min_identity=0.2, Relative_Length=0.6, + min_relat_interuptions=8) { + include <- as.numeric(dante_gff$Similarity) >= min_similarity & + as.numeric(dante_gff$Identity) >= min_identity & + as.numeric(dante_gff$Relat_Length) >= Relative_Length & + as.numeric(dante_gff$Relat_Interruptions) <= min_relat_interuptions + + include[is.na(include)] <- FALSE + return(dante_gff[include]) +} get_te_statistics <- function(gr, RT){ Ranks <- c("D", "DL", "DLT", "DLP", "DLTP")
--- a/README.md Fri Jun 24 14:19:48 2022 +0000 +++ b/README.md Tue Jun 28 12:33:22 2022 +0000 @@ -1,15 +1,33 @@ -# dante_ltr +# DANTE_LTR + +Tool for identifying complete LTR retrotransposons based on analysis of protein domains identified with the [DANTE tool](https://github.com/kavonrtep/dante). Both DANTE and DANTE_LTR are available on [Galaxy server](ttps://repeatexplorer-elixir.cerit-sc.cz/). -Tool for identification of complete LTR retrotransposons based on analysis of protein -domains identified by DANTE tool. +## Principle of DANTE _LTR +Complete retrotransposons are identified as clusters of protein domains recognized by the DANTE tool. The domains in the clusters must be assigned to a single retrotransposon lineage by DANTE. In addition, the orientation and order of the protein domains, as well as the distances between them, must conform to the characteristics of elements from REXXdb database [Neumann et al. (2019)](https://mobilednajournal.biomedcentral.com/articles/10.1186/s13100-018-0144-1). +In the next step, the 5' and 3' regions of the putative retrotransposon are examined for the presence of 5' and 3' long terminal repeats. If 5'- and 3'-long terminal repeats are detected, detection of target site duplication (TSD) and primer binding site (PSB) is performed. The detected LTR retrotranspsons are classified into 5 categories: +- Elements with protein domains, 5'LTR, 3'LTR, TSD and PBS - rank **DLTP**. +- Elements with protein domains, 5'LTR, 3'LTR, and PBS (TSD was not found) Rank **DLP** +- Elements with protein domains, 5' LTR, 3'LTR, TSD (PBS was not found) - rank **DTL** +- Elements with protein domains, 5'LTR and 3'LTR (PBS and TDS were not found) - rank **DL** +- Elements as clusters of protein domains with the same classification, no LTRs - rank **D**. + +![dante_ltr_workflow.png](dante_ltr_workflow.png) + ## Installation: ```shell conda create -n dante_ltr -c bioconda -c conda-forge -c petrnovak dante_ltr ``` + +## Input data +One input is a reference sequence in fasta fromat. The second input is an annotation of the reference genome using the tool DANTE in GFF3 format. For better results, use the unfiltered full output of the DANTE pipeline. + + ## Usage +### Detection of complete LTR retrotransposons + ```shell Usage: ./extract_putative_ltr.R COMMAND [OPTIONS] @@ -27,25 +45,63 @@ -c NUMBER, --cpu=NUMBER Number of cpu to use [default 5] + -M NUMBER, --max_missing_domains=NUMBER + Maximum number of missing domains is retrotransposon [default 0] + + -L NUMBER, --min_relative_length=NUMBER + Minimum relative length of protein domain to be considered for retrostransposon detection [default 0.6] -h, --help Show this help message and exit + ``` -## Example +#### Example: + ```shell mkdir -p tmp ./extract_putative_ltr.R -g test_data/sample_DANTE.gff3 -s test_data/sample_genome.fasta -o tmp/ltr_annotation ``` -## Output files - - -### Output of script `extract_putative_ltr.R`: - +#### Files in the output of `extract_putative_ltr.R`: - `prefix.gff3` - annotation of all identified elements +- `prefix_D.fasta` - partial elements with protein **d**omains - `prefix_DL.fasta` - elements with protein **d**omains and **L**TR - `prefix_DLTP.fasta` - elements with **d**omains, **L**TR, **T**SD and **P**BS - `prefix_DLP.fasta` - elements with **d**omains, **L**TR and **P**BS - `prefix_DLT.fasta` - elements with **d**omains, **L**TR, **T**SD - `prefix_statistics.csv` - number of elements in individual categories + + + +### Validation of LTR retrotransposons detected un previous step: + +```shell +./clean_ltr.R --help +Usage: ./clean_ltr.R COMMAND [OPTIONS] + + +Options: + -g GFF3, --gff3=GFF3 + gff3 with LTR Transposable elements + + -s REFERENCE_SEQUENCE, --reference_sequence=REFERENCE_SEQUENCE + reference sequence as fasta + + -o OUTPUT, --output=OUTPUT + output file prefix + + -c NUMBER, --cpu=NUMBER + Number of cpu to use [default 5] + + -h, --help + Show this help message and exit +``` + +This script check for potentially chimeric elements and removes them from GFF3 file. + +#### Example +```shell +./clean_ltr.R -g test_data/sample_DANTE_LTR_annotation.gff3 -s test_data/sample_genome.fasta -o tmp/ltr_annotation_clean +``` +
--- a/clean_dante_ltr.xml Fri Jun 24 14:19:48 2022 +0000 +++ b/clean_dante_ltr.xml Tue Jun 28 12:33:22 2022 +0000 @@ -1,4 +1,4 @@ -<tool id="clean_dante_ltr" name="DANTE_LTR retrotransposons filtering" version="0.1.6" python_template_version="3.5"> +<tool id="clean_dante_ltr" name="DANTE_LTR retrotransposons filtering" version="0.1.6.1" python_template_version="3.5"> <requirements> <requirement type="package">r-optparse</requirement>
--- a/dante_ltr_search.xml Fri Jun 24 14:19:48 2022 +0000 +++ b/dante_ltr_search.xml Tue Jun 28 12:33:22 2022 +0000 @@ -1,4 +1,4 @@ -<tool id="dante_ltr_search" name="DANTE_LTR retrotransposon identification" version="0.1.6" python_template_version="3.5"> +<tool id="dante_ltr_search" name="DANTE_LTR retrotransposon identification" version="0.1.6.1" python_template_version="3.5"> <requirements> <requirement type="package">blast</requirement> <requirement type="package">r-optparse</requirement>
--- a/extract_putative_ltr.R Fri Jun 24 14:19:48 2022 +0000 +++ b/extract_putative_ltr.R Tue Jun 28 12:33:22 2022 +0000 @@ -17,9 +17,11 @@ help = "Number of cpu to use [default %default]", metavar = "number"), make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0, help = "Maximum number of missing domains is retrotransposon [default %default]", + metavar = "number"), + make_option(c("-L", "--min_relative_length"), type = "numeric", default = 0.6, + help = "Minimum relative length of protein domain to be considered for retrostransposon detection [default %default]", metavar = "number") - ) description <- paste(strwrap("")) @@ -71,6 +73,7 @@ s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) + g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/sample_DANTE_unfiltered.gff3") g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3") s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa") @@ -122,6 +125,8 @@ paste(strsplit(d, " ")[[1]], ":", l, sep = "") }, d = lineage_domain, l = names(lineage_domain))) +# filter g gff3 +g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] g <- add_coordinates_of_closest_neighbor(g) @@ -156,7 +161,7 @@ ) g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) g$Parent <- paste0("TE_partial_", g$Cluster) -g$Rank="D" +g$Rank <- "D" # keep only partial TE with more than one domain TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1]