annotate detect_putative_ltr.R @ 13:559940c04c44 draft

"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
author petr-novak
date Thu, 11 Aug 2022 07:29:06 +0000
parents ff01d4263391
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
1 #!/usr/bin/env Rscript
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
2 initial_options <- commandArgs(trailingOnly = FALSE)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
3 file_arg_name <- "--file="
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
4 script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep(file_arg_name, initial_options)]))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
5 script_dir <- dirname(script_name)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
6 library(optparse)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
7
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
8 parser <- OptionParser()
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
9 option_list <- list(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
10 make_option(c("-g", "--gff3"), action = "store", type = "character",
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
11 help = "gff3 with dante results", default = NULL),
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
12 make_option(c("-s", "--reference_sequence"), action = "store", type = "character",
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
13 help = "reference sequence as fasta", default = NULL),
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
14 make_option(c("-o", "--output"), action = "store", type = "character",
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
15 help = "output file path and prefix", default = NULL),
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
16 make_option(c("-c", "--cpu"), type = "integer", default = 5,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
17 help = "Number of cpu to use [default %default]", metavar = "number"),
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
18 make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
19 help = "Maximum number of missing domains is retrotransposon [default %default]",
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
20 metavar = "number"),
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
21 make_option(c("-L", "--min_relative_length"), type = "numeric", default = 0.6,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
22 help = "Minimum relative length of protein domain to be considered for retrostransposon detection [default %default]",
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
23 metavar = "number")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
24
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
25 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
26 description <- paste(strwrap(""))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
27
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
28 epilogue <- ""
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
29 parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = description,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
30 usage = "usage: %prog COMMAND [OPTIONS]")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
31 opt <- parse_args(parser, args = commandArgs(TRUE))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
32
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
33
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
34 # load packages
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
35 suppressPackageStartupMessages({
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
36 library(rtracklayer)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
37 library(Biostrings)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
38 library(BSgenome)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
39 library(parallel)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
40 })
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
41
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
42
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
43 # CONFIGURATION
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
44 OFFSET <- 15000
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
45 # load configuration files and functions:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
46 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
47 FDM_file <- paste0(script_dir, "/databases/feature_distances_model.RDS")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
48 trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
49 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
50 if (file.exists(lineage_file) & file.exists(trna_db)) {
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
51 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
52 FDM <- readRDS(FDM_file)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
53 source(ltr_utils_r)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
54 }else {
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
55 # this destination work is installed using conda
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
56 lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
57 FDM_file <- paste0(script_dir, "/../share/dante_ltr/databases/feature_distances_model.RDS")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
58 trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
59 ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
60 if (file.exists(lineage_file) & file.exists((trna_db))) {
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
61 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
62 source(ltr_utils_r)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
63 FDM <- readRDS(FDM_file)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
64 }else(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
65 stop("configuration files not found")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
66 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
67 }
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
68
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
69
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
70 # for testing
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
71 if (FALSE) {
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
72 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
73 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
74 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
75
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
76 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/sample_DANTE_unfiltered.gff3")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
77 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
78 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
79
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
80 # oriza
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
81 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/DANTE_full_oryza.gff3")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
82 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/o_sativa_msu7.0.fasta")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
83
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
84 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
85 /DANTE_Vfaba_chr5.gff3")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
86 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
87 /fasta_parts/211010_Vfaba_chr5.fasta")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
88
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
89 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
90 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
91 # test on bigger data:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
92
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
93 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/tmp/DANTE_unfiltered/1.gff3")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
94 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/tmp/fasta_parts/1.fasta")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
95
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
96 source("R/ltr_utils.R")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
97 ## feature distance model
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
98 FDM <- readRDS("./databases/feature_distances_model.RDS")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
99 g <- rtracklayer::import("./test_data/sample_DANTE.gff3")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
100 s <- readDNAStringSet("./test_data/sample_genome.fasta")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
101 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3"
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
102 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header =
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
103 TRUE, as.is = TRUE)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
104 trna_db <- "./databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta"
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
105 opt <- list(min_relative_length=0.6, cpu = 8)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
106
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
107 }
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
108
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
109 # MAIN #############################################################
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
110
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
111 # load data:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
112 cat("reading gff...")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
113 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
114 cat("done\n")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
115 cat("reading fasta...")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
116 s <- readDNAStringSet(opt$reference_sequence) # genome assembly
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
117 cat("done\n")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
118 outfile <- opt$output
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
119 # clean sequence names:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
120 names(s) <- gsub(" .+", "", names(s))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
121 lineage_domain <- lineage_info$Domains.order
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
122 lineage_domain_span <- lineage_info$domain_span
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
123 lineage_ltr_mean_length <- lineage_info$ltr_length
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
124 lineage_offset5prime <- lineage_info$offset5prime
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
125 lineage_offset3prime <- lineage_info$offset3prime
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
126 ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage)))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
127 names(lineage_offset3prime) <- ln
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
128 names(lineage_offset5prime) <- ln
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
129 names(lineage_domain) <- ln
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
130 names(lineage_domain_span) <- ln
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
131 names(lineage_ltr_mean_length) <- ln
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
132 lineage_domains_sequence <- unlist(mapply(function(d,l) {
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
133 paste(strsplit(d, " ")[[1]], ":", l, sep = "")
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
134 }, d = lineage_domain, l = names(lineage_domain)))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
135
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
136 # this repeat block is run just once
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
137 # it can breaks in eny point if zero TE is found
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
138 repeat{
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
139
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
140 # filter g gff3
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
141 g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
142
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
143 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
144 g <- add_coordinates_of_closest_neighbor(g)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
145
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
146 # add info about domain order:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
147 g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence))
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
148 # set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
149 g$domain_order[is.na(g$domain_order)] <- 0
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
150
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
151 # NOTE - some operation is faster of GrangesList but some on list of data.frames
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
152 # this is primary clusteing
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
153 cls <- get_domain_clusters(g)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
154 gcl <- split(as.data.frame(g), cls)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
155 # gcl_as_GRL <- split(g, cls) # delete?
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
156
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
157 cls_alt <- get_domain_clusters_alt(g, FDM)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
158 g$Cluster <- as.numeric(factor(cls_alt))
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
159
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
160 gcl_alt <- split(as.data.frame(g), cls_alt)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
161
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
162 TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
163 Name = sapply(gcl_alt, function(x) x$Final_Classification[1]),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
164 Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
165 ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
166 strand = sapply(gcl_alt, function(x) x$strand[1]),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
167 Ndomains = sapply(gcl_alt, function(x) nrow(x)),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
168 type = "transposable_element",
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
169 source = "dante_ltr",
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
170 Rank="D",
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
171 IRanges(start = sapply(gcl_alt, function(x) min(x$start)),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
172 end = sapply(gcl_alt, function(x) max(x$end)))
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
173 )
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
174 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
175 g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster))
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
176 g$Rank <- "D"
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
177 # for statistics
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
178 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"]
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
179
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
180 # keep only partial TE with more than one domain
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
181 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
182 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)]
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
183
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
184 # first filtering - remove TEs with low number of domains
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
185 gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
186
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
187 # glc annotation
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
188 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
189 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-",
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
190 paste(rev(x$Name), collapse = " "),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
191 paste(x$Name, collapse = " "))
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
192 )
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
193
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
194 # compare detected domains with domains in lineages from REXdb database
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
195 dd <- mapply(domain_distance,
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
196 d_query = gcl_clean_domains,
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
197 d_reference = lineage_domain[gcl_clean_lineage])
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
198
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
199 # get lineages which has correct number and order of domains
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
200 # gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]]
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
201
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
202
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
203 gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
204
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
205 if(length(gcl_clean2) == 0) {
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
206 cat("No complete TE found\n")
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
207 good_TE <- list()
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
208 break
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
209 }
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
210 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
211 gr <- get_ranges(gcl_clean_with_domains)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
212
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
213 cat('Number of analyzed regions:\n')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
214 cat('Total number of domain clusters : ', length(gcl), '\n')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
215 cat('Number of clean clusters : ', length(gcl_clean), '\n')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
216 cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n')
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
217
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
218
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
219 te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
220 te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1])
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
221
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
222 max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
223 max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
224
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
225 grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
226 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
227
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
228 s_left <- getSeq(s, grL)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
229 s_right <- getSeq(s, grR)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
230
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
231 expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])]
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
232
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
233 # cleanup
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
234 #rm(g)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
235 rm(gcl)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
236 rm(gcl_clean)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
237 rm(gcl_clean2)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
238
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
239 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_")
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
240 names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_")
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
241 names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_")
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
242 cat('Identification of LTRs...')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
243 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x],
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
244 s_right[x],
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
245 gcl_clean_with_domains[[x]],
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
246 gr[x],
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
247 grL[x],
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
248 grR[x],
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
249 expected_ltr_length[x]),
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
250 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
251 )
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
252
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
253 cat('done.\n')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
254 good_TE <- TE[!sapply(TE, is.null)]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
255 cat('Number of putative TE with identified LTR :', length(good_TE), '\n')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
256 break
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
257 }
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
258
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
259 if (length(good_TE)>0){ # handle empty list
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
260 ID <- paste0('TE_', sprintf("%08d", seq(good_TE)))
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
261 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
262
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
263 cat('Identification of PBS ...')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
264 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
265 cat('done\n')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
266 gff3_out <- do.call(c, gff3_list2)
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
267
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
268 # define new source
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
269 src <- as.character(gff3_out$source)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
270 src[is.na(src)] <- "dante_ltr"
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
271 gff3_out$source <- src
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
272 gff3_out$Rank <- get_te_rank(gff3_out)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
273 # add partial TEs but first remove all ovelaping
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
274 TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
275 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
276 gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
277 }else{
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
278 # but this could be a problem if there are no TEs in the sequence
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
279 if (length(TE_partial_with_more_than_one_domain)>0){
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
280 TE_partial_parent_part <- TE_partial_with_more_than_one_domain
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
281 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID]
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
282 gff3_out <- sort(c(TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
283 }else{
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
284 gff3_out <- NULL
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
285 }
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
286 }
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
287
13
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
288 if (is.null(gff3_out)){
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
289 cat('No TEs found.\n')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
290 }else{
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
291 # modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
292 gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
293 gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)])
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
294 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3')
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
295 all_tbl <- get_te_statistics(gff3_out, RT)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
296 all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
297 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
298 # export fasta files:
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
299 s_te <- get_te_sequences(gff3_out, s)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
300 for (i in seq_along(s_te)) {
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
301 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta")
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
302 writeXStringSet(s_te[[i]], filepath = outname)
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
303 }
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
304 }
559940c04c44 "planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents: 12
diff changeset
305