annotate extract_putative_ltr.R @ 0:7b0bbe7477c4 draft

"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
author petr-novak
date Tue, 08 Mar 2022 13:24:33 +0000
parents
children f131886ea194
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
1 #!/usr/bin/env Rscript
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
2 initial_options <- commandArgs(trailingOnly = FALSE)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
3 file_arg_name <- "--file="
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
4 script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep(file_arg_name, initial_options)]))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
5 script_dir <- dirname(script_name)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
6 library(optparse)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
7
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
8 parser <- OptionParser()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
9 option_list <- list(
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
10 make_option(c("-g", "--gff3"), action = "store", type = "character",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
11 help = "gff3 with dante results", default = NULL),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
12 make_option(c("-s", "--reference_sequence"), action = "store", type = "character",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
13 help = "reference sequence as fasta", default = NULL),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
14 make_option(c("-o", "--output"), action = "store", type = "character",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
15 help = "output file path and prefix", default = NULL),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
16 make_option(c("-c", "--cpu"), type = "integer", default = 5,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
17 help = "Number of cpu to use [default %default]", metavar = "number")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
18
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
19 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
20 description <- paste(strwrap(""))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
21
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
22 epilogue <- ""
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
23 parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = description,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
24 usage = "usage: %prog COMMAND [OPTIONS]")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
25 opt <- parse_args(parser, args = commandArgs(TRUE))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
26
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
27
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
28 # load packages
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
29 suppressPackageStartupMessages({
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
30 library(rtracklayer)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
31 library(Biostrings)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
32 library(BSgenome)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
33 library(parallel)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
34 })
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
35
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
36
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
37 # CONFIGURATION
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
38 OFFSET <- 15000
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
39 # load configuration files and functions:
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
40 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
41 trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
42 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
43 if (file.exists(lineage_file) & file.exists(trna_db)) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
44 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
45 source(ltr_utils_r)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
46 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
47 lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
48 trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
49 ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
50 if (file.exists(lineage_file) & file.exists((trna_db))) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
51 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
52 source(ltr_utils_r)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
53 }else(
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
54 stop("configuration files not found")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
55 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
56 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
57
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
58
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
59 # for testing
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
60 if (FALSE) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
61 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
62 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
63 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
64
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
65 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
66 /DANTE_filtered_part.gff3")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
67 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
68 /Rbp_part.fa")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
69
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
70 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
71 /DANTE_Vfaba_chr5.gff3")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
72 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
73 /fasta_parts/211010_Vfaba_chr5.fasta")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
74
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
75 g <- rtracklayer::import("./test_data/sample_DANTE.gff3")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
76 s <- readDNAStringSet("./test_data/sample_genome.fasta")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
77 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
78 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header =
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
79 TRUE, as.is = TRUE)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
80 trna_db <- "./databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
81
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
82 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
83
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
84 # MAIN #############################################################
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
85
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
86 # load data:
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
87
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
88 cat("reading gff...")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
89 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
90 cat("done\n")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
91 cat("reading fasta...")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
92 s <- readDNAStringSet(opt$reference_sequence) # genome assembly
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
93 cat("done\n")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
94 outfile <- opt$output
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
95 # clean sequence names:
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
96 names(s) <- gsub(" .+", "", names(s))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
97 lineage_domain <- lineage_info$Domains.order
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
98 lineage_offset5prime <- lineage_info$offset5prime
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
99 lineage_offset3prime <- lineage_info$offset3prime
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
100 ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
101 names(lineage_offset3prime) <- ln
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
102 names(lineage_offset5prime) <- ln
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
103 names(lineage_domain) <- ln
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
104
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
105
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
106 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
107 g <- add_coordinates_of_closest_neighbor(g)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
108
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
109 gcl <- get_domain_clusters(g)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
110 gcl_clean <- clean_domain_clusters(gcl)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
111 # glc annotation
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
112 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
113 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
114 paste(rev(x$Name), collapse = " "),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
115 paste(x$Name, collapse = " "))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
116 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
117
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
118 # get lineages which has correct number and order of domains
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
119 gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
120 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
121 gr <- get_ranges(gcl_clean_with_domains)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
122
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
123
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
124 cat('Number of analyzed regions:\n')
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
125 cat('Total number of domain clusters : ', length(gcl), '\n')
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
126 cat('Number of clean clusters : ', length(gcl_clean), '\n')
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
127 cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n')
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
128
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
129
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
130 te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
131 te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
132
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
133 max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
134 max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
135
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
136 grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
137 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
138
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
139 s_left <- getSeq(s, grL)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
140 s_right <- getSeq(s, grR)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
141
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
142 # for statistics
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
143 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
144 # cleanup
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
145 rm(g)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
146 rm(gcl)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
147 rm(gcl_clean)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
148 rm(gcl_clean2)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
149
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
150 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
151 names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
152 names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
153 cat('Identification of LTRs...')
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
154 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x],
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
155 s_right[x],
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
156 gcl_clean_with_domains[[x]],
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
157 gr[x],
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
158 grL[x],
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
159 grR[x]),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
160 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
161 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
162 cat('done.\n')
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
163
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
164 good_TE <- TE[!sapply(TE, is.null)]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
165 cat('Number of putative TE with identified LTR :', length(good_TE), '\n')
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
166
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
167
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
168 ID <- paste0('TE_', sprintf("%08d", seq(good_TE)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
169 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
170
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
171 cat('Identification of PBS ...')
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
172 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
173 cat('done\n')
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
174 gff3_out <- do.call(c, gff3_list2)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
175
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
176 # define new source
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
177 src <- as.character(gff3_out$source)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
178 src[is.na(src)] <- "dante_ltr"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
179 gff3_out$source <- src
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
180
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
181 # TODO export all files to single directory
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
182 # TODO export individual groups DL, DLT, DLP DLPT gff3
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
183
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
184 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3')
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
185 # summary statistics
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
186 all_tbl <- get_te_statistics(gff3_out, RT)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
187 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
188 # export fasta files:
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
189 s_te <- get_te_sequences(gff3_out, s)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
190
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
191 for (i in seq_along(s_te)) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
192 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
193 writeXStringSet(s_te[[i]], filepath = outname)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
194 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
195