Mercurial > repos > petr-novak > dante_ltr
annotate extract_putative_ltr.R @ 5:0c3111ab729b draft
"planemo upload commit 5a81f1734493a3acf5ff24a573fa2887d92b58a8"
author | petr-novak |
---|---|
date | Mon, 16 May 2022 07:50:41 +0000 |
parents | f131886ea194 |
children | c33d6583e548 |
rev | line source |
---|---|
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
1 #!/usr/bin/env Rscript |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
2 initial_options <- commandArgs(trailingOnly = FALSE) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
3 file_arg_name <- "--file=" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
4 script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep(file_arg_name, initial_options)])) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
5 script_dir <- dirname(script_name) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
6 library(optparse) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
7 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
8 parser <- OptionParser() |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
9 option_list <- list( |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
10 make_option(c("-g", "--gff3"), action = "store", type = "character", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
11 help = "gff3 with dante results", default = NULL), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
12 make_option(c("-s", "--reference_sequence"), action = "store", type = "character", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
13 help = "reference sequence as fasta", default = NULL), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
14 make_option(c("-o", "--output"), action = "store", type = "character", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
15 help = "output file path and prefix", default = NULL), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
16 make_option(c("-c", "--cpu"), type = "integer", default = 5, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
17 help = "Number of cpu to use [default %default]", metavar = "number") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
18 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
19 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
20 description <- paste(strwrap("")) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
21 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
22 epilogue <- "" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
23 parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = description, |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
24 usage = "usage: %prog COMMAND [OPTIONS]") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
25 opt <- parse_args(parser, args = commandArgs(TRUE)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
26 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
27 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
28 # load packages |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
29 suppressPackageStartupMessages({ |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
30 library(rtracklayer) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
31 library(Biostrings) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
32 library(BSgenome) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
33 library(parallel) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
34 }) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
35 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
36 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
37 # CONFIGURATION |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
38 OFFSET <- 15000 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
39 # load configuration files and functions: |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
40 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
41 trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
42 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
43 if (file.exists(lineage_file) & file.exists(trna_db)) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
44 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
45 source(ltr_utils_r) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
46 }else { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
47 lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
48 trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
49 ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
50 if (file.exists(lineage_file) & file.exists((trna_db))) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
51 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
52 source(ltr_utils_r) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
53 }else( |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
54 stop("configuration files not found") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
55 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
56 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
57 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
58 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
59 # for testing |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
60 if (FALSE) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
61 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
62 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
63 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
64 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
65 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
66 /DANTE_filtered_part.gff3") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
67 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
68 /Rbp_part.fa") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
69 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
70 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
71 /DANTE_Vfaba_chr5.gff3") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
72 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
73 /fasta_parts/211010_Vfaba_chr5.fasta") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
74 |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
75 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3") |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
76 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz") |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
77 |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
78 source("R/ltr_utils.R") |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
79 |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
80 g <- rtracklayer::import("./test_data/sample_DANTE.gff3") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
81 s <- readDNAStringSet("./test_data/sample_genome.fasta") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
82 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
83 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
84 TRUE, as.is = TRUE) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
85 trna_db <- "./databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
86 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
87 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
88 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
89 # MAIN ############################################################# |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
90 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
91 # load data: |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
92 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
93 cat("reading gff...") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
94 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
95 cat("done\n") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
96 cat("reading fasta...") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
97 s <- readDNAStringSet(opt$reference_sequence) # genome assembly |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
98 cat("done\n") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
99 outfile <- opt$output |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
100 # clean sequence names: |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
101 names(s) <- gsub(" .+", "", names(s)) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
102 lineage_domain <- lineage_info$Domains.order |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
103 lineage_offset5prime <- lineage_info$offset5prime |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
104 lineage_offset3prime <- lineage_info$offset3prime |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
105 ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
106 names(lineage_offset3prime) <- ln |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
107 names(lineage_offset5prime) <- ln |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
108 names(lineage_domain) <- ln |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
109 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
110 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
111 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
112 g <- add_coordinates_of_closest_neighbor(g) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
113 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
114 gcl <- get_domain_clusters(g) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
115 gcl_clean <- clean_domain_clusters(gcl) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
116 # glc annotation |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
117 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
118 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
119 paste(rev(x$Name), collapse = " "), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
120 paste(x$Name, collapse = " ")) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
121 ) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
122 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
123 # get lineages which has correct number and order of domains |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
124 gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
125 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
126 gr <- get_ranges(gcl_clean_with_domains) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
127 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
128 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
129 cat('Number of analyzed regions:\n') |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
130 cat('Total number of domain clusters : ', length(gcl), '\n') |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
131 cat('Number of clean clusters : ', length(gcl_clean), '\n') |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
132 cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
133 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
134 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
135 te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
136 te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
137 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
138 max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
139 max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
140 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
141 grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
142 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
143 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
144 s_left <- getSeq(s, grL) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
145 s_right <- getSeq(s, grR) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
146 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
147 # for statistics |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
148 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
149 # cleanup |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
150 rm(g) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
151 rm(gcl) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
152 rm(gcl_clean) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
153 rm(gcl_clean2) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
154 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
155 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
156 names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
157 names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
158 cat('Identification of LTRs...') |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
159 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
160 s_right[x], |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
161 gcl_clean_with_domains[[x]], |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
162 gr[x], |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
163 grL[x], |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
164 grR[x]), |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
165 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
166 ) |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
167 |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
168 cat('done.\n') |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
169 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
170 good_TE <- TE[!sapply(TE, is.null)] |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
171 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
172 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
173 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
174 ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
175 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
176 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
177 cat('Identification of PBS ...') |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
178 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
179 cat('done\n') |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
180 gff3_out <- do.call(c, gff3_list2) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
181 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
182 # define new source |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
183 src <- as.character(gff3_out$source) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
184 src[is.na(src)] <- "dante_ltr" |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
185 gff3_out$source <- src |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
186 gff3_out$Rank <- get_te_rank(gff3_out) |
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
187 # TODO add attributte specifying individual groups DL, DLT, DLP DLPT gff3 |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
188 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
189 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
190 |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
191 all_tbl <- get_te_statistics(gff3_out, RT) |
2
f131886ea194
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents:
0
diff
changeset
|
192 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = TRUE) |
0
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
193 # export fasta files: |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
194 s_te <- get_te_sequences(gff3_out, s) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
195 |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
196 for (i in seq_along(s_te)) { |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
197 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
198 writeXStringSet(s_te[[i]], filepath = outname) |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
199 } |
7b0bbe7477c4
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff
changeset
|
200 |