Mercurial > repos > petr-novak > dante_ltr
annotate detect_putative_ltr.R @ 13:559940c04c44 draft
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
author | petr-novak |
---|---|
date | Thu, 11 Aug 2022 07:29:06 +0000 |
parents | ff01d4263391 |
children |
rev | line source |
---|---|
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
1 #!/usr/bin/env Rscript |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
2 initial_options <- commandArgs(trailingOnly = FALSE) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
3 file_arg_name <- "--file=" |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
4 script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep(file_arg_name, initial_options)])) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
5 script_dir <- dirname(script_name) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
6 library(optparse) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
7 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
8 parser <- OptionParser() |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
9 option_list <- list( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
10 make_option(c("-g", "--gff3"), action = "store", type = "character", |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
11 help = "gff3 with dante results", default = NULL), |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
12 make_option(c("-s", "--reference_sequence"), action = "store", type = "character", |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
13 help = "reference sequence as fasta", default = NULL), |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
14 make_option(c("-o", "--output"), action = "store", type = "character", |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
15 help = "output file path and prefix", default = NULL), |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
16 make_option(c("-c", "--cpu"), type = "integer", default = 5, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
17 help = "Number of cpu to use [default %default]", metavar = "number"), |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
18 make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
19 help = "Maximum number of missing domains is retrotransposon [default %default]", |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
20 metavar = "number"), |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
21 make_option(c("-L", "--min_relative_length"), type = "numeric", default = 0.6, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
22 help = "Minimum relative length of protein domain to be considered for retrostransposon detection [default %default]", |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
23 metavar = "number") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
24 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
25 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
26 description <- paste(strwrap("")) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
27 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
28 epilogue <- "" |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
29 parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = description, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
30 usage = "usage: %prog COMMAND [OPTIONS]") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
31 opt <- parse_args(parser, args = commandArgs(TRUE)) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
32 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
33 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
34 # load packages |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
35 suppressPackageStartupMessages({ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
36 library(rtracklayer) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
37 library(Biostrings) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
38 library(BSgenome) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
39 library(parallel) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
40 }) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
41 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
42 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
43 # CONFIGURATION |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
44 OFFSET <- 15000 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
45 # load configuration files and functions: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
46 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
47 FDM_file <- paste0(script_dir, "/databases/feature_distances_model.RDS") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
48 trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
49 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
50 if (file.exists(lineage_file) & file.exists(trna_db)) { |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
51 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
52 FDM <- readRDS(FDM_file) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
53 source(ltr_utils_r) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
54 }else { |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
55 # this destination work is installed using conda |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
56 lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
57 FDM_file <- paste0(script_dir, "/../share/dante_ltr/databases/feature_distances_model.RDS") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
58 trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
59 ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
60 if (file.exists(lineage_file) & file.exists((trna_db))) { |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
61 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
62 source(ltr_utils_r) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
63 FDM <- readRDS(FDM_file) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
64 }else( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
65 stop("configuration files not found") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
66 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
67 } |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
68 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
69 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
70 # for testing |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
71 if (FALSE) { |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
72 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
73 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
74 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
75 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
76 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/sample_DANTE_unfiltered.gff3") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
77 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
78 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
79 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
80 # oriza |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
81 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/DANTE_full_oryza.gff3") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
82 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/o_sativa_msu7.0.fasta") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
83 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
84 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
85 /DANTE_Vfaba_chr5.gff3") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
86 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
87 /fasta_parts/211010_Vfaba_chr5.fasta") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
88 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
89 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
90 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
91 # test on bigger data: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
92 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
93 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/tmp/DANTE_unfiltered/1.gff3") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
94 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/tmp/fasta_parts/1.fasta") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
95 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
96 source("R/ltr_utils.R") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
97 ## feature distance model |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
98 FDM <- readRDS("./databases/feature_distances_model.RDS") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
99 g <- rtracklayer::import("./test_data/sample_DANTE.gff3") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
100 s <- readDNAStringSet("./test_data/sample_genome.fasta") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
101 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
102 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
103 TRUE, as.is = TRUE) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
104 trna_db <- "./databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta" |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
105 opt <- list(min_relative_length=0.6, cpu = 8) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
106 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
107 } |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
108 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
109 # MAIN ############################################################# |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
110 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
111 # load data: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
112 cat("reading gff...") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
113 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
114 cat("done\n") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
115 cat("reading fasta...") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
116 s <- readDNAStringSet(opt$reference_sequence) # genome assembly |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
117 cat("done\n") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
118 outfile <- opt$output |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
119 # clean sequence names: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
120 names(s) <- gsub(" .+", "", names(s)) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
121 lineage_domain <- lineage_info$Domains.order |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
122 lineage_domain_span <- lineage_info$domain_span |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
123 lineage_ltr_mean_length <- lineage_info$ltr_length |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
124 lineage_offset5prime <- lineage_info$offset5prime |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
125 lineage_offset3prime <- lineage_info$offset3prime |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
126 ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage))) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
127 names(lineage_offset3prime) <- ln |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
128 names(lineage_offset5prime) <- ln |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
129 names(lineage_domain) <- ln |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
130 names(lineage_domain_span) <- ln |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
131 names(lineage_ltr_mean_length) <- ln |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
132 lineage_domains_sequence <- unlist(mapply(function(d,l) { |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
133 paste(strsplit(d, " ")[[1]], ":", l, sep = "") |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
134 }, d = lineage_domain, l = names(lineage_domain))) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
135 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
136 # this repeat block is run just once |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
137 # it can breaks in eny point if zero TE is found |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
138 repeat{ |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
139 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
140 # filter g gff3 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
141 g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
142 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
143 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
144 g <- add_coordinates_of_closest_neighbor(g) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
145 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
146 # add info about domain order: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
147 g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
148 # set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
149 g$domain_order[is.na(g$domain_order)] <- 0 |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
150 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
151 # NOTE - some operation is faster of GrangesList but some on list of data.frames |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
152 # this is primary clusteing |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
153 cls <- get_domain_clusters(g) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
154 gcl <- split(as.data.frame(g), cls) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
155 # gcl_as_GRL <- split(g, cls) # delete? |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
156 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
157 cls_alt <- get_domain_clusters_alt(g, FDM) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
158 g$Cluster <- as.numeric(factor(cls_alt)) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
159 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
160 gcl_alt <- split(as.data.frame(g), cls_alt) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
161 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
162 TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
163 Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
164 Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
165 ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
166 strand = sapply(gcl_alt, function(x) x$strand[1]), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
167 Ndomains = sapply(gcl_alt, function(x) nrow(x)), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
168 type = "transposable_element", |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
169 source = "dante_ltr", |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
170 Rank="D", |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
171 IRanges(start = sapply(gcl_alt, function(x) min(x$start)), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
172 end = sapply(gcl_alt, function(x) max(x$end))) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
173 ) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
174 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
175 g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster)) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
176 g$Rank <- "D" |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
177 # for statistics |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
178 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
179 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
180 # keep only partial TE with more than one domain |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
181 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
182 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
183 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
184 # first filtering - remove TEs with low number of domains |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
185 gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
186 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
187 # glc annotation |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
188 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
189 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
190 paste(rev(x$Name), collapse = " "), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
191 paste(x$Name, collapse = " ")) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
192 ) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
193 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
194 # compare detected domains with domains in lineages from REXdb database |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
195 dd <- mapply(domain_distance, |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
196 d_query = gcl_clean_domains, |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
197 d_reference = lineage_domain[gcl_clean_lineage]) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
198 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
199 # get lineages which has correct number and order of domains |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
200 # gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
201 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
202 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
203 gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
204 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
205 if(length(gcl_clean2) == 0) { |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
206 cat("No complete TE found\n") |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
207 good_TE <- list() |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
208 break |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
209 } |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
210 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
211 gr <- get_ranges(gcl_clean_with_domains) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
212 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
213 cat('Number of analyzed regions:\n') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
214 cat('Total number of domain clusters : ', length(gcl), '\n') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
215 cat('Number of clean clusters : ', length(gcl_clean), '\n') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
216 cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
217 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
218 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
219 te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
220 te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
221 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
222 max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
223 max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
224 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
225 grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
226 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
227 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
228 s_left <- getSeq(s, grL) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
229 s_right <- getSeq(s, grR) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
230 |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
231 expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
232 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
233 # cleanup |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
234 #rm(g) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
235 rm(gcl) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
236 rm(gcl_clean) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
237 rm(gcl_clean2) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
238 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
239 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
240 names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
241 names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
242 cat('Identification of LTRs...') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
243 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
244 s_right[x], |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
245 gcl_clean_with_domains[[x]], |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
246 gr[x], |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
247 grL[x], |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
248 grR[x], |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
249 expected_ltr_length[x]), |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
250 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
251 ) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
252 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
253 cat('done.\n') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
254 good_TE <- TE[!sapply(TE, is.null)] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
255 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
256 break |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
257 } |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
258 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
259 if (length(good_TE)>0){ # handle empty list |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
260 ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
261 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
262 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
263 cat('Identification of PBS ...') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
264 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
265 cat('done\n') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
266 gff3_out <- do.call(c, gff3_list2) |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
267 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
268 # define new source |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
269 src <- as.character(gff3_out$source) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
270 src[is.na(src)] <- "dante_ltr" |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
271 gff3_out$source <- src |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
272 gff3_out$Rank <- get_te_rank(gff3_out) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
273 # add partial TEs but first remove all ovelaping |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
274 TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
275 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
276 gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
277 }else{ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
278 # but this could be a problem if there are no TEs in the sequence |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
279 if (length(TE_partial_with_more_than_one_domain)>0){ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
280 TE_partial_parent_part <- TE_partial_with_more_than_one_domain |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
281 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
282 gff3_out <- sort(c(TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
283 }else{ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
284 gff3_out <- NULL |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
285 } |
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
286 } |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
287 |
13
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
288 if (is.null(gff3_out)){ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
289 cat('No TEs found.\n') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
290 }else{ |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
291 # modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
292 gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)]) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
293 gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)]) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
294 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
295 all_tbl <- get_te_statistics(gff3_out, RT) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
296 all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
297 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
298 # export fasta files: |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
299 s_te <- get_te_sequences(gff3_out, s) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
300 for (i in seq_along(s_te)) { |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
301 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
302 writeXStringSet(s_te[[i]], filepath = outname) |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
303 } |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
304 } |
559940c04c44
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
petr-novak
parents:
12
diff
changeset
|
305 |