Mercurial > repos > petr-novak > dante_ltr
comparison extract_putative_ltr.R @ 0:7b0bbe7477c4 draft
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
author | petr-novak |
---|---|
date | Tue, 08 Mar 2022 13:24:33 +0000 |
parents | |
children | f131886ea194 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7b0bbe7477c4 |
---|---|
1 #!/usr/bin/env Rscript | |
2 initial_options <- commandArgs(trailingOnly = FALSE) | |
3 file_arg_name <- "--file=" | |
4 script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep(file_arg_name, initial_options)])) | |
5 script_dir <- dirname(script_name) | |
6 library(optparse) | |
7 | |
8 parser <- OptionParser() | |
9 option_list <- list( | |
10 make_option(c("-g", "--gff3"), action = "store", type = "character", | |
11 help = "gff3 with dante results", default = NULL), | |
12 make_option(c("-s", "--reference_sequence"), action = "store", type = "character", | |
13 help = "reference sequence as fasta", default = NULL), | |
14 make_option(c("-o", "--output"), action = "store", type = "character", | |
15 help = "output file path and prefix", default = NULL), | |
16 make_option(c("-c", "--cpu"), type = "integer", default = 5, | |
17 help = "Number of cpu to use [default %default]", metavar = "number") | |
18 | |
19 ) | |
20 description <- paste(strwrap("")) | |
21 | |
22 epilogue <- "" | |
23 parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = description, | |
24 usage = "usage: %prog COMMAND [OPTIONS]") | |
25 opt <- parse_args(parser, args = commandArgs(TRUE)) | |
26 | |
27 | |
28 # load packages | |
29 suppressPackageStartupMessages({ | |
30 library(rtracklayer) | |
31 library(Biostrings) | |
32 library(BSgenome) | |
33 library(parallel) | |
34 }) | |
35 | |
36 | |
37 # CONFIGURATION | |
38 OFFSET <- 15000 | |
39 # load configuration files and functions: | |
40 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") | |
41 trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") | |
42 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") | |
43 if (file.exists(lineage_file) & file.exists(trna_db)) { | |
44 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) | |
45 source(ltr_utils_r) | |
46 }else { | |
47 lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv") | |
48 trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") | |
49 ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R") | |
50 if (file.exists(lineage_file) & file.exists((trna_db))) { | |
51 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) | |
52 source(ltr_utils_r) | |
53 }else( | |
54 stop("configuration files not found") | |
55 ) | |
56 } | |
57 | |
58 | |
59 # for testing | |
60 if (FALSE) { | |
61 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3") | |
62 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") | |
63 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) | |
64 | |
65 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data | |
66 /DANTE_filtered_part.gff3") | |
67 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data | |
68 /Rbp_part.fa") | |
69 | |
70 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data | |
71 /DANTE_Vfaba_chr5.gff3") | |
72 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 | |
73 /fasta_parts/211010_Vfaba_chr5.fasta") | |
74 | |
75 g <- rtracklayer::import("./test_data/sample_DANTE.gff3") | |
76 s <- readDNAStringSet("./test_data/sample_genome.fasta") | |
77 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" | |
78 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = | |
79 TRUE, as.is = TRUE) | |
80 trna_db <- "./databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta" | |
81 | |
82 } | |
83 | |
84 # MAIN ############################################################# | |
85 | |
86 # load data: | |
87 | |
88 cat("reading gff...") | |
89 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 | |
90 cat("done\n") | |
91 cat("reading fasta...") | |
92 s <- readDNAStringSet(opt$reference_sequence) # genome assembly | |
93 cat("done\n") | |
94 outfile <- opt$output | |
95 # clean sequence names: | |
96 names(s) <- gsub(" .+", "", names(s)) | |
97 lineage_domain <- lineage_info$Domains.order | |
98 lineage_offset5prime <- lineage_info$offset5prime | |
99 lineage_offset3prime <- lineage_info$offset3prime | |
100 ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage))) | |
101 names(lineage_offset3prime) <- ln | |
102 names(lineage_offset5prime) <- ln | |
103 names(lineage_domain) <- ln | |
104 | |
105 | |
106 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] | |
107 g <- add_coordinates_of_closest_neighbor(g) | |
108 | |
109 gcl <- get_domain_clusters(g) | |
110 gcl_clean <- clean_domain_clusters(gcl) | |
111 # glc annotation | |
112 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) | |
113 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", | |
114 paste(rev(x$Name), collapse = " "), | |
115 paste(x$Name, collapse = " ")) | |
116 ) | |
117 | |
118 # get lineages which has correct number and order of domains | |
119 gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] | |
120 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] | |
121 gr <- get_ranges(gcl_clean_with_domains) | |
122 | |
123 | |
124 cat('Number of analyzed regions:\n') | |
125 cat('Total number of domain clusters : ', length(gcl), '\n') | |
126 cat('Number of clean clusters : ', length(gcl_clean), '\n') | |
127 cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') | |
128 | |
129 | |
130 te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) | |
131 te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) | |
132 | |
133 max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) | |
134 max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) | |
135 | |
136 grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) | |
137 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) | |
138 | |
139 s_left <- getSeq(s, grL) | |
140 s_right <- getSeq(s, grR) | |
141 | |
142 # for statistics | |
143 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] | |
144 # cleanup | |
145 rm(g) | |
146 rm(gcl) | |
147 rm(gcl_clean) | |
148 rm(gcl_clean2) | |
149 | |
150 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") | |
151 names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") | |
152 names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") | |
153 cat('Identification of LTRs...') | |
154 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], | |
155 s_right[x], | |
156 gcl_clean_with_domains[[x]], | |
157 gr[x], | |
158 grL[x], | |
159 grR[x]), | |
160 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE | |
161 ) | |
162 cat('done.\n') | |
163 | |
164 good_TE <- TE[!sapply(TE, is.null)] | |
165 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') | |
166 | |
167 | |
168 ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) | |
169 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) | |
170 | |
171 cat('Identification of PBS ...') | |
172 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) | |
173 cat('done\n') | |
174 gff3_out <- do.call(c, gff3_list2) | |
175 | |
176 # define new source | |
177 src <- as.character(gff3_out$source) | |
178 src[is.na(src)] <- "dante_ltr" | |
179 gff3_out$source <- src | |
180 | |
181 # TODO export all files to single directory | |
182 # TODO export individual groups DL, DLT, DLP DLPT gff3 | |
183 | |
184 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') | |
185 # summary statistics | |
186 all_tbl <- get_te_statistics(gff3_out, RT) | |
187 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) | |
188 # export fasta files: | |
189 s_te <- get_te_sequences(gff3_out, s) | |
190 | |
191 for (i in seq_along(s_te)) { | |
192 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") | |
193 writeXStringSet(s_te[[i]], filepath = outname) | |
194 } | |
195 |