comparison extract_putative_ltr.R @ 7:c33d6583e548 draft

"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author petr-novak
date Fri, 24 Jun 2022 14:19:48 +0000
parents f131886ea194
children 9de392f2fc02
comparison
equal deleted inserted replaced
6:b91ca438a1cb 7:c33d6583e548
12 make_option(c("-s", "--reference_sequence"), action = "store", type = "character", 12 make_option(c("-s", "--reference_sequence"), action = "store", type = "character",
13 help = "reference sequence as fasta", default = NULL), 13 help = "reference sequence as fasta", default = NULL),
14 make_option(c("-o", "--output"), action = "store", type = "character", 14 make_option(c("-o", "--output"), action = "store", type = "character",
15 help = "output file path and prefix", default = NULL), 15 help = "output file path and prefix", default = NULL),
16 make_option(c("-c", "--cpu"), type = "integer", default = 5, 16 make_option(c("-c", "--cpu"), type = "integer", default = 5,
17 help = "Number of cpu to use [default %default]", metavar = "number") 17 help = "Number of cpu to use [default %default]", metavar = "number"),
18 make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0,
19 help = "Maximum number of missing domains is retrotransposon [default %default]",
20 metavar = "number")
21
18 22
19 ) 23 )
20 description <- paste(strwrap("")) 24 description <- paste(strwrap(""))
21 25
22 epilogue <- "" 26 epilogue <- ""
36 40
37 # CONFIGURATION 41 # CONFIGURATION
38 OFFSET <- 15000 42 OFFSET <- 15000
39 # load configuration files and functions: 43 # load configuration files and functions:
40 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") 44 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv")
45 FDM_file <- paste0(script_dir, "/databases/feature_distances_model.RDS")
41 trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") 46 trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
42 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") 47 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R")
43 if (file.exists(lineage_file) & file.exists(trna_db)) { 48 if (file.exists(lineage_file) & file.exists(trna_db)) {
44 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) 49 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
50 FDM <- readRDS(FDM_file)
45 source(ltr_utils_r) 51 source(ltr_utils_r)
46 }else { 52 }else {
53 # this destination work is installed using conda
47 lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv") 54 lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv")
55 FDM_file <- paste0(script_dir, "/../share/dante_ltr/databases/feature_distances_model.RDS")
48 trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") 56 trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
49 ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R") 57 ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R")
50 if (file.exists(lineage_file) & file.exists((trna_db))) { 58 if (file.exists(lineage_file) & file.exists((trna_db))) {
51 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) 59 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
52 source(ltr_utils_r) 60 source(ltr_utils_r)
61 FDM <- readRDS(FDM_file)
53 }else( 62 }else(
54 stop("configuration files not found") 63 stop("configuration files not found")
55 ) 64 )
56 } 65 }
57 66
60 if (FALSE) { 69 if (FALSE) {
61 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3") 70 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3")
62 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") 71 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa")
63 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) 72 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE)
64 73
65 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data 74 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3")
66 /DANTE_filtered_part.gff3") 75 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa")
67 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data
68 /Rbp_part.fa")
69 76
70 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data 77 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data
71 /DANTE_Vfaba_chr5.gff3") 78 /DANTE_Vfaba_chr5.gff3")
72 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 79 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910
73 /fasta_parts/211010_Vfaba_chr5.fasta") 80 /fasta_parts/211010_Vfaba_chr5.fasta")
74 81
75 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3") 82 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3")
76 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz") 83 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz")
77 84
78 source("R/ltr_utils.R") 85 source("R/ltr_utils.R")
79 86 ## feature distance model
87 FDM <- readRDS("./databases/feature_distances_model.RDS")
80 g <- rtracklayer::import("./test_data/sample_DANTE.gff3") 88 g <- rtracklayer::import("./test_data/sample_DANTE.gff3")
81 s <- readDNAStringSet("./test_data/sample_genome.fasta") 89 s <- readDNAStringSet("./test_data/sample_genome.fasta")
82 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" 90 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3"
83 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = 91 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header =
84 TRUE, as.is = TRUE) 92 TRUE, as.is = TRUE)
98 cat("done\n") 106 cat("done\n")
99 outfile <- opt$output 107 outfile <- opt$output
100 # clean sequence names: 108 # clean sequence names:
101 names(s) <- gsub(" .+", "", names(s)) 109 names(s) <- gsub(" .+", "", names(s))
102 lineage_domain <- lineage_info$Domains.order 110 lineage_domain <- lineage_info$Domains.order
111 lineage_domain_span <- lineage_info$domain_span
112 lineage_ltr_mean_length <- lineage_info$ltr_length
103 lineage_offset5prime <- lineage_info$offset5prime 113 lineage_offset5prime <- lineage_info$offset5prime
104 lineage_offset3prime <- lineage_info$offset3prime 114 lineage_offset3prime <- lineage_info$offset3prime
105 ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage))) 115 ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage)))
106 names(lineage_offset3prime) <- ln 116 names(lineage_offset3prime) <- ln
107 names(lineage_offset5prime) <- ln 117 names(lineage_offset5prime) <- ln
108 names(lineage_domain) <- ln 118 names(lineage_domain) <- ln
119 names(lineage_domain_span) <- ln
120 names(lineage_ltr_mean_length) <- ln
121 lineage_domains_sequence <- unlist(mapply(function(d,l) {
122 paste(strsplit(d, " ")[[1]], ":", l, sep = "")
123 }, d = lineage_domain, l = names(lineage_domain)))
109 124
110 125
111 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] 126 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))]
112 g <- add_coordinates_of_closest_neighbor(g) 127 g <- add_coordinates_of_closest_neighbor(g)
113 128
114 gcl <- get_domain_clusters(g) 129 # add info about domain order:
115 gcl_clean <- clean_domain_clusters(gcl) 130 g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence))
131 # set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements
132 g$domain_order[is.na(g$domain_order)] <- 0
133
134 # NOTE - some operation is faster of GrangesList but some on list of data.frames
135 # this is primary clusteing
136 cls <- get_domain_clusters(g)
137 gcl <- split(as.data.frame(g), cls)
138 # gcl_as_GRL <- split(g, cls) # delete?
139
140 cls_alt <- get_domain_clusters_alt(g, FDM)
141 g$Cluster <- as.numeric(factor(cls_alt))
142
143 gcl_alt <- split(as.data.frame(g), cls_alt)
144
145 TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]),
146 Name = sapply(gcl_alt, function(x) x$Final_Classification[1]),
147 Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]),
148 ID = sapply(gcl_alt, function(x) paste0("TE_partial_", x$Cluster[1])),
149 strand = sapply(gcl_alt, function(x) x$strand[1]),
150 Ndomains = sapply(gcl_alt, function(x) nrow(x)),
151 type = "transposable_element",
152 source = "dante_ltr",
153 Rank="D",
154 IRanges(start = sapply(gcl_alt, function(x) min(x$start)),
155 end = sapply(gcl_alt, function(x) max(x$end)))
156 )
157 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster)
158 g$Parent <- paste0("TE_partial_", g$Cluster)
159 g$Rank="D"
160
161 # keep only partial TE with more than one domain
162 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1]
163 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)]
164
165 # first filtering - remove TEs with low number of domains
166 gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains)
167
116 # glc annotation 168 # glc annotation
117 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) 169 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1])
118 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", 170 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-",
119 paste(rev(x$Name), collapse = " "), 171 paste(rev(x$Name), collapse = " "),
120 paste(x$Name, collapse = " ")) 172 paste(x$Name, collapse = " "))
121 ) 173 )
122 174
175 # compare detected domains with domains in lineages from REXdb database
176 dd <- mapply(domain_distance,
177 d_query = gcl_clean_domains,
178 d_reference = lineage_domain[gcl_clean_lineage])
179
123 # get lineages which has correct number and order of domains 180 # get lineages which has correct number and order of domains
124 gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] 181 # gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]]
182 gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains]
183
125 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] 184 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)]
126 gr <- get_ranges(gcl_clean_with_domains) 185 gr <- get_ranges(gcl_clean_with_domains)
127 186
128 187
129 cat('Number of analyzed regions:\n') 188 cat('Number of analyzed regions:\n')
142 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) 201 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset)
143 202
144 s_left <- getSeq(s, grL) 203 s_left <- getSeq(s, grL)
145 s_right <- getSeq(s, grR) 204 s_right <- getSeq(s, grR)
146 205
206 expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])]
147 # for statistics 207 # for statistics
148 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] 208 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"]
149 # cleanup 209 # cleanup
150 rm(g) 210 #rm(g)
151 rm(gcl) 211 rm(gcl)
152 rm(gcl_clean) 212 rm(gcl_clean)
153 rm(gcl_clean2) 213 rm(gcl_clean2)
154 214
155 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") 215 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_")
159 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], 219 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x],
160 s_right[x], 220 s_right[x],
161 gcl_clean_with_domains[[x]], 221 gcl_clean_with_domains[[x]],
162 gr[x], 222 gr[x],
163 grL[x], 223 grL[x],
164 grR[x]), 224 grR[x],
225 expected_ltr_length[x]),
165 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE 226 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE
166 ) 227 )
167 228
168 cat('done.\n') 229 cat('done.\n')
169 230
170 good_TE <- TE[!sapply(TE, is.null)] 231 good_TE <- TE[!sapply(TE, is.null)]
171 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') 232 cat('Number of putative TE with identified LTR :', length(good_TE), '\n')
172 233
173 234 # TODO - extent TE region to cover also TSD
174 ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) 235 ID <- paste0('TE_', sprintf("%08d", seq(good_TE)))
175 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) 236 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu)
176 237
177 cat('Identification of PBS ...') 238 cat('Identification of PBS ...')
178 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) 239 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE)
182 # define new source 243 # define new source
183 src <- as.character(gff3_out$source) 244 src <- as.character(gff3_out$source)
184 src[is.na(src)] <- "dante_ltr" 245 src[is.na(src)] <- "dante_ltr"
185 gff3_out$source <- src 246 gff3_out$source <- src
186 gff3_out$Rank <- get_te_rank(gff3_out) 247 gff3_out$Rank <- get_te_rank(gff3_out)
187 # TODO add attributte specifying individual groups DL, DLT, DLP DLPT gff3 248
249 # add partial TEs but first remove all ovelaping
250 TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out]
251 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID]
252
253 gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start)
188 254
189 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') 255 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3')
190 256
191 all_tbl <- get_te_statistics(gff3_out, RT) 257 all_tbl <- get_te_statistics(gff3_out, RT)
192 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = TRUE) 258 all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl)
259 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE)
193 # export fasta files: 260 # export fasta files:
194 s_te <- get_te_sequences(gff3_out, s) 261 s_te <- get_te_sequences(gff3_out, s)
195
196 for (i in seq_along(s_te)) { 262 for (i in seq_along(s_te)) {
197 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") 263 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta")
198 writeXStringSet(s_te[[i]], filepath = outname) 264 writeXStringSet(s_te[[i]], filepath = outname)
199 } 265 }
200 266