Mercurial > repos > petr-novak > dante_ltr
comparison extract_putative_ltr.R @ 7:c33d6583e548 draft
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author | petr-novak |
---|---|
date | Fri, 24 Jun 2022 14:19:48 +0000 |
parents | f131886ea194 |
children | 9de392f2fc02 |
comparison
equal
deleted
inserted
replaced
6:b91ca438a1cb | 7:c33d6583e548 |
---|---|
12 make_option(c("-s", "--reference_sequence"), action = "store", type = "character", | 12 make_option(c("-s", "--reference_sequence"), action = "store", type = "character", |
13 help = "reference sequence as fasta", default = NULL), | 13 help = "reference sequence as fasta", default = NULL), |
14 make_option(c("-o", "--output"), action = "store", type = "character", | 14 make_option(c("-o", "--output"), action = "store", type = "character", |
15 help = "output file path and prefix", default = NULL), | 15 help = "output file path and prefix", default = NULL), |
16 make_option(c("-c", "--cpu"), type = "integer", default = 5, | 16 make_option(c("-c", "--cpu"), type = "integer", default = 5, |
17 help = "Number of cpu to use [default %default]", metavar = "number") | 17 help = "Number of cpu to use [default %default]", metavar = "number"), |
18 make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0, | |
19 help = "Maximum number of missing domains is retrotransposon [default %default]", | |
20 metavar = "number") | |
21 | |
18 | 22 |
19 ) | 23 ) |
20 description <- paste(strwrap("")) | 24 description <- paste(strwrap("")) |
21 | 25 |
22 epilogue <- "" | 26 epilogue <- "" |
36 | 40 |
37 # CONFIGURATION | 41 # CONFIGURATION |
38 OFFSET <- 15000 | 42 OFFSET <- 15000 |
39 # load configuration files and functions: | 43 # load configuration files and functions: |
40 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") | 44 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") |
45 FDM_file <- paste0(script_dir, "/databases/feature_distances_model.RDS") | |
41 trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") | 46 trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") |
42 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") | 47 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") |
43 if (file.exists(lineage_file) & file.exists(trna_db)) { | 48 if (file.exists(lineage_file) & file.exists(trna_db)) { |
44 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) | 49 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) |
50 FDM <- readRDS(FDM_file) | |
45 source(ltr_utils_r) | 51 source(ltr_utils_r) |
46 }else { | 52 }else { |
53 # this destination work is installed using conda | |
47 lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv") | 54 lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv") |
55 FDM_file <- paste0(script_dir, "/../share/dante_ltr/databases/feature_distances_model.RDS") | |
48 trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") | 56 trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") |
49 ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R") | 57 ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R") |
50 if (file.exists(lineage_file) & file.exists((trna_db))) { | 58 if (file.exists(lineage_file) & file.exists((trna_db))) { |
51 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) | 59 lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) |
52 source(ltr_utils_r) | 60 source(ltr_utils_r) |
61 FDM <- readRDS(FDM_file) | |
53 }else( | 62 }else( |
54 stop("configuration files not found") | 63 stop("configuration files not found") |
55 ) | 64 ) |
56 } | 65 } |
57 | 66 |
60 if (FALSE) { | 69 if (FALSE) { |
61 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3") | 70 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3") |
62 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") | 71 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") |
63 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) | 72 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) |
64 | 73 |
65 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data | 74 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3") |
66 /DANTE_filtered_part.gff3") | 75 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa") |
67 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data | |
68 /Rbp_part.fa") | |
69 | 76 |
70 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data | 77 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data |
71 /DANTE_Vfaba_chr5.gff3") | 78 /DANTE_Vfaba_chr5.gff3") |
72 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 | 79 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 |
73 /fasta_parts/211010_Vfaba_chr5.fasta") | 80 /fasta_parts/211010_Vfaba_chr5.fasta") |
74 | 81 |
75 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3") | 82 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3") |
76 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz") | 83 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz") |
77 | 84 |
78 source("R/ltr_utils.R") | 85 source("R/ltr_utils.R") |
79 | 86 ## feature distance model |
87 FDM <- readRDS("./databases/feature_distances_model.RDS") | |
80 g <- rtracklayer::import("./test_data/sample_DANTE.gff3") | 88 g <- rtracklayer::import("./test_data/sample_DANTE.gff3") |
81 s <- readDNAStringSet("./test_data/sample_genome.fasta") | 89 s <- readDNAStringSet("./test_data/sample_genome.fasta") |
82 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" | 90 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" |
83 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = | 91 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = |
84 TRUE, as.is = TRUE) | 92 TRUE, as.is = TRUE) |
98 cat("done\n") | 106 cat("done\n") |
99 outfile <- opt$output | 107 outfile <- opt$output |
100 # clean sequence names: | 108 # clean sequence names: |
101 names(s) <- gsub(" .+", "", names(s)) | 109 names(s) <- gsub(" .+", "", names(s)) |
102 lineage_domain <- lineage_info$Domains.order | 110 lineage_domain <- lineage_info$Domains.order |
111 lineage_domain_span <- lineage_info$domain_span | |
112 lineage_ltr_mean_length <- lineage_info$ltr_length | |
103 lineage_offset5prime <- lineage_info$offset5prime | 113 lineage_offset5prime <- lineage_info$offset5prime |
104 lineage_offset3prime <- lineage_info$offset3prime | 114 lineage_offset3prime <- lineage_info$offset3prime |
105 ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage))) | 115 ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage))) |
106 names(lineage_offset3prime) <- ln | 116 names(lineage_offset3prime) <- ln |
107 names(lineage_offset5prime) <- ln | 117 names(lineage_offset5prime) <- ln |
108 names(lineage_domain) <- ln | 118 names(lineage_domain) <- ln |
119 names(lineage_domain_span) <- ln | |
120 names(lineage_ltr_mean_length) <- ln | |
121 lineage_domains_sequence <- unlist(mapply(function(d,l) { | |
122 paste(strsplit(d, " ")[[1]], ":", l, sep = "") | |
123 }, d = lineage_domain, l = names(lineage_domain))) | |
109 | 124 |
110 | 125 |
111 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] | 126 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] |
112 g <- add_coordinates_of_closest_neighbor(g) | 127 g <- add_coordinates_of_closest_neighbor(g) |
113 | 128 |
114 gcl <- get_domain_clusters(g) | 129 # add info about domain order: |
115 gcl_clean <- clean_domain_clusters(gcl) | 130 g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) |
131 # set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements | |
132 g$domain_order[is.na(g$domain_order)] <- 0 | |
133 | |
134 # NOTE - some operation is faster of GrangesList but some on list of data.frames | |
135 # this is primary clusteing | |
136 cls <- get_domain_clusters(g) | |
137 gcl <- split(as.data.frame(g), cls) | |
138 # gcl_as_GRL <- split(g, cls) # delete? | |
139 | |
140 cls_alt <- get_domain_clusters_alt(g, FDM) | |
141 g$Cluster <- as.numeric(factor(cls_alt)) | |
142 | |
143 gcl_alt <- split(as.data.frame(g), cls_alt) | |
144 | |
145 TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), | |
146 Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), | |
147 Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), | |
148 ID = sapply(gcl_alt, function(x) paste0("TE_partial_", x$Cluster[1])), | |
149 strand = sapply(gcl_alt, function(x) x$strand[1]), | |
150 Ndomains = sapply(gcl_alt, function(x) nrow(x)), | |
151 type = "transposable_element", | |
152 source = "dante_ltr", | |
153 Rank="D", | |
154 IRanges(start = sapply(gcl_alt, function(x) min(x$start)), | |
155 end = sapply(gcl_alt, function(x) max(x$end))) | |
156 ) | |
157 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) | |
158 g$Parent <- paste0("TE_partial_", g$Cluster) | |
159 g$Rank="D" | |
160 | |
161 # keep only partial TE with more than one domain | |
162 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] | |
163 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] | |
164 | |
165 # first filtering - remove TEs with low number of domains | |
166 gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) | |
167 | |
116 # glc annotation | 168 # glc annotation |
117 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) | 169 gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) |
118 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", | 170 gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", |
119 paste(rev(x$Name), collapse = " "), | 171 paste(rev(x$Name), collapse = " "), |
120 paste(x$Name, collapse = " ")) | 172 paste(x$Name, collapse = " ")) |
121 ) | 173 ) |
122 | 174 |
175 # compare detected domains with domains in lineages from REXdb database | |
176 dd <- mapply(domain_distance, | |
177 d_query = gcl_clean_domains, | |
178 d_reference = lineage_domain[gcl_clean_lineage]) | |
179 | |
123 # get lineages which has correct number and order of domains | 180 # get lineages which has correct number and order of domains |
124 gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] | 181 # gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] |
182 gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] | |
183 | |
125 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] | 184 gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] |
126 gr <- get_ranges(gcl_clean_with_domains) | 185 gr <- get_ranges(gcl_clean_with_domains) |
127 | 186 |
128 | 187 |
129 cat('Number of analyzed regions:\n') | 188 cat('Number of analyzed regions:\n') |
142 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) | 201 grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) |
143 | 202 |
144 s_left <- getSeq(s, grL) | 203 s_left <- getSeq(s, grL) |
145 s_right <- getSeq(s, grR) | 204 s_right <- getSeq(s, grR) |
146 | 205 |
206 expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] | |
147 # for statistics | 207 # for statistics |
148 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] | 208 RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] |
149 # cleanup | 209 # cleanup |
150 rm(g) | 210 #rm(g) |
151 rm(gcl) | 211 rm(gcl) |
152 rm(gcl_clean) | 212 rm(gcl_clean) |
153 rm(gcl_clean2) | 213 rm(gcl_clean2) |
154 | 214 |
155 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") | 215 names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") |
159 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], | 219 TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], |
160 s_right[x], | 220 s_right[x], |
161 gcl_clean_with_domains[[x]], | 221 gcl_clean_with_domains[[x]], |
162 gr[x], | 222 gr[x], |
163 grL[x], | 223 grL[x], |
164 grR[x]), | 224 grR[x], |
225 expected_ltr_length[x]), | |
165 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE | 226 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE |
166 ) | 227 ) |
167 | 228 |
168 cat('done.\n') | 229 cat('done.\n') |
169 | 230 |
170 good_TE <- TE[!sapply(TE, is.null)] | 231 good_TE <- TE[!sapply(TE, is.null)] |
171 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') | 232 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') |
172 | 233 |
173 | 234 # TODO - extent TE region to cover also TSD |
174 ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) | 235 ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) |
175 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) | 236 gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) |
176 | 237 |
177 cat('Identification of PBS ...') | 238 cat('Identification of PBS ...') |
178 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) | 239 gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) |
182 # define new source | 243 # define new source |
183 src <- as.character(gff3_out$source) | 244 src <- as.character(gff3_out$source) |
184 src[is.na(src)] <- "dante_ltr" | 245 src[is.na(src)] <- "dante_ltr" |
185 gff3_out$source <- src | 246 gff3_out$source <- src |
186 gff3_out$Rank <- get_te_rank(gff3_out) | 247 gff3_out$Rank <- get_te_rank(gff3_out) |
187 # TODO add attributte specifying individual groups DL, DLT, DLP DLPT gff3 | 248 |
249 # add partial TEs but first remove all ovelaping | |
250 TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] | |
251 TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] | |
252 | |
253 gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) | |
188 | 254 |
189 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') | 255 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') |
190 | 256 |
191 all_tbl <- get_te_statistics(gff3_out, RT) | 257 all_tbl <- get_te_statistics(gff3_out, RT) |
192 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = TRUE) | 258 all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) |
259 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) | |
193 # export fasta files: | 260 # export fasta files: |
194 s_te <- get_te_sequences(gff3_out, s) | 261 s_te <- get_te_sequences(gff3_out, s) |
195 | |
196 for (i in seq_along(s_te)) { | 262 for (i in seq_along(s_te)) { |
197 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") | 263 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") |
198 writeXStringSet(s_te[[i]], filepath = outname) | 264 writeXStringSet(s_te[[i]], filepath = outname) |
199 } | 265 } |
200 | 266 |