Mercurial > repos > petr-novak > dante_ltr
comparison clean_ltr.R @ 0:7b0bbe7477c4 draft
"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
author | petr-novak |
---|---|
date | Tue, 08 Mar 2022 13:24:33 +0000 |
parents | |
children | 6ae4a341d1f3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7b0bbe7477c4 |
---|---|
1 #!/usr/bin/env Rscript | |
2 initial_options <- commandArgs(trailingOnly = FALSE) | |
3 file_arg_name <- "--file=" | |
4 script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep | |
5 (file_arg_name, | |
6 initial_options)])) | |
7 script_dir <- dirname(script_name) | |
8 library(optparse) | |
9 | |
10 parser <- OptionParser() | |
11 option_list <- list( | |
12 make_option(c("-g", "--gff3"), action = "store", type = "character", | |
13 help = "gff3 with LTR Transposable elements", default = NULL), | |
14 make_option(c("-s", "--reference_sequence"), action = "store", type = "character", | |
15 help = "reference sequence as fasta", | |
16 default = NULL), | |
17 make_option(c("-o", "--output"), action = "store", type = "character", | |
18 help = "output file prefix", default = NULL), | |
19 make_option(c("-c", "--cpu"), type = | |
20 "integer", default = 5, help = "Number of cpu to use [default %default]", | |
21 metavar = "number") | |
22 | |
23 ) | |
24 description <- paste(strwrap("")) | |
25 | |
26 epilogue <- "" | |
27 parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = | |
28 description, usage = "usage: %prog COMMAND [OPTIONS]") | |
29 opt <- parse_args(parser, args = commandArgs(TRUE)) | |
30 | |
31 | |
32 # load packages | |
33 suppressPackageStartupMessages({ library(rtracklayer) | |
34 library(Biostrings) | |
35 library(BSgenome) | |
36 library(parallel) | |
37 }) | |
38 | |
39 # CONFIGURATION | |
40 # load configuration files and functions: | |
41 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") | |
42 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") | |
43 if (file.exists(lineage_file)) { | |
44 lineage_info <- read.table(lineage_file, sep = "\t", | |
45 header = TRUE, | |
46 as.is = TRUE) | |
47 source(ltr_utils_r) | |
48 }else { | |
49 lineage_file <- paste0(script_dir, "/. | |
50 ./share/dante_ltr/databases/lineage_domain_order.csv") | |
51 ltr_utils_r <- paste0(script_dir, "/. | |
52 ./share/dante_ltr/R/ltr_utils.R") | |
53 if (file.exists(lineage_file)) { | |
54 lineage_info <- read.table(lineage_file, sep = "\t", | |
55 header = TRUE, | |
56 as.is = TRUE) | |
57 source(ltr_utils_r) | |
58 }else { | |
59 (stop("configuration files not found")) | |
60 } | |
61 } | |
62 | |
63 | |
64 ncpus <- opt$cpu | |
65 | |
66 | |
67 # load data | |
68 cat("reading fasta...") | |
69 s <- readDNAStringSet(opt$reference_sequence) # genome assembly | |
70 cat("done\n") | |
71 outfile <- opt$output | |
72 # clean sequence names: | |
73 names(s) <- gsub(" .+", "", names(s)) | |
74 cat("reading gff...") | |
75 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 | |
76 cat("done\n") | |
77 # testing | |
78 if (FALSE) { | |
79 g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3") | |
80 s <- readDNAStringSet("./test_data/sample_genome.fasta") | |
81 | |
82 g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3") | |
83 s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta") | |
84 names(s) <- gsub(" .+", "", names(s)) | |
85 ncpus <- 10 | |
86 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = | |
87 TRUE, as.is = TRUE) | |
88 source("./R/ltr_utils.R") | |
89 } | |
90 | |
91 | |
92 # get te sequence based on rank | |
93 | |
94 # evaluate best TE - DLTP grou | |
95 s_te <- get_te_sequences(g, s) # split by 'element quality' | |
96 # best quality - split by lineage | |
97 word_size <- 15 | |
98 # best TE | |
99 TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus) | |
100 | |
101 # TE rank 2: | |
102 TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus | |
103 = ncpus) | |
104 TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus | |
105 = ncpus, | |
106 TE_DLTP_info$seqs_representative, | |
107 word_size = word_size | |
108 ) | |
109 | |
110 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) | |
111 | |
112 # create additional library from rank 2 verified by multiplicity | |
113 id_for_additional_library <- setdiff( | |
114 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, | |
115 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified) | |
116 | |
117 if (length(id_for_additional_library) > 1) { | |
118 seqs_for_additional_library <- c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% | |
119 id_for_additional_library] | |
120 seqs_additional_info <- analyze_TE(seqs_for_additional_library, word_size = | |
121 word_size, ncpus = ncpus) | |
122 seq_representative <- c( | |
123 TE_DLTP_info$seqs_representative, | |
124 seqs_additional_info$seqs_representative | |
125 ) | |
126 }else { | |
127 if (length(id_for_additional_library) == 1) { | |
128 seq_representative <- c( | |
129 TE_DLTP_info$seqs_representative, | |
130 c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library] | |
131 ) | |
132 }else { | |
133 seq_representative <- TE_DLTP_info$seqs_representative | |
134 } | |
135 } | |
136 | |
137 | |
138 # TE rank 3 | |
139 TE_DL_info_DLTP_verified <- compare_TE_datasets( | |
140 s_te$DL, | |
141 TE_DLTP_info$seqs_representative, min_coverage = 0.98, | |
142 ncpus = ncpus | |
143 ) | |
144 | |
145 | |
146 R <- seq_diversity(seq_representative)$richness | |
147 SI <- seq_diversity(seq_representative)$shannon_index | |
148 | |
149 # final RM library: | |
150 seq_representative_no_ssr <- seq_representative[R > 20] | |
151 | |
152 ID <- g$ID[g$type == "transposable_element"] | |
153 names(ID) <- paste0(seqnames(g), "_", | |
154 start(g), "_", | |
155 end(g), "#", | |
156 g$Final_Classification | |
157 )[g$type %in% "transposable_element"] | |
158 | |
159 # create clean gff3 | |
160 id_of_good_te <- unique(c( | |
161 TE_DLTP_info$te_conflict_info$ok, | |
162 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified, | |
163 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, | |
164 TE_DL_info_DLTP_verified$id_ok_verified) | |
165 ) | |
166 | |
167 c1 <- g$ID %in% ID[id_of_good_te] | |
168 c2 <- sapply(g$Parent, function(x)ifelse(length(x) == 0, "", x)) %in% ID[id_of_good_te] | |
169 | |
170 gff_out <- g[c1 | c2] | |
171 | |
172 | |
173 writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib.fasta")) | |
174 export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3") | |
175 |