Mercurial > repos > petr-novak > dante_ltr
comparison clean_ltr.R @ 7:c33d6583e548 draft
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author | petr-novak |
---|---|
date | Fri, 24 Jun 2022 14:19:48 +0000 |
parents | b91ca438a1cb |
children | ff01d4263391 |
comparison
equal
deleted
inserted
replaced
6:b91ca438a1cb | 7:c33d6583e548 |
---|---|
78 if (FALSE) { | 78 if (FALSE) { |
79 g <- rtracklayer::import("./test_data/big_test_data/dante_ltr_unfiltered_t.cacao.gff3") | 79 g <- rtracklayer::import("./test_data/big_test_data/dante_ltr_unfiltered_t.cacao.gff3") |
80 s <- readDNAStringSet("./test_data/big_test_data/T_cacao_chromosomes.fasta") | 80 s <- readDNAStringSet("./test_data/big_test_data/T_cacao_chromosomes.fasta") |
81 | 81 |
82 g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3") | 82 g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3") |
83 g <- rtracklayer::import("./test_data/sample_DANTE_LTR_annotation.gff3") | |
83 s <- readDNAStringSet("./test_data/sample_genome.fasta") | 84 s <- readDNAStringSet("./test_data/sample_genome.fasta") |
84 | 85 |
85 g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3") | 86 g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3") |
86 s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta") | 87 s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta") |
87 names(s) <- gsub(" .+", "", names(s)) | 88 names(s) <- gsub(" .+", "", names(s)) |
98 g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix)) | 99 g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix)) |
99 g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix)) | 100 g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix)) |
100 | 101 |
101 # get te sequence based on rank | 102 # get te sequence based on rank |
102 | 103 |
104 # best quality - split by lineage | |
105 s_te <- get_te_sequences(g, s) # split by 'element quality' | |
103 # evaluate best TE - DLTP grou | 106 # evaluate best TE - DLTP grou |
104 s_te <- get_te_sequences(g, s) # split by 'element quality' | 107 |
105 # best quality - split by lineage | 108 # comparison parameters |
106 word_size <- 15 | 109 word_size <- 11 |
110 task <- "blastn" | |
111 perc_identity <- 80 | |
112 | |
107 # best TE | 113 # best TE |
108 TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus) | 114 TE_DLTP_info <- analyze_TE(s_te$DLTP, |
115 word_size = word_size, | |
116 ncpus = ncpus, | |
117 perc_identity = perc_identity, | |
118 task = task) | |
109 | 119 |
110 # TE rank 2: | 120 # TE rank 2: |
111 TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus | 121 TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), |
112 = ncpus) | 122 word_size = word_size, |
113 TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus | 123 ncpus = ncpus, |
114 = ncpus,TE_DLTP_info$seqs_representative, word_size = word_size | 124 perc_identity = perc_identity, |
115 ) | 125 task = task) |
126 | |
127 TE_D_plus_DL_info <- analyze_TE(c(s_te$DL, s_te$D), | |
128 word_size = word_size, | |
129 ncpus = ncpus, | |
130 perc_identity = perc_identity, | |
131 task = task) | |
132 | |
133 TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets( | |
134 c(s_te$DLT, s_te$DLP), | |
135 ncpus = ncpus, | |
136 TE_DLTP_info$seqs_representative, | |
137 word_size = word_size, | |
138 perc_identity = perc_identity, | |
139 task = task) | |
116 | 140 |
117 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) | 141 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) |
118 # create additional library from rank 2 verified by multiplicity | 142 TE_D_plus_DL_info_multiplicity <- verify_based_on_multiplicity(TE_D_plus_DL_info) |
119 id_for_additional_library <- setdiff( | 143 |
144 # create additional library from rank 2 verified by multiplicity and DLTP | |
145 id_for_additional_library <- union( | |
120 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, | 146 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, |
121 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified) | 147 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified) |
122 | 148 |
123 if (length(id_for_additional_library) > 1) { | 149 if (length(id_for_additional_library) > 1) { |
124 seqs_for_additional_library <- c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% | 150 seqs_for_additional_library <- c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% |
125 id_for_additional_library] | 151 id_for_additional_library] |
126 seqs_additional_info <- analyze_TE(seqs_for_additional_library, word_size = | 152 seqs_additional_info <- analyze_TE(seqs_for_additional_library, word_size = |
127 word_size, ncpus = ncpus) | 153 word_size, ncpus = ncpus) |
128 seq_representative <- c( | 154 seqs_representative <- c( |
129 TE_DLTP_info$seqs_representative, | 155 TE_DLTP_info$seqs_representative, |
130 seqs_additional_info$seqs_representative | 156 seqs_additional_info$seqs_representative |
131 ) | 157 ) |
132 }else { | 158 }else { |
133 if (length(id_for_additional_library) == 1) { | 159 if (length(id_for_additional_library) == 1) { |
134 seq_representative <- c( | 160 seqs_representative <- c( |
135 TE_DLTP_info$seqs_representative, | 161 TE_DLTP_info$seqs_representative, |
136 c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library] | 162 c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library] |
137 ) | 163 ) |
138 }else { | 164 }else { |
139 seq_representative <- TE_DLTP_info$seqs_representative | 165 seqs_representative <- TE_DLTP_info$seqs_representative |
140 } | 166 } |
141 } | 167 } |
142 | 168 |
143 # TE rank 3 - verify agains good DLTP | 169 # TE rank 3 - verify agains good DLTP |
144 TE_DL_info_DLTP_verified <- compare_TE_datasets( | 170 TE_DL_info_DLTP_verified <- compare_TE_datasets( |
145 s_te$DL, | 171 s_te$DL, |
146 TE_DLTP_info$seqs_representative, min_coverage = 0.95, | 172 seqs_representative, min_coverage = 0.90, |
147 ncpus = ncpus, word_size = word_size | 173 ncpus = ncpus, |
148 ) | 174 word_size = word_size, |
149 | 175 task = task, |
150 | 176 perc_identity = perc_identity) |
151 R <- seq_diversity(seq_representative)$richness | 177 |
152 SI <- seq_diversity(seq_representative)$shannon_index | 178 TE_D_info_DLTP_verified <- compare_TE_datasets( |
179 s_te$D, | |
180 seqs_representative, min_coverage = 0.90, | |
181 ncpus = ncpus, | |
182 word_size = word_size, | |
183 task = task, | |
184 perc_identity = perc_identity) | |
185 | |
186 | |
187 | |
188 R <- seq_diversity(seqs_representative)$richness | |
189 SI <- seq_diversity(seqs_representative)$shannon_index | |
153 | 190 |
154 # final RM library: | 191 # final RM library: |
155 seq_representative_no_ssr <- seq_representative[R > 20] | 192 seqs_representative_no_ssr <- seqs_representative[R > 20] |
156 | 193 |
157 ID <- g$ID[g$type == "transposable_element"] | 194 ID <- g$ID[g$type == "transposable_element"] |
158 names(ID) <- paste0(seqnames(g), "_", | 195 names(ID) <- paste0(seqnames(g), "_", |
159 start(g), "_", | 196 start(g), "_", |
160 end(g), "#", | 197 end(g), "#", |
164 # create clean gff3 | 201 # create clean gff3 |
165 id_of_good_te <- unique(c( | 202 id_of_good_te <- unique(c( |
166 TE_DLTP_info$te_conflict_info$ok, | 203 TE_DLTP_info$te_conflict_info$ok, |
167 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified, | 204 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified, |
168 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, | 205 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, |
169 TE_DL_info_DLTP_verified$id_ok_verified) | 206 TE_DL_info_DLTP_verified$id_ok_verified, |
207 TE_D_info_DLTP_verified$id_ok_verified, | |
208 TE_D_plus_DL_info_multiplicity$id_ok_mp_verified), | |
170 ) | 209 ) |
171 | 210 |
172 c1 <- g$ID %in% ID[id_of_good_te] | 211 c1 <- g$ID %in% ID[id_of_good_te] |
173 c2 <- sapply(g$Parent, function(x)ifelse(length(x) == 0, "", x)) %in% ID[id_of_good_te] | 212 c2 <- sapply(g$Parent, function(x)ifelse(length(x) == 0, "", x)) %in% ID[id_of_good_te] |
174 | 213 |
175 gff_out <- g[c1 | c2] | 214 gff_out <- g[c1 | c2] |
176 | 215 |
177 gff_te <- gff_out[gff_out$type %in% "transposable_element"] | 216 gff_te <- gff_out[gff_out$type %in% "transposable_element"] |
217 # remove partial elements | |
218 gff_te_with_ltr <- gff_out[gff_out$type %in% "transposable_element" & gff_out$Rank != "D"] | |
219 | |
220 | |
178 gff_5ltr <- gff_out[gff_out$LTR %in% "5LTR"] | 221 gff_5ltr <- gff_out[gff_out$LTR %in% "5LTR"] |
179 gff_3ltr <- gff_out[gff_out$LTR %in% "3LTR"] | 222 gff_3ltr <- gff_out[gff_out$LTR %in% "3LTR"] |
180 | 223 |
181 full_te <- getSeqNamed(s, gff_te) | 224 full_te <- getSeqNamed(s, gff_te) |
182 names(full_te) <- paste0(gff_te$ID,":",names(full_te)) | 225 names(full_te) <- paste0(gff_te$ID,":",names(full_te)) |
183 ltr5 <- getSeqNamed(s, gff_5ltr) | 226 ltr5 <- getSeqNamed(s, gff_5ltr) |
184 names(ltr5) <- paste0(gff_5ltr$Parent,":",names(ltr5)) | 227 names(ltr5) <- paste0(gff_5ltr$Parent,":",names(ltr5)) |
185 ltr3 <- getSeqNamed(s, gff_3ltr) | 228 ltr3 <- getSeqNamed(s, gff_3ltr) |
186 names(ltr3) <- paste0(gff_3ltr$Parent,":",names(ltr3)) | 229 names(ltr3) <- paste0(gff_3ltr$Parent,":",names(ltr3)) |
187 inc <- gff_te$Rank != "DL" | 230 inc <- gff_te_with_ltr$Rank != "DL" |
188 | 231 |
189 writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta")) | 232 writeXStringSet(seqs_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta")) |
190 writeXStringSet(full_te, paste0(opt$output, "_RM_lib_full_TE.fasta")) | 233 writeXStringSet(full_te, paste0(opt$output, "_RM_lib_full_TE.fasta")) |
191 writeXStringSet(ltr5, paste0(opt$output, "_RM_lib_5LTR.fasta")) | 234 writeXStringSet(ltr5, paste0(opt$output, "_RM_lib_5LTR.fasta")) |
192 writeXStringSet(ltr3, paste0(opt$output, "_RM_lib_3LTR.fasta")) | 235 writeXStringSet(ltr3, paste0(opt$output, "_RM_lib_3LTR.fasta")) |
193 | 236 |
194 export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3") | 237 export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3") |
195 | 238 |
196 lv <- sort(unique(gff_te$Final_Classification)) | 239 lv <- sort(unique(gff_te_with_ltr$Final_Classification)) |
197 te_count <- table(factor(gff_te$Final_Classification, levels=lv)) | 240 te_count <- table(factor(gff_te_with_ltr$Final_Classification, levels=lv)) |
198 | 241 |
199 pdf(paste0(opt$output, "_summary.pdf"), width = 13, height=8, pointsize = 10) | 242 pdf(paste0(opt$output, "_summary.pdf"), width = 13, height=8, pointsize = 10) |
200 par(mfrow=c(1,2), mar=c(5,7,2,0), las=1) | 243 par(mfrow=c(1,2), mar=c(5,7,2,0), las=1) |
201 boxplot(width(gff_te) ~ factor(gff_te$Final_Classification, levels=lv), | 244 boxplot(width(gff_te_with_ltr) ~ factor(gff_te_with_ltr$Final_Classification, levels=lv), |
202 horizontal = TRUE, xlab="length[bp]", ylab="", | 245 horizontal = TRUE, xlab="length[bp]", ylab="", |
203 names = paste0(gsub("^.+[|]", "", lv), " (", te_count, ")"), | 246 names = paste0(gsub("^.+[|]", "", lv), " (", te_count, ")"), |
204 main = 'Full TE', at = seq_along(lv)*4 | 247 main = 'Full TE', at = seq_along(lv)*4 |
205 ) | 248 ) |
206 boxplot(width(gff_te[inc]) ~ factor(gff_te$Final_Classification[inc], levels=lv), | 249 boxplot(width(gff_te_with_ltr[inc]) ~ factor(gff_te_with_ltr$Final_Classification[inc], levels=lv), |
207 horizontal = TRUE, xlab="length[bp]", ylab="", | 250 horizontal = TRUE, xlab="length[bp]", ylab="", |
208 names = rep("", length(lv)), | 251 names = rep("", length(lv)), |
209 main = 'Full TE', at = seq_along(lv)*4-1, add=TRUE, col=2 | 252 main = 'Full TE', at = seq_along(lv)*4-1, add=TRUE, col=2 |
210 ) | 253 ) |
211 par(mar=c(5,0,2,7)) | 254 par(mar=c(5,0,2,7)) |