comparison extract_putative_ltr.R @ 2:f131886ea194 draft

"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
author petr-novak
date Tue, 12 Apr 2022 12:55:32 +0000
parents 7b0bbe7477c4
children c33d6583e548
comparison
equal deleted inserted replaced
1:c1498f679b50 2:f131886ea194
70 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data 70 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data
71 /DANTE_Vfaba_chr5.gff3") 71 /DANTE_Vfaba_chr5.gff3")
72 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 72 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910
73 /fasta_parts/211010_Vfaba_chr5.fasta") 73 /fasta_parts/211010_Vfaba_chr5.fasta")
74 74
75 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3")
76 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz")
77
78 source("R/ltr_utils.R")
79
75 g <- rtracklayer::import("./test_data/sample_DANTE.gff3") 80 g <- rtracklayer::import("./test_data/sample_DANTE.gff3")
76 s <- readDNAStringSet("./test_data/sample_genome.fasta") 81 s <- readDNAStringSet("./test_data/sample_genome.fasta")
77 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" 82 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3"
78 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = 83 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header =
79 TRUE, as.is = TRUE) 84 TRUE, as.is = TRUE)
157 gr[x], 162 gr[x],
158 grL[x], 163 grL[x],
159 grR[x]), 164 grR[x]),
160 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE 165 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE
161 ) 166 )
167
162 cat('done.\n') 168 cat('done.\n')
163 169
164 good_TE <- TE[!sapply(TE, is.null)] 170 good_TE <- TE[!sapply(TE, is.null)]
165 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') 171 cat('Number of putative TE with identified LTR :', length(good_TE), '\n')
166 172
175 181
176 # define new source 182 # define new source
177 src <- as.character(gff3_out$source) 183 src <- as.character(gff3_out$source)
178 src[is.na(src)] <- "dante_ltr" 184 src[is.na(src)] <- "dante_ltr"
179 gff3_out$source <- src 185 gff3_out$source <- src
180 186 gff3_out$Rank <- get_te_rank(gff3_out)
181 # TODO export all files to single directory 187 # TODO add attributte specifying individual groups DL, DLT, DLP DLPT gff3
182 # TODO export individual groups DL, DLT, DLP DLPT gff3
183 188
184 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') 189 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3')
185 # summary statistics 190
186 all_tbl <- get_te_statistics(gff3_out, RT) 191 all_tbl <- get_te_statistics(gff3_out, RT)
187 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) 192 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = TRUE)
188 # export fasta files: 193 # export fasta files:
189 s_te <- get_te_sequences(gff3_out, s) 194 s_te <- get_te_sequences(gff3_out, s)
190 195
191 for (i in seq_along(s_te)) { 196 for (i in seq_along(s_te)) {
192 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") 197 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta")