Mercurial > repos > petr-novak > dante_ltr
comparison extract_putative_ltr.R @ 2:f131886ea194 draft
"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
author | petr-novak |
---|---|
date | Tue, 12 Apr 2022 12:55:32 +0000 |
parents | 7b0bbe7477c4 |
children | c33d6583e548 |
comparison
equal
deleted
inserted
replaced
1:c1498f679b50 | 2:f131886ea194 |
---|---|
70 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data | 70 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data |
71 /DANTE_Vfaba_chr5.gff3") | 71 /DANTE_Vfaba_chr5.gff3") |
72 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 | 72 s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 |
73 /fasta_parts/211010_Vfaba_chr5.fasta") | 73 /fasta_parts/211010_Vfaba_chr5.fasta") |
74 | 74 |
75 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3") | |
76 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz") | |
77 | |
78 source("R/ltr_utils.R") | |
79 | |
75 g <- rtracklayer::import("./test_data/sample_DANTE.gff3") | 80 g <- rtracklayer::import("./test_data/sample_DANTE.gff3") |
76 s <- readDNAStringSet("./test_data/sample_genome.fasta") | 81 s <- readDNAStringSet("./test_data/sample_genome.fasta") |
77 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" | 82 outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" |
78 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = | 83 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = |
79 TRUE, as.is = TRUE) | 84 TRUE, as.is = TRUE) |
157 gr[x], | 162 gr[x], |
158 grL[x], | 163 grL[x], |
159 grR[x]), | 164 grR[x]), |
160 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE | 165 mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE |
161 ) | 166 ) |
167 | |
162 cat('done.\n') | 168 cat('done.\n') |
163 | 169 |
164 good_TE <- TE[!sapply(TE, is.null)] | 170 good_TE <- TE[!sapply(TE, is.null)] |
165 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') | 171 cat('Number of putative TE with identified LTR :', length(good_TE), '\n') |
166 | 172 |
175 | 181 |
176 # define new source | 182 # define new source |
177 src <- as.character(gff3_out$source) | 183 src <- as.character(gff3_out$source) |
178 src[is.na(src)] <- "dante_ltr" | 184 src[is.na(src)] <- "dante_ltr" |
179 gff3_out$source <- src | 185 gff3_out$source <- src |
180 | 186 gff3_out$Rank <- get_te_rank(gff3_out) |
181 # TODO export all files to single directory | 187 # TODO add attributte specifying individual groups DL, DLT, DLP DLPT gff3 |
182 # TODO export individual groups DL, DLT, DLP DLPT gff3 | |
183 | 188 |
184 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') | 189 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') |
185 # summary statistics | 190 |
186 all_tbl <- get_te_statistics(gff3_out, RT) | 191 all_tbl <- get_te_statistics(gff3_out, RT) |
187 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) | 192 write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = TRUE) |
188 # export fasta files: | 193 # export fasta files: |
189 s_te <- get_te_sequences(gff3_out, s) | 194 s_te <- get_te_sequences(gff3_out, s) |
190 | 195 |
191 for (i in seq_along(s_te)) { | 196 for (i in seq_along(s_te)) { |
192 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") | 197 outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") |