diff extract_putative_ltr.R @ 8:9de392f2fc02 draft

"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
author petr-novak
date Tue, 28 Jun 2022 12:33:22 +0000
parents c33d6583e548
children 1aa578e6c8b3
line wrap: on
line diff
--- a/extract_putative_ltr.R	Fri Jun 24 14:19:48 2022 +0000
+++ b/extract_putative_ltr.R	Tue Jun 28 12:33:22 2022 +0000
@@ -17,9 +17,11 @@
               help = "Number of cpu to use [default %default]", metavar = "number"),
   make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0,
               help = "Maximum number of missing domains is retrotransposon [default %default]",
+              metavar = "number"),
+  make_option(c("-L", "--min_relative_length"), type = "numeric", default = 0.6,
+              help = "Minimum relative length of protein domain to be considered for retrostransposon detection [default %default]",
               metavar = "number")
 
-
 )
 description <- paste(strwrap(""))
 
@@ -71,6 +73,7 @@
   s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa")
   lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE)
 
+  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/sample_DANTE_unfiltered.gff3")
   g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3")
   s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa")
 
@@ -122,6 +125,8 @@
   paste(strsplit(d, " ")[[1]], ":", l, sep = "")
 }, d = lineage_domain, l = names(lineage_domain)))
 
+# filter g gff3
+g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default
 
 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))]
 g <- add_coordinates_of_closest_neighbor(g)
@@ -156,7 +161,7 @@
 )
 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster)
 g$Parent <- paste0("TE_partial_", g$Cluster)
-g$Rank="D"
+g$Rank <- "D"
 
 # keep only partial TE with more than one domain
 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1]