# HG changeset patch
# User petr-novak
# Date 1634024634 0
# Node ID e955b40ad3a4aeeadfce6862d38480e2786863a4
# Parent  3f8ae272f4f38bf7b70f363011310c4299b115df
Uploaded

diff -r 3f8ae272f4f3 -r e955b40ad3a4 README.org
--- a/README.org	Thu Oct 07 07:29:59 2021 +0000
+++ b/README.org	Tue Oct 12 07:43:54 2021 +0000
@@ -4,7 +4,7 @@
 **  Extract Repeat Library from RepeatExplorer Archive
 (=extract_re_contigs.xml=)
 
-This toll  will extract library of repeats  based on RepeatExplorer2 analysis. Library is available as fasta file.
+This toll  will extract library of repeats  based on RepeatExplorer2 analysis. Library is available as fasta file. Tool also filter out all  the contig parts which has read depth and length below threshold. Parts of contigs with read depth below threshold are hardmasker. Contigs with full hardmasking are removed completelly
 
 ** Format repeat library
 (=format_repeat_library.xml=)
@@ -18,8 +18,9 @@
 ** Repeat Annotation
 (=repeat_annotate_custom.xml=)
 
- Internally annotation is performed using RepeatMasker search. Output from RepeatMasker is parsed to remove duplicated and overlaping annotations, Conflicts in annotations are resolved using hierarchical classification of repeats provided in custom database
-
+ Internally annotation is performed using RepeatMasker search. Output from RepeatMasker is parsed to remove duplicated and overlaping annotations, Conflicts in annotations are resolved using hierarchical classification of repeats provided in custom database. 
+** TODO Summarize Annotation
+This tool will create summary table from GFF annotation.
 * test data
 
 - ~test_assembly_1.fasta~ with ~test_db_1_satellites.fasta~ (include CLASS followed by double underscore - syntax 1)
diff -r 3f8ae272f4f3 -r e955b40ad3a4 clean_rm_output.R
--- a/clean_rm_output.R	Thu Oct 07 07:29:59 2021 +0000
+++ b/clean_rm_output.R	Tue Oct 12 07:43:54 2021 +0000
@@ -8,16 +8,43 @@
   gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8)
   gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8)
   new_annot = sapply(sapply(gff_names, unique), paste, collapse="|")
+  new_annot_uniq = unique(new_annot)
+  lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name)
+  names(lca_annot) = new_annot_uniq
+  new_annot_lca = lca_annot[new_annot] 
+  #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name)
   strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|")
   gff_disjoin$strands=strand_attribute
   gff_disjoin$source="RM"
   gff_disjoin$type="repeat"
   gff_disjoin$score=NA
   gff_disjoin$phase=NA
-  gff_disjoin$Name=new_annot
+  gff_disjoin$Name=new_annot_lca
+  gff_disjoin$Original_names=new_annot
   gff_disjoin$revmap=NULL
   return(gff_disjoin)
 }
+
+resolve_name=function(x){
+  if (length(x)==1){
+    # no conflict
+    return(x)
+  } else{
+    y = sapply(x, strsplit,  split="/", fixed = TRUE)
+    ny = table(unlist(sapply(y, function(x)paste(seq_along(x),x))))
+    if (max(ny)<length(x)){
+      return("Unknown")
+    }else{
+      k = which(ny==length(x))
+      r = max(as.numeric((gsub(" .+","",names(k)))))
+      out = paste(y[[1]][1:r], collapse="/")
+      return(out)
+    }
+  }
+}
+
+
+
 infile = commandArgs(T)[1]
 outfile = commandArgs(T)[2]
 
@@ -44,12 +71,15 @@
 
 ## join neighbors with the same annotation, disregard strand!
 result <- unlist(reduce(split(gff, gff$Name)))
+
 result$Name <- names(result)
 
+result_clean = gff_cleanup(result)
+
 ## TODO
 ## identify conflicting annotation, replace by LCA but keep origin list of classifications
 
-gff_out = sortSeqlevels(result)
+gff_out = sortSeqlevels(result_clean)
 gff_out = sort(gff_out)
 gff_out$type = "repeat_region"
 gff_out$source = "RepeatMasker_parsed"
diff -r 3f8ae272f4f3 -r e955b40ad3a4 repeat_annotate_custom.xml
--- a/repeat_annotate_custom.xml	Thu Oct 07 07:29:59 2021 +0000
+++ b/repeat_annotate_custom.xml	Tue Oct 12 07:43:54 2021 +0000
@@ -1,4 +1,4 @@
-<tool id="repeat_annotate" name="RepeatExplorer Based Assembly Annotation" version="0.1.0" python_template_version="3.5">
+<tool id="repeat_annotate" name="RepeatExplorer Based Assembly Annotation" version="0.1.1" python_template_version="3.5">
     <requirements>
         <requirement type="package">repeatmasker</requirement>
         <requirement type="package">bioconductor-rtracklayer</requirement>
@@ -20,13 +20,13 @@
     </inputs>
     <outputs>
         <data name="output1" format="gff3"  label="Repeat Annotation on ${on_string}, cleaned gff"/>
-        <data name="output2" format="tabular" label="RepeatMasker on ${on_string}, original output" />
+        <data name="output2" format="tabular" label="Raw output from RepeatMasker on ${on_string}" />
     </outputs>
     <help><![CDATA[
         This tools uses RepeatMasker to annotate repetitive sequences in the genome assemblie using custom library of repeats created from RepeatExplorer output.
-        Library of repeats created from RepeatExplorer ouput are contigs and TAREAN consensus sequences in fasta format where sequence header containg information about classification of repeats as **>sequence_id#classification_level1/classification_level2/...**
+        Library of repeats can be created from RepeatExplorer ouputt from contigs and TAREAN consensus sequences. Fasta formated library of repeats must contain header containg information about classification of repeats as **>sequence_id#classification_level1/classification_level2/...**
 
-        Classification in RepeatExplorer based library follows predetermined classification levels. User can however specify additional classification levels or ciompletelly custom classifications. Conflicts in annotations are resolved based on classification hierarchy.
+        Classification in RepeatExplorer based library follows predetermined classification levels. User can however specify additional classification levels or completelly custom classifications. Conflicts in annotations are resolved based on classification hierarchy.
     ]]></help>
 </tool>