changeset 12:ddc6bab20889 draft

Uploaded
author petr-novak
date Thu, 15 Aug 2019 07:20:12 -0400
parents 1d5883a9ec3a
children ab56b34d67d8
files dante_gff_to_tabular.xml fasta2database.R summarize_gff.R summarize_gff.xml
diffstat 4 files changed, 83 insertions(+), 28 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dante_gff_to_tabular.xml	Thu Aug 15 07:20:12 2019 -0400
@@ -0,0 +1,17 @@
+<tool id="gff_to_tabular" name="Convert dante gff3 to tab delimited file" version="0.1.0" python_template_version="3.5">
+    <requirements>
+        <requirement type="package">R</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        Rscript ${__tool_directory__}/summarize_gff.R '$inputgff' '$output'
+    ]]></command>
+    <inputs>
+      <param type="data" name="inputgff" format="gff3" />
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular" />
+    </outputs>
+    <help><![CDATA[
+        TODO: Fill in help.
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta2database.R	Thu Aug 15 07:20:12 2019 -0400
@@ -0,0 +1,14 @@
+library(Biostrings)
+input_fasta = commandArgs(T)[1]
+## for testing input_fasta="/mnt/raid/454_data/RE2_benchmark/REPET_annotation/Prunus_persica/DANTE_proteins_filtered.fasta"
+s = readAAStringSet(input_fasta)
+names_table = do.call("rbind", strsplit(names(s)," "))
+head(names_table)
+classification_table = paste(names_table[,1], gsub("|","\t",names_table[,3], fixed = TRUE), sep="\t")
+cat(unique(classification_table), sep="\n", file = paste(input_fasta, ".classification", sep = ""))
+
+new_fasta_names = paste("NA-", names_table[,2], "__", names_table[,1], sep="")
+
+names(s) = new_fasta_names
+
+writeXStringSet(s, filepath = paste(input_fasta, ".db",sep=''))
--- a/summarize_gff.R	Wed Aug 14 11:24:42 2019 -0400
+++ b/summarize_gff.R	Thu Aug 15 07:20:12 2019 -0400
@@ -1,7 +1,11 @@
 ## summarize hits
 output = commandArgs(T)[2] ## output table
 filepath = commandArgs(T)[1]  ## input dante gff3
-summarized_by = commandArgs(T)[-(1:2)]
+if (length(commandArgs(T))==2){
+  summarized_by = NA
+}else{
+  summarized_by = strsplit(commandArgs(T)[-(1:2)], split = ",")[[1]]
+}
 
 readGFF3fromDante = function(filepath){
   dfraw=read.table(filepath, as.is = TRUE)
@@ -9,29 +13,49 @@
   colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score",
                     "strand", "phase")
   ## assume same order, same attributes names
-  gffattr = do.call(rbind,
-                    lapply(
-                      strsplit(dfraw[,9],split=c("=|;")),
-                      function(x)x[c(FALSE,TRUE)]
-                    )
-                    )
-  gff_df$Name = gffattr[,1]
-  gff_df$Final_Classification = gffattr[,2]
-  gff_df$Region_Hits_Classifications = gffattr[,3]
-  gff_df$Best_Hit = gffattr[,4]
-  gff_df$Best_Hit_DB_Pos = gffattr[,5]
-  gff_df$DB_Seq = gffattr[,6]
-  gff_df$Query_Seq = gffattr[,7]
-  gff_df$Identity = as.numeric(gffattr[,8])
-  gff_df$Similarity = as.numeric(gffattr[,9])
-  gff_df$Relat_Length = as.numeric(gffattr[,10])
-  gff_df$Relat_Interruptions = as.numeric(gffattr[,11])
-  gff_df$Hit_to_DB_Length = as.numeric(gffattr[,12])
+  ## TODO make ti more robust - order can change!
+  gffattr_list = lapply(
+    strsplit(dfraw[,9],split=c("=|;")),
+    function(x)x[c(FALSE,TRUE)]
+  )
+  ## some rows are not complete - in case of ambiguous domains
+  L = sapply(gffattr_list, length)
+  short = L  < max(L)
+  if (any(short)){
+    gffattr_list[short] = lapply(gffattr_list[short],function(x) c(x, rep(NA, 13 - length(x))))
+  }
+  gffattr = as.data.frame(do.call(rbind, gffattr_list))
+
+  ## get attributes names
+  attrnames =  strsplit(dfraw[1,9],split=c("=|;"))[[1]][c(TRUE,FALSE)]
+  colnames(gffattr) = attrnames
+
+  gff_df$Final_Classification = gffattr$Final_Classification
+  gff_df$Name = gffattr$Name
+  gff_df$Region_Hits_Classifications = gffattr$Region_Hits_Classifications
+  gff_df$Best_Hit = gffattr$Best_Hit
+  gff_df$Best_Hit_DB_Pos = gffattr$Best_Hir_DB_Pos
+  gff_df$DB_Seq = gffattr$DB_Seq
+  gff_df$Query_Seq = gffattr$Query_Seq
+  gff_df$Region_Seq = gffattr$Region_Seq
+  gff_df$Identity = as.numeric(gffattr$Identity)
+  gff_df$Similarity = as.numeric(gffattr$Similarity)
+  gff_df$Relat_Length = as.numeric(gffattr$Relat_Length)
+  gff_df$Relat_Interruptions = as.numeric(gffattr$Relat_Interruptions)
+  gff_df$Hit_to_DB_Length = as.numeric(gffattr$Hit_to_DB_Length)
   return(gff_df)
 }
 
 gff = readGFF3fromDante(filepath)
-# summarize_by = c("Final_Classification", "Name", "seqid")
+# summarized_by = c("Final_Classification", "Name", "seqid")
+# summarized_by = c("Final_Classification")
+
 
-tbl = data.frame(table(gff[,summarize_by]))
-write.table(tbl, file = filepath, row.names =FALSE, quote = FALSE)
+if (is.na(summarized_by)){
+  ## export complete table
+  write.table(gff, file = output, row.names = FALSE, quote = FALSE, sep = "\t")
+}else{
+  ## export summary
+  tbl = data.frame(table(gff[, summarized_by]))
+  write.table(tbl, file = output, row.names = FALSE, quote = FALSE, sep = "\t")
+}
--- a/summarize_gff.xml	Wed Aug 14 11:24:42 2019 -0400
+++ b/summarize_gff.xml	Thu Aug 15 07:20:12 2019 -0400
@@ -3,18 +3,18 @@
         <requirement type="package">R</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
-        summarize_gff.R '$inputgff' '$output'
+        Rscript ${__tool_directory__}/summarize_gff.R '$inputgff' '$output' '$group'
     ]]></command>
     <inputs>
       <param type="data" name="inputgff" format="gff3" />
-      <param name="group" type="select" label="select categories to summarize" multiple="true">
-            <option value="Name">Numerical sort</option>
-            <option value="Final_Classification">General numeric sort</option>
-            <option value="seqid">Alphabetical sort</option>
+      <param name="group" type="select" label="select categories to summarize" multiple="true" optional="false">
+            <option value="Name">protein domain name</option>
+            <option value="Final_Classification">Classification</option>
+            <option value="seqid">Sequence ID</option>
       </param>
     </inputs>
     <outputs>
-        <data name="output" format="csv" />
+        <data name="output" format="tabular" />
     </outputs>
     <help><![CDATA[
         TODO: Fill in help.