diff summarize_gff.R @ 10:d0431a839606 draft

Uploaded
author petr-novak
date Wed, 14 Aug 2019 11:24:15 -0400
parents
children ddc6bab20889
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/summarize_gff.R	Wed Aug 14 11:24:15 2019 -0400
@@ -0,0 +1,37 @@
+## summarize hits
+output = commandArgs(T)[2] ## output table
+filepath = commandArgs(T)[1]  ## input dante gff3
+summarized_by = commandArgs(T)[-(1:2)]
+
+readGFF3fromDante = function(filepath){
+  dfraw=read.table(filepath, as.is = TRUE)
+  gff_df = dfraw[,1:8]
+  colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score",
+                    "strand", "phase")
+  ## assume same order, same attributes names
+  gffattr = do.call(rbind,
+                    lapply(
+                      strsplit(dfraw[,9],split=c("=|;")),
+                      function(x)x[c(FALSE,TRUE)]
+                    )
+                    )
+  gff_df$Name = gffattr[,1]
+  gff_df$Final_Classification = gffattr[,2]
+  gff_df$Region_Hits_Classifications = gffattr[,3]
+  gff_df$Best_Hit = gffattr[,4]
+  gff_df$Best_Hit_DB_Pos = gffattr[,5]
+  gff_df$DB_Seq = gffattr[,6]
+  gff_df$Query_Seq = gffattr[,7]
+  gff_df$Identity = as.numeric(gffattr[,8])
+  gff_df$Similarity = as.numeric(gffattr[,9])
+  gff_df$Relat_Length = as.numeric(gffattr[,10])
+  gff_df$Relat_Interruptions = as.numeric(gffattr[,11])
+  gff_df$Hit_to_DB_Length = as.numeric(gffattr[,12])
+  return(gff_df)
+}
+
+gff = readGFF3fromDante(filepath)
+# summarize_by = c("Final_Classification", "Name", "seqid")
+
+tbl = data.frame(table(gff[,summarize_by]))
+write.table(tbl, file = filepath, row.names =FALSE, quote = FALSE)