Mercurial > repos > petr-novak > dante
diff summarize_gff.R @ 10:d0431a839606 draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 14 Aug 2019 11:24:15 -0400 |
parents | |
children | ddc6bab20889 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/summarize_gff.R Wed Aug 14 11:24:15 2019 -0400 @@ -0,0 +1,37 @@ +## summarize hits +output = commandArgs(T)[2] ## output table +filepath = commandArgs(T)[1] ## input dante gff3 +summarized_by = commandArgs(T)[-(1:2)] + +readGFF3fromDante = function(filepath){ + dfraw=read.table(filepath, as.is = TRUE) + gff_df = dfraw[,1:8] + colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score", + "strand", "phase") + ## assume same order, same attributes names + gffattr = do.call(rbind, + lapply( + strsplit(dfraw[,9],split=c("=|;")), + function(x)x[c(FALSE,TRUE)] + ) + ) + gff_df$Name = gffattr[,1] + gff_df$Final_Classification = gffattr[,2] + gff_df$Region_Hits_Classifications = gffattr[,3] + gff_df$Best_Hit = gffattr[,4] + gff_df$Best_Hit_DB_Pos = gffattr[,5] + gff_df$DB_Seq = gffattr[,6] + gff_df$Query_Seq = gffattr[,7] + gff_df$Identity = as.numeric(gffattr[,8]) + gff_df$Similarity = as.numeric(gffattr[,9]) + gff_df$Relat_Length = as.numeric(gffattr[,10]) + gff_df$Relat_Interruptions = as.numeric(gffattr[,11]) + gff_df$Hit_to_DB_Length = as.numeric(gffattr[,12]) + return(gff_df) +} + +gff = readGFF3fromDante(filepath) +# summarize_by = c("Final_Classification", "Name", "seqid") + +tbl = data.frame(table(gff[,summarize_by])) +write.table(tbl, file = filepath, row.names =FALSE, quote = FALSE)