Mercurial > repos > petr-novak > dante
view summarize_gff.R @ 10:d0431a839606 draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 14 Aug 2019 11:24:15 -0400 |
parents | |
children | ddc6bab20889 |
line wrap: on
line source
## summarize hits output = commandArgs(T)[2] ## output table filepath = commandArgs(T)[1] ## input dante gff3 summarized_by = commandArgs(T)[-(1:2)] readGFF3fromDante = function(filepath){ dfraw=read.table(filepath, as.is = TRUE) gff_df = dfraw[,1:8] colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score", "strand", "phase") ## assume same order, same attributes names gffattr = do.call(rbind, lapply( strsplit(dfraw[,9],split=c("=|;")), function(x)x[c(FALSE,TRUE)] ) ) gff_df$Name = gffattr[,1] gff_df$Final_Classification = gffattr[,2] gff_df$Region_Hits_Classifications = gffattr[,3] gff_df$Best_Hit = gffattr[,4] gff_df$Best_Hit_DB_Pos = gffattr[,5] gff_df$DB_Seq = gffattr[,6] gff_df$Query_Seq = gffattr[,7] gff_df$Identity = as.numeric(gffattr[,8]) gff_df$Similarity = as.numeric(gffattr[,9]) gff_df$Relat_Length = as.numeric(gffattr[,10]) gff_df$Relat_Interruptions = as.numeric(gffattr[,11]) gff_df$Hit_to_DB_Length = as.numeric(gffattr[,12]) return(gff_df) } gff = readGFF3fromDante(filepath) # summarize_by = c("Final_Classification", "Name", "seqid") tbl = data.frame(table(gff[,summarize_by])) write.table(tbl, file = filepath, row.names =FALSE, quote = FALSE)