Mercurial > repos > petr-novak > dante
diff summarize_gff.R @ 12:ddc6bab20889 draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 15 Aug 2019 07:20:12 -0400 |
parents | d0431a839606 |
children | ab56b34d67d8 |
line wrap: on
line diff
--- a/summarize_gff.R Wed Aug 14 11:24:42 2019 -0400 +++ b/summarize_gff.R Thu Aug 15 07:20:12 2019 -0400 @@ -1,7 +1,11 @@ ## summarize hits output = commandArgs(T)[2] ## output table filepath = commandArgs(T)[1] ## input dante gff3 -summarized_by = commandArgs(T)[-(1:2)] +if (length(commandArgs(T))==2){ + summarized_by = NA +}else{ + summarized_by = strsplit(commandArgs(T)[-(1:2)], split = ",")[[1]] +} readGFF3fromDante = function(filepath){ dfraw=read.table(filepath, as.is = TRUE) @@ -9,29 +13,49 @@ colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score", "strand", "phase") ## assume same order, same attributes names - gffattr = do.call(rbind, - lapply( - strsplit(dfraw[,9],split=c("=|;")), - function(x)x[c(FALSE,TRUE)] - ) - ) - gff_df$Name = gffattr[,1] - gff_df$Final_Classification = gffattr[,2] - gff_df$Region_Hits_Classifications = gffattr[,3] - gff_df$Best_Hit = gffattr[,4] - gff_df$Best_Hit_DB_Pos = gffattr[,5] - gff_df$DB_Seq = gffattr[,6] - gff_df$Query_Seq = gffattr[,7] - gff_df$Identity = as.numeric(gffattr[,8]) - gff_df$Similarity = as.numeric(gffattr[,9]) - gff_df$Relat_Length = as.numeric(gffattr[,10]) - gff_df$Relat_Interruptions = as.numeric(gffattr[,11]) - gff_df$Hit_to_DB_Length = as.numeric(gffattr[,12]) + ## TODO make ti more robust - order can change! + gffattr_list = lapply( + strsplit(dfraw[,9],split=c("=|;")), + function(x)x[c(FALSE,TRUE)] + ) + ## some rows are not complete - in case of ambiguous domains + L = sapply(gffattr_list, length) + short = L < max(L) + if (any(short)){ + gffattr_list[short] = lapply(gffattr_list[short],function(x) c(x, rep(NA, 13 - length(x)))) + } + gffattr = as.data.frame(do.call(rbind, gffattr_list)) + + ## get attributes names + attrnames = strsplit(dfraw[1,9],split=c("=|;"))[[1]][c(TRUE,FALSE)] + colnames(gffattr) = attrnames + + gff_df$Final_Classification = gffattr$Final_Classification + gff_df$Name = gffattr$Name + gff_df$Region_Hits_Classifications = gffattr$Region_Hits_Classifications + gff_df$Best_Hit = gffattr$Best_Hit + gff_df$Best_Hit_DB_Pos = gffattr$Best_Hir_DB_Pos + gff_df$DB_Seq = gffattr$DB_Seq + gff_df$Query_Seq = gffattr$Query_Seq + gff_df$Region_Seq = gffattr$Region_Seq + gff_df$Identity = as.numeric(gffattr$Identity) + gff_df$Similarity = as.numeric(gffattr$Similarity) + gff_df$Relat_Length = as.numeric(gffattr$Relat_Length) + gff_df$Relat_Interruptions = as.numeric(gffattr$Relat_Interruptions) + gff_df$Hit_to_DB_Length = as.numeric(gffattr$Hit_to_DB_Length) return(gff_df) } gff = readGFF3fromDante(filepath) -# summarize_by = c("Final_Classification", "Name", "seqid") +# summarized_by = c("Final_Classification", "Name", "seqid") +# summarized_by = c("Final_Classification") + -tbl = data.frame(table(gff[,summarize_by])) -write.table(tbl, file = filepath, row.names =FALSE, quote = FALSE) +if (is.na(summarized_by)){ + ## export complete table + write.table(gff, file = output, row.names = FALSE, quote = FALSE, sep = "\t") +}else{ + ## export summary + tbl = data.frame(table(gff[, summarized_by])) + write.table(tbl, file = output, row.names = FALSE, quote = FALSE, sep = "\t") +}