10
|
1 ## summarize hits
|
|
2 output = commandArgs(T)[2] ## output table
|
|
3 filepath = commandArgs(T)[1] ## input dante gff3
|
|
4 summarized_by = commandArgs(T)[-(1:2)]
|
|
5
|
|
6 readGFF3fromDante = function(filepath){
|
|
7 dfraw=read.table(filepath, as.is = TRUE)
|
|
8 gff_df = dfraw[,1:8]
|
|
9 colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score",
|
|
10 "strand", "phase")
|
|
11 ## assume same order, same attributes names
|
|
12 gffattr = do.call(rbind,
|
|
13 lapply(
|
|
14 strsplit(dfraw[,9],split=c("=|;")),
|
|
15 function(x)x[c(FALSE,TRUE)]
|
|
16 )
|
|
17 )
|
|
18 gff_df$Name = gffattr[,1]
|
|
19 gff_df$Final_Classification = gffattr[,2]
|
|
20 gff_df$Region_Hits_Classifications = gffattr[,3]
|
|
21 gff_df$Best_Hit = gffattr[,4]
|
|
22 gff_df$Best_Hit_DB_Pos = gffattr[,5]
|
|
23 gff_df$DB_Seq = gffattr[,6]
|
|
24 gff_df$Query_Seq = gffattr[,7]
|
|
25 gff_df$Identity = as.numeric(gffattr[,8])
|
|
26 gff_df$Similarity = as.numeric(gffattr[,9])
|
|
27 gff_df$Relat_Length = as.numeric(gffattr[,10])
|
|
28 gff_df$Relat_Interruptions = as.numeric(gffattr[,11])
|
|
29 gff_df$Hit_to_DB_Length = as.numeric(gffattr[,12])
|
|
30 return(gff_df)
|
|
31 }
|
|
32
|
|
33 gff = readGFF3fromDante(filepath)
|
|
34 # summarize_by = c("Final_Classification", "Name", "seqid")
|
|
35
|
|
36 tbl = data.frame(table(gff[,summarize_by]))
|
|
37 write.table(tbl, file = filepath, row.names =FALSE, quote = FALSE)
|