comparison summarize_gff.R @ 12:ddc6bab20889 draft

Uploaded
author petr-novak
date Thu, 15 Aug 2019 07:20:12 -0400
parents d0431a839606
children ab56b34d67d8
comparison
equal deleted inserted replaced
11:1d5883a9ec3a 12:ddc6bab20889
1 ## summarize hits 1 ## summarize hits
2 output = commandArgs(T)[2] ## output table 2 output = commandArgs(T)[2] ## output table
3 filepath = commandArgs(T)[1] ## input dante gff3 3 filepath = commandArgs(T)[1] ## input dante gff3
4 summarized_by = commandArgs(T)[-(1:2)] 4 if (length(commandArgs(T))==2){
5 summarized_by = NA
6 }else{
7 summarized_by = strsplit(commandArgs(T)[-(1:2)], split = ",")[[1]]
8 }
5 9
6 readGFF3fromDante = function(filepath){ 10 readGFF3fromDante = function(filepath){
7 dfraw=read.table(filepath, as.is = TRUE) 11 dfraw=read.table(filepath, as.is = TRUE)
8 gff_df = dfraw[,1:8] 12 gff_df = dfraw[,1:8]
9 colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score", 13 colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score",
10 "strand", "phase") 14 "strand", "phase")
11 ## assume same order, same attributes names 15 ## assume same order, same attributes names
12 gffattr = do.call(rbind, 16 ## TODO make ti more robust - order can change!
13 lapply( 17 gffattr_list = lapply(
14 strsplit(dfraw[,9],split=c("=|;")), 18 strsplit(dfraw[,9],split=c("=|;")),
15 function(x)x[c(FALSE,TRUE)] 19 function(x)x[c(FALSE,TRUE)]
16 ) 20 )
17 ) 21 ## some rows are not complete - in case of ambiguous domains
18 gff_df$Name = gffattr[,1] 22 L = sapply(gffattr_list, length)
19 gff_df$Final_Classification = gffattr[,2] 23 short = L < max(L)
20 gff_df$Region_Hits_Classifications = gffattr[,3] 24 if (any(short)){
21 gff_df$Best_Hit = gffattr[,4] 25 gffattr_list[short] = lapply(gffattr_list[short],function(x) c(x, rep(NA, 13 - length(x))))
22 gff_df$Best_Hit_DB_Pos = gffattr[,5] 26 }
23 gff_df$DB_Seq = gffattr[,6] 27 gffattr = as.data.frame(do.call(rbind, gffattr_list))
24 gff_df$Query_Seq = gffattr[,7] 28
25 gff_df$Identity = as.numeric(gffattr[,8]) 29 ## get attributes names
26 gff_df$Similarity = as.numeric(gffattr[,9]) 30 attrnames = strsplit(dfraw[1,9],split=c("=|;"))[[1]][c(TRUE,FALSE)]
27 gff_df$Relat_Length = as.numeric(gffattr[,10]) 31 colnames(gffattr) = attrnames
28 gff_df$Relat_Interruptions = as.numeric(gffattr[,11]) 32
29 gff_df$Hit_to_DB_Length = as.numeric(gffattr[,12]) 33 gff_df$Final_Classification = gffattr$Final_Classification
34 gff_df$Name = gffattr$Name
35 gff_df$Region_Hits_Classifications = gffattr$Region_Hits_Classifications
36 gff_df$Best_Hit = gffattr$Best_Hit
37 gff_df$Best_Hit_DB_Pos = gffattr$Best_Hir_DB_Pos
38 gff_df$DB_Seq = gffattr$DB_Seq
39 gff_df$Query_Seq = gffattr$Query_Seq
40 gff_df$Region_Seq = gffattr$Region_Seq
41 gff_df$Identity = as.numeric(gffattr$Identity)
42 gff_df$Similarity = as.numeric(gffattr$Similarity)
43 gff_df$Relat_Length = as.numeric(gffattr$Relat_Length)
44 gff_df$Relat_Interruptions = as.numeric(gffattr$Relat_Interruptions)
45 gff_df$Hit_to_DB_Length = as.numeric(gffattr$Hit_to_DB_Length)
30 return(gff_df) 46 return(gff_df)
31 } 47 }
32 48
33 gff = readGFF3fromDante(filepath) 49 gff = readGFF3fromDante(filepath)
34 # summarize_by = c("Final_Classification", "Name", "seqid") 50 # summarized_by = c("Final_Classification", "Name", "seqid")
51 # summarized_by = c("Final_Classification")
35 52
36 tbl = data.frame(table(gff[,summarize_by])) 53
37 write.table(tbl, file = filepath, row.names =FALSE, quote = FALSE) 54 if (is.na(summarized_by)){
55 ## export complete table
56 write.table(gff, file = output, row.names = FALSE, quote = FALSE, sep = "\t")
57 }else{
58 ## export summary
59 tbl = data.frame(table(gff[, summarized_by]))
60 write.table(tbl, file = output, row.names = FALSE, quote = FALSE, sep = "\t")
61 }