Mercurial > repos > petr-novak > dante
comparison summarize_gff.R @ 12:ddc6bab20889 draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 15 Aug 2019 07:20:12 -0400 |
parents | d0431a839606 |
children | ab56b34d67d8 |
comparison
equal
deleted
inserted
replaced
11:1d5883a9ec3a | 12:ddc6bab20889 |
---|---|
1 ## summarize hits | 1 ## summarize hits |
2 output = commandArgs(T)[2] ## output table | 2 output = commandArgs(T)[2] ## output table |
3 filepath = commandArgs(T)[1] ## input dante gff3 | 3 filepath = commandArgs(T)[1] ## input dante gff3 |
4 summarized_by = commandArgs(T)[-(1:2)] | 4 if (length(commandArgs(T))==2){ |
5 summarized_by = NA | |
6 }else{ | |
7 summarized_by = strsplit(commandArgs(T)[-(1:2)], split = ",")[[1]] | |
8 } | |
5 | 9 |
6 readGFF3fromDante = function(filepath){ | 10 readGFF3fromDante = function(filepath){ |
7 dfraw=read.table(filepath, as.is = TRUE) | 11 dfraw=read.table(filepath, as.is = TRUE) |
8 gff_df = dfraw[,1:8] | 12 gff_df = dfraw[,1:8] |
9 colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score", | 13 colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score", |
10 "strand", "phase") | 14 "strand", "phase") |
11 ## assume same order, same attributes names | 15 ## assume same order, same attributes names |
12 gffattr = do.call(rbind, | 16 ## TODO make ti more robust - order can change! |
13 lapply( | 17 gffattr_list = lapply( |
14 strsplit(dfraw[,9],split=c("=|;")), | 18 strsplit(dfraw[,9],split=c("=|;")), |
15 function(x)x[c(FALSE,TRUE)] | 19 function(x)x[c(FALSE,TRUE)] |
16 ) | 20 ) |
17 ) | 21 ## some rows are not complete - in case of ambiguous domains |
18 gff_df$Name = gffattr[,1] | 22 L = sapply(gffattr_list, length) |
19 gff_df$Final_Classification = gffattr[,2] | 23 short = L < max(L) |
20 gff_df$Region_Hits_Classifications = gffattr[,3] | 24 if (any(short)){ |
21 gff_df$Best_Hit = gffattr[,4] | 25 gffattr_list[short] = lapply(gffattr_list[short],function(x) c(x, rep(NA, 13 - length(x)))) |
22 gff_df$Best_Hit_DB_Pos = gffattr[,5] | 26 } |
23 gff_df$DB_Seq = gffattr[,6] | 27 gffattr = as.data.frame(do.call(rbind, gffattr_list)) |
24 gff_df$Query_Seq = gffattr[,7] | 28 |
25 gff_df$Identity = as.numeric(gffattr[,8]) | 29 ## get attributes names |
26 gff_df$Similarity = as.numeric(gffattr[,9]) | 30 attrnames = strsplit(dfraw[1,9],split=c("=|;"))[[1]][c(TRUE,FALSE)] |
27 gff_df$Relat_Length = as.numeric(gffattr[,10]) | 31 colnames(gffattr) = attrnames |
28 gff_df$Relat_Interruptions = as.numeric(gffattr[,11]) | 32 |
29 gff_df$Hit_to_DB_Length = as.numeric(gffattr[,12]) | 33 gff_df$Final_Classification = gffattr$Final_Classification |
34 gff_df$Name = gffattr$Name | |
35 gff_df$Region_Hits_Classifications = gffattr$Region_Hits_Classifications | |
36 gff_df$Best_Hit = gffattr$Best_Hit | |
37 gff_df$Best_Hit_DB_Pos = gffattr$Best_Hir_DB_Pos | |
38 gff_df$DB_Seq = gffattr$DB_Seq | |
39 gff_df$Query_Seq = gffattr$Query_Seq | |
40 gff_df$Region_Seq = gffattr$Region_Seq | |
41 gff_df$Identity = as.numeric(gffattr$Identity) | |
42 gff_df$Similarity = as.numeric(gffattr$Similarity) | |
43 gff_df$Relat_Length = as.numeric(gffattr$Relat_Length) | |
44 gff_df$Relat_Interruptions = as.numeric(gffattr$Relat_Interruptions) | |
45 gff_df$Hit_to_DB_Length = as.numeric(gffattr$Hit_to_DB_Length) | |
30 return(gff_df) | 46 return(gff_df) |
31 } | 47 } |
32 | 48 |
33 gff = readGFF3fromDante(filepath) | 49 gff = readGFF3fromDante(filepath) |
34 # summarize_by = c("Final_Classification", "Name", "seqid") | 50 # summarized_by = c("Final_Classification", "Name", "seqid") |
51 # summarized_by = c("Final_Classification") | |
35 | 52 |
36 tbl = data.frame(table(gff[,summarize_by])) | 53 |
37 write.table(tbl, file = filepath, row.names =FALSE, quote = FALSE) | 54 if (is.na(summarized_by)){ |
55 ## export complete table | |
56 write.table(gff, file = output, row.names = FALSE, quote = FALSE, sep = "\t") | |
57 }else{ | |
58 ## export summary | |
59 tbl = data.frame(table(gff[, summarized_by])) | |
60 write.table(tbl, file = output, row.names = FALSE, quote = FALSE, sep = "\t") | |
61 } |