annotate summarize_gff.R @ 13:ab56b34d67d8 draft

Uploaded
author petr-novak
date Thu, 15 Aug 2019 07:48:49 -0400
parents ddc6bab20889
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
1 ## summarize hits
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
2 output = commandArgs(T)[2] ## output table
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
3 filepath = commandArgs(T)[1] ## input dante gff3
12
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
4 if (length(commandArgs(T))==2){
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
5 summarized_by = NA
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
6 }else{
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
7 summarized_by = strsplit(commandArgs(T)[-(1:2)], split = ",")[[1]]
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
8 }
10
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
9
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
10 readGFF3fromDante = function(filepath){
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
11 dfraw=read.table(filepath, as.is = TRUE)
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
12 gff_df = dfraw[,1:8]
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
13 colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score",
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
14 "strand", "phase")
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
15 ## assume same order, same attributes names
12
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
16 ## TODO make ti more robust - order can change!
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
17 gffattr_list = lapply(
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
18 strsplit(dfraw[,9],split=c("=|;")),
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
19 function(x)x[c(FALSE,TRUE)]
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
20 )
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
21 ## some rows are not complete - in case of ambiguous domains
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
22 L = sapply(gffattr_list, length)
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
23 short = L < max(L)
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
24 if (any(short)){
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
25 gffattr_list[short] = lapply(gffattr_list[short],function(x) c(x, rep(NA, 13 - length(x))))
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
26 }
13
ab56b34d67d8 Uploaded
petr-novak
parents: 12
diff changeset
27 gffattr = as.data.frame(do.call(rbind, gffattr_list), stringsAsFactors = FALSE)
12
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
28
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
29 ## get attributes names
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
30 attrnames = strsplit(dfraw[1,9],split=c("=|;"))[[1]][c(TRUE,FALSE)]
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
31 colnames(gffattr) = attrnames
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
32
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
33 gff_df$Final_Classification = gffattr$Final_Classification
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
34 gff_df$Name = gffattr$Name
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
35 gff_df$Region_Hits_Classifications = gffattr$Region_Hits_Classifications
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
36 gff_df$Best_Hit = gffattr$Best_Hit
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
37 gff_df$Best_Hit_DB_Pos = gffattr$Best_Hir_DB_Pos
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
38 gff_df$DB_Seq = gffattr$DB_Seq
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
39 gff_df$Query_Seq = gffattr$Query_Seq
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
40 gff_df$Region_Seq = gffattr$Region_Seq
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
41 gff_df$Identity = as.numeric(gffattr$Identity)
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
42 gff_df$Similarity = as.numeric(gffattr$Similarity)
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
43 gff_df$Relat_Length = as.numeric(gffattr$Relat_Length)
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
44 gff_df$Relat_Interruptions = as.numeric(gffattr$Relat_Interruptions)
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
45 gff_df$Hit_to_DB_Length = as.numeric(gffattr$Hit_to_DB_Length)
10
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
46 return(gff_df)
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
47 }
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
48
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
49 gff = readGFF3fromDante(filepath)
12
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
50 # summarized_by = c("Final_Classification", "Name", "seqid")
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
51 # summarized_by = c("Final_Classification")
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
52
10
d0431a839606 Uploaded
petr-novak
parents:
diff changeset
53
12
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
54 if (is.na(summarized_by)){
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
55 ## export complete table
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
56 write.table(gff, file = output, row.names = FALSE, quote = FALSE, sep = "\t")
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
57 }else{
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
58 ## export summary
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
59 tbl = data.frame(table(gff[, summarized_by]))
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
60 write.table(tbl, file = output, row.names = FALSE, quote = FALSE, sep = "\t")
ddc6bab20889 Uploaded
petr-novak
parents: 10
diff changeset
61 }