# HG changeset patch
# User petr-novak
# Date 1565868012 14400
# Node ID ddc6bab2088918ff9daca037d1c83f8587631eba
# Parent 1d5883a9ec3a562053a37fc4ac887ff5215de6fc
Uploaded
diff -r 1d5883a9ec3a -r ddc6bab20889 dante_gff_to_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dante_gff_to_tabular.xml Thu Aug 15 07:20:12 2019 -0400
@@ -0,0 +1,17 @@
+
+
+ R
+
+
+
+
+
+
+
+
+
+
diff -r 1d5883a9ec3a -r ddc6bab20889 fasta2database.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta2database.R Thu Aug 15 07:20:12 2019 -0400
@@ -0,0 +1,14 @@
+library(Biostrings)
+input_fasta = commandArgs(T)[1]
+## for testing input_fasta="/mnt/raid/454_data/RE2_benchmark/REPET_annotation/Prunus_persica/DANTE_proteins_filtered.fasta"
+s = readAAStringSet(input_fasta)
+names_table = do.call("rbind", strsplit(names(s)," "))
+head(names_table)
+classification_table = paste(names_table[,1], gsub("|","\t",names_table[,3], fixed = TRUE), sep="\t")
+cat(unique(classification_table), sep="\n", file = paste(input_fasta, ".classification", sep = ""))
+
+new_fasta_names = paste("NA-", names_table[,2], "__", names_table[,1], sep="")
+
+names(s) = new_fasta_names
+
+writeXStringSet(s, filepath = paste(input_fasta, ".db",sep=''))
diff -r 1d5883a9ec3a -r ddc6bab20889 summarize_gff.R
--- a/summarize_gff.R Wed Aug 14 11:24:42 2019 -0400
+++ b/summarize_gff.R Thu Aug 15 07:20:12 2019 -0400
@@ -1,7 +1,11 @@
## summarize hits
output = commandArgs(T)[2] ## output table
filepath = commandArgs(T)[1] ## input dante gff3
-summarized_by = commandArgs(T)[-(1:2)]
+if (length(commandArgs(T))==2){
+ summarized_by = NA
+}else{
+ summarized_by = strsplit(commandArgs(T)[-(1:2)], split = ",")[[1]]
+}
readGFF3fromDante = function(filepath){
dfraw=read.table(filepath, as.is = TRUE)
@@ -9,29 +13,49 @@
colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score",
"strand", "phase")
## assume same order, same attributes names
- gffattr = do.call(rbind,
- lapply(
- strsplit(dfraw[,9],split=c("=|;")),
- function(x)x[c(FALSE,TRUE)]
- )
- )
- gff_df$Name = gffattr[,1]
- gff_df$Final_Classification = gffattr[,2]
- gff_df$Region_Hits_Classifications = gffattr[,3]
- gff_df$Best_Hit = gffattr[,4]
- gff_df$Best_Hit_DB_Pos = gffattr[,5]
- gff_df$DB_Seq = gffattr[,6]
- gff_df$Query_Seq = gffattr[,7]
- gff_df$Identity = as.numeric(gffattr[,8])
- gff_df$Similarity = as.numeric(gffattr[,9])
- gff_df$Relat_Length = as.numeric(gffattr[,10])
- gff_df$Relat_Interruptions = as.numeric(gffattr[,11])
- gff_df$Hit_to_DB_Length = as.numeric(gffattr[,12])
+ ## TODO make ti more robust - order can change!
+ gffattr_list = lapply(
+ strsplit(dfraw[,9],split=c("=|;")),
+ function(x)x[c(FALSE,TRUE)]
+ )
+ ## some rows are not complete - in case of ambiguous domains
+ L = sapply(gffattr_list, length)
+ short = L < max(L)
+ if (any(short)){
+ gffattr_list[short] = lapply(gffattr_list[short],function(x) c(x, rep(NA, 13 - length(x))))
+ }
+ gffattr = as.data.frame(do.call(rbind, gffattr_list))
+
+ ## get attributes names
+ attrnames = strsplit(dfraw[1,9],split=c("=|;"))[[1]][c(TRUE,FALSE)]
+ colnames(gffattr) = attrnames
+
+ gff_df$Final_Classification = gffattr$Final_Classification
+ gff_df$Name = gffattr$Name
+ gff_df$Region_Hits_Classifications = gffattr$Region_Hits_Classifications
+ gff_df$Best_Hit = gffattr$Best_Hit
+ gff_df$Best_Hit_DB_Pos = gffattr$Best_Hir_DB_Pos
+ gff_df$DB_Seq = gffattr$DB_Seq
+ gff_df$Query_Seq = gffattr$Query_Seq
+ gff_df$Region_Seq = gffattr$Region_Seq
+ gff_df$Identity = as.numeric(gffattr$Identity)
+ gff_df$Similarity = as.numeric(gffattr$Similarity)
+ gff_df$Relat_Length = as.numeric(gffattr$Relat_Length)
+ gff_df$Relat_Interruptions = as.numeric(gffattr$Relat_Interruptions)
+ gff_df$Hit_to_DB_Length = as.numeric(gffattr$Hit_to_DB_Length)
return(gff_df)
}
gff = readGFF3fromDante(filepath)
-# summarize_by = c("Final_Classification", "Name", "seqid")
+# summarized_by = c("Final_Classification", "Name", "seqid")
+# summarized_by = c("Final_Classification")
+
-tbl = data.frame(table(gff[,summarize_by]))
-write.table(tbl, file = filepath, row.names =FALSE, quote = FALSE)
+if (is.na(summarized_by)){
+ ## export complete table
+ write.table(gff, file = output, row.names = FALSE, quote = FALSE, sep = "\t")
+}else{
+ ## export summary
+ tbl = data.frame(table(gff[, summarized_by]))
+ write.table(tbl, file = output, row.names = FALSE, quote = FALSE, sep = "\t")
+}
diff -r 1d5883a9ec3a -r ddc6bab20889 summarize_gff.xml
--- a/summarize_gff.xml Wed Aug 14 11:24:42 2019 -0400
+++ b/summarize_gff.xml Thu Aug 15 07:20:12 2019 -0400
@@ -3,18 +3,18 @@
R
-
-
-
-
+
+
+
+
-
+