# HG changeset patch # User petr-novak # Date 1565868012 14400 # Node ID ddc6bab2088918ff9daca037d1c83f8587631eba # Parent 1d5883a9ec3a562053a37fc4ac887ff5215de6fc Uploaded diff -r 1d5883a9ec3a -r ddc6bab20889 dante_gff_to_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dante_gff_to_tabular.xml Thu Aug 15 07:20:12 2019 -0400 @@ -0,0 +1,17 @@ + + + R + + + + + + + + + + diff -r 1d5883a9ec3a -r ddc6bab20889 fasta2database.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta2database.R Thu Aug 15 07:20:12 2019 -0400 @@ -0,0 +1,14 @@ +library(Biostrings) +input_fasta = commandArgs(T)[1] +## for testing input_fasta="/mnt/raid/454_data/RE2_benchmark/REPET_annotation/Prunus_persica/DANTE_proteins_filtered.fasta" +s = readAAStringSet(input_fasta) +names_table = do.call("rbind", strsplit(names(s)," ")) +head(names_table) +classification_table = paste(names_table[,1], gsub("|","\t",names_table[,3], fixed = TRUE), sep="\t") +cat(unique(classification_table), sep="\n", file = paste(input_fasta, ".classification", sep = "")) + +new_fasta_names = paste("NA-", names_table[,2], "__", names_table[,1], sep="") + +names(s) = new_fasta_names + +writeXStringSet(s, filepath = paste(input_fasta, ".db",sep='')) diff -r 1d5883a9ec3a -r ddc6bab20889 summarize_gff.R --- a/summarize_gff.R Wed Aug 14 11:24:42 2019 -0400 +++ b/summarize_gff.R Thu Aug 15 07:20:12 2019 -0400 @@ -1,7 +1,11 @@ ## summarize hits output = commandArgs(T)[2] ## output table filepath = commandArgs(T)[1] ## input dante gff3 -summarized_by = commandArgs(T)[-(1:2)] +if (length(commandArgs(T))==2){ + summarized_by = NA +}else{ + summarized_by = strsplit(commandArgs(T)[-(1:2)], split = ",")[[1]] +} readGFF3fromDante = function(filepath){ dfraw=read.table(filepath, as.is = TRUE) @@ -9,29 +13,49 @@ colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score", "strand", "phase") ## assume same order, same attributes names - gffattr = do.call(rbind, - lapply( - strsplit(dfraw[,9],split=c("=|;")), - function(x)x[c(FALSE,TRUE)] - ) - ) - gff_df$Name = gffattr[,1] - gff_df$Final_Classification = gffattr[,2] - gff_df$Region_Hits_Classifications = gffattr[,3] - gff_df$Best_Hit = gffattr[,4] - gff_df$Best_Hit_DB_Pos = gffattr[,5] - gff_df$DB_Seq = gffattr[,6] - gff_df$Query_Seq = gffattr[,7] - gff_df$Identity = as.numeric(gffattr[,8]) - gff_df$Similarity = as.numeric(gffattr[,9]) - gff_df$Relat_Length = as.numeric(gffattr[,10]) - gff_df$Relat_Interruptions = as.numeric(gffattr[,11]) - gff_df$Hit_to_DB_Length = as.numeric(gffattr[,12]) + ## TODO make ti more robust - order can change! + gffattr_list = lapply( + strsplit(dfraw[,9],split=c("=|;")), + function(x)x[c(FALSE,TRUE)] + ) + ## some rows are not complete - in case of ambiguous domains + L = sapply(gffattr_list, length) + short = L < max(L) + if (any(short)){ + gffattr_list[short] = lapply(gffattr_list[short],function(x) c(x, rep(NA, 13 - length(x)))) + } + gffattr = as.data.frame(do.call(rbind, gffattr_list)) + + ## get attributes names + attrnames = strsplit(dfraw[1,9],split=c("=|;"))[[1]][c(TRUE,FALSE)] + colnames(gffattr) = attrnames + + gff_df$Final_Classification = gffattr$Final_Classification + gff_df$Name = gffattr$Name + gff_df$Region_Hits_Classifications = gffattr$Region_Hits_Classifications + gff_df$Best_Hit = gffattr$Best_Hit + gff_df$Best_Hit_DB_Pos = gffattr$Best_Hir_DB_Pos + gff_df$DB_Seq = gffattr$DB_Seq + gff_df$Query_Seq = gffattr$Query_Seq + gff_df$Region_Seq = gffattr$Region_Seq + gff_df$Identity = as.numeric(gffattr$Identity) + gff_df$Similarity = as.numeric(gffattr$Similarity) + gff_df$Relat_Length = as.numeric(gffattr$Relat_Length) + gff_df$Relat_Interruptions = as.numeric(gffattr$Relat_Interruptions) + gff_df$Hit_to_DB_Length = as.numeric(gffattr$Hit_to_DB_Length) return(gff_df) } gff = readGFF3fromDante(filepath) -# summarize_by = c("Final_Classification", "Name", "seqid") +# summarized_by = c("Final_Classification", "Name", "seqid") +# summarized_by = c("Final_Classification") + -tbl = data.frame(table(gff[,summarize_by])) -write.table(tbl, file = filepath, row.names =FALSE, quote = FALSE) +if (is.na(summarized_by)){ + ## export complete table + write.table(gff, file = output, row.names = FALSE, quote = FALSE, sep = "\t") +}else{ + ## export summary + tbl = data.frame(table(gff[, summarized_by])) + write.table(tbl, file = output, row.names = FALSE, quote = FALSE, sep = "\t") +} diff -r 1d5883a9ec3a -r ddc6bab20889 summarize_gff.xml --- a/summarize_gff.xml Wed Aug 14 11:24:42 2019 -0400 +++ b/summarize_gff.xml Thu Aug 15 07:20:12 2019 -0400 @@ -3,18 +3,18 @@ R - - - - + + + + - +