Mercurial > repos > petr-novak > dante
changeset 12:ddc6bab20889 draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 15 Aug 2019 07:20:12 -0400 |
parents | 1d5883a9ec3a |
children | ab56b34d67d8 |
files | dante_gff_to_tabular.xml fasta2database.R summarize_gff.R summarize_gff.xml |
diffstat | 4 files changed, 83 insertions(+), 28 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dante_gff_to_tabular.xml Thu Aug 15 07:20:12 2019 -0400 @@ -0,0 +1,17 @@ +<tool id="gff_to_tabular" name="Convert dante gff3 to tab delimited file" version="0.1.0" python_template_version="3.5"> + <requirements> + <requirement type="package">R</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + Rscript ${__tool_directory__}/summarize_gff.R '$inputgff' '$output' + ]]></command> + <inputs> + <param type="data" name="inputgff" format="gff3" /> + </inputs> + <outputs> + <data name="output" format="tabular" /> + </outputs> + <help><![CDATA[ + TODO: Fill in help. + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta2database.R Thu Aug 15 07:20:12 2019 -0400 @@ -0,0 +1,14 @@ +library(Biostrings) +input_fasta = commandArgs(T)[1] +## for testing input_fasta="/mnt/raid/454_data/RE2_benchmark/REPET_annotation/Prunus_persica/DANTE_proteins_filtered.fasta" +s = readAAStringSet(input_fasta) +names_table = do.call("rbind", strsplit(names(s)," ")) +head(names_table) +classification_table = paste(names_table[,1], gsub("|","\t",names_table[,3], fixed = TRUE), sep="\t") +cat(unique(classification_table), sep="\n", file = paste(input_fasta, ".classification", sep = "")) + +new_fasta_names = paste("NA-", names_table[,2], "__", names_table[,1], sep="") + +names(s) = new_fasta_names + +writeXStringSet(s, filepath = paste(input_fasta, ".db",sep=''))
--- a/summarize_gff.R Wed Aug 14 11:24:42 2019 -0400 +++ b/summarize_gff.R Thu Aug 15 07:20:12 2019 -0400 @@ -1,7 +1,11 @@ ## summarize hits output = commandArgs(T)[2] ## output table filepath = commandArgs(T)[1] ## input dante gff3 -summarized_by = commandArgs(T)[-(1:2)] +if (length(commandArgs(T))==2){ + summarized_by = NA +}else{ + summarized_by = strsplit(commandArgs(T)[-(1:2)], split = ",")[[1]] +} readGFF3fromDante = function(filepath){ dfraw=read.table(filepath, as.is = TRUE) @@ -9,29 +13,49 @@ colnames(gff_df) = c("seqid", "source", "type", "start", "end", "score", "strand", "phase") ## assume same order, same attributes names - gffattr = do.call(rbind, - lapply( - strsplit(dfraw[,9],split=c("=|;")), - function(x)x[c(FALSE,TRUE)] - ) - ) - gff_df$Name = gffattr[,1] - gff_df$Final_Classification = gffattr[,2] - gff_df$Region_Hits_Classifications = gffattr[,3] - gff_df$Best_Hit = gffattr[,4] - gff_df$Best_Hit_DB_Pos = gffattr[,5] - gff_df$DB_Seq = gffattr[,6] - gff_df$Query_Seq = gffattr[,7] - gff_df$Identity = as.numeric(gffattr[,8]) - gff_df$Similarity = as.numeric(gffattr[,9]) - gff_df$Relat_Length = as.numeric(gffattr[,10]) - gff_df$Relat_Interruptions = as.numeric(gffattr[,11]) - gff_df$Hit_to_DB_Length = as.numeric(gffattr[,12]) + ## TODO make ti more robust - order can change! + gffattr_list = lapply( + strsplit(dfraw[,9],split=c("=|;")), + function(x)x[c(FALSE,TRUE)] + ) + ## some rows are not complete - in case of ambiguous domains + L = sapply(gffattr_list, length) + short = L < max(L) + if (any(short)){ + gffattr_list[short] = lapply(gffattr_list[short],function(x) c(x, rep(NA, 13 - length(x)))) + } + gffattr = as.data.frame(do.call(rbind, gffattr_list)) + + ## get attributes names + attrnames = strsplit(dfraw[1,9],split=c("=|;"))[[1]][c(TRUE,FALSE)] + colnames(gffattr) = attrnames + + gff_df$Final_Classification = gffattr$Final_Classification + gff_df$Name = gffattr$Name + gff_df$Region_Hits_Classifications = gffattr$Region_Hits_Classifications + gff_df$Best_Hit = gffattr$Best_Hit + gff_df$Best_Hit_DB_Pos = gffattr$Best_Hir_DB_Pos + gff_df$DB_Seq = gffattr$DB_Seq + gff_df$Query_Seq = gffattr$Query_Seq + gff_df$Region_Seq = gffattr$Region_Seq + gff_df$Identity = as.numeric(gffattr$Identity) + gff_df$Similarity = as.numeric(gffattr$Similarity) + gff_df$Relat_Length = as.numeric(gffattr$Relat_Length) + gff_df$Relat_Interruptions = as.numeric(gffattr$Relat_Interruptions) + gff_df$Hit_to_DB_Length = as.numeric(gffattr$Hit_to_DB_Length) return(gff_df) } gff = readGFF3fromDante(filepath) -# summarize_by = c("Final_Classification", "Name", "seqid") +# summarized_by = c("Final_Classification", "Name", "seqid") +# summarized_by = c("Final_Classification") + -tbl = data.frame(table(gff[,summarize_by])) -write.table(tbl, file = filepath, row.names =FALSE, quote = FALSE) +if (is.na(summarized_by)){ + ## export complete table + write.table(gff, file = output, row.names = FALSE, quote = FALSE, sep = "\t") +}else{ + ## export summary + tbl = data.frame(table(gff[, summarized_by])) + write.table(tbl, file = output, row.names = FALSE, quote = FALSE, sep = "\t") +}
--- a/summarize_gff.xml Wed Aug 14 11:24:42 2019 -0400 +++ b/summarize_gff.xml Thu Aug 15 07:20:12 2019 -0400 @@ -3,18 +3,18 @@ <requirement type="package">R</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ - summarize_gff.R '$inputgff' '$output' + Rscript ${__tool_directory__}/summarize_gff.R '$inputgff' '$output' '$group' ]]></command> <inputs> <param type="data" name="inputgff" format="gff3" /> - <param name="group" type="select" label="select categories to summarize" multiple="true"> - <option value="Name">Numerical sort</option> - <option value="Final_Classification">General numeric sort</option> - <option value="seqid">Alphabetical sort</option> + <param name="group" type="select" label="select categories to summarize" multiple="true" optional="false"> + <option value="Name">protein domain name</option> + <option value="Final_Classification">Classification</option> + <option value="seqid">Sequence ID</option> </param> </inputs> <outputs> - <data name="output" format="csv" /> + <data name="output" format="tabular" /> </outputs> <help><![CDATA[ TODO: Fill in help.