Mercurial > repos > lionelguy > spades
diff tools/spades_2_5/plot_spades_stats.xml @ 2:b5ce24f34dd7 draft
Uploaded
author | lionelguy |
---|---|
date | Thu, 05 Sep 2013 07:43:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_2_5/plot_spades_stats.xml Thu Sep 05 07:43:48 2013 -0400 @@ -0,0 +1,80 @@ +<tool id="plot_spades_stats" name="SPAdes stats" version="0.1"> + <description>coverage vs. length plot</description> + <requirements> + <requirement type="package">R</requirement> + </requirements> + <command interpreter="bash">r_wrapper.sh $script_file</command> + + <inputs> + <param name="input_scaffolds" type="data" format="tabular" label="Scaffold stats"/> + <param name="input_contigs" type="data" format="tabular" label="Contig stats"/> + <param name="length_co" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under that value are shown in red"/> + <param name="coverage_co" type="integer" value="10" min="0" label="Coverage cut-off" help="Contigs with length under that value are shown in red"/> + </inputs> + <configfiles> + <configfile name="script_file"> +## Setup R error handling to go to stderr +options( show.error.messages=F, + error = function () { + cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) +} ) +files = c("${input_contigs}", "${input_scaffolds}") +types = c("Contigs", "Scaffolds") + +## Start plotting device +png("${out_file}", w=500, h=1000) +par(mfrow=c(2,1)) + +## Loop over the two files +for (i in 1:length(types)){ + seqs = read.table(files[i], header=FALSE, comment.char="#") + colnames = c("name", "length", "coverage") + names(seqs) = colnames + + ## Stats over all sequences + sl_all = sort(seqs\$length, decreasing=TRUE) + cs_all = cumsum(sl_all) + s_all = sum(seqs\$length) + n50_idx_all = which.min(sl_all[cs_all < 0.5*s_all]) + n90_idx_all = which.min(sl_all[cs_all < 0.9*s_all]) + n50_all = sl_all[n50_idx_all] + n90_all = sl_all[n90_idx_all] + + ## Filter short seqs, redo stats + seqs_filt = seqs[seqs\$length >= ${length_co} & seqs\$coverage >= ${coverage_co},] + if (nrow(seqs_filt) > 0){ + sl_filt = sort(seqs_filt\$length, decreasing=TRUE) + cs_filt = cumsum(sl_filt) + s_filt = sum(seqs_filt\$length) + n50_idx_filt = which.min(sl_filt[cs_filt < 0.5*s_filt]) + n90_idx_filt = which.min(sl_filt[cs_filt < 0.9*s_filt]) + n50_filt = sl_filt[n50_idx_filt] + n90_filt = sl_filt[n90_idx_filt] + } + seqs_bad = seqs[seqs\$length < ${length_co} | seqs\$coverage < ${coverage_co},] + + ## Length vs coverage + plot(length~coverage, data=seqs, log="xy", type="n", main=paste(types[i], ": coverage vs. length", sep=""), xlab="Coverage", ylab="Length") + if (nrow(seqs_bad) > 0){ + points(length~coverage, data=seqs_bad, cex=0.5, col="red") + } + if (nrow(seqs_filt) > 0){ + points(length~coverage, data=seqs_filt, cex=0.5, col="black") + } + abline(v=${coverage_co}, h=${length_co}, lty=2, col=grey(0.3)) + legend(x="topleft", legend=c("Before/after filtering", paste(c("N50: ", "N90: ", "Median cov.: "), c(n50_all, n90_all, round(median(seqs\$coverage))), rep("/", 3), c(n50_filt, n90_filt, round(median(seqs_filt\$coverage))), sep="")), cex=0.8) +} +dev.off() + </configfile> + </configfiles> + <outputs> + <data format="png" name="out_file" /> + </outputs> + <help> +**What it does** + +Using the output of SPAdes (a pair of fasta file and stat file for each of the contigs and scaffolds), it produces a coverage vs. contig plot. Each dot represent a contig/scaffold. Given a coverage and a length cutoff, sequences that do not meet those criteria are shown in red. Some statistics are also given (N50, N90, median contig/scaffold length) both before and after filtering. + +Use the "filter SPAdes output" tool to actually filter sequences. + </help> +</tool> \ No newline at end of file