# HG changeset patch # User lionelguy # Date 1391595543 18000 # Node ID ff058438080a162297d54ba824a9042e67a1583a # Parent 95ddc23801300f7570b20fde242876993b77c361 Version 0.8, supports SPAdes 3.0.0 diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/filter_spades_output.pl --- a/tools/spades_2_5/filter_spades_output.pl Thu Nov 28 05:29:32 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ -#!/usr/bin/perl -w - -=head1 SYNOPSIS - -filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage. - -=head1 USAGE - -filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta - -=head1 INPUT - -=head2 [-c|--coverage-cutoff] - -Mininum coverage. Contigs with lower coverage will be discarded. Default 10. - -=head2 [-l|--length-cutoff] - -Mininum coverage. Smaller ontigs will be discarded. Default 500. - -=head2 -t|--tab stats.tab - -A tabular file, with three columns: contig name, length, and coverage: - -NODE_1 31438 24.5116 -NODE_2 31354 2316.96 -NODE_3 26948 82.3294 - -Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file. - -=head2 [-o|--filtered-out out.fasta] - -If specified, filtered out sequences will be written to this file. - -=head2 seqs.fasta - -Sequences in fasta format. Start of IDs must match ids in the tabular file. - -=head1 OUTPUT - -A fasta file on stdout. - -=head1 AUTHOR - -Lionel Guy (lionel.guy@icm.uu.se) - -=head1 DATE - -Thu Aug 29 13:51:13 CEST 2013 - -=cut - -# libraries -use strict; -use Getopt::Long; -use Bio::SeqIO; - -my $coverage_co = 10; -my $length_co = 500; -my $out_filtered; -my $tab_file; - -GetOptions( - 'c|coverage-cutoff=s' => \$coverage_co, - 'l|length-cutoff=s' => \$length_co, - 'o|filtered-out=s' => \$out_filtered, - 't|tab=s' => \$tab_file, -); -my $fasta_file = shift(@ARGV); -die ("No tab file specified") unless ($tab_file); -die ("No fasta file specified") unless ($fasta_file); - -## Read tab file, discard rows with comments -open TAB, '<', $tab_file or die "$?"; -my @stats; -while (){ - chomp; - push @stats, $_ unless (/^#/); -} - -## Read fasta -my $seq_in = Bio::SeqIO->new(-file => $fasta_file, - -format => 'fasta'); -my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT, - -format => 'fasta'); -my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered", - -format => 'fasta') if ($out_filtered); -while (my $seq = $seq_in->next_seq){ - my $stat = shift @stats; - die "Less rows in tab than sequences in seq file" unless $stat; - my ($id_tab, $length, $coverage) = split(/\t+/, $stat); - die "id, length or coverate not defined at $stat\n" - unless ($id_tab && $length && $coverage); - my $id_seq = $seq->id; - die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i); - if ($length >= $length_co && $coverage >= $coverage_co){ - $seq_out->write_seq($seq); - } elsif ($out_filtered){ - $seq_out_filt->write_seq($seq); - } else { - # do nothing - } -} -die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0); -exit 0; - diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/filter_spades_output.xml --- a/tools/spades_2_5/filter_spades_output.xml Thu Nov 28 05:29:32 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ - - remove low coverage and short contigs/scaffolds - filter_spades_output.pl - --coverage-cutoff $coverage_co - --length-cutoff $length_co - #if $keep_leftover - --filtered-out $filtered_out - #end if - --tab $stats_in - $fasta_in > $fasta_output - - - - - - - - - - - - - keep_leftover == "true" - - - -**What it does** - -Using the output of SPAdes (a fasta and a stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a given coverage. - -Typically, this is used to discard short contigs, or contaminations. To display a coverage vs. length plot, use the "SPAdes stats" tool in the same package. - - diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/plot_spades_stats.xml --- a/tools/spades_2_5/plot_spades_stats.xml Thu Nov 28 05:29:32 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ - - coverage vs. length plot - - R - - r_wrapper.sh $script_file - - - - - - - - - -## Setup R error handling to go to stderr -options( show.error.messages=F, - error = function () { - cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) -} ) -files = c("${input_contigs}", "${input_scaffolds}") -types = c("Contigs", "Scaffolds") - -## Start plotting device -png("${out_file}", w=500, h=1000) -par(mfrow=c(2,1)) - -## Loop over the two files -for (i in 1:length(types)){ - seqs = read.table(files[i], header=FALSE, comment.char="#") - colnames = c("name", "length", "coverage") - names(seqs) = colnames - - ## Stats over all sequences - sl_all = sort(seqs\$length, decreasing=TRUE) - cs_all = cumsum(sl_all) - s_all = sum(seqs\$length) - n50_idx_all = which.min(sl_all[cs_all < 0.5*s_all]) - n90_idx_all = which.min(sl_all[cs_all < 0.9*s_all]) - n50_all = sl_all[n50_idx_all] - n90_all = sl_all[n90_idx_all] - - ## Filter short seqs, redo stats - seqs_filt = seqs[seqs\$length >= ${length_co} & seqs\$coverage >= ${coverage_co},] - if (nrow(seqs_filt) > 0){ - sl_filt = sort(seqs_filt\$length, decreasing=TRUE) - cs_filt = cumsum(sl_filt) - s_filt = sum(seqs_filt\$length) - n50_idx_filt = which.min(sl_filt[cs_filt < 0.5*s_filt]) - n90_idx_filt = which.min(sl_filt[cs_filt < 0.9*s_filt]) - n50_filt = sl_filt[n50_idx_filt] - n90_filt = sl_filt[n90_idx_filt] - } - seqs_bad = seqs[seqs\$length < ${length_co} | seqs\$coverage < ${coverage_co},] - - ## Length vs coverage - plot(length~coverage, data=seqs, log="xy", type="n", main=paste(types[i], ": coverage vs. length", sep=""), xlab="Coverage", ylab="Length") - if (nrow(seqs_bad) > 0){ - points(length~coverage, data=seqs_bad, cex=0.5, col="red") - } - if (nrow(seqs_filt) > 0){ - points(length~coverage, data=seqs_filt, cex=0.5, col="black") - } - abline(v=${coverage_co}, h=${length_co}, lty=2, col=grey(0.3)) - legend(x="topleft", legend=c("Before/after filtering", paste(c("N50: ", "N90: ", "Median cov.: "), c(n50_all, n90_all, round(median(seqs\$coverage))), rep("/", 3), c(n50_filt, n90_filt, round(median(seqs_filt\$coverage))), sep="")), cex=0.8) -} -dev.off() - - - - - - -**What it does** - -Using the output of SPAdes (a pair of fasta file and stat file for each of the contigs and scaffolds), it produces a coverage vs. contig plot. Each dot represent a contig/scaffold. Given a coverage and a length cutoff, sequences that do not meet those criteria are shown in red. Some statistics are also given (N50, N90, median contig/scaffold length) both before and after filtering. - -Use the "filter SPAdes output" tool to actually filter sequences. - - \ No newline at end of file diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/r_wrapper.sh --- a/tools/spades_2_5/r_wrapper.sh Thu Nov 28 05:29:32 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -#!/bin/sh - -### Run R providing the R script in $1 as standard input and passing -### the remaining arguments on the command line - -# Function that writes a message to stderr and exits -function fail -{ - echo "$@" >&2 - exit 1 -} - -# Ensure R executable is found -which R > /dev/null || fail "'R' is required by this tool but was not found on path" - -# Extract first argument -infile=$1; shift - -# Ensure the file exists -test -f $infile || fail "R input file '$infile' does not exist" - -# Invoke R passing file named by first argument to stdin -R --vanilla --slave $* < $infile diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/spades.pl --- a/tools/spades_2_5/spades.pl Thu Nov 28 05:29:32 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ -#!/usr/bin/env perl -## A wrapper script to call spades.py and collect its output -use strict; -use warnings; -use File::Temp qw/ tempfile tempdir /; -use File::Copy; -use Getopt::Long; - -# Parse arguments -my ($out_contigs_file, - $out_contigs_stats, - $out_scaffolds_file, - $out_scaffolds_stats, - $out_log_file, - @sysargs) = @ARGV; - -## GetOptions not compatible with parsing the rest of the arguments in an array. -## Keeping the not-so-nice parse-in-one-go method, without named arguments. -# GetOptions( -# 'contigs-file=s' => \$out_contigs_file, -# 'contigs-stats=s' => \$out_contigs_stats, -# 'scaffolds-file=s' => \$out_scaffolds_file, -# 'scaffolds-stats=s' => \$out_scaffolds_stats, -# 'out_log_file=s' => \$out_log_file, -# ); - -# my @sysargs = @ARGV; - -# Create temporary folder to store files, delete after use -#my $output_dir = tempdir( CLEANUP => 0 ); -my $output_dir = 'output_dir'; -# Link "dat" files as fastq, otherwise spades complains about file format - -# Create log handle -open my $log, '>', $out_log_file or die "Cannot write to $out_log_file: $?\n"; - -# Run program -# To do: record time -&runSpades(@sysargs); -&collectOutput(); -&extractCoverageLength($out_contigs_file, $out_contigs_stats); -&extractCoverageLength($out_scaffolds_file, $out_scaffolds_stats); -print $log "Done\n"; -close $log; -exit 0; - -# Run spades -sub runSpades { - my $cmd = join(" ", @_) . " -o $output_dir"; - my $return_code = system($cmd); - if ($return_code) { - print $log "Failed with code $return_code\nCommand $cmd\nMessage: $?\n"; - die "Failed with code $return_code\nCommand $cmd\nMessage: $?\n"; - } - return 0; -} - -# Collect output -sub collectOutput{ - # To do: check that the files are there - # Collects output - move "$output_dir/contigs.fasta", $out_contigs_file; - move "$output_dir/scaffolds.fasta", $out_scaffolds_file; - open LOG, '<', "$output_dir/spades.log" - or die "Cannot open log file $output_dir/spades.log: $?"; - print $log $_ while (); - return 0; -} - -# Extract -sub extractCoverageLength{ - my ($in, $out) = @_; - open FASTA, '<', $in or die $!; - open TAB, '>', $out or die $!; - print TAB "#name\tlength\tcoverage\n"; - while (){ - next unless /^>/; - chomp; - die "Not all elements found in $_\n" if (! m/^>NODE_(\d+)_length_(\d+)_cov_(\d+\.*\d*)_/); - my ($n, $l, $cov) = ($1, $2, $3); - print TAB "NODE_$n\t$l\t$cov\n"; - } - close TAB; -} diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/spades.xml --- a/tools/spades_2_5/spades.xml Thu Nov 28 05:29:32 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ - - SPAdes genome assembler for regular and single-cell projects - - spades - - spades.pl - $out_contigs - $out_contig_stats - $out_scaffolds - $out_scaffold_stats - $out_log - ## A real command looks like: spades.py -k 21,33,55,77,99,127 --careful -1 Y.fastq.gz -2 X.fastq.gz -t 24 -o output - spades.py - ## Forces unzipped output, faster - --disable-gzip-output - $sc - $onlyassembler - $careful - ##$rectangles - -t \${GALAXY_SLOTS:-16} - -k "$kmers" - ##-i $iterations - ##--phred-offset - ## Sequence files - #for $i, $library in enumerate( $libraries ) - #set num=$i+1 - #if str( $library.lib_type ) == "paired_end": - #set prefix = 'pe' - #else: - #set prefix = 'mp' - #end if - --$prefix$num-$library.orientation - #for $file in $library.files - #if $file.file_type.type == "separate" - --$prefix$num-1 fastq:$file.file_type.fwd_reads - --$prefix$num-2 fastq:$file.file_type.rev_reads - #elif $file.file_type.type == "interleaved" - --$prefix$num-12 fastq:$file.file_type.interleaved_reads - #elif $file.file_type.type == "unpaired" - --$prefix$num-s fastq:$file.file_type.unpaired_reads - #end if - #end for - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -SPAdes – St. Petersburg genome assembler – is intended for both standard isolates and single-cell MDA bacteria assemblies. See http://bioinf.spbau.ru/en/spades for more details on SPAdes. - -This wrapper runs SPAdes 2.5.1, collects the output, and throws away all the temporary files. It also produces a tab file with contig names, length and coverage. - -**SPAdes citation** - -Anton Bankevich, Sergey Nurk, Dmitry Antipov, Alexey A. Gurevich, Mikhail Dvorkin, Alexander S. Kulikov, Valery M. Lesin, Sergey I. Nikolenko, Son Pham, Andrey D. Prjibelski, Alexey V. Pyshkin, Alexander V. Sirotkin, Nikolay Vyahhi, Glenn Tesler, Max A. Alekseyev, and Pavel A. Pevzner. Journal of Computational Biology. May 2012, 19(5): 455-477. doi:10.1089/cmb.2012.0021. - -**License** - -SPAdes is developed by and copyrighted to Saint-Petersburg Academic University, and is released under GPLv2. - -This wrapper is copyrighted by Lionel Guy, and is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/. - -** Acknowledgments ** - -Anton Korobeynikov greatlty helped understanding how SPAdes work, and integrated handy features into SPAdes. - -Nicola Soranzo fixed bugs in the 0.6 version. - - diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/tool_dependencies.xml --- a/tools/spades_2_5/tool_dependencies.xml Thu Nov 28 05:29:32 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ - - - - - - http://spades.bioinf.spbau.ru/release2.5.1/SPAdes-2.5.1-Linux.tar.gz - - $INSTALL_DIR/bin - $INSTALL_DIR/share - - bin - $INSTALL_DIR/bin - - - share - $INSTALL_DIR/share - - - - - - - - - -This installs SPAdes 2.5.1. - -See manual here http://spades.bioinf.spbau.ru/release2.5.1/manual.html -See also here http://bioinf.spbau.ru/en/spades - - - - diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/filter_spades_output.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_3_0/filter_spades_output.pl Wed Feb 05 05:19:03 2014 -0500 @@ -0,0 +1,106 @@ +#!/usr/bin/perl -w + +=head1 SYNOPSIS + +filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage. + +=head1 USAGE + +filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta + +=head1 INPUT + +=head2 [-c|--coverage-cutoff] + +Mininum coverage. Contigs with lower coverage will be discarded. Default 10. + +=head2 [-l|--length-cutoff] + +Mininum coverage. Smaller ontigs will be discarded. Default 500. + +=head2 -t|--tab stats.tab + +A tabular file, with three columns: contig name, length, and coverage: + +NODE_1 31438 24.5116 +NODE_2 31354 2316.96 +NODE_3 26948 82.3294 + +Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file. + +=head2 [-o|--filtered-out out.fasta] + +If specified, filtered out sequences will be written to this file. + +=head2 seqs.fasta + +Sequences in fasta format. Start of IDs must match ids in the tabular file. + +=head1 OUTPUT + +A fasta file on stdout. + +=head1 AUTHOR + +Lionel Guy (lionel.guy@icm.uu.se) + +=head1 DATE + +Thu Aug 29 13:51:13 CEST 2013 + +=cut + +# libraries +use strict; +use Getopt::Long; +use Bio::SeqIO; + +my $coverage_co = 10; +my $length_co = 500; +my $out_filtered; +my $tab_file; + +GetOptions( + 'c|coverage-cutoff=s' => \$coverage_co, + 'l|length-cutoff=s' => \$length_co, + 'o|filtered-out=s' => \$out_filtered, + 't|tab=s' => \$tab_file, +); +my $fasta_file = shift(@ARGV); +die ("No tab file specified") unless ($tab_file); +die ("No fasta file specified") unless ($fasta_file); + +## Read tab file, discard rows with comments +open TAB, '<', $tab_file or die "$?"; +my @stats; +while (){ + chomp; + push @stats, $_ unless (/^#/); +} + +## Read fasta +my $seq_in = Bio::SeqIO->new(-file => $fasta_file, + -format => 'fasta'); +my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT, + -format => 'fasta'); +my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered", + -format => 'fasta') if ($out_filtered); +while (my $seq = $seq_in->next_seq){ + my $stat = shift @stats; + die "Less rows in tab than sequences in seq file" unless $stat; + my ($id_tab, $length, $coverage) = split(/\t+/, $stat); + die "id, length or coverate not defined at $stat\n" + unless ($id_tab && $length && $coverage); + my $id_seq = $seq->id; + die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i); + if ($length >= $length_co && $coverage >= $coverage_co){ + $seq_out->write_seq($seq); + } elsif ($out_filtered){ + $seq_out_filt->write_seq($seq); + } else { + # do nothing + } +} +die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0); +exit 0; + diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/filter_spades_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_3_0/filter_spades_output.xml Wed Feb 05 05:19:03 2014 -0500 @@ -0,0 +1,33 @@ + + remove low coverage and short contigs/scaffolds + filter_spades_output.pl + --coverage-cutoff $coverage_co + --length-cutoff $length_co + #if $keep_leftover + --filtered-out $filtered_out + #end if + --tab $stats_in + $fasta_in > $fasta_output + + + + + + + + + + + + + keep_leftover == "true" + + + +**What it does** + +Using the output of SPAdes (a fasta and a stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a given coverage. + +Typically, this is used to discard short contigs, or contaminations. To display a coverage vs. length plot, use the "SPAdes stats" tool in the same package. + + diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/plot_spades_stats.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_3_0/plot_spades_stats.xml Wed Feb 05 05:19:03 2014 -0500 @@ -0,0 +1,80 @@ + + coverage vs. length plot + + R + + r_wrapper.sh $script_file + + + + + + + + + +## Setup R error handling to go to stderr +options( show.error.messages=F, + error = function () { + cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) +} ) +files = c("${input_contigs}", "${input_scaffolds}") +types = c("Contigs", "Scaffolds") + +## Start plotting device +png("${out_file}", w=500, h=1000) +par(mfrow=c(2,1)) + +## Loop over the two files +for (i in 1:length(types)){ + seqs = read.table(files[i], header=FALSE, comment.char="#") + colnames = c("name", "length", "coverage") + names(seqs) = colnames + + ## Stats over all sequences + sl_all = sort(seqs\$length, decreasing=TRUE) + cs_all = cumsum(sl_all) + s_all = sum(seqs\$length) + n50_idx_all = which.min(sl_all[cs_all < 0.5*s_all]) + n90_idx_all = which.min(sl_all[cs_all < 0.9*s_all]) + n50_all = sl_all[n50_idx_all] + n90_all = sl_all[n90_idx_all] + + ## Filter short seqs, redo stats + seqs_filt = seqs[seqs\$length >= ${length_co} & seqs\$coverage >= ${coverage_co},] + if (nrow(seqs_filt) > 0){ + sl_filt = sort(seqs_filt\$length, decreasing=TRUE) + cs_filt = cumsum(sl_filt) + s_filt = sum(seqs_filt\$length) + n50_idx_filt = which.min(sl_filt[cs_filt < 0.5*s_filt]) + n90_idx_filt = which.min(sl_filt[cs_filt < 0.9*s_filt]) + n50_filt = sl_filt[n50_idx_filt] + n90_filt = sl_filt[n90_idx_filt] + } + seqs_bad = seqs[seqs\$length < ${length_co} | seqs\$coverage < ${coverage_co},] + + ## Length vs coverage + plot(length~coverage, data=seqs, log="xy", type="n", main=paste(types[i], ": coverage vs. length", sep=""), xlab="Coverage", ylab="Length") + if (nrow(seqs_bad) > 0){ + points(length~coverage, data=seqs_bad, cex=0.5, col="red") + } + if (nrow(seqs_filt) > 0){ + points(length~coverage, data=seqs_filt, cex=0.5, col="black") + } + abline(v=${coverage_co}, h=${length_co}, lty=2, col=grey(0.3)) + legend(x="topleft", legend=c("Before/after filtering", paste(c("N50: ", "N90: ", "Median cov.: "), c(n50_all, n90_all, round(median(seqs\$coverage))), rep("/", 3), c(n50_filt, n90_filt, round(median(seqs_filt\$coverage))), sep="")), cex=0.8) +} +dev.off() + + + + + + +**What it does** + +Using the output of SPAdes (a pair of fasta file and stat file for each of the contigs and scaffolds), it produces a coverage vs. contig plot. Each dot represent a contig/scaffold. Given a coverage and a length cutoff, sequences that do not meet those criteria are shown in red. Some statistics are also given (N50, N90, median contig/scaffold length) both before and after filtering. + +Use the "filter SPAdes output" tool to actually filter sequences. + + \ No newline at end of file diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/r_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_3_0/r_wrapper.sh Wed Feb 05 05:19:03 2014 -0500 @@ -0,0 +1,23 @@ +#!/bin/sh + +### Run R providing the R script in $1 as standard input and passing +### the remaining arguments on the command line + +# Function that writes a message to stderr and exits +function fail +{ + echo "$@" >&2 + exit 1 +} + +# Ensure R executable is found +which R > /dev/null || fail "'R' is required by this tool but was not found on path" + +# Extract first argument +infile=$1; shift + +# Ensure the file exists +test -f $infile || fail "R input file '$infile' does not exist" + +# Invoke R passing file named by first argument to stdin +R --vanilla --slave $* < $infile diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/spades.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_3_0/spades.pl Wed Feb 05 05:19:03 2014 -0500 @@ -0,0 +1,84 @@ +#!/usr/bin/env perl +## A wrapper script to call spades.py and collect its output +use strict; +use warnings; +use File::Temp qw/ tempfile tempdir /; +use File::Copy; +use Getopt::Long; + +# Parse arguments +my ($out_contigs_file, + $out_contigs_stats, + $out_scaffolds_file, + $out_scaffolds_stats, + $out_log_file, + @sysargs) = @ARGV; + +## GetOptions not compatible with parsing the rest of the arguments in an array. +## Keeping the not-so-nice parse-in-one-go method, without named arguments. +# GetOptions( +# 'contigs-file=s' => \$out_contigs_file, +# 'contigs-stats=s' => \$out_contigs_stats, +# 'scaffolds-file=s' => \$out_scaffolds_file, +# 'scaffolds-stats=s' => \$out_scaffolds_stats, +# 'out_log_file=s' => \$out_log_file, +# ); + +# my @sysargs = @ARGV; + +# Create temporary folder to store files, delete after use +#my $output_dir = tempdir( CLEANUP => 0 ); +my $output_dir = 'output_dir'; +# Link "dat" files as fastq, otherwise spades complains about file format + +# Create log handle +open my $log, '>', $out_log_file or die "Cannot write to $out_log_file: $?\n"; + +# Run program +# To do: record time +&runSpades(@sysargs); +&collectOutput(); +&extractCoverageLength($out_contigs_file, $out_contigs_stats); +&extractCoverageLength($out_scaffolds_file, $out_scaffolds_stats); +print $log "Done\n"; +close $log; +exit 0; + +# Run spades +sub runSpades { + my $cmd = join(" ", @_) . " -o $output_dir"; + my $return_code = system($cmd); + if ($return_code) { + print $log "Failed with code $return_code\nCommand $cmd\nMessage: $?\n"; + die "Failed with code $return_code\nCommand $cmd\nMessage: $?\n"; + } + return 0; +} + +# Collect output +sub collectOutput{ + # To do: check that the files are there + # Collects output + move "$output_dir/contigs.fasta", $out_contigs_file; + move "$output_dir/scaffolds.fasta", $out_scaffolds_file; + open LOG, '<', "$output_dir/spades.log" + or die "Cannot open log file $output_dir/spades.log: $?"; + print $log $_ while (); + return 0; +} + +# Extract +sub extractCoverageLength{ + my ($in, $out) = @_; + open FASTA, '<', $in or die $!; + open TAB, '>', $out or die $!; + print TAB "#name\tlength\tcoverage\n"; + while (){ + next unless /^>/; + chomp; + die "Not all elements found in $_\n" if (! m/^>NODE_(\d+)_length_(\d+)_cov_(\d+\.*\d*)/); + my ($n, $l, $cov) = ($1, $2, $3); + print TAB "NODE_$n\t$l\t$cov\n"; + } + close TAB; +} diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/spades.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_3_0/spades.xml Wed Feb 05 05:19:03 2014 -0500 @@ -0,0 +1,193 @@ + + SPAdes genome assembler for regular and single-cell projects + + spades + + spades.pl + $out_contigs + $out_contig_stats + $out_scaffolds + $out_scaffold_stats + $out_log + ## A real command looks like: spades.py -k 21,33,55,77,99,127 --careful -1 Y.fastq.gz -2 X.fastq.gz -t 24 -o output + spades.py + ## Forces unzipped output, faster + --disable-gzip-output + $sc + $onlyassembler + $careful + -t \${GALAXY_SLOTS:-16} + -k "$kmers" + $iontorrent + ## Sequence files, libraries + #for $i, $library in enumerate( $libraries ) + #set num=$i+1 + #if str( $library.lib_type ) == "paired_end": + #set prefix = 'pe' + #else: + #set prefix = 'mp' + #end if + --$prefix$num-$library.orientation + #for $file in $library.files + #if $file.file_type.type == "separate" + --$prefix$num-1 fastq:$file.file_type.fwd_reads + --$prefix$num-2 fastq:$file.file_type.rev_reads + #elif $file.file_type.type == "interleaved" + --$prefix$num-12 fastq:$file.file_type.interleaved_reads + #elif $file.file_type.type == "unpaired" + --$prefix$num-s fastq:$file.file_type.unpaired_reads + #end if + #end for + #end for + ## PacBio reads + #for $i, $pacbiolib in enumerate( $pacbio ) + --pacbio fastq:$pacbiolib.pacbio_reads + #end for + ## Sanger + #for $i, $sangerlib in enumerate( $sanger ) + --sanger $sangerlib.file_type.type:$sangerlib.file_type.sanger_reads + #end for + ## Contigs + #for $i, $trustedcontigs in enumerate( $trustedcontigs ) + --trusted-contigs $trustedcontigs.file_type.type:$trustedcontigs.file_type.trusted_contigs + #end for + #for $i, $untrustedcontigs in enumerate( $untrustedcontigs ) + --untrusted-contigs $untrustedcontigs.file_type.type:$untrustedcontigs.file_type.untrusted_contigs + #end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +SPAdes – St. Petersburg genome assembler – is intended for both standard isolates and single-cell MDA bacteria assemblies. See http://bioinf.spbau.ru/en/spades for more details on SPAdes. + +This wrapper runs SPAdes 3.0.0, collects the output, and throws away all the temporary files. It also produces a tab file with contig names, length and coverage. + +**SPAdes citation** + +Anton Bankevich, Sergey Nurk, Dmitry Antipov, Alexey A. Gurevich, Mikhail Dvorkin, Alexander S. Kulikov, Valery M. Lesin, Sergey I. Nikolenko, Son Pham, Andrey D. Prjibelski, Alexey V. Pyshkin, Alexander V. Sirotkin, Nikolay Vyahhi, Glenn Tesler, Max A. Alekseyev, and Pavel A. Pevzner. Journal of Computational Biology. May 2012, 19(5): 455-477. doi:10.1089/cmb.2012.0021. + +**License** + +SPAdes is developed by and copyrighted to Saint-Petersburg Academic University, and is released under GPLv2. + +This wrapper is copyrighted by Lionel Guy, and is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/. + +** Acknowledgments ** + +Anton Korobeynikov greatlty helped understanding how SPAdes work, and integrated handy features into SPAdes. + +Nicola Soranzo fixed various bugs. + + diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_3_0/tool_dependencies.xml Wed Feb 05 05:19:03 2014 -0500 @@ -0,0 +1,33 @@ + + + + + + http://spades.bioinf.spbau.ru/release3.0.0/SPAdes-3.0.0-Linux.tar.gz + + $INSTALL_DIR/bin + $INSTALL_DIR/share + + bin + $INSTALL_DIR/bin + + + share + $INSTALL_DIR/share + + + + + + + + + +This installs SPAdes 3.0.0. + +See manual here http://spades.bioinf.spbau.ru/release3.0.0/manual.html +See also here http://bioinf.spbau.ru/en/spades + + + +