# HG changeset patch # User takadonet # Date 1418752660 18000 # Node ID 1027f26f52d51545627c4b3f0a505b23f0497a55 # Parent 7d72faef9af0d92d76acce5115bb4cbf816594c3 Deleted selected files diff -r 7d72faef9af0 -r 1027f26f52d5 tools/spades_3_1_1/filter_spades_output.pl --- a/tools/spades_3_1_1/filter_spades_output.pl Fri Oct 10 14:40:43 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ -#!/usr/bin/env perl -w - -=head1 SYNOPSIS - -filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage. - -=head1 USAGE - -filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta - -=head1 INPUT - -=head2 [-c|--coverage-cutoff] - -Mininum coverage. Contigs with lower coverage will be discarded. Default 10. - -=head2 [-l|--length-cutoff] - -Mininum coverage. Smaller ontigs will be discarded. Default 500. - -=head2 -t|--tab stats.tab - -A tabular file, with three columns: contig name, length, and coverage: - -NODE_1 31438 24.5116 -NODE_2 31354 2316.96 -NODE_3 26948 82.3294 - -Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file. - -=head2 [-o|--filtered-out out.fasta] - -If specified, filtered out sequences will be written to this file. - -=head2 seqs.fasta - -Sequences in fasta format. Start of IDs must match ids in the tabular file. - -=head1 OUTPUT - -A fasta file on stdout. - -=head1 AUTHOR - -Lionel Guy (lionel.guy@icm.uu.se) - -=head1 DATE - -Thu Aug 29 13:51:13 CEST 2013 - -=cut - -# libraries -use strict; -use Getopt::Long; -use Bio::SeqIO; - -my $coverage_co = 10; -my $length_co = 500; -my $out_filtered; -my $tab_file; - -GetOptions( - 'c|coverage-cutoff=s' => \$coverage_co, - 'l|length-cutoff=s' => \$length_co, - 'o|filtered-out=s' => \$out_filtered, - 't|tab=s' => \$tab_file, -); -my $fasta_file = shift(@ARGV); -die ("No tab file specified") unless ($tab_file); -die ("No fasta file specified") unless ($fasta_file); - -## Read tab file, discard rows with comments -open TAB, '<', $tab_file or die "$?"; -my @stats; -while (){ - chomp; - push @stats, $_ unless (/^#/); -} - -## Read fasta -my $seq_in = Bio::SeqIO->new(-file => $fasta_file, - -format => 'fasta'); -my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT, - -format => 'fasta'); -my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered", - -format => 'fasta') if ($out_filtered); -while (my $seq = $seq_in->next_seq){ - my $stat = shift @stats; - die "Less rows in tab than sequences in seq file" unless $stat; - my ($id_tab, $length, $coverage) = split(/\t+/, $stat); - die "id, length or coverate not defined at $stat\n" - unless ($id_tab && $length && $coverage); - my $id_seq = $seq->id; - die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i); - if ($length >= $length_co && $coverage >= $coverage_co){ - $seq_out->write_seq($seq); - } elsif ($out_filtered){ - $seq_out_filt->write_seq($seq); - } else { - # do nothing - } -} -die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0); -exit 0; - diff -r 7d72faef9af0 -r 1027f26f52d5 tools/spades_3_1_1/filter_spades_output.xml --- a/tools/spades_3_1_1/filter_spades_output.xml Fri Oct 10 14:40:43 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ - - remove low coverage and short contigs/scaffolds - filter_spades_output.pl - --coverage-cutoff $coverage_co - --length-cutoff $length_co - #if $keep_leftover - --filtered-out $filtered_out - #end if - --tab $stats_in - $fasta_in > $fasta_output - - - - - - - - - - - - - keep_leftover == "true" - - - -**What it does** - -Using the output of SPAdes (a fasta and a stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a given coverage. - -Typically, this is used to discard short contigs, or contaminations. To display a coverage vs. length plot, use the "SPAdes stats" tool in the same package. - - diff -r 7d72faef9af0 -r 1027f26f52d5 tools/spades_3_1_1/plot_spades_stats.xml --- a/tools/spades_3_1_1/plot_spades_stats.xml Fri Oct 10 14:40:43 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ - - coverage vs. length plot - - R - - r_wrapper.sh $script_file - - - - - - - - - -## Setup R error handling to go to stderr -options( show.error.messages=F, - error = function () { - cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) -} ) -files = c("${input_contigs}", "${input_scaffolds}") -types = c("Contigs", "Scaffolds") - -## Start plotting device -png("${out_file}", w=500, h=1000) -par(mfrow=c(2,1)) - -## Loop over the two files -for (i in 1:length(types)){ - seqs = read.table(files[i], header=FALSE, comment.char="#") - colnames = c("name", "length", "coverage") - names(seqs) = colnames - - ## Stats over all sequences - sl_all = sort(seqs\$length, decreasing=TRUE) - cs_all = cumsum(sl_all) - s_all = sum(seqs\$length) - n50_idx_all = which.min(sl_all[cs_all < 0.5*s_all]) - n90_idx_all = which.min(sl_all[cs_all < 0.9*s_all]) - n50_all = sl_all[n50_idx_all] - n90_all = sl_all[n90_idx_all] - - ## Filter short seqs, redo stats - seqs_filt = seqs[seqs\$length >= ${length_co} & seqs\$coverage >= ${coverage_co},] - if (nrow(seqs_filt) > 0){ - sl_filt = sort(seqs_filt\$length, decreasing=TRUE) - cs_filt = cumsum(sl_filt) - s_filt = sum(seqs_filt\$length) - n50_idx_filt = which.min(sl_filt[cs_filt < 0.5*s_filt]) - n90_idx_filt = which.min(sl_filt[cs_filt < 0.9*s_filt]) - n50_filt = sl_filt[n50_idx_filt] - n90_filt = sl_filt[n90_idx_filt] - } - seqs_bad = seqs[seqs\$length < ${length_co} | seqs\$coverage < ${coverage_co},] - - ## Length vs coverage - plot(length~coverage, data=seqs, log="xy", type="n", main=paste(types[i], ": coverage vs. length", sep=""), xlab="Coverage", ylab="Length") - if (nrow(seqs_bad) > 0){ - points(length~coverage, data=seqs_bad, cex=0.5, col="red") - } - if (nrow(seqs_filt) > 0){ - points(length~coverage, data=seqs_filt, cex=0.5, col="black") - } - abline(v=${coverage_co}, h=${length_co}, lty=2, col=grey(0.3)) - legend(x="topleft", legend=c("Before/after filtering", paste(c("N50: ", "N90: ", "Median cov.: "), c(n50_all, n90_all, round(median(seqs\$coverage))), rep("/", 3), c(n50_filt, n90_filt, round(median(seqs_filt\$coverage))), sep="")), cex=0.8) -} -dev.off() - - - - - - -**What it does** - -Using the output of SPAdes (a pair of fasta file and stat file for each of the contigs and scaffolds), it produces a coverage vs. contig plot. Each dot represent a contig/scaffold. Given a coverage and a length cutoff, sequences that do not meet those criteria are shown in red. Some statistics are also given (N50, N90, median contig/scaffold length) both before and after filtering. - -Use the "filter SPAdes output" tool to actually filter sequences. - - \ No newline at end of file diff -r 7d72faef9af0 -r 1027f26f52d5 tools/spades_3_1_1/r_wrapper.sh --- a/tools/spades_3_1_1/r_wrapper.sh Fri Oct 10 14:40:43 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -#!/bin/sh - -### Run R providing the R script in $1 as standard input and passing -### the remaining arguments on the command line - -# Function that writes a message to stderr and exits -function fail -{ - echo "$@" >&2 - exit 1 -} - -# Ensure R executable is found -which R > /dev/null || fail "'R' is required by this tool but was not found on path" - -# Extract first argument -infile=$1; shift - -# Ensure the file exists -test -f $infile || fail "R input file '$infile' does not exist" - -# Invoke R passing file named by first argument to stdin -R --vanilla --slave $* < $infile diff -r 7d72faef9af0 -r 1027f26f52d5 tools/spades_3_1_1/spades.pl --- a/tools/spades_3_1_1/spades.pl Fri Oct 10 14:40:43 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ -#!/usr/bin/env perl -## A wrapper script to call spades.py and collect its output -use strict; -use warnings; -use File::Temp qw/ tempfile tempdir /; -use File::Copy; -use Getopt::Long; - -# Parse arguments -my ($out_contigs_file, - $out_contigs_stats, - $out_scaffolds_file, - $out_scaffolds_stats, - $out_log_file, - $new_name, - @sysargs) = @ARGV; - -## GetOptions not compatible with parsing the rest of the arguments in an array. -## Keeping the not-so-nice parse-in-one-go method, without named arguments. -# GetOptions( -# 'contigs-file=s' => \$out_contigs_file, -# 'contigs-stats=s' => \$out_contigs_stats, -# 'scaffolds-file=s' => \$out_scaffolds_file, -# 'scaffolds-stats=s' => \$out_scaffolds_stats, -# 'out_log_file=s' => \$out_log_file, -# ); - -# my @sysargs = @ARGV; - -# Create temporary folder to store files, delete after use -#my $output_dir = tempdir( CLEANUP => 0 ); -my $output_dir = 'output_dir'; -# Link "dat" files as fastq, otherwise spades complains about file format - -# Create log handle -open my $log, '>', $out_log_file or die "Cannot write to $out_log_file: $?\n"; - -# Run program -# To do: record time -runSpades(@sysargs); -collectOutput($new_name); -extractCoverageLength($out_contigs_file, $out_contigs_stats); -extractCoverageLength($out_scaffolds_file, $out_scaffolds_stats); -print $log "Done\n"; -close $log; -exit 0; - -# Run spades -sub runSpades { - my $cmd = join(" ", @_) . " -o $output_dir"; - my $return_code = system($cmd); - if ($return_code) { - print $log "Failed with code $return_code\nCommand $cmd\nMessage: $?\n"; - die "Failed with code $return_code\nCommand $cmd\nMessage: $?\n"; - } - return 0; -} - -# Collect output -sub collectOutput{ - my ($new_name) = @_; - - # To do: check that the files are there - # Collects output - if ( not -e "$output_dir/contigs.fasta") { - die "Could not find contigs.fasta file\n"; - } - if ( not -e "$output_dir/scaffolds.fasta") { - die "Could not find scaffolds.fasta file\n"; - } - - #if a new name is given for the contigs and scaffolds, change them before moving them - if ( $new_name ne 'NODE') { - renameContigs($new_name); - } - else { - move "$output_dir/contigs.fasta", $out_contigs_file; - move "$output_dir/scaffolds.fasta", $out_scaffolds_file; - } - - - - open LOG, '<', "$output_dir/spades.log" - or die "Cannot open log file $output_dir/spades.log: $?"; - print $log $_ while (); - return 0; -} - -#Change name in contig and scaffolds file -sub renameContigs{ - my ($name) = @_; - - open my $in, '<',"$output_dir/contigs.fasta" or die $!; - open my $out,'>', $out_contigs_file; - - while ( my $line = <$in>) { - #remove the NODE_ so we can rebuilt the display_id with our contig name with the contig number. - #also move the remainder of the length - if ( $line =~ />NODE_(\d+)_(.+)/) { - $line = ">$name" . "_$1 $2\n"; - } - print $out $line; - } - close $in; - close $out; - - - open $in, '<',"$output_dir/scaffolds.fasta" or die $!; - open $out,'>', $out_scaffolds_file; - - while ( my $line = <$in>) { - #remove the NODE_ so we can rebuilt the display_id with our contig name with the contig number. - #also move the remainder of the length - if ( $line =~ />NODE_(\d+)_(.+)/) { - $line = ">$name" . "_$1 $2\n"; - } - print $out $line; - } - close $in; - close $out; - -} - - -# Extract -sub extractCoverageLength{ - my ($in, $out) = @_; - open FASTA, '<', $in or die $!; - open TAB, '>', $out or die $!; - print TAB "#name\tlength\tcoverage\n"; - while (){ - next unless /^>/; - chomp; - die "Not all elements found in $_\n" if (! m/^>(NODE|\S+)_(\d+)(?:_|\s)length_(\d+)_cov_(\d+\.*\d*)/); - my ($name,$n, $l, $cov) = ($1,$2, $3, $4); - print TAB "$name" . "_$n\t$l\t$cov\n"; - } - close TAB; -} diff -r 7d72faef9af0 -r 1027f26f52d5 tools/spades_3_1_1/spades.xml --- a/tools/spades_3_1_1/spades.xml Fri Oct 10 14:40:43 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,210 +0,0 @@ - - SPAdes genome assembler for regular and single-cell projects - - spades - - spades.pl - $out_contigs - $out_contig_stats - $out_scaffolds - $out_scaffold_stats - $out_log - - ## if the first library file is a paired-collection, use the key as the name - #if $libraries[0].files[0].file_type.type == "paired-collection": - $libraries[0].files[0].file_type.fastq_collection.name - #else: - NODE - #end if - ## A real command looks like: spades.py -k 21,33,55,77,99,127 --careful -1 Y.fastq.gz -2 X.fastq.gz -t 24 -o output - spades.py - ## Forces unzipped output, faster - --disable-gzip-output - $sc - $onlyassembler - $careful - -t \${GALAXY_SLOTS:-16} - -k "$kmers" - $iontorrent - ## Sequence files, libraries - #for $i, $library in enumerate( $libraries ) - #set num=$i+1 - #if str( $library.lib_type ) == "paired_end": - #set prefix = 'pe' - #elif str( $library.lib_type ) == "mate_paired": - #set prefix = 'mp' - #else: - $set prefix = 'hqmp' - #end if - --$prefix$num-$library.orientation - #for $file in $library.files - #if $file.file_type.type == "separate" - --$prefix$num-1 fastq:$file.file_type.fwd_reads - --$prefix$num-2 fastq:$file.file_type.rev_reads - #elif $file.file_type.type == "interleaved" - --$prefix$num-12 fastq:$file.file_type.interleaved_reads - #elif $file.file_type.type == "unpaired" - --$prefix$num-s fastq:$file.file_type.unpaired_reads - #elif $file.file_type.type == "paired-collection" - --$prefix$num-1 fastq:$file.file_type.fastq_collection.forward - --$prefix$num-2 fastq:$file.file_type.fastq_collection.reverse - #end if - #end for - #end for - ## PacBio reads - #for $i, $pacbiolib in enumerate( $pacbio ) - --pacbio fastq:$pacbiolib.pacbio_reads - #end for - ## Sanger - #for $i, $sangerlib in enumerate( $sanger ) - --sanger $sangerlib.file_type.type:$sangerlib.file_type.sanger_reads - #end for - ## Contigs - #for $i, $trustedcontigs in enumerate( $trustedcontigs ) - --trusted-contigs $trustedcontigs.file_type.type:$trustedcontigs.file_type.trusted_contigs - #end for - #for $i, $untrustedcontigs in enumerate( $untrustedcontigs ) - --untrusted-contigs $untrustedcontigs.file_type.type:$untrustedcontigs.file_type.untrusted_contigs - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -SPAdes – St. Petersburg genome assembler – is intended for both standard isolates and single-cell MDA bacteria assemblies. See http://bioinf.spbau.ru/en/spades for more details on SPAdes. - -This wrapper runs SPAdes 3.1.1, collects the output, and throws away all the temporary files. It also produces a tab file with contig names, length and coverage. - -**SPAdes citation** - -Anton Bankevich, Sergey Nurk, Dmitry Antipov, Alexey A. Gurevich, Mikhail Dvorkin, Alexander S. Kulikov, Valery M. Lesin, Sergey I. Nikolenko, Son Pham, Andrey D. Prjibelski, Alexey V. Pyshkin, Alexander V. Sirotkin, Nikolay Vyahhi, Glenn Tesler, Max A. Alekseyev, and Pavel A. Pevzner. Journal of Computational Biology. May 2012, 19(5): 455-477. doi:10.1089/cmb.2012.0021. - -**License** - -SPAdes is developed by and copyrighted to Saint-Petersburg Academic University, and is released under GPLv2. - -This wrapper is copyrighted by Lionel Guy,Philip Mabon and is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/. - -** Acknowledgments ** - -Anton Korobeynikov greatlty helped understanding how SPAdes work, and integrated handy features into SPAdes. - -Nicola Soranzo fixed various bugs. - - diff -r 7d72faef9af0 -r 1027f26f52d5 tools/spades_3_1_1/tool_dependencies.xml --- a/tools/spades_3_1_1/tool_dependencies.xml Fri Oct 10 14:40:43 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ - - - - - - http://spades.bioinf.spbau.ru/release3.1.1/SPAdes-3.1.1-Linux.tar.gz - - $INSTALL_DIR/bin - $INSTALL_DIR/share - - bin - $INSTALL_DIR/bin - - - share - $INSTALL_DIR/share - - - - - - - - - -This installs SPAdes 3.1.1. - -See manual here http://spades.bioinf.spbau.ru/release3.1.1/manual.html -See also here http://bioinf.spbau.ru/en/spades - - - -