changeset 2:b5ce24f34dd7 draft

Uploaded
author lionelguy
date Thu, 05 Sep 2013 07:43:48 -0400
parents 0f8b2da62d7d
children d82f18c76309
files tools/spades_2_5/filter_spades_output.pl tools/spades_2_5/filter_spades_output.xml tools/spades_2_5/plot_spades_stats.xml tools/spades_2_5/r_wrapper.sh tools/spades_2_5/spades.pl tools/spades_2_5/spades.xml
diffstat 6 files changed, 304 insertions(+), 52 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_2_5/filter_spades_output.pl	Thu Sep 05 07:43:48 2013 -0400
@@ -0,0 +1,106 @@
+#!/usr/bin/perl -w
+
+=head1 SYNOPSIS
+
+filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage.
+
+=head1 USAGE
+
+filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta
+
+=head1 INPUT
+
+=head2 [-c|--coverage-cutoff]
+
+Mininum coverage. Contigs with lower coverage will be discarded. Default 10.
+
+=head2 [-l|--length-cutoff]
+
+Mininum coverage. Smaller ontigs will be discarded. Default 500.
+
+=head2 -t|--tab stats.tab
+
+A tabular file, with three columns: contig name, length, and coverage:
+
+NODE_1	31438	24.5116
+NODE_2	31354	2316.96
+NODE_3	26948	82.3294
+
+Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file.
+
+=head2 [-o|--filtered-out out.fasta]
+
+If specified, filtered out sequences will be written to this file.
+
+=head2 seqs.fasta
+
+Sequences in fasta format. Start of IDs must match ids in the tabular file. 
+
+=head1 OUTPUT
+
+A fasta file on stdout.
+
+=head1 AUTHOR
+
+Lionel Guy (lionel.guy@icm.uu.se)
+
+=head1 DATE
+
+Thu Aug 29 13:51:13 CEST 2013
+
+=cut
+
+# libraries
+use strict;
+use Getopt::Long;
+use Bio::SeqIO;
+
+my $coverage_co = 10;
+my $length_co = 500;
+my $out_filtered;
+my $tab_file;
+
+GetOptions(
+    'c|coverage-cutoff=s' => \$coverage_co,
+    'l|length-cutoff=s' => \$length_co,
+    'o|filtered-out=s' => \$out_filtered,
+    't|tab=s' => \$tab_file,
+);
+my $fasta_file = shift(@ARGV);
+die ("No tab file specified") unless ($tab_file);
+die ("No fasta file specified") unless ($fasta_file);
+
+## Read tab file, discard rows with comments
+open TAB, '<', $tab_file or die "$?";
+my @stats; 
+while (<TAB>){
+    chomp;
+    push @stats, $_ unless (/^#/);
+}
+
+## Read fasta
+my $seq_in = Bio::SeqIO->new(-file => $fasta_file, 
+			     -format => 'fasta');
+my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT, 
+			      -format => 'fasta');
+my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered", 
+				   -format => 'fasta') if ($out_filtered);
+while (my $seq = $seq_in->next_seq){
+    my $stat = shift @stats;
+    die "Less rows in tab than sequences in seq file" unless $stat;
+    my ($id_tab, $length, $coverage) = split(/\t+/, $stat);
+    die "id, length or coverate not defined at $stat\n" 
+	unless ($id_tab && $length && $coverage);
+    my $id_seq = $seq->id;
+    die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i);
+    if ($length >= $length_co && $coverage >= $coverage_co){
+	$seq_out->write_seq($seq);
+    } elsif ($out_filtered){
+	$seq_out_filt->write_seq($seq);
+    } else {
+	# do nothing
+    }
+}
+die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0);
+exit 0;
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_2_5/filter_spades_output.xml	Thu Sep 05 07:43:48 2013 -0400
@@ -0,0 +1,33 @@
+<tool id="filter_spades_output" name="Filter SPAdes output" version="0.1">
+  <description>remove low coverage and short contigs/scaffolds</description>
+  <command interpreter="perl">filter_spades_output.pl 
+  --coverage-cutoff $coverage_co
+  --length-cutoff $length_co
+  #if $keep_leftover
+    --filtered-out $filtered_out
+  #end if
+  --tab $stats_in
+  $fasta_in > $fasta_output
+  </command>
+  
+  <inputs>
+    <param name="fasta_in" type="data" format="fasta" label="Sequences" help="Contigs or scaffolds. Make sure you input the corresponding stat file" />
+    <param name="stats_in" type="data" format="tabular" label="Contig stats" />
+    <param name="length_co" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under that value are shown in red" />
+    <param name="coverage_co" type="integer" value="10" min="0" label="Coverage cut-off" help="Contigs with length under that value are shown in red" />
+    <param name="keep_leftover" type="boolean" checked="false" label="Save filtered-out sequences?" />
+  </inputs>
+  <outputs>
+    <data format="fasta" name="fasta_output" label="Filtered sequences" />
+    <data format="fasta" name="filtered_out" label="Discarded sequences"> 
+      <filter>keep_leftover == "true"</filter>
+    </data>
+  </outputs>
+  <help>
+**What it does**
+
+Using the output of SPAdes (a fasta and stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a given coverage.
+
+Typically, this is used to discard short contigs, or contaminations. To dispaly a coverage vs. length plot, use the "SPAdes stats" tool in the same package.
+  </help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_2_5/plot_spades_stats.xml	Thu Sep 05 07:43:48 2013 -0400
@@ -0,0 +1,80 @@
+<tool id="plot_spades_stats" name="SPAdes stats" version="0.1">
+  <description>coverage vs. length plot</description>
+  <requirements>
+    <requirement type="package">R</requirement>  
+  </requirements>
+  <command interpreter="bash">r_wrapper.sh $script_file</command>
+
+  <inputs>
+    <param name="input_scaffolds" type="data" format="tabular" label="Scaffold stats"/>
+    <param name="input_contigs" type="data" format="tabular" label="Contig stats"/>
+    <param name="length_co" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under that value are shown in red"/>
+    <param name="coverage_co" type="integer" value="10" min="0" label="Coverage cut-off" help="Contigs with length under that value are shown in red"/>
+  </inputs>
+  <configfiles>
+    <configfile name="script_file">
+## Setup R error handling to go to stderr
+options( show.error.messages=F, 
+  error = function () { 
+    cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) 
+} )
+files = c("${input_contigs}", "${input_scaffolds}")
+types = c("Contigs", "Scaffolds")
+
+## Start plotting device
+png("${out_file}", w=500, h=1000)
+par(mfrow=c(2,1))
+
+## Loop over the two files
+for (i in 1:length(types)){
+  seqs = read.table(files[i], header=FALSE, comment.char="#")
+  colnames = c("name", "length", "coverage")
+  names(seqs) = colnames
+
+  ## Stats over all sequences 
+  sl_all = sort(seqs\$length, decreasing=TRUE)
+  cs_all = cumsum(sl_all)
+  s_all = sum(seqs\$length)
+  n50_idx_all = which.min(sl_all[cs_all &lt; 0.5*s_all])
+  n90_idx_all = which.min(sl_all[cs_all &lt; 0.9*s_all])
+  n50_all = sl_all[n50_idx_all]
+  n90_all = sl_all[n90_idx_all]
+  
+  ## Filter short seqs, redo stats
+  seqs_filt = seqs[seqs\$length >= ${length_co} &amp; seqs\$coverage >= ${coverage_co},]
+  if (nrow(seqs_filt) > 0){
+    sl_filt = sort(seqs_filt\$length, decreasing=TRUE)
+    cs_filt = cumsum(sl_filt)
+    s_filt = sum(seqs_filt\$length)
+    n50_idx_filt = which.min(sl_filt[cs_filt &lt; 0.5*s_filt])
+    n90_idx_filt = which.min(sl_filt[cs_filt &lt; 0.9*s_filt])
+    n50_filt = sl_filt[n50_idx_filt]
+    n90_filt = sl_filt[n90_idx_filt]
+  }
+  seqs_bad = seqs[seqs\$length &lt; ${length_co} | seqs\$coverage &lt; ${coverage_co},]
+
+  ## Length vs coverage
+  plot(length~coverage, data=seqs, log="xy", type="n", main=paste(types[i], ": coverage vs. length", sep=""), xlab="Coverage", ylab="Length")
+  if (nrow(seqs_bad) > 0){
+    points(length~coverage, data=seqs_bad, cex=0.5, col="red")
+  }
+  if (nrow(seqs_filt) > 0){
+    points(length~coverage, data=seqs_filt, cex=0.5, col="black")
+  }
+  abline(v=${coverage_co}, h=${length_co}, lty=2, col=grey(0.3))
+  legend(x="topleft", legend=c("Before/after filtering", paste(c("N50: ", "N90: ", "Median cov.: "), c(n50_all, n90_all, round(median(seqs\$coverage))), rep("/", 3), c(n50_filt, n90_filt, round(median(seqs_filt\$coverage))), sep="")), cex=0.8)
+}
+dev.off()
+    </configfile>
+  </configfiles>
+  <outputs>
+    <data format="png" name="out_file" />
+  </outputs>
+  <help>
+**What it does**
+
+Using the output of SPAdes (a pair of fasta file and stat file for each of the contigs and scaffolds), it produces a coverage vs. contig plot. Each dot represent a contig/scaffold. Given a coverage and a length cutoff, sequences that do not meet those criteria are shown in red. Some statistics are also given (N50, N90, median contig/scaffold length) both before and after filtering.
+
+Use the "filter SPAdes output" tool to actually filter sequences.
+  </help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_2_5/r_wrapper.sh	Thu Sep 05 07:43:48 2013 -0400
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+### Run R providing the R script in $1 as standard input and passing 
+### the remaining arguments on the command line
+
+# Function that writes a message to stderr and exits
+function fail
+{
+    echo "$@" >&2
+    exit 1
+}
+
+# Ensure R executable is found
+which R > /dev/null || fail "'R' is required by this tool but was not found on path" 
+
+# Extract first argument
+infile=$1; shift
+
+# Ensure the file exists
+test -f $infile || fail "R input file '$infile' does not exist"
+
+# Invoke R passing file named by first argument to stdin
+R --vanilla --slave $* < $infile
--- a/tools/spades_2_5/spades.pl	Mon Aug 19 09:06:17 2013 -0400
+++ b/tools/spades_2_5/spades.pl	Thu Sep 05 07:43:48 2013 -0400
@@ -28,7 +28,7 @@
 
 # Create temporary folder to store files, delete after use
 #my $output_dir = tempdir( CLEANUP => 0 );
-my $output_dir = tempdir( CLEANUP => 1 );
+my $output_dir = 'output_dir';
 # Link "dat" files as fastq, otherwise spades complains about file format
 
 # Create log handle
@@ -72,13 +72,14 @@
     my ($in, $out) = @_;
     open FASTA, '<', $in or die $!;
     open TAB, '>', $out or die $!;
+    print TAB "#name\tlength\tcoverage\n";
     while (<FASTA>){
 	next unless /^>/;
 	chomp;
 	my @a = split(/\s/, $_);
 	my ($NODE, $n, $LENGTH, $l, $COV, $cov) = split(/_/, $a[0]);
 	die "Not all elements found in $_\n" unless ($n && $l && $cov);
-	print TAB "$n\t$l\t$cov\n";
+	print TAB "NODE_$n\t$l\t$cov\n";
     }
     close TAB;
 }
--- a/tools/spades_2_5/spades.xml	Mon Aug 19 09:06:17 2013 -0400
+++ b/tools/spades_2_5/spades.xml	Thu Sep 05 07:43:48 2013 -0400
@@ -1,4 +1,4 @@
-<tool id="spades" name="spades" version="0.4">
+<tool id="spades" name="spades" version="0.5">
   <description>SPAdes genome assembler for regular and single-cell projects</description>
   <requirements>
     <requirement type="package" version="2.5.0">spades</requirement>
@@ -11,70 +11,79 @@
      $out_log 
     ## A real command looks like: spades.py -k 21,33,55,77,99,127 --careful -1 Y.fastq.gz -2 X.fastq.gz -t 24 -o output
     spades.py
-    ## TODO: kmers, threads, other options (-sc for single-cell)
-    #if $sc == "true":
-      --sc 
-    #end if
-    #if $careful == "true":
-      --careful 
-    #end if
-    #if $rectangle == "true" 
-      --rectangle 
-    #end if 
+    $sc
+    $onlyassembler
+    $careful
+    $rectangle
     -t $threads 
     -k $kmers 
     -i $iterations 
     ##--phred-offset
     ## Sequence files
-    #for $i, $s in enumerate( $reads )
-      #if $s.read_type.type == "pairedend"
-      -1 $s.read_type.fwd_reads
-      -2 $s.read_type.rev_reads
-      #elif $s.read_type.type == "interleaved"
-      --12 $s.read_type.interleaved_reads
-      #elif $s.read_type.type == "unpaired"
-      -s $s.read_type.unpaired_reads
+    #for $i, $library in enumerate( $libraries )
+      #set num=$i+1
+      #if str( $library.lib_type ) == "paired_end":
+        #set prefix = 'pe'
+      #else:
+        #set prefix = 'mp'
       #end if
+      --$prefix$num-$library.orientation    
+      #for $file in $library.files
+	#if $file.file_type.type == "separate"
+          --$prefix$num-1 $file.file_type.fwd_reads
+          --$prefix$num-2 $file.file_type.rev_reads
+        #elif $file.file_type.type == "interleaved"
+          --$prefix$num-12 $file.file_type.interleaved_reads
+        #elif $file.file_type.type == "unpaired"
+          --$prefix$num-s $file.file_type.unpaired_reads
+        #end if
+      #end for
     #end for
   </command>
   <inputs>
-    <param name="sc" type="select" label="Single-cell?" help="This flag is required for MDA (single-cell) data.">
+    <param name="sc" type="boolean" truevalue="--sc" falsevalue="" label="Single-cell?" help="This option is required for MDA (single-cell) data.">
       <option value="false">No</option>
       <option value="true">Yes</option>
     </param>
-    <param name="careful" type="select" label="Careful correction?" help="Tries to reduce number of mismatches and short indels. Also runs MismatchCorrector – a post processing tool, which uses BWA tool (comes with SPAdes).">
-      <option value="false">No</option>
-      <option value="true" selected="true">Yes</option>
-    </param>
-    <param name="rectangle" type="select" label="Use rectangle correction for repeat resolution?" help="Uses rectangle graph algorithm for repeat resolution stage instead of usual SPAdes repeat resolution module (experimental).">
-      <option value="false" selected="true">No</option>
-      <option value="true">Yes</option>
-    </param>
+    <param name="onlyassembler" type="boolean" truevalue="--only-assembler" falsevalue="" checked="False" label="Run only assembly? (without read error correction)" />
+    <param name="careful" type="boolean" truevalue="--careful" falsevalue="" checked="True" label="Careful correction?" help="Tries to reduce number of mismatches and short indels. Also runs MismatchCorrector – a post processing tool, which uses BWA tool (comes with SPAdes)." />
+    <param name="rectangle" type="boolean" truevalue="--rectangle" falsevalue="" checked="False" label="Use rectangle correction for repeat resolution?" help="Uses rectangle graph algorithm for repeat resolution stage instead of usual SPAdes repeat resolution module (experimental, use at your own risks)." />
     <param name="threads" type="integer" label="Number of threads to use" value="16">
     </param>
     <param name="iterations" type="integer" label="Number of iterations for read error correction." value="1">
     </param>    
-    <param name="kmers" type="text" label="K-mers to use, separated by commas" value="21,33,55" help="Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128 and listed in ascending order). The default value is 21,33,55." >
+    <param name="kmers" type="text" label="K-mers to use, separated by commas" value="21,33,55" help="Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128, listed in ascending order, and smaller than the read length). The default value is 21,33,55." >
     </param>
     <!-- Reads -->
-    <repeat name="reads" title="Reads">
-      <conditional name="read_type">
-	<param name="type" type="select" label="Select type of reads">
-	  <option value="pairedend">Paired-end, separate inputs</option>
-	  <option value="interleaved">Paired-end, interleaved</option>
-	  <option value="unpaired">Unpaired reads</option>
-	</param>
-	<when value="pairedend">
-	  <param name="fwd_reads" type="data" format="fastq" label="Forward reads" help="FASTQ format" />
-	  <param name="rev_reads" type="data" format="fastq" label="Reverse reads" help="FASTQ format" />
-	</when>
-	<when value="interleaved">
-	  <param name="interleaved_reads" type="data" format="fastq" label="Interleaved paired reads" help="FASTQ format" />
-	</when>
-	<when value="unpaired">
-	  <param name="unpaired_reads" type="data" format="fastq" label="Unpaired reads" help="FASTQ format" />
-	</when>
-      </conditional>
+    <repeat name="libraries" title="Libraries">
+      <param name="lib_type" type="select" label="Library type">
+	<option value="paired_end">Paired end / Single reads</option>
+	<option value="mate_paired">Mate pairs</option>
+      </param>
+      <param name="orientation" type="select" label="Orientation">
+	<option value="fr" selected="true">-> &lt;- (fr)</option>
+	<option value="rf">&lt;- -> (rf)</option>
+	<option value="ff">-> -> (ff)</option>
+      </param>
+      <repeat name="files" title="Files">
+	<conditional name="file_type">
+	  <param name="type" type="select" label="Select file format">
+	    <option value="separate">Separate input files</option>
+	    <option value="interleaved">Interleaved files</option>
+	    <option value="unpaired">Unpaired/Single reads</option>
+	  </param>
+	  <when value="separate">
+	    <param name="fwd_reads" type="data" format="fastq" label="Forward reads" help="FASTQ format" />
+	    <param name="rev_reads" type="data" format="fastq" label="Reverse reads" help="FASTQ format" />
+	  </when>
+	  <when value="interleaved">
+	    <param name="interleaved_reads" type="data" format="fastq" label="Interleaved paired reads" help="FASTQ format" />
+	  </when>
+	  <when value="unpaired">
+	    <param name="unpaired_reads" type="data" format="fastq" label="Unpaired reads" help="FASTQ format" />
+	  </when>
+	</conditional>
+      </repeat>
     </repeat>
   </inputs>
   <outputs>
@@ -84,9 +93,9 @@
     <data name="out_scaffold_stats" format="tabular" label="SPAdes scaffold stats" />
     <data name="out_log" format="txt" label="SPAdes log" />
   </outputs>
-  <tests>
+  <!--  <tests>
     <test>
-      <!-- Based on the tests coming along with SPAdes -->
+      
       <param name="sc" value="false" />
       <param name="careful" value="false" />
       <param name="rectangle" value="false" />
@@ -97,7 +106,7 @@
       <param name="rev_reads" value="ecoli_1K_2.fq" ftype="fastq" />
       <output name="out_contigs" file="reference_1K.fa" ftype="fasta" compare="re_match" lines_diff="1" />
     </test>
-  </tests>
+  </tests> -->
   <help>
 **What it does**