changeset 1:0f8b2da62d7d draft

Support for SPAdes 2.5.0. Added a tab-separated output with coverage vs. length info for each contig.
author lionelguy
date Mon, 19 Aug 2013 09:06:17 -0400
parents a407a6ed437d
children b5ce24f34dd7
files tools/spades_2_4/spades.pl tools/spades_2_4/spades.xml tools/spades_2_4/tool_dependencies.xml tools/spades_2_5/spades.pl tools/spades_2_5/spades.xml tools/spades_2_5/tool_dependencies.xml
diffstat 6 files changed, 228 insertions(+), 184 deletions(-) [+]
line wrap: on
line diff
--- a/tools/spades_2_4/spades.pl	Wed Jul 17 06:07:29 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,47 +0,0 @@
-#!/usr/bin/env perl
-## A wrapper script to call spades.py and collect its output
-use strict;
-use warnings;
-use File::Temp qw/ tempfile tempdir /;
-use File::Copy;
-
-# Parse arguments
-my ($out_contigs_file, $out_scaffolds_file, $out_log_file, @sysargs) = @ARGV;
-
-# Create temporary folder to store files, delete after use
-#my $output_dir = tempdir( CLEANUP => 0 );
-my $output_dir = tempdir( CLEANUP => 1 );
-# Link "dat" files as fastq, otherwise spades complains about file format
-
-# Create log handle
-open my $log, '>', $out_log_file or die "Cannot write to $out_log_file: $?\n";
-
-# Run program
-# To do: record time
-&runSpades(@sysargs);
-&collectOutput();
-print $log "Done\n";
-close $log;
-exit 0;
-
-# Run spades
-sub runSpades {
-    my $cmd = join(" ", @_) . " -o $output_dir";
-    my $return_code = system($cmd);
-    if ($return_code) {
-	print $log "Failed with code $return_code\nCommand $cmd\nMessage: $?\n";
-	die "Failed with code $return_code\nCommand $cmd\nMessage: $?\n";
-    }
-    return 0;
-}
-# Collect output
-sub collectOutput{
-    # To do: check that the files are there
-    # Collects output
-    move "$output_dir/contigs.fasta", $out_contigs_file;
-    move "$output_dir/scaffolds.fasta", $out_scaffolds_file;
-    open LOG, '<', "$output_dir/spades.log" 
-	or die "Cannot open log file $output_dir/spades.log: $?";
-    print $log $_ while (<LOG>);
-    return 0;
-}
--- a/tools/spades_2_4/spades.xml	Wed Jul 17 06:07:29 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,104 +0,0 @@
-<tool id="spades" name="spades" version="0.2">
-  <description>SPAdes genome assembler for regular and single-cell projects</description>
-  <requirements>
-    <requirement type="package" version="2.4.0">spades</requirement>
-  </requirements>
-  <command interpreter="perl">spades.pl $out_contigs $out_scaffolds $out_log 
-    ## A real command looks like: spades.py -k 21,33,55,77,99,127 --careful -1 Y.fastq.gz -2 X.fastq.gz -t 24 -o output
-    spades.py
-    ## TODO: kmers, threads, other options (-sc for single-cell)
-    #if $sc == "true":
-      --sc 
-    #end if
-    #if $careful == "true":
-      --careful 
-    #end if
-    #if $rectangle == "true" 
-      --rectangle 
-    #end if 
-    -t $threads 
-    -k $kmers 
-    -i $iterations 
-    ##--phred-offset
-    ## Sequence files
-    #for $i, $s in enumerate( $reads )
-      #if $s.read_type.type == "pairedend"
-      -1 $s.read_type.fwd_reads
-      -2 $s.read_type.rev_reads
-      #elif $s.read_type.type == "interleaved"
-      --12 $s.read_type.interleaved_reads
-      #elif $s.read_type.type == "unpaired"
-      -s $s.read_type.unpaired_reads
-      #end if
-    #end for
-  </command>
-  <inputs>
-    <param name="sc" type="select" label="Single-cell?" help="This flag is required for MDA (single-cell) data.">
-      <option value="false">No</option>
-      <option value="true">Yes</option>
-    </param>
-    <param name="careful" type="select" label="Careful correction?" help="Tries to reduce number of mismatches and short indels. Also runs MismatchCorrector – a post processing tool, which uses BWA tool (comes with SPAdes).">
-      <option value="false">No</option>
-      <option value="true" selected="true">Yes</option>
-    </param>
-    <param name="rectangle" type="select" label="Use rectangle correction for repeat resolution?" help="Uses rectangle graph algorithm for repeat resolution stage instead of usual SPAdes repeat resolution module (experimental).">
-      <option value="false" selected="true">No</option>
-      <option value="true">Yes</option>
-    </param>
-    <param name="threads" type="integer" label="Number of threads to use" value="16">
-    </param>
-    <param name="iterations" type="integer" label="Number of iterations for read error correction." value="1">
-    </param>    
-    <param name="kmers" type="text" label="K-mers to use, separated by commas" value="21,33,55" help="Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128 and listed in ascending order). The default value is 21,33,55." >
-    </param>
-    <!-- Reads -->
-    <repeat name="reads" title="Reads">
-      <conditional name="read_type">
-	<param name="type" type="select" label="Select type of reads">
-	  <option value="pairedend">Paired-end, separate inputs</option>
-	  <option value="interleaved">Paired-end, interleaved</option>
-	  <option value="unpaired">Unpaired reads</option>
-	</param>
-	<when value="pairedend">
-	  <param name="fwd_reads" type="data" format="fastq" label="Forward reads" help="FASTQ format" />
-	  <param name="rev_reads" type="data" format="fastq" label="Reverse reads" help="FASTQ format" />
-	</when>
-	<when value="interleaved">
-	  <param name="interleaved_reads" type="data" format="fastq" label="Interleaved paired reads" help="FASTQ format" />
-	</when>
-	<when value="unpaired">
-	  <param name="unpaired_reads" type="data" format="fastq" label="Unpaired reads" help="FASTQ format" />
-	</when>
-      </conditional>
-    </repeat>
-  </inputs>
-  <outputs>
-    <data name="out_contigs" format="fasta" label="SPAdes contigs (fasta)" />
-    <data name="out_scaffolds" format="fasta" label="SPAdes scaffolds (fasta)" />
-    <data name="out_log" format="txt" label="SPAdes log" />
-  </outputs>
-  <tests>
-    <test>
-      <!-- Based on the tests coming along with SPAdes -->
-      <param name="sc" value="false" />
-      <param name="careful" value="false" />
-      <param name="rectangle" value="false" />
-      <param name="threads" value="16" />
-      <param name="kmers" value="33,55" />
-      <param name="type" value="pairedend" />
-      <param name="fwd_reads" value="ecoli_1K_1.fq" ftype="fastq" />
-      <param name="rev_reads" value="ecoli_1K_2.fq" ftype="fastq" />
-      <output name="out_contigs" file="reference_1K.fa" ftype="fasta" compare="re_match" lines_diff="1" />
-    </test>
-  </tests>
-  <help>
-**What it does**
-
-Runs SPAdes 2.4.0, collects the output, and throws away all the temporary files.
-
-**Citation**
-
-Anton Bankevich, Sergey Nurk, Dmitry Antipov, Alexey A. Gurevich, Mikhail Dvorkin, Alexander S. Kulikov, Valery M. Lesin, Sergey I. Nikolenko, Son Pham, Andrey D. Prjibelski, Alexey V. Pyshkin, Alexander V. Sirotkin, Nikolay Vyahhi, Glenn Tesler, Max A. Alekseyev, and Pavel A. Pevzner. Journal of Computational Biology. May 2012, 19(5): 455-477. doi:10.1089/cmb.2012.0021. 
-    
-  </help>
-</tool>
--- a/tools/spades_2_4/tool_dependencies.xml	Wed Jul 17 06:07:29 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="spades" version="2.4.0">
-        <install version="1.0">
-            <actions>
-                <action type="download_by_url">http://spades.bioinf.spbau.ru/release2.4.0/SPAdes-2.4.0-Linux.tar.gz</action>
-                <!-- Copying the whole bin folder -->
-		<action type="make_directory">$INSTALL_DIR/bin</action>
-		<action type="make_directory">$INSTALL_DIR/share</action>
-		<action type="move_directory_files">
-		  <source_directory>bin</source_directory>
-		  <destination_directory>$INSTALL_DIR/bin</destination_directory>
-		</action>
-                <action type="move_directory_files">
-		  <source_directory>share</source_directory>
-		  <destination_directory>$INSTALL_DIR/share</destination_directory>
-		</action>
-		<!-- This is required to have spades accept .dat files -->
-		<action type="shell_command">sed -i -e "s/\('\.fa', '\.fasta', '\.fq', '\.fastq', '\.gz'\)/\\1, '.dat'/" $INSTALL_DIR/bin/spades.py</action>
-                <action type="set_environment">
-                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
-                </action>
-            </actions>
-        </install>
-        <readme>
-This installs SPAdes 2.4.0.
-
-See manual here http://spades.bioinf.spbau.ru/release2.4.0/manual.html
-See also here http://bioinf.spbau.ru/en/spades
-        </readme>
-    </package>
-</tool_dependency>
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_2_5/spades.pl	Mon Aug 19 09:06:17 2013 -0400
@@ -0,0 +1,84 @@
+#!/usr/bin/env perl
+## A wrapper script to call spades.py and collect its output
+use strict;
+use warnings;
+use File::Temp qw/ tempfile tempdir /;
+use File::Copy;
+use Getopt::Long;
+
+# Parse arguments
+my ($out_contigs_file,
+    $out_contigs_stats,
+    $out_scaffolds_file,
+    $out_scaffolds_stats,
+    $out_log_file,
+    @sysargs) = @ARGV;
+
+## GetOptions not compatible with parsing the rest of the arguments in an array.
+## Keeping the not-so-nice parse-in-one-go method, without named arguments.
+# GetOptions(
+#     'contigs-file=s'    => \$out_contigs_file,
+#     'contigs-stats=s'   => \$out_contigs_stats,
+#     'scaffolds-file=s'  => \$out_scaffolds_file,
+#     'scaffolds-stats=s' => \$out_scaffolds_stats,
+#     'out_log_file=s'    => \$out_log_file,
+# );
+
+# my @sysargs = @ARGV;
+
+# Create temporary folder to store files, delete after use
+#my $output_dir = tempdir( CLEANUP => 0 );
+my $output_dir = tempdir( CLEANUP => 1 );
+# Link "dat" files as fastq, otherwise spades complains about file format
+
+# Create log handle
+open my $log, '>', $out_log_file or die "Cannot write to $out_log_file: $?\n";
+
+# Run program
+# To do: record time
+&runSpades(@sysargs);
+&collectOutput();
+&extractCoverageLength($out_contigs_file, $out_contigs_stats);
+&extractCoverageLength($out_scaffolds_file, $out_scaffolds_stats);
+print $log "Done\n";
+close $log;
+exit 0;
+
+# Run spades
+sub runSpades {
+    my $cmd = join(" ", @_) . " -o $output_dir";
+    my $return_code = system($cmd);
+    if ($return_code) {
+	print $log "Failed with code $return_code\nCommand $cmd\nMessage: $?\n";
+	die "Failed with code $return_code\nCommand $cmd\nMessage: $?\n";
+    }
+    return 0;
+}
+
+# Collect output
+sub collectOutput{
+    # To do: check that the files are there
+    # Collects output
+    move "$output_dir/contigs.fasta", $out_contigs_file;
+    move "$output_dir/scaffolds.fasta", $out_scaffolds_file;
+    open LOG, '<', "$output_dir/spades.log" 
+	or die "Cannot open log file $output_dir/spades.log: $?";
+    print $log $_ while (<LOG>);
+    return 0;
+}
+
+# Extract
+sub extractCoverageLength{
+    my ($in, $out) = @_;
+    open FASTA, '<', $in or die $!;
+    open TAB, '>', $out or die $!;
+    while (<FASTA>){
+	next unless /^>/;
+	chomp;
+	my @a = split(/\s/, $_);
+	my ($NODE, $n, $LENGTH, $l, $COV, $cov) = split(/_/, $a[0]);
+	die "Not all elements found in $_\n" unless ($n && $l && $cov);
+	print TAB "$n\t$l\t$cov\n";
+    }
+    close TAB;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_2_5/spades.xml	Mon Aug 19 09:06:17 2013 -0400
@@ -0,0 +1,111 @@
+<tool id="spades" name="spades" version="0.4">
+  <description>SPAdes genome assembler for regular and single-cell projects</description>
+  <requirements>
+    <requirement type="package" version="2.5.0">spades</requirement>
+  </requirements>
+  <command interpreter="perl">spades.pl 
+     $out_contigs 
+     $out_contig_stats 
+     $out_scaffolds 
+     $out_scaffold_stats 
+     $out_log 
+    ## A real command looks like: spades.py -k 21,33,55,77,99,127 --careful -1 Y.fastq.gz -2 X.fastq.gz -t 24 -o output
+    spades.py
+    ## TODO: kmers, threads, other options (-sc for single-cell)
+    #if $sc == "true":
+      --sc 
+    #end if
+    #if $careful == "true":
+      --careful 
+    #end if
+    #if $rectangle == "true" 
+      --rectangle 
+    #end if 
+    -t $threads 
+    -k $kmers 
+    -i $iterations 
+    ##--phred-offset
+    ## Sequence files
+    #for $i, $s in enumerate( $reads )
+      #if $s.read_type.type == "pairedend"
+      -1 $s.read_type.fwd_reads
+      -2 $s.read_type.rev_reads
+      #elif $s.read_type.type == "interleaved"
+      --12 $s.read_type.interleaved_reads
+      #elif $s.read_type.type == "unpaired"
+      -s $s.read_type.unpaired_reads
+      #end if
+    #end for
+  </command>
+  <inputs>
+    <param name="sc" type="select" label="Single-cell?" help="This flag is required for MDA (single-cell) data.">
+      <option value="false">No</option>
+      <option value="true">Yes</option>
+    </param>
+    <param name="careful" type="select" label="Careful correction?" help="Tries to reduce number of mismatches and short indels. Also runs MismatchCorrector – a post processing tool, which uses BWA tool (comes with SPAdes).">
+      <option value="false">No</option>
+      <option value="true" selected="true">Yes</option>
+    </param>
+    <param name="rectangle" type="select" label="Use rectangle correction for repeat resolution?" help="Uses rectangle graph algorithm for repeat resolution stage instead of usual SPAdes repeat resolution module (experimental).">
+      <option value="false" selected="true">No</option>
+      <option value="true">Yes</option>
+    </param>
+    <param name="threads" type="integer" label="Number of threads to use" value="16">
+    </param>
+    <param name="iterations" type="integer" label="Number of iterations for read error correction." value="1">
+    </param>    
+    <param name="kmers" type="text" label="K-mers to use, separated by commas" value="21,33,55" help="Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128 and listed in ascending order). The default value is 21,33,55." >
+    </param>
+    <!-- Reads -->
+    <repeat name="reads" title="Reads">
+      <conditional name="read_type">
+	<param name="type" type="select" label="Select type of reads">
+	  <option value="pairedend">Paired-end, separate inputs</option>
+	  <option value="interleaved">Paired-end, interleaved</option>
+	  <option value="unpaired">Unpaired reads</option>
+	</param>
+	<when value="pairedend">
+	  <param name="fwd_reads" type="data" format="fastq" label="Forward reads" help="FASTQ format" />
+	  <param name="rev_reads" type="data" format="fastq" label="Reverse reads" help="FASTQ format" />
+	</when>
+	<when value="interleaved">
+	  <param name="interleaved_reads" type="data" format="fastq" label="Interleaved paired reads" help="FASTQ format" />
+	</when>
+	<when value="unpaired">
+	  <param name="unpaired_reads" type="data" format="fastq" label="Unpaired reads" help="FASTQ format" />
+	</when>
+      </conditional>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data name="out_contigs" format="fasta" label="SPAdes contigs (fasta)" />
+    <data name="out_contig_stats" format="tabular" label="SPAdes contig stats" />
+    <data name="out_scaffolds" format="fasta" label="SPAdes scaffolds (fasta)" />
+    <data name="out_scaffold_stats" format="tabular" label="SPAdes scaffold stats" />
+    <data name="out_log" format="txt" label="SPAdes log" />
+  </outputs>
+  <tests>
+    <test>
+      <!-- Based on the tests coming along with SPAdes -->
+      <param name="sc" value="false" />
+      <param name="careful" value="false" />
+      <param name="rectangle" value="false" />
+      <param name="threads" value="16" />
+      <param name="kmers" value="33,55" />
+      <param name="type" value="pairedend" />
+      <param name="fwd_reads" value="ecoli_1K_1.fq" ftype="fastq" />
+      <param name="rev_reads" value="ecoli_1K_2.fq" ftype="fastq" />
+      <output name="out_contigs" file="reference_1K.fa" ftype="fasta" compare="re_match" lines_diff="1" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+Runs SPAdes 2.5.0, collects the output, and throws away all the temporary files. It also produces a tab file with contig names, length and coverage.
+
+**Citation**
+
+Anton Bankevich, Sergey Nurk, Dmitry Antipov, Alexey A. Gurevich, Mikhail Dvorkin, Alexander S. Kulikov, Valery M. Lesin, Sergey I. Nikolenko, Son Pham, Andrey D. Prjibelski, Alexey V. Pyshkin, Alexander V. Sirotkin, Nikolay Vyahhi, Glenn Tesler, Max A. Alekseyev, and Pavel A. Pevzner. Journal of Computational Biology. May 2012, 19(5): 455-477. doi:10.1089/cmb.2012.0021. 
+    
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_2_5/tool_dependencies.xml	Mon Aug 19 09:06:17 2013 -0400
@@ -0,0 +1,33 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="spades" version="2.5.0">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url">http://spades.bioinf.spbau.ru/release2.5.0/SPAdes-2.5.0-Linux.tar.gz</action>
+                <!-- Copying the whole bin folder -->
+		<action type="make_directory">$INSTALL_DIR/bin</action>
+		<action type="make_directory">$INSTALL_DIR/share</action>
+		<action type="move_directory_files">
+		  <source_directory>bin</source_directory>
+		  <destination_directory>$INSTALL_DIR/bin</destination_directory>
+		</action>
+                <action type="move_directory_files">
+		  <source_directory>share</source_directory>
+		  <destination_directory>$INSTALL_DIR/share</destination_directory>
+		</action>
+		<!-- This is required to have spades accept .dat files -->
+		<action type="shell_command">sed -i -e "s/\('\.fa', '\.fasta', '\.fq', '\.fastq', '\.gz'\)/\\1, '.dat'/" $INSTALL_DIR/share/spades/spades_pipeline/support.py</action>
+                <action type="set_environment">
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+                </action>
+            </actions>
+        </install>
+        <readme>
+This installs SPAdes 2.5.0.
+
+See manual here http://spades.bioinf.spbau.ru/release2.5.0/manual.html
+See also here http://bioinf.spbau.ru/en/spades
+        </readme>
+    </package>
+</tool_dependency>
+