changeset 0:58e1eb37fddc draft

Uploaded
author crs4
date Tue, 15 Oct 2013 11:15:28 -0400
parents
children ccadfae70b02
files COPYING sequel_wrapper.py sequel_wrapper.xml tool_dependencies.xml
diffstat 4 files changed, 345 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/COPYING	Tue Oct 15 11:15:28 2013 -0400
@@ -0,0 +1,23 @@
+Copyright © 2013 CRS4 Srl. http://www.crs4.it/
+Created by:
+Andrea Pinna <andrea.pinna@crs4.it>
+Nicola Soranzo <nicola.soranzo@crs4.it>
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sequel_wrapper.py	Tue Oct 15 11:15:28 2013 -0400
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+"""
+SEQuel
+version 0.2 (andrea.pinna@crs4.it)
+"""
+
+import optparse
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import uuid
+
+def external_insert_size(prep_file):
+    """ Retrieve average external insert-size from pre-processing log """
+    with open(prep_file, 'r') as f:
+        for line in f:
+            if line[0:30] == '\tAverage external insert-size:':
+                ext_int_size = line.split(':')[1][1:-1]
+                break
+        else:
+            sys.exit('Average external insert-size not found in %s' % prep_file)
+    return ext_int_size
+
+
+def __main__():
+    # load arguments
+    print 'Parsing SEQuel input options...'
+    parser = optparse.OptionParser()
+    parser.add_option('--sequel_jar_path', dest='sequel_jar_path', help='')
+    parser.add_option('--read1', dest='read1', help='')
+    parser.add_option('--read2', dest='read2', help='')
+    parser.add_option('--contigs', dest='contigs', help='')
+    parser.add_option('--bases_length', dest='bases_length', type='int', help='')
+    parser.add_option('-t', dest='n_threads', type='int', help='')
+    parser.add_option('-p', dest='max_threads', type='int', help='')
+    parser.add_option('-u', dest='min_threads', type='int', help='')
+    parser.add_option('--kmer_size', dest='kmer_size', type='int', help='')
+    parser.add_option('--max_positional_error', dest='max_positional_error', type='int', help='')
+    parser.add_option('--min_fraction', dest='min_fraction', type='float', help='')
+    parser.add_option('--min_aln_length', dest='min_aln_length', type='int', help='')
+    parser.add_option('--min_avg_coverage', dest='min_avg_coverage', type='float', help='')
+    parser.add_option('--discard_kmers', dest='discard_kmers', type='int', help='')
+    parser.add_option('--discard_positional', dest='discard_positional', type='int', help='')
+    parser.add_option('--min_aln_score', dest='min_aln_score', type='int', help='')
+    parser.add_option('--single_cell_mode', action='store_true', dest='single_cell_mode', help='')
+    parser.add_option('--report_changes', action='store_true', dest='report_changes', help='')
+    parser.add_option('--extend_contig', action='store_true', dest='extend_contig', help='')
+    parser.add_option('--reference_genome', dest='reference_genome', help='')
+    parser.add_option('--contigs_refined', dest='contigs_refined', help='')
+    parser.add_option('--logprep', dest='logprep', help='')
+    parser.add_option('--logseq', dest='logseq', help='')
+    parser.add_option('--logfile_prep', dest='logfile_prep', help='')
+    parser.add_option('--logfile_seq', dest='logfile_seq', help='')
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error('Wrong number of arguments')
+
+    prep_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # prep.pl dies if the directory already exists, so just define a unique name
+    seq_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # SEQuel.jar would work in another directory (with '_1' suffix) if the directory already exists, so just define a unique name
+
+    # Build SEQuel (pre-processing) command to be executed
+    # basic preprocessing
+    prep_input = "-r1 %s -r2 %s -c %s" % (options.read1, options.read2, options.contigs)
+    bases_length = "-l %d" % (options.bases_length) if options.bases_length is not None else ''
+    n_threads = "-t %d" % (options.n_threads) if options.n_threads is not None else ''
+    cmd_prep_dir = "-o %s" % (prep_directory)
+    cmd_seq_dir = "-o %s" % (seq_directory)
+    # -i INT (external insert size of paired-end reads (from prep.log)
+    # -p INT
+    max_threads = "-p %d" % (options.max_threads) if options.max_threads is not None else ''
+    # -u INT
+    min_threads = "-u %d" % (options.min_threads) if options.min_threads is not None else ''
+    # -A DIR
+    input_directory = "-A %s" % (prep_directory)
+    # -k INT
+    kmer_size = "-k %d" % (options.kmer_size) if options.kmer_size is not None else ''
+    # -d INT
+    max_positional_error = "-d %d" % (options.max_positional_error) if options.max_positional_error is not None else ''
+    # -f FLOAT
+    min_fraction = "-f %s" % (options.min_fraction) if options.min_fraction is not None else ''
+    # -l INT
+    min_aln_length = "-l %d" % (options.min_aln_length) if options.min_aln_length is not None else ''
+    # -v FLOAT
+    min_avg_coverage = "-v %s" % (options.min_avg_coverage) if options.min_avg_coverage is not None else ''
+    # -m INT
+    discard_kmers = "-m %d" % (options.discard_kmers) if options.discard_kmers is not None else ''
+    # -n INT
+    discard_positional = "-n %d" % (options.discard_positional) if options.discard_positional is not None else ''
+    # -q INT
+    min_aln_score = "-q %d" % (options.min_aln_score) if options.min_aln_score is not None else ''
+    # -s
+    single_cell_mode = '-s' if options.single_cell_mode else ''
+    # -r
+    report_changes = '-r' if options.report_changes else ''
+    # -e
+    extend_contig = '-e' if options.extend_contig else ''
+    # -g FILE
+    reference_genome = "-g %s" % (options.reference_genome) if options.reference_genome else ''
+    # contigs_refined.fa
+    contigs_refined = options.contigs_refined
+    # logprep & logseq
+    logprep = options.logprep
+    logseq = options.logseq
+    # x.refined-fa
+    # x.stdout
+    # logfile
+    logfile_prep = options.logfile_prep
+    logfile_seq = options.logfile_seq
+
+    # Build SEQuel (pre-processing) command
+    cmd1 = ' '.join(['prep.pl', prep_input, bases_length, n_threads, cmd_prep_dir])
+    print '\nSEQuel (pre-processing) command to be executed:\n %s' % (cmd1)
+
+    # Execution of SEQuel (pre-processing)
+    print 'Executing SEQuel (pre-processing)...'
+    logfile_prep_file = open(logfile_prep, 'w') if logfile_prep else sys.stdout
+    try:
+        try:
+            subprocess.check_call(cmd1, stdout=logfile_prep_file, stderr=subprocess.STDOUT, shell=True)
+        finally:
+            if logfile_prep_file != sys.stdout:
+                logfile_prep_file.close()
+        print 'SEQuel (pre-processing) executed!'
+
+        prep_file = os.path.join(prep_directory, 'prep.log')
+        ext_ins_size = external_insert_size(prep_file)
+
+        # Build SEQuel command
+        cmd2 = 'java -Xmx12g -jar %s %s -i %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % (os.path.join(options.sequel_jar_path, 'SEQuel.jar'), input_directory, ext_ins_size, max_threads, min_threads, kmer_size, max_positional_error, min_fraction, min_aln_length, min_avg_coverage, discard_kmers, discard_positional, min_aln_score, single_cell_mode, report_changes, extend_contig, reference_genome, cmd_seq_dir)
+        print '\nSEQuel command to be executed:\n %s' % (cmd2)
+
+        # Execution of SEQuel
+        print 'Executing SEQuel...'
+        logfile_seq_file = open(logfile_seq, 'w') if logfile_seq else sys.stdout
+        try:
+            subprocess.check_call(cmd2, stdout=logfile_seq_file, stderr=subprocess.STDOUT, shell=True)
+        except subprocess.CalledProcessError, e:
+            if e.retcode == 24:
+                sys.exit("Too many contigs in the assembly!")
+            else:
+                raise
+        finally:
+            if logfile_seq_file != sys.stdout:
+                logfile_seq_file.close()
+        print 'SEQuel executed!'
+
+        shutil.move(prep_file, logprep)
+        shutil.move(os.path.join(seq_directory, 'Log'), logseq)
+        shutil.move(os.path.join(seq_directory, 'contigs_refined.fa'), contigs_refined)
+    finally:
+        shutil.rmtree(prep_directory, ignore_errors=True)
+        shutil.rmtree(seq_directory, ignore_errors=True)
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sequel_wrapper.xml	Tue Oct 15 11:15:28 2013 -0400
@@ -0,0 +1,132 @@
+<tool id="sequel_wrapper" name="SEQuel" version="0.2">
+  <description></description>
+  <requirements>
+    <requirement type="package" version="0.6.2">bwa</requirement>
+    <requirement type="package" version="35">blat</requirement>
+    <requirement type="package" version="1.0.2">sequel</requirement>
+  </requirements>
+  <command interpreter="python">
+    sequel_wrapper.py
+    \${SEQUEL_SITE_OPTIONS:--t 8 -p 8 -u 1}
+    --sequel_jar_path=\$SEQUEL_JAR_PATH --read1=$read1 --read2=$read2 --contigs=$contigs
+    #if str($bases_length)
+      --bases_length=$bases_length
+    #end if
+    #if str($kmer_size)
+      --kmer_size=$kmer_size
+    #end if
+    #if str($max_positional_error)
+      --max_positional_error=$max_positional_error
+    #end if
+    #if str($min_fraction)
+      --min_fraction=$min_fraction
+    #end if
+    #if str($min_aln_length)
+      --min_aln_length=$min_aln_length
+    #end if
+    #if str($min_avg_coverage)
+      --min_avg_coverage=$min_avg_coverage
+    #end if
+    #if str($discard_kmers)
+      --discard_kmers=$discard_kmers
+    #end if
+    #if str($discard_positional)
+      --discard_positional=$discard_positional
+    #end if
+    #if str($min_aln_score)
+      --min_aln_score=$min_aln_score
+    #end if
+    #if $single_cell_mode
+      --single_cell_mode
+    #end if
+    #if $report_changes
+      --report_changes
+    #end if
+    #if $extend_contig
+      --extend_contig
+    #end if
+    #if $reference_genome
+      --reference_genome=$reference_genome
+    #end if
+    --contigs_refined=$contigs_refined
+    --logprep=$logprep
+    --logseq=$logseq
+    --logfile_prep=$logfile_prep
+    --logfile_seq=$logfile_seq
+  </command>
+
+  <inputs>
+    <param name="read1" type="data" format="fasta,fastq" label="Paired-end reads 1 from sequencing (-r1)" help="FASTA or FASTQ format" />
+    <param name="read2" type="data" format="fasta,fastq" label="Paired-end reads 2 from sequencing (-r2)" help="FASTA or FASTQ format" />
+    <param name="contigs" type="data" format="fasta,fastq" label="Contigs from assembly (-c)" help="FASTA or FASTQ format" />
+
+    <param name="bases_length" type="integer" value="0" optional="true" label="Preprocessing: do not refine contigs shorter than n bases (-l)" help="Contigs shorter than n bases will appear unchanged in the final output file" />
+
+    <param name="kmer_size" type="integer" value="50" optional="true" label="K-mer size (-k)" help="" />
+
+    <param name="max_positional_error" type="integer" value="25" optional="true" label="Max positional error Delta (-d)" help="" />
+
+    <param name="min_fraction" type="float" value="0.9" optional="true" label="Min fraction of matches in alignment (-f)" help="" />
+
+    <param name="min_aln_length" type="integer" value="" optional="true" label="Min alignment length (-l)" help="bp or fraction of contig. Optional." />
+
+    <param name="min_avg_coverage" type="float" value="20.0" optional="true" label="Min average coverage to incorporate changes (-v)" help="" />
+
+    <param name="discard_kmers" type="integer" value="1" optional="true" label="Discard k-mers observed less than m times (-m)" help="" />
+
+    <param name="discard_positional" type="integer" value="1" optional="true" label="Discard positional k-mers observed less than n times (-n)" help="" />
+
+    <param name="min_aln_score" type="integer" value="1" optional="true" label="Min alignment score (MAPQ) of reads to consider (-q)" help="" />
+
+    <param name="single_cell_mode" type="boolean" optional="true" checked="false" label="Single cell mode, sort partial-contigs by coverage (-s)" />
+
+    <param name="report_changes" type="boolean" optional="true" checked="false" label="Report changes (slow) for all input-contigs (-r)" />
+
+    <param name="extend_contig" type="boolean" optional="true" checked="false" label="Extend contig with flanking regions of alignment (-e)" />
+
+    <param name="reference_genome" type="data" format="fasta,twobit" optional="true" label="Evaluate refinement using reference genome (-g)" help="FASTA or 2bit format" />
+  </inputs>
+
+  <outputs>
+    <data name="logfile_prep" format="txt" label="${tool.name} on ${on_string}: log (pre-processing)" />
+    <data name="logfile_seq" format="txt" label="${tool.name} on ${on_string}: log (SEQuel)" />
+    <data name="logprep" format="txt" label="${tool.name} on ${on_string}: log (pre-processing, official)" />
+    <data name="logseq" format="txt" label="${tool.name} on ${on_string}: log (SEQuel, official)" />
+    <data name="contigs_refined" format="fasta" label="${tool.name} on ${on_string}: refined contigs" />
+  </outputs>
+
+  <tests>
+
+  </tests>
+  <help>
+**What it does**
+
+SEQuel is a tool for correcting errors (i.e., insertions, deletions, and substitutions) in contigs output from assembly. While assemblies of next generation sequencing (NGS) data are accurate, they still contain a substantial number of errors that need to be corrected after the assembly process. The algorithm behind SEQuel makes use of a graph structure called the positional de Bruijn graph, which models k-mers within reads while incorporating their approximate positions into the model.
+
+SEQuel substantially reduces the number of small insertions, deletions and substitutions errors in assemblies of both standard (multi-cell) and single-cell sequencing data. SEQuel was tested mainly on Illumina sequence data, in combination with multiple NGS assemblers, such as Euler-SR, Velvet, SoapDeNovo, ALLPATHS and SPAdes.
+
+**Known issues**
+
+.. class:: warningmark
+
+During the pre-processing stage, a SAM file per contig is created. Due to runtime considerations, these files are kept open simultaneously. The program will crash when the number of contigs in the assembly is too high.
+
+**License and citation**
+
+This Galaxy tool is Copyright © 2013 `CRS4 Srl.`_ and is released under the `MIT license`_.
+
+.. _CRS4 Srl.: http://www.crs4.it/
+.. _MIT license: http://opensource.org/licenses/MIT
+
+If you use this tool in Galaxy, please cite |Cuccuru2013|_.
+
+.. |Cuccuru2013| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2013) Orione, a web-based framework for NGS analysis in microbiology. *Submitted*
+.. _Cuccuru2013: http://orione.crs4.it/
+
+This tool uses `SEQuel`_, which is licensed separately. Please cite |Ronen2012|_.
+
+.. _SEQuel: http://bix.ucsd.edu/SEQuel/
+.. |Ronen2012| replace:: Ronen R., *et al.* (2012) SEQuel: improving the accuracy of genome assemblies. *Bioinformatics* 28 (12), i188-i196
+.. _Ronen2012: http://bioinformatics.oxfordjournals.org/content/28/12/i188
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Tue Oct 15 11:15:28 2013 -0400
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="bwa" version="0.6.2">
+    <repository changeset_revision="0778635a84ba" name="package_bwa_0_6_2" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" />
+  </package>
+  <package name="blat" version="35">
+<!--    <repository name="package_blat_35x1" owner="iuc" /> This may be used instead of everything inside <install> when a stable Galaxy release will support the 'download_binary' action type -->
+  </package>
+  <package name="sequel" version="1.0.2">
+    <install version="1.0">
+      <actions>
+        <action target_filename="SEQuel.tar.gz" type="download_by_url">http://bix.ucsd.edu/SEQuel/download/SEQuel.v102.tar.gz</action>
+        <action type="move_directory_files">
+          <source_directory>.</source_directory>
+          <destination_directory>$INSTALL_DIR</destination_directory>
+        </action>
+        <action type="set_environment">
+          <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR</environment_variable>
+        </action>
+        <action type="set_environment">
+          <environment_variable action="set_to" name="SEQUEL_JAR_PATH">$INSTALL_DIR</environment_variable>
+        </action>
+        <action type="set_environment">
+          <environment_variable action="set_to" name="SEQUEL_SITE_OPTIONS">"-t 8 -p 8 -u 1"</environment_variable>
+        </action>
+      </actions>
+    </install>
+    <readme>
+Change the SEQUEL_SITE_OPTIONS variable in the installed env.sh file to adjust the number of threads to use in BWA alignment (-t) or the maximum number of threads for SEQuel (-p) or the minimum number of threads for SEQuel (-u).
+    </readme>
+  </package>
+</tool_dependency>