# HG changeset patch # User crs4 # Date 1381850128 14400 # Node ID 58e1eb37fddcd08b261d6ee3ccaaa3fb46eeb62e Uploaded diff -r 000000000000 -r 58e1eb37fddc COPYING --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/COPYING Tue Oct 15 11:15:28 2013 -0400 @@ -0,0 +1,23 @@ +Copyright © 2013 CRS4 Srl. http://www.crs4.it/ +Created by: +Andrea Pinna +Nicola Soranzo + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff -r 000000000000 -r 58e1eb37fddc sequel_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequel_wrapper.py Tue Oct 15 11:15:28 2013 -0400 @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +""" +SEQuel +version 0.2 (andrea.pinna@crs4.it) +""" + +import optparse +import os +import shutil +import subprocess +import sys +import tempfile +import uuid + +def external_insert_size(prep_file): + """ Retrieve average external insert-size from pre-processing log """ + with open(prep_file, 'r') as f: + for line in f: + if line[0:30] == '\tAverage external insert-size:': + ext_int_size = line.split(':')[1][1:-1] + break + else: + sys.exit('Average external insert-size not found in %s' % prep_file) + return ext_int_size + + +def __main__(): + # load arguments + print 'Parsing SEQuel input options...' + parser = optparse.OptionParser() + parser.add_option('--sequel_jar_path', dest='sequel_jar_path', help='') + parser.add_option('--read1', dest='read1', help='') + parser.add_option('--read2', dest='read2', help='') + parser.add_option('--contigs', dest='contigs', help='') + parser.add_option('--bases_length', dest='bases_length', type='int', help='') + parser.add_option('-t', dest='n_threads', type='int', help='') + parser.add_option('-p', dest='max_threads', type='int', help='') + parser.add_option('-u', dest='min_threads', type='int', help='') + parser.add_option('--kmer_size', dest='kmer_size', type='int', help='') + parser.add_option('--max_positional_error', dest='max_positional_error', type='int', help='') + parser.add_option('--min_fraction', dest='min_fraction', type='float', help='') + parser.add_option('--min_aln_length', dest='min_aln_length', type='int', help='') + parser.add_option('--min_avg_coverage', dest='min_avg_coverage', type='float', help='') + parser.add_option('--discard_kmers', dest='discard_kmers', type='int', help='') + parser.add_option('--discard_positional', dest='discard_positional', type='int', help='') + parser.add_option('--min_aln_score', dest='min_aln_score', type='int', help='') + parser.add_option('--single_cell_mode', action='store_true', dest='single_cell_mode', help='') + parser.add_option('--report_changes', action='store_true', dest='report_changes', help='') + parser.add_option('--extend_contig', action='store_true', dest='extend_contig', help='') + parser.add_option('--reference_genome', dest='reference_genome', help='') + parser.add_option('--contigs_refined', dest='contigs_refined', help='') + parser.add_option('--logprep', dest='logprep', help='') + parser.add_option('--logseq', dest='logseq', help='') + parser.add_option('--logfile_prep', dest='logfile_prep', help='') + parser.add_option('--logfile_seq', dest='logfile_seq', help='') + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error('Wrong number of arguments') + + prep_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # prep.pl dies if the directory already exists, so just define a unique name + seq_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # SEQuel.jar would work in another directory (with '_1' suffix) if the directory already exists, so just define a unique name + + # Build SEQuel (pre-processing) command to be executed + # basic preprocessing + prep_input = "-r1 %s -r2 %s -c %s" % (options.read1, options.read2, options.contigs) + bases_length = "-l %d" % (options.bases_length) if options.bases_length is not None else '' + n_threads = "-t %d" % (options.n_threads) if options.n_threads is not None else '' + cmd_prep_dir = "-o %s" % (prep_directory) + cmd_seq_dir = "-o %s" % (seq_directory) + # -i INT (external insert size of paired-end reads (from prep.log) + # -p INT + max_threads = "-p %d" % (options.max_threads) if options.max_threads is not None else '' + # -u INT + min_threads = "-u %d" % (options.min_threads) if options.min_threads is not None else '' + # -A DIR + input_directory = "-A %s" % (prep_directory) + # -k INT + kmer_size = "-k %d" % (options.kmer_size) if options.kmer_size is not None else '' + # -d INT + max_positional_error = "-d %d" % (options.max_positional_error) if options.max_positional_error is not None else '' + # -f FLOAT + min_fraction = "-f %s" % (options.min_fraction) if options.min_fraction is not None else '' + # -l INT + min_aln_length = "-l %d" % (options.min_aln_length) if options.min_aln_length is not None else '' + # -v FLOAT + min_avg_coverage = "-v %s" % (options.min_avg_coverage) if options.min_avg_coverage is not None else '' + # -m INT + discard_kmers = "-m %d" % (options.discard_kmers) if options.discard_kmers is not None else '' + # -n INT + discard_positional = "-n %d" % (options.discard_positional) if options.discard_positional is not None else '' + # -q INT + min_aln_score = "-q %d" % (options.min_aln_score) if options.min_aln_score is not None else '' + # -s + single_cell_mode = '-s' if options.single_cell_mode else '' + # -r + report_changes = '-r' if options.report_changes else '' + # -e + extend_contig = '-e' if options.extend_contig else '' + # -g FILE + reference_genome = "-g %s" % (options.reference_genome) if options.reference_genome else '' + # contigs_refined.fa + contigs_refined = options.contigs_refined + # logprep & logseq + logprep = options.logprep + logseq = options.logseq + # x.refined-fa + # x.stdout + # logfile + logfile_prep = options.logfile_prep + logfile_seq = options.logfile_seq + + # Build SEQuel (pre-processing) command + cmd1 = ' '.join(['prep.pl', prep_input, bases_length, n_threads, cmd_prep_dir]) + print '\nSEQuel (pre-processing) command to be executed:\n %s' % (cmd1) + + # Execution of SEQuel (pre-processing) + print 'Executing SEQuel (pre-processing)...' + logfile_prep_file = open(logfile_prep, 'w') if logfile_prep else sys.stdout + try: + try: + subprocess.check_call(cmd1, stdout=logfile_prep_file, stderr=subprocess.STDOUT, shell=True) + finally: + if logfile_prep_file != sys.stdout: + logfile_prep_file.close() + print 'SEQuel (pre-processing) executed!' + + prep_file = os.path.join(prep_directory, 'prep.log') + ext_ins_size = external_insert_size(prep_file) + + # Build SEQuel command + cmd2 = 'java -Xmx12g -jar %s %s -i %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % (os.path.join(options.sequel_jar_path, 'SEQuel.jar'), input_directory, ext_ins_size, max_threads, min_threads, kmer_size, max_positional_error, min_fraction, min_aln_length, min_avg_coverage, discard_kmers, discard_positional, min_aln_score, single_cell_mode, report_changes, extend_contig, reference_genome, cmd_seq_dir) + print '\nSEQuel command to be executed:\n %s' % (cmd2) + + # Execution of SEQuel + print 'Executing SEQuel...' + logfile_seq_file = open(logfile_seq, 'w') if logfile_seq else sys.stdout + try: + subprocess.check_call(cmd2, stdout=logfile_seq_file, stderr=subprocess.STDOUT, shell=True) + except subprocess.CalledProcessError, e: + if e.retcode == 24: + sys.exit("Too many contigs in the assembly!") + else: + raise + finally: + if logfile_seq_file != sys.stdout: + logfile_seq_file.close() + print 'SEQuel executed!' + + shutil.move(prep_file, logprep) + shutil.move(os.path.join(seq_directory, 'Log'), logseq) + shutil.move(os.path.join(seq_directory, 'contigs_refined.fa'), contigs_refined) + finally: + shutil.rmtree(prep_directory, ignore_errors=True) + shutil.rmtree(seq_directory, ignore_errors=True) + + +if __name__ == "__main__": + __main__() diff -r 000000000000 -r 58e1eb37fddc sequel_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequel_wrapper.xml Tue Oct 15 11:15:28 2013 -0400 @@ -0,0 +1,132 @@ + + + + bwa + blat + sequel + + + sequel_wrapper.py + \${SEQUEL_SITE_OPTIONS:--t 8 -p 8 -u 1} + --sequel_jar_path=\$SEQUEL_JAR_PATH --read1=$read1 --read2=$read2 --contigs=$contigs + #if str($bases_length) + --bases_length=$bases_length + #end if + #if str($kmer_size) + --kmer_size=$kmer_size + #end if + #if str($max_positional_error) + --max_positional_error=$max_positional_error + #end if + #if str($min_fraction) + --min_fraction=$min_fraction + #end if + #if str($min_aln_length) + --min_aln_length=$min_aln_length + #end if + #if str($min_avg_coverage) + --min_avg_coverage=$min_avg_coverage + #end if + #if str($discard_kmers) + --discard_kmers=$discard_kmers + #end if + #if str($discard_positional) + --discard_positional=$discard_positional + #end if + #if str($min_aln_score) + --min_aln_score=$min_aln_score + #end if + #if $single_cell_mode + --single_cell_mode + #end if + #if $report_changes + --report_changes + #end if + #if $extend_contig + --extend_contig + #end if + #if $reference_genome + --reference_genome=$reference_genome + #end if + --contigs_refined=$contigs_refined + --logprep=$logprep + --logseq=$logseq + --logfile_prep=$logfile_prep + --logfile_seq=$logfile_seq + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +SEQuel is a tool for correcting errors (i.e., insertions, deletions, and substitutions) in contigs output from assembly. While assemblies of next generation sequencing (NGS) data are accurate, they still contain a substantial number of errors that need to be corrected after the assembly process. The algorithm behind SEQuel makes use of a graph structure called the positional de Bruijn graph, which models k-mers within reads while incorporating their approximate positions into the model. + +SEQuel substantially reduces the number of small insertions, deletions and substitutions errors in assemblies of both standard (multi-cell) and single-cell sequencing data. SEQuel was tested mainly on Illumina sequence data, in combination with multiple NGS assemblers, such as Euler-SR, Velvet, SoapDeNovo, ALLPATHS and SPAdes. + +**Known issues** + +.. class:: warningmark + +During the pre-processing stage, a SAM file per contig is created. Due to runtime considerations, these files are kept open simultaneously. The program will crash when the number of contigs in the assembly is too high. + +**License and citation** + +This Galaxy tool is Copyright © 2013 `CRS4 Srl.`_ and is released under the `MIT license`_. + +.. _CRS4 Srl.: http://www.crs4.it/ +.. _MIT license: http://opensource.org/licenses/MIT + +If you use this tool in Galaxy, please cite |Cuccuru2013|_. + +.. |Cuccuru2013| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2013) Orione, a web-based framework for NGS analysis in microbiology. *Submitted* +.. _Cuccuru2013: http://orione.crs4.it/ + +This tool uses `SEQuel`_, which is licensed separately. Please cite |Ronen2012|_. + +.. _SEQuel: http://bix.ucsd.edu/SEQuel/ +.. |Ronen2012| replace:: Ronen R., *et al.* (2012) SEQuel: improving the accuracy of genome assemblies. *Bioinformatics* 28 (12), i188-i196 +.. _Ronen2012: http://bioinformatics.oxfordjournals.org/content/28/12/i188 + + diff -r 000000000000 -r 58e1eb37fddc tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Oct 15 11:15:28 2013 -0400 @@ -0,0 +1,32 @@ + + + + + + + + + + + + http://bix.ucsd.edu/SEQuel/download/SEQuel.v102.tar.gz + + . + $INSTALL_DIR + + + $INSTALL_DIR + + + $INSTALL_DIR + + + "-t 8 -p 8 -u 1" + + + + +Change the SEQUEL_SITE_OPTIONS variable in the installed env.sh file to adjust the number of threads to use in BWA alignment (-t) or the maximum number of threads for SEQuel (-p) or the minimum number of threads for SEQuel (-u). + + +