Mercurial > repos > crs4 > ssake
changeset 0:0ec408bcfc80 draft
Uploaded
author | crs4 |
---|---|
date | Wed, 11 Sep 2013 12:51:21 -0400 |
parents | |
children | 386166019772 |
files | COPYING ssake.py ssake.xml tool_dependencies.xml |
diffstat | 4 files changed, 307 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/COPYING Wed Sep 11 12:51:21 2013 -0400 @@ -0,0 +1,24 @@ +Copyright © 2012-2013 CRS4 Srl. http://www.crs4.it/ +Created by: +Massimiliano Orsini <massimiliano.orsini@crs4.it> +Gianmauro Cuccuru <gianmauro.cuccuru@crs4.it> +Nicola Soranzo <nicola.soranzo@crs4.it> + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ssake.py Wed Sep 11 12:51:21 2013 -0400 @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +""" +SSAKE wrapper +""" + +import logging +import optparse +import os +import shutil +import subprocess +import tempfile + +def execute(cmd): + """ """ + subprocess.check_call(args=cmd, stdout=open(os.devnull, 'w'), shell=True) + + +def which(name, flags=os.X_OK): + """ + Search PATH for executable files with the given name. + """ + result = [] + exts = filter(None, os.environ.get('PATHEXT', '').split(os.pathsep)) + path = os.environ.get('PATH', None) + if path is None: + return [] + for p in os.environ.get('PATH', '').split(os.pathsep): + p = os.path.join(p, str(name)) + if os.access(p, flags): + result.append(p) + for e in exts: + pext = p + e + if os.access(pext, flags): + result.append(pext) + return result + + +class SSAKE: + def __init__(self, logger, options): + self.logger = logger + self.executables = ('SSAKE', 'makePairedOutput2EQUALfiles.pl', 'makePairedOutput2UNEQUALfiles.pl') + self.logger.debug(which(self.executables[0])) + self.logger.debug(which(self.executables[1])) + self.logger.debug(which(self.executables[2])) + self.logger.debug('Creating temp dir') + self.wd = tempfile.mkdtemp() + + self.kind_of_reads = int(options.kind_of_reads) + if not (self.kind_of_reads): + self.infile = options.if_unpaired + self.paired = 0 + else: + self.infile_r1 = options.if_paired_r1 + self.infile_r2 = options.if_paired_r2 + self.paired = 1 + self.insert_size = options.insert_size + self.minnumlinks = options.minnumlinks + self.error = options.error + self.maxlinkratio = options.maxlinkratio + self.minoverlap = options.minoverlap + self.mindepthofcoverage = options.mindepthofcoverage + self.minoverlappingbases = options.minoverlappingbases + self.mincall = options.mincall + self.baseratio = options.baseratio + self.ignore_header = options.ignore_header + self.prefix = options.prefix + self.contigs = options.contigs + self.log = options.logfile + self.short = options.short + self.singlets = options.singlets + if options.seeds_file: + self.seeds_file = options.seeds_file + + def run(self): + """ """ + os.chdir(self.wd) + seeds = '' + if hasattr(self, 'seeds_file'): + seeds = " -s %s" % self.seeds_file + if self.kind_of_reads == 1: + cmd = "%s %s %s %d" % ( + self.executables[1], self.infile_r1, self.infile_r2, + self.insert_size) + self.logger.info("Preparing data") + execute(cmd) + paired_file = "%s/paired.fa" % self.wd + command = "%s -f %s -k %d -e %s -a %s -x %d" % (self.executables[0], paired_file, self.minnumlinks, self.error, self.maxlinkratio, self.minoverlap) + elif self.kind_of_reads == 2: + cmd = "%s %s %s %d" % ( + self.executables[2], self.infile_r1, self.infile_r2, + self.insert_size) + self.logger.info("Preparing data") + execute(cmd) + paired_file = "%s/paired.fa" % self.wd + unpaired_file = "%s/unpaired.fa" % self.wd + command = "%s -f %s -g %s -k %d -e %s -a %s -x %d" % (self.executables[0], paired_file, unpaired_file, self.minnumlinks, self.error, self.maxlinkratio, self.minoverlap) + else: + command = "%s -f %s" % (self.executables[0], self.infile) + command += " %s -w %d -m %d -o %d -r %s -h %s -b %s -p %s" % (seeds, self.mindepthofcoverage, self.minoverlappingbases, self.mincall, self.baseratio, self.ignore_header, self.prefix, self.paired) + self.logger.debug(command) + self.logger.info("Executing SSAKE") + execute(command) + + with open("%s.log" % os.path.join(self.wd, self.prefix), 'rb') as ssake_log_file: + self.logger.info("\n".join(["Log from SSAKE", ssake_log_file.read()])) + self.logger.info("Moving result files") + shutil.move("%s.contigs" % os.path.join(self.wd, self.prefix), self.contigs) + shutil.move("%s.short" % os.path.join(self.wd, self.prefix), self.short) + shutil.move("%s.singlets" % os.path.join(self.wd, self.prefix), self.singlets) + + def __del__(self): + shutil.rmtree(self.wd) + + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + + +def __main__(): + """ main function """ + parser = optparse.OptionParser() + parser.add_option('--if_unpaired', dest='if_unpaired', help='Unpaired FASTA input file name') + parser.add_option('--if_paired_r1', dest='if_paired_r1', help='Paired FASTA reads 1 input file name') + parser.add_option('--if_paired_r2', dest='if_paired_r2', help='Paired FASTA reads 2 input file name') + parser.add_option('-s', dest='seeds_file', help='FASTA as seeds, input file name') + parser.add_option('-w', dest='mindepthofcoverage', type='int', help='minimum depth of coverage allowed for contigs') + parser.add_option('-m', dest='minoverlappingbases', type='int', default=20, help='Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m 20)') + parser.add_option('-o', dest='mincall', type='int', default=2, help='mincall -o ') + parser.add_option('-r', dest='baseratio', type='float', default=0.7, help='baseratio -r') + parser.add_option('-k', dest='minnumlinks', type='int', default=4, help='Minimum number of links (read pairs) to compute scaffold -k') + parser.add_option('-e', dest='error', type='float', default=0.75, help='Error (%) allowed on mean distance -e') + parser.add_option('-a', dest='maxlinkratio', type='float', default=0.5, help='Maximum link ratio between two best contig pairs -a') + parser.add_option('-x', dest='minoverlap', type='int', default=20, help='Minimum overlap required between contigs to merge adjacent contigs in a scaffold -x') + parser.add_option('--ignore_header', dest='ignore_header', choices=['0', '1'], default='1', help='Ignore read name/header *will use less RAM if set to 1* -h') + parser.add_option('--kind_of_reads', dest='kind_of_reads', choices=['0', '1', '2'], help='Kind of reads (-p)') + parser.add_option('--iz', dest='insert_size', type='int', help='Library insert size') + parser.add_option('--prefix', dest='prefix', default='ssake_pre', help='prefix') + parser.add_option('--out1', dest='contigs', help='contig file') + parser.add_option('--out2', dest='short', help='short file') + parser.add_option('--out3', dest='singlets', help='singlets file') + parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', help='logging level (default: INFO)') + parser.add_option('--logfile', help='log file (default=stderr)') + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error('Wrong number of arguments') + + log_level = getattr(logging, options.loglevel) + kwargs = {'format': LOG_FORMAT, + 'datefmt': LOG_DATEFMT, + 'level': log_level} + if options.logfile: + kwargs['filename'] = options.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger('SSAKE scaffold assembly') + + S = SSAKE(logger, options) + S.run() + return + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ssake.xml Wed Sep 11 12:51:21 2013 -0400 @@ -0,0 +1,99 @@ +<tool id="ssake" name="SSAKE" version="0.0.10"> + <description>short DNA sequences assembler</description> + <requirements> + <requirement type="package" version="3.8">ssake</requirement> + </requirements> + <command interpreter="python"> + ssake.py + #if $kind_of_reads.kind_of_reads_select == '0' + --if_unpaired $infile + #else + --if_paired_r1 $infile_r1 + --if_paired_r2 $infile_r2 + --iz ${kind_of_reads.insert_size} + -k ${kind_of_reads.minnumlinks} + -e ${kind_of_reads.error} + -a ${kind_of_reads.maxlinkratio} + -x ${kind_of_reads.minoverlap} + #end if + #if $seeds + -s $seeds + #end if + -w $mindepthofcoverage + -m $minoverlap + -o $mincall + -r $baseratio + --ignore_header 1 + --kind_of_reads ${kind_of_reads.kind_of_reads_select} + --out1 $contig + --out2 $short + --out3 $singlets + --logfile $log + </command> + <inputs> + <conditional name="kind_of_reads"> + <param name="kind_of_reads_select" type="select" label="Kind of reads (-p)"> + <option value="0">Unpaired </option> + <option value="1">Paired and equal (both files must have the same number of sequences, arranged in the same order)</option> + <option value="2">Paired and unequal (files can have different number of sequences in any order)</option> + </param> + <when value="0"> + <param name="infile" type="data" format="fasta" label="Input FASTA file" /> + </when> + <when value="1"> + <param name="infile_r1" type="data" format="fasta" label="Input FASTA file (read 1)" /> + <param name="infile_r2" type="data" format="fasta" label="Input FASTA file (read 2)" /> + <param name="insert_size" type="integer" value="200" label="Library insert size" /> + <param name="minnumlinks" type="integer" value="4" label="Minimum number of links (read pairs) to compute scaffold (-k)" /> + <param name="error" type="float" value="0.75" min="0" max="1" label="Error (%) allowed on mean distance (-e)" /> + <param name="maxlinkratio" type="float" value="0.5" label="Maximum link ratio between two best contig pairs (-a)" /> + <param name="minoverlap" type="integer" value="20" label="Minimum overlap required between contigs to merge adjacent contigs in a scaffold (-x)" /> + </when> + <when value="2"> + <param name="infile_r1" type="data" format="fasta" label="Input FASTA file (read 1)" /> + <param name="infile_r2" type="data" format="fasta" label="Input FASTA file (read 2)" /> + <param name="insert_size" type="integer" value="200" label="Library insert size" /> + <param name="minnumlinks" type="integer" value="4" label="Minimum number of links (read pairs) to compute scaffold (-k)" /> + <param name="error" type="float" value="0.75" min="0" max="1" label="Error (%) allowed on mean distance (-e)" /> + <param name="maxlinkratio" type="float" value="0.5" label="Maximum link ratio between two best contig pairs (-a)" /> + <param name="minoverlap" type="integer" value="20" label="Minimum overlap required between contigs to merge adjacent contigs in a scaffold (-x)" /> + </when> + </conditional> + <param name="seeds" type="data" format="fasta" optional="true" label="FASTA file containing sequences to use as seeds exclusively (-s)" help="Optional, specify only if different from read set" /> + <param name="mindepthofcoverage" type="integer" value="1" label="Minimum depth of coverage allowed for contigs (-w)" /> + <param name="minoverlap" type="integer" value="20" label="Minimum number of overlapping bases with the seed/contig during overhang consensus build up (-m)" /> + <param name="mincall" type="integer" value="2" label="Minimum number of reads needed to call a base during an extension (-o)" /> + <param name="baseratio" type="float" value="0.7" label="Minimum base ratio used to accept a overhang consensus base (-r)" /> + </inputs> + + <outputs> + <data name="contig" format="fasta" label="${tool.name} on ${on_string}: contigs" /> + <data name="log" format="txt" label="${tool.name} on ${on_string}: log" /> + <data name="short" format="txt" label="${tool.name} on ${on_string}: unacceptable reads" /> + <data name="singlets" format="fasta" label="${tool.name} on ${on_string}: unassembled reads" /> + </outputs> + <help> +**What it does** + +SSAKE is a genomics application for de novo assembly of millions of very short DNA sequences. +It is an easy-to-use, robust, reliable and tractable clustering algorithm for very short sequence reads, such as those generated by Illumina Ltd. + +**License and citation** + +This Galaxy tool is Copyright © 2012-2013 `CRS4 Srl.`_ and is released under the `MIT license`_. + +.. _CRS4 Srl.: http://www.crs4.it/ +.. _MIT license: http://opensource.org/licenses/MIT + +If you use this tool in Galaxy, please cite |Cuccuru2013|_. + +.. |Cuccuru2013| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2013) Orione, a web-based framework for NGS analysis in microbiology. *Submitted* +.. _Cuccuru2013: http://orione.crs4.it/ + +This tool uses `SSAKE`_, which is licensed separately. Please cite |Warren2007|_. + +.. _SSAKE: http://www.bcgsc.ca/platform/bioinfo/software/ssake/ +.. |Warren2007| replace:: Warren RL, Sutton GG, Jones SJM, Holt RA. 2007. Assembling millions of short DNA sequences using SSAKE. Bioinformatics. 23(4):500-501 +.. _Warren2007: http://bioinformatics.oxfordjournals.org/content/23/4/500 + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Sep 11 12:51:21 2013 -0400 @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="ssake" version="3.8"> + <install version="1.0"> + <actions> + <action type="download_by_url" target_filename="ssake_v3-8.tar.gz">http://www.bcgsc.ca/platform/bioinfo/software/ssake/releases/3.8/ssake_v3-8-tar.gz</action> + <!-- fix for Perl >= 5.16.0--> + <action type="shell_command">sed -i -e 's/require "getopts.pl"/use Getopt::Std/' -e 's/&Getopts/getopts/' SSAKE tools/TQSfastq.pl</action> + <action type="move_directory_files"> + <source_directory>.</source_directory> + <destination_directory>$INSTALL_DIR</destination_directory> + </action> + <action type="set_environment"> + <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR</environment_variable> + </action> + <action type="set_environment"> + <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/tools</environment_variable> + </action> + </actions> + </install> + </package> +</tool_dependency>