Mercurial > repos > cjav > smalt
changeset 0:747433a6de00 draft
Initial commit.
author | cjav |
---|---|
date | Wed, 13 Feb 2013 13:27:44 -0500 |
parents | |
children | 54855bd8d107 |
files | tools/smalt_wrapper.py tools/smalt_wrapper.xml tools/tool-data/smalt_index.loc.sample tools/tool_data_table_conf.xml.sample |
diffstat | 4 files changed, 523 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/smalt_wrapper.py Wed Feb 13 13:27:44 2013 -0500 @@ -0,0 +1,195 @@ +#!/usr/bin/env python + +""" +Runs Smalt on single-end or paired-end data. +Produces a SAM file containing the mappings. +Works with Smalt version 0.7.1. + +usage: smalt_wrapper.py [options] + +See below for options +""" + +import optparse, os, shutil, subprocess, sys, tempfile + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' ) + parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' ) + parser.add_option( '-f', '--input1', dest='fastq', help='The (forward) fastq file to use for the mapping' ) + parser.add_option( '-F', '--input2', dest='rfastq', help='The reverse fastq file to use for mapping if paired-end data' ) + parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' ) + parser.add_option( '-g', '--genAlignType', dest='genAlignType', help='The type of pairing (single or paired)' ) + parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' ) + parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' ) + parser.add_option( '-x', '--exhaustiveSearch', dest='exhaustiveSearch', help='This flag triggers a more exhaustive search for alignments at the cost of decreased speed' ) + parser.add_option( '-c', '--minCover', dest='minCover', help='Only consider mappings where the k-mer word seeds cover the query read to a minimum extent' ) + parser.add_option( '-d', '--scorDiff', dest='scorDiff', help='Set a threshold of the Smith-Waterman alignment score relative to the maximum score' ) + parser.add_option( '-i', '--insertMax', dest='insertMax', help='Maximum insert size (Only in paired-end mode)' ) + parser.add_option( '-j', '--insertMin', dest='insertMin', help='Minimum insert size (Only in paired-end mode)' ) + parser.add_option( '-l', '--pairTyp', dest='pairTyp', help='Type of read pair library, can be either pe, mp or pp' ) + parser.add_option( '-m', '--minScor', dest='minScor', help='Sets an absolute threshold of the Smith-Waterman scores' ) + parser.add_option( '-a', '--partialAlignments', dest='partialAlignments', help='Report partial alignments if they are complementary on the read (split reads)' ) + parser.add_option( '-q', '--minBasq', dest='minBasq', help='Sets a base quality threshold (0 <= minbasq <= 10, default 0)' ) + parser.add_option( '-e', '--seed', dest='seed', help='If <seed> >= 0 report an alignment selected at random where there are multiple mappings with the same best alignment score. With <seed> = 0 (default) a seed is derived from the current calendar time. If <seed> < 0 reads with multiple best mappings are reported as "not mapped".' ) + parser.add_option( '-w', '--complexityWeighted', dest='complexityWeighted', help='Smith-Waterman scores are complexity weighted' ) + parser.add_option( '-y', '--minId', dest='minId', help='Sets an identity threshold for a mapping to be reported' ) + parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' ) + parser.add_option( '-X', '--do_not_build_index', dest='do_not_build_index', action='store_true', help="Don't build index" ) + parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' ) + (options, args) = parser.parse_args() + + # output version # of tool + try: + tmp = tempfile.NamedTemporaryFile().name + tmp_stdout = open( tmp, 'wb' ) + proc = subprocess.Popen( args='smalt 2>&1', shell=True, stdout=tmp_stdout ) + tmp_stdout.close() + returncode = proc.wait() + stdout = None + for line in open( tmp_stdout.name, 'rb' ): + if line.lower().find( 'version' ) >= 0: + stdout = line.strip() + break + if stdout: + sys.stdout.write( 'SMALT %s\n' % stdout ) + else: + raise Exception + except: + sys.stdout.write( 'Could not determine SMALT version\n' ) + + fastq = options.fastq + if options.rfastq: + rfastq = options.rfastq + + # make temp directory for placement of indices + tmp_index_dir = tempfile.mkdtemp() + tmp_dir = tempfile.mkdtemp() + # index if necessary + if options.fileSource == 'history' and not options.do_not_build_index: + ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) + ref_file_name = ref_file.name + ref_file.close() + os.symlink( options.ref, ref_file_name ) + cmd1 = 'smalt index %s %s' % ( ref_file_name, ref_file_name ) + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + # clean up temp dirs + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + stop_err( 'Error indexing reference sequence. ' + str( e ) ) + else: + ref_file_name = options.ref + + # set up aligning and generate aligning command options + if options.params == 'pre_set': + aligning_cmds = '-n %s ' % ( options.threads ) + gen_alignment_cmds = '' + else: + if options.exhaustiveSearch == 'true': + exhaustiveSearch = '-x' + minCover = '-c %s' % options.minCover + else: + exhaustiveSearch = '' + minCover = '' + if options.partialAlignments == 'true': + partialAlignments = '-x' + else: + partialAlignments = '' + if options.complexityWeighted == 'true': + complexityWeighted = '-w' + else: + complexityWeighted = '' + aligning_cmds = '-d %s -m %s -q %s -r %s -y %s' % \ + ( options.scorDiff, options.minScor, options.minBasq, options.seed, options.minId ) + if options.genAlignType == 'paired': + gen_alignment_cmds = '-i %s -j %s -l %s' % ( options.insertMax, options.insertMin, options.pairTyp ) + else: + gen_alignment_cmds = '' + # prepare actual aligning and generate aligning commands + if options.genAlignType == 'paired': + cmd = 'smalt map %s %s -o %s %s %s ' % ( aligning_cmds, gen_alignment_cmds, options.output, ref_file_name, fastq, rfastq ) + else: + cmd = 'smalt map %s -o %s %s %s ' % ( aligning_cmds, options.output, ref_file_name, fastq ) + # perform alignments + buffsize = 1048576 + try: + # need to nest try-except in try-finally to handle 2.4 + try: + # align + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error aligning sequence. ' + str( e ) + # remove header if necessary + if options.suppressHeader == 'true': + tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir) + tmp_out_name = tmp_out.name + tmp_out.close() + try: + shutil.move( options.output, tmp_out_name ) + except Exception, e: + raise Exception, 'Error moving output file before removing headers. ' + str( e ) + fout = file( options.output, 'w' ) + for line in file( tmp_out.name, 'r' ): + if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ): + fout.write( line ) + fout.close() + # check that there are results in the output file + if os.path.getsize( options.output ) > 0: + sys.stdout.write( 'SMALT run on %s-end data' % options.genAlignType ) + else: + raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.' + except Exception, e: + stop_err( 'The alignment failed.\n' + str( e ) ) + finally: + # clean up temp dir + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + +if __name__=="__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/smalt_wrapper.xml Wed Feb 13 13:27:44 2013 -0500 @@ -0,0 +1,291 @@ +<tool id="smalt_wrapper" name="SMALT" version="0.0.1"> + <requirements> + <requirement type="package" version="0.7.1">smalt</requirement> + </requirements> + <description>maps query reads onto the reference sequences</description> + <command interpreter="python"> + smalt_wrapper.py + --threads="4" + + ## reference source + --fileSource=$genomeSource.refGenomeSource + #if $genomeSource.refGenomeSource == "history": + ##build index on the fly + --ref="${genomeSource.ownFile}" + --dbkey=$dbkey + #else: + ##use precomputed indexes + --ref="${genomeSource.indices.fields.path}" + --do_not_build_index + #end if + + ## input file(s) + --input1=$paired.input1 + #if $paired.sPaired == "paired": + --input2=$paired.input2 + #end if + + ## output file + --output=$output + + ## run parameters + --genAlignType=$paired.sPaired + --params=$params.source_select + #if $params.source_select != "pre_set": + --scorDiff=$params.scorDiff + #if $paired.sPaired == "paired": + --insertMax=$params.insertMax + --insertMin=$params.insertMin + --pairTyp=$params.pairTyp + #end if + --minScor=$params.minScor + --partialAlignments=$params.partialAlignments + --minBasq=$params.minBasq + --seed=$params.seed + --complexityWeighted=$params.complexityWeighted + --exhaustiveSearch=$params.cExhaustiveSearch.exhaustiveSearch + #if $params.cExhaustiveSearch.exhaustiveSearch == "true" + --minCover=$params.cExhaustiveSearch.minCover + #end if + --minId=$params.minId + #end if + + ## suppress output SAM header + --suppressHeader=$suppressHeader + </command> + <inputs> + <conditional name="genomeSource"> + <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="indices" type="select" label="Select a reference genome"> + <options from_data_table="smalt_indexes"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No indexes are available" /> + </options> + </param> + </when> + <when value="history"> + <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" /> + </when> + </conditional> + <conditional name="paired"> + <param name="sPaired" type="select" label="Is this library mate-paired?"> + <option value="single">Single-end</option> + <option value="paired">Paired-end</option> + </param> + <when value="single"> + <param name="input1" type="data" format="fastqsanger" label="FASTQ file" help="FASTQ with Sanger-scaled quality values (fastqsanger)" /> + </when> + <when value="paired"> + <param name="input1" type="data" format="fastqsanger" label="Forward FASTQ file" help="FASTQ with Sanger-scaled quality values (fastqsanger)" /> + <param name="input2" type="data" format="fastqsanger" label="Reverse FASTQ file" help="FASTQ with Sanger-scaled quality values (fastqsanger)" /> + </when> + </conditional> + <conditional name="params"> + <param name="source_select" type="select" label="Smalt settings to use" help="For most mapping needs use Commonly Used settings. If you want full control use Full Parameter List"> + <option value="pre_set">Commonly Used</option> + <option value="full">Full Parameter List</option> + </param> + <when value="pre_set" /> + <when value="full"> + <conditional name="cExhaustiveSearch"> + <param name="exhaustiveSearch" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Do exhaustive search? (map -x)" help="This flag triggers a more exhaustive search for alignments at the cost of decreased speed." /> + <when value="true"> + <param name="minCover" type="float" value="0" label="Minimum cover (map -c)" help="Only consider mappings where the k-mer word seeds cover the query read to a minimum extent." /> + </when> + <when value="no" /> + </conditional> + <param name="scorDiff" type="integer" value="0" label="Score diff (map -d)" help="Set a threshold of the Smith-Waterman alignment score relative to the maximum score." /> + <param name="insertMax" type="integer" value="500" label="Maximum insert size (map -i)" help="Only in paired-end mode." /> + <param name="insertMin" type="integer" value="0" label="Minimum insert size (map -j)" help="Only in paired-end mode." /> + <param name="pairTyp" type="text" size="2" value="pe" label="Type of read pair library (map -l)" help="Can be either 'pe', 'mp' or 'pp'." /> + <param name="minScor" type="integer" value="0" label="Minimum score (map -m)" help="Sets an absolute threshold of the Smith-Waterman scores." /> + <param name="partialAlignments" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Partial alignments (map -p)" help="Report partial alignments if they are complementary on the read (split reads)." /> + <param name="minBasq" type="integer" value="0" label="Base quality threshold (map -q)" help="Sets a base quality threshold (0 <= minbasq <= 10, default 0)." /> + <param name="seed" type="integer" value="0" label="Seed (map -r)" help="See below." /> + <param name="complexityWeighted" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Complexity weighted (map -w)" help="Smith-Waterman scores are complexity weighted." /> + <param name="minId" type="float" value="0" label="Identity threshold (map -y)" help="Sets an identity threshold for a mapping to be reported." /> + </when> + </conditional> + <param name="suppressHeader" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Suppress the header in the output SAM file" help="Smalt produces SAM with several lines of header information" /> + </inputs> + <outputs> + <data format="sam" name="output" label="${tool.name} on ${on_string}: mapped reads"> + <actions> + <conditional name="genomeSource.refGenomeSource"> + <when value="indexed"> + <action type="metadata" name="dbkey"> + <option type="from_data_table" name="smalt_indexes" column="1"> + <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/> + <filter type="param_value" ref="genomeSource.indices" column="0"/> + </option> + </action> + </when> + <when value="history"> + <action type="metadata" name="dbkey"> + <option type="from_param" name="genomeSource.ownFile" param_attribute="dbkey" /> + </action> + </when> + </conditional> + </actions> + </data> + </outputs> + <help> + +**What it does** + +SMALT is a pairwise sequence alignment program for the experimentingcient mapping of DNA sequencing reads onto genomic reference sequences. It uses a combination of short-word hashing and dynamic programming. Most types of sequencing platforms are supported including paired-end sequencing reads. + +------ + +Please cite the website "http://www.sanger.ac.uk/resources/software/smalt/". + +------ + +**Know what you are doing** + +.. class:: warningmark + +There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. + + .. __: http://www.sanger.ac.uk/resources/software/smalt/ + +------ + +**Input formats** + +SMALT accepts files in Sanger FASTQ format (galaxy type *fastqsanger*). Use the FASTQ Groomer to prepare your files. + +------ + +**A Note on Built-in Reference Genomes** + +The default variant for all genomes is "Full", defined as all primary chromosomes (or scaffolds/contigs) including mitochondrial plus associated unmapped, plasmid, and other segments. When only one version of a genome is available in this tool, it represents the default "Full" variant. Some genomes will have more than one variant available. The "Canonical Male" or sometimes simply "Canonical" variant contains the primary chromosomes for a genome. For example a human "Canonical" variant contains chr1-chr22, chrX, chrY, and chrM. The "Canonical Female" variant contains the primary chromosomes excluding chrY. + +------ + +**Outputs** + +The output is in SAM format. + +------- + +**SMALT parameter list** + +This is an exhaustive list of SMALT options: + +For **map**:: + + -a + Output explicit alignments along with the mappings. + + -c <mincover> + Only consider mappings where the k-mer word seeds cover the query read to + a minimum extent. If <mincover> is an integer or floating point > 1.0, at + least this many bases of the read must be covered by k-mer word seeds. If + <mincover> is a floating point <= 1.0, it specifies the fraction of the + query read length that must be covered by k-mer word seeds. This option + is only valid in conjunction with the '-x' flag. + + -d <scordiff> + Set a threshold of the Smith-Waterman alignment score relative to the + maximum score. When mapping single reads, all alignments are reported + that have Smith-Waterman scores within <scorediff> of the maximum. + Mappings with lower scores are skipped. If <scorediff> is set to to a + value < 0, all alignments are printed that have scores above the + threshold specified with the '-m <minscor>' option. + For paired reads, only a value of 0 is supported. With the option '-d 0' + all aligments (pairings) with the best score are output. By default + (without the option '-d 0') single reads/mates with multiple best mappings + are reported as 'not mapped'. + + -f <format> + Specifies the output format. <format> can be either 'bam', 'cigar', 'gff', + 'sam' (default), 'samsoft' or 'ssaha'. Optional extension 'sam:nohead,clip' + (see manual) + + -F <inform> + Specifies the input format. <inform> can be either 'fastq' (default), + 'sam' or 'bam' (see: samtools.sourceforge.net). SAM and BAM formats + require additional libraries to be installed. + + -g <insfil> + Use the distribution of insert sizes stored in the file <insfil>. This + file is in ASCII format and can be generated using the 'sample' task see + 'smalt sample -H' for help). + + -H + Print these instructions. + + -i <insertmax> + Maximum insert size (only in paired-end mode). The default is 500. + + -j <insertmin> + Minimum insert size (only in paired-end mode). The default is 0. + + -l <pairtyp> + Type of read pair library. <pairtyp> can be either 'pe', i.e. for + the Illumina paired-end library for short inserts (|--> <--|). 'mp' + for the Illumina mate-pair library for long inserts (<--| |-->) or + 'pp' for mates sequenced on the same strand (|--> |-->). 'pe' is the + default. + + -m <minscor> + Sets an absolute threshold of the Smith-Waterman scores. Mappings with + scores below that threshold will not be reported. The default is + <minscor> = <wordlen> + <stepsiz> - 1 + + -n <nthreads> + Run smalt using mutiple threads. <nthread> is the number of additional + threads forked from the main thread. The order of the reads in the + input files is not preserved for the output unless '-O' is also specified. + + -o <oufilnam> + Write mapping output (e.g. SAM lines) to a separate file. If this option + is not specified, mappings are written to standard output together with + other messages. + + -O + Output mappings in the order of the reads in the input files when using + multiple threads (option '-n <nthreads>'). + + -p + Report partial alignments if they are complementary on the read (split + reads). + + -q <minbasq> + Sets a base quality threshold (0 <= minbasq <= 10, default 0). + K-mer words of the read with nucleotides that have a base quality below + this threshold are not looked up in the hash index. + + -r <seed> + If <seed> >= 0 report an alignment selected at random where there are + multiple mappings with the same best alignment score. With <seed> = 0 + (default) a seed is derived from the current calendar time. If <seed> + < 0 reads with multiple best mappings are reported as 'not mapped'. + + -T <tmp_dir> + Write temporary files to directory <tmp_dir> (used with input files in + SAM/BAM format). + + -w + Smith-Waterman scores are complexity weighted. + + -x + This flag triggers a more exhaustive search for alignments at the cost + of decreased speed. In paired-end mode each mate is mapped independently. + (By default the mate with fewer hits in the hash index is mapped first + and the vicinity is searched for mappings of its mate.) + + -y <minid> + Sets an identity threshold for a mapping to be reported (default: 0). + <minid> specifies the number of exactly matching nucleotides either as + a positive integer or as a fraction of the read length (<= 1.0). + + </help> +</tool> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/tool-data/smalt_index.loc.sample Wed Feb 13 13:27:44 2013 -0500 @@ -0,0 +1,29 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Smalt indexed sequences data files. You will need +#to create these data files and then create a smalt_index.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The smalt_index.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_path> +# +#So, for example, if you had phiX indexed stored in +#/depot/data2/galaxy/phiX/base/, +#then the smalt_index.loc entry would look like this: +# +#phiX174 phiX phiX Pretty /depot/data2/galaxy/phiX/base/phiX.fa +# +#and your /depot/data2/galaxy/phiX/base/ directory +#would contain hg19.fa.* files: +# +#-rw-r--r-- 1 cborroto universe 527388 2005-09-13 10:12 phix.fa.sma +#-rw-r--r-- 1 cborroto universe 269808 2005-09-13 10:12 phiX.fa.smi +# +#Your smalt_index.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#phiX174 phiX phiX174 /depot/data2/galaxy/phiX/base/phiX.fa +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/hg18/base/hg18canon.fa +#hg18full hg18 hg18 Full /depot/data2/galaxy/hg18/base/hg18full.fa +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/tool_data_table_conf.xml.sample Wed Feb 13 13:27:44 2013 -0500 @@ -0,0 +1,8 @@ +<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> +<tables> + <!-- Locations of indexes in the BWA mapper format --> + <table name="smalt_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/smalt_index.loc" /> + </table> +</tables>