Mercurial > repos > nilshomer > srma_wrapper

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srma_ref.loc	Tue Jun 07 17:43:07 2011 -0400
@@ -0,0 +1,25 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of SRMA indexed sequences data files.  You will need
+#to create these data files and then create a srma_ref.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The srma_ref.loc
+#file has this format (white space characters are TAB characters):
+#
+#<build>	<fasta_file>
+#
+#So, for example, if you had hg18 indexed stored in
+#/depot/data2/galaxy/srma/hg18/,
+#then the srma_ref.loc entry would look like this:
+#
+#hg18	/depot/data2/galaxy/srma/hg18/hg18.fa
+#
+#and your /depot/data2/galaxy/srma/hg18/ directory
+#would contain hg18.fa.*.brg and hg18.fa.*.bif files:
+#hg18.fa
+#hg18.fa.dict
+#...etc...
+#
+#The dictionary file for each reference (ex. hg18.fa.dict) must be
+#created with Picard (http://picard.sourceforge.net).
+#
+20079	/Users/nhomer/hg/galaxy-central/srma/DH10B.fa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srma_wrapper.py	Tue Jun 07 17:43:07 2011 -0400
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+"""
+Runs SRMA on a SAM/BAM file;
+TODO: more documentation
+
+usage: srma_wrapper.py [options]
+    -r, --ref=r: The reference genome to use or index
+    -i, --input=i: The SAM/BAM input file
+    -o, --output=o: The SAM/BAM output file
+    -O, --offset=O: The alignment offset
+    -Q, --minMappingQuality=Q: The minimum mapping quality
+    -P, --minAlleleProbability=P: The minimum allele probability conditioned on coverage (for the binomial quantile).
+    -C, --minAlleleCoverage=C: The minimum haploid coverage for the consensus. Default value: 3. This option can be set
+    -R, --range=R: A range to examine
+    -c, --correctBases=c: Correct bases
+    -q, --useSequenceQualities=q: Use sequence qualities
+    -M, --maxHeapSize=M: The maximum number of nodes on the heap before re-alignment is ignored
+    -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history)
+    -p, --params=p: Parameter setting to use (pre_set or full)
+    -D, --dbkey=D: Dbkey for reference genome
+"""
+
+import optparse, os, shutil, subprocess, sys, tempfile
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
+    parser.add_option( '-i', '--input', dest='input', help='The SAM/BAM input file' )
+    parser.add_option( '-o', '--output', dest='output', help='The SAM/BAM output file' )
+    parser.add_option( '-O', '--offset', dest='offset', help='The alignment offset' )
+    parser.add_option( '-Q', '--minMappingQuality', dest='minMappingQuality', help='The minimum mapping quality' )
+    parser.add_option( '-P', '--minAlleleProbability', dest='minAlleleProbability', help='The minimum allele probability conditioned on coverage (for the binomial quantile).' )
+    parser.add_option( '-C', '--minAlleleCoverage', dest='minAlleleCoverage', help='The minimum haploid coverage for the consensus' )
+    parser.add_option( '-R', '--range', dest='range', help='A range to examine' )
+    parser.add_option( '-c', '--correctBases', dest='correctBases', help='Correct bases ' )
+    parser.add_option( '-q', '--useSequenceQualities', dest='useSequenceQualities', help='Use sequence qualities ' )
+    parser.add_option( '-M', '--maxHeapSize', dest='maxHeapSize', help='The maximum number of nodes on the heap before re-alignment is ignored' )
+    parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one from history (indexed or history)' )
+    parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' )
+    parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' )
+    (options, args) = parser.parse_args()
+
+    # make temp directory for bfast
+    tmp_dir = '%s/' % tempfile.mkdtemp()
+
+    # assume indexing has already been done
+    if options.fileSource == 'history':
+        stop_err( 'Error: indexing not implemented' )
+
+    # set up aligning and generate aligning command options
+    if options.params == 'pre_set':
+        srma_cmds = ''
+    else:
+        if options.useSequenceQualities == 'true':
+            useSequenceQualities = 'true'
+        else:
+            useSequenceQualities = 'false'
+        ranges = 'null'
+        if options.range == 'None':
+            range = 'null'
+        else:
+            range = options.range
+
+        srma_cmds = "OFFSET=%s MIN_MAPQ=%s MINIMUM_ALLELE_PROBABILITY=%s MINIMUM_ALLELE_COVERAGE=%s RANGES=%s RANGE=%s CORRECT_BASES=%s USE_SEQUENCE_QUALITIES=%s MAX_HEAP_SIZE=%s" % ( options.offset, options.minMappingQuality, options.minAlleleProbability, options.minAlleleCoverage, ranges, range, options.correctBases, options.useSequenceQualities, options.maxHeapSize )
+
+    # how do we call a JAR file?
+    cmd = 'java -jar /Users/nhomer/srma/build/jar/srma.jar I=%s O=%s R=%s %s' % ( options.input, options.output, options.ref, srma_cmds )
+
+    # perform alignments
+    buffsize = 1048576
+    try:
+        # need to nest try-except in try-finally to handle 2.4
+        try:
+            try:
+                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
+                tmp_stderr = open( tmp, 'wb' )
+                proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                # get stderr, allowing for case where it's very large
+                tmp_stderr = open( tmp, 'rb' )
+                stderr = ''
+                try:
+                    while True:
+                        stderr += tmp_stderr.read( buffsize )
+                        if not stderr or len( stderr ) % buffsize != 0:
+                            break
+                except OverflowError:
+                    pass
+                tmp_stderr.close()
+                if returncode != 0:
+                    raise Exception, stderr
+            except Exception, e:
+                raise Exception, 'Error executing SRMA. ' + str( e )
+            # check that there are results in the output file
+            if os.path.getsize( options.output ) > 0:
+                if "0" == options.space:
+                    sys.stdout.write( 'BFAST run on Base Space data' )
+                else:
+                    sys.stdout.write( 'BFAST run on Color Space data' )
+            else:
+                raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
+        except Exception, e:
+            stop_err( 'The alignment failed.\n' + str( e ) )
+    finally:
+        # clean up temp dir
+        if os.path.exists( tmp_dir ):
+            shutil.rmtree( tmp_dir )
+
+if __name__=="__main__": __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srma_wrapper.xml	Tue Jun 07 17:43:07 2011 -0400
@@ -0,0 +1,150 @@
+<tool id="srma_wrapper" name="Re-align with SRMA" version="0.1.1">
+  <description></description>
+  <command interpreter="python">
+    srma_wrapper.py
+
+    #if $refGenomeSource.refGenomeSource == "history":
+    --ref=$refGenomeSource.ownFile
+    #else:
+    --ref=$refGenomeSource.ref.value
+    #end if
+	--input=$input --output=$output
+	--params=$params.source_select --fileSource=$refGenomeSource.refGenomeSource
+    #if $params.source_select == "pre_set":
+    --offset="None", --minMappingQuality="None", --minAlleleProbability="None", --minAlleleCoverage="None", --range="None", --correctBases="None", --useSequenceQualities="None", --maxHeapSize="None", --fileSource="None", --params="None", --dbkey="None",
+    #else:
+    --offset=$params.offset, --minMappingQuality=$params.minMappingQuality, --minAlleleProbability=$params.minAlleleProbability, --minAlleleCoverage=$params.minAlleleCoverage, --range=$params.range, --correctBases=$params.correctBases, --useSequenceQualities=$params.useSequenceQualities, --maxHeapSize=$params.maxHeapSize, --fileSource=$params.fileSource, --params=$params.params, --dbkey=$params.dbkey,     #end if
+    #if $refGenomeSource.refGenomeSource == "history":
+    --dbkey=$dbkey
+    #else:
+    --dbkey="None"
+    #end if
+  </command>
+  <inputs>
+    <conditional name="refGenomeSource">
+      <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in reference?">
+        <option value="built-in">Use a built-in reference</option>
+        <option value="history">Use one from the history</option>
+      </param>
+      <when value="built-in">
+        <param name="ref" type="select" label="Select a reference genome">
+          <options from_file="srma_ref.loc">
+            <column name="value" index="1" />
+            <column name="name" index="0" />
+          </options>
+        </param>
+      </when>
+      <when value="history">
+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />
+      </when>
+    </conditional>
+    <param name="input" type="data" format="bam" label="Input BAM file" help="The input BAM file to re-align"/>
+    <conditional name="params">
+      <param name="source_select" type="select" label="SRMA settings to use" help="For most re-alignment needs, use Commonly Used settings. If you want full control use Full Parameter List">
+        <option value="pre_set">Commonly Used</option>
+        <option value="full">Full Parameter List</option>
+      </param>
+	  <when value="pre_set" />
+      <when value="full">
+		<param name="offset" type="integer" value="20" label="Offset" help="The alignment offset" />
+		<param name="minMappingQuality" type="integer" value="0" label="Minimum mapping quality" help="The minimum mapping quality" />
+		<param name="minAlleleProbability" type="float" value="0.1" label="Minimum allele probability" help="The minimum allele probability conditioned on coverage (for the binomial quantile)." />
+		<param name="minAlleleCoverage" type="integer" value="2" label="Minimum allele coverage" help="The minimum haploid coverage for the consensus. Default value: 3. This option can be set " />
+		<param name="range" type="text" value="null" label="Range" help="A range to examine" />
+		<param name="correctBases" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Correct bases" help="Correct bases " />
+		<param name="useSequenceQualities" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Use sequence qualities" help="Use sequence qualities " />
+		<param name="maxHeapSize" type="integer" value="8192" label="Maximum heap size" help="The maximum number of nodes on the heap before re-alignment is ignored" />
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="bam" name="output" />
+  </outputs>
+  <help>
+**What it does**
+
+SRMA is a short read micro re-aligner for next-generation high throughput sequencing data.
+
+Sequence alignment algorithms examine each read independently. When indels occur towards the ends of reads, the alignment can lead to false SNPs as well as improperly placed indels. This tool aims to perform a re-alignment of each read to a graphical representation of all alignments within a local region to provide a better overall base-resolution consensus.
+
+Currently this tool works well with and has been tested on 30x diploid coverage genome sequencing data from Illumina and ABI SOLiD technology. This tool may not work well with 454 data, as indels are a significant error mode for 454 data.
+
+------
+
+Please cite the website "http://srma.sourceforge.net" as well as:
+
+Homer N, and Nelson SF.  SRMA: short read micro re-aligner. 2010.
+
+------
+
+**Know what you are doing**
+
+.. class:: warningmark
+
+There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.
+
+.. __: http://srma.sourceforge.net/
+
+------
+
+**Input formats**
+
+SRMA accepts a BAM input file.
+
+------
+
+**Outputs**
+
+The output is in BAM format, see http://samtools.sourceforge.net for more details.
+
+-------
+
+**SRMA settings**
+
+All of the options have a default value. You can change any of them. Most of the options in SRMA have been implemented here.
+
+------
+
+**SRMA parameter list**
+
+This is an exhaustive list of SRMA options:
+
+For **SRMA**::
+
+  INPUT=File
+  I=File                        The input SAM or BAM file. Required.
+
+  OUTPUT=File
+  O=File                        The output SAM or BAM file. Default value: null.
+
+  REFERENCE=File
+  R=File                        The reference FASTA file. Required.
+
+  OFFSET=Integer                The alignment offset. Default value: 20. This option can be set to 'null' to clear the
+                                default value.
+
+  MIN_MAPQ=Integer              The minimum mapping quality. Default value: 0. This option can be set to 'null' to clear
+                                the default value.
+
+  MINIMUM_ALLELE_PROBABILITY=Double
+                                The minimum allele probability conditioned on coverage (for the binomial quantile).
+                                Default value: 0.1. This option can be set to 'null' to clear the default value.
+
+  MINIMUM_ALLELE_COVERAGE=Integer
+                                The minimum haploid coverage for the consensus. Default value: 3. This option can be set
+                                to 'null' to clear the default value.
+
+  RANGE=String                  A range to examine. Default value: null.
+
+  CORRECT_BASES=Boolean         Correct bases. Default value: false. This option can be set to 'null' to clear the
+                                default value. Possible values: {true, false}
+
+  USE_SEQUENCE_QUALITIES=BooleanUse sequence qualities Default value: true. This option can be set to 'null' to clear the
+                                default value. Possible values: {true, false}
+
+  MAX_HEAP_SIZE=Integer         The maximum number of nodes on the heap before re-alignment is ignored Default value:
+                                8192. This option can be set to 'null' to clear the default value.
+
+  </help>
+  <code file="srma_wrapper_code.py" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srma_wrapper_code.py	Tue Jun 07 17:43:07 2011 -0400
@@ -0,0 +1,11 @@
+import os
+
+def exec_before_job(app, inp_data, out_data, param_dict, tool):
+    try:
+        refFile = param_dict[ 'refGenomeSource' ][ 'index' ].value
+    except:
+        try:
+            refFile = param_dict[ 'refGenomeSource' ][ 'ownFile' ].dbkey
+        except:
+            refFile = '?'
+    dbkey = os.path.split( refFile )[1].split( '.' )[0]