annotate tools/sr_mapping/srma_wrapper.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 Runs SRMA on a SAM/BAM file;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 TODO: more documentation
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 usage: srma_wrapper.py [options]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 See below for options
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 import optparse, os, shutil, subprocess, sys, tempfile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 sys.stderr.write( '%s\n' % msg )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 def parseRefLoc( refLoc, refUID ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 for line in open( refLoc ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 if not line.startswith( '#' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 fields = line.strip().split( '\t' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 if len( fields ) >= 3:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 if fields[0] == refUID:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 return fields[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 return None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 def __main__():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 #Parse Command Line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 parser = optparse.OptionParser()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to index and use' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 parser.add_option( '-u', '--refUID', dest='refUID', help='The pre-index reference genome unique Identifier' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 parser.add_option( '-L', '--refLocations', dest='refLocations', help='The filepath to the srma indices location file' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 parser.add_option( '-i', '--input', dest='input', help='The SAM/BAM input file' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 parser.add_option( '-I', '--inputIndex', dest='inputIndex', help='The SAM/BAM input index file' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 parser.add_option( '-o', '--output', dest='output', help='The SAM/BAM output file' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 parser.add_option( '-O', '--offset', dest='offset', help='The alignment offset' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 parser.add_option( '-Q', '--minMappingQuality', dest='minMappingQuality', help='The minimum mapping quality' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 parser.add_option( '-P', '--minAlleleProbability', dest='minAlleleProbability', help='The minimum allele probability conditioned on coverage (for the binomial quantile).' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 parser.add_option( '-C', '--minAlleleCoverage', dest='minAlleleCoverage', help='The minimum haploid coverage for the consensus' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 parser.add_option( '-R', '--range', dest='range', help='A range to examine' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 parser.add_option( '-c', '--correctBases', dest='correctBases', help='Correct bases ' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 parser.add_option( '-q', '--useSequenceQualities', dest='useSequenceQualities', help='Use sequence qualities ' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 parser.add_option( '-M', '--maxHeapSize', dest='maxHeapSize', help='The maximum number of nodes on the heap before re-alignment is ignored' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one from history (indexed or history)' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 parser.add_option( '-j', '--jarBin', dest='jarBin', default='', help='The path to where jars are stored' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 parser.add_option( '-f', '--jarFile', dest='jarFile', help='The file name of the jar file to use')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 (options, args) = parser.parse_args()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 # make temp directory for srma
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 tmp_dir = tempfile.mkdtemp()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 buffsize = 1048576
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 # set up reference filenames
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 reference_filepath_name = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 # need to create SRMA dict and Samtools fai files for custom genome
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 if options.fileSource == 'history':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 reference_filepath_name = reference_filepath.name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 reference_filepath.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 fai_filepath_name = '%s.fai' % reference_filepath_name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 dict_filepath_name = reference_filepath_name.replace( '.fa', '.dict' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 os.symlink( options.ref, reference_filepath_name )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 # create fai file using Samtools
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 index_fai_cmd = 'samtools faidx %s' % reference_filepath_name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 tmp_stderr = open( tmp, 'wb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 proc = subprocess.Popen( args=index_fai_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 returncode = proc.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 tmp_stderr.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 # get stderr, allowing for case where it's very large
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 tmp_stderr = open( tmp, 'rb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 stderr = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 while True:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 stderr += tmp_stderr.read( buffsize )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 if not stderr or len( stderr ) % buffsize != 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 except OverflowError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 tmp_stderr.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 if returncode != 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 raise Exception, stderr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 # clean up temp dir
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 if os.path.exists( tmp_dir ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 shutil.rmtree( tmp_dir )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 stop_err( 'Error creating Samtools index for custom genome file: %s\n' % str( e ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 # create dict file using SRMA
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 dict_cmd = 'java -cp "%s" net.sf.picard.sam.CreateSequenceDictionary R=%s O=%s' % ( os.path.join( options.jarBin, options.jarFile ), reference_filepath_name, dict_filepath_name )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 tmp_stderr = open( tmp, 'wb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 proc = subprocess.Popen( args=dict_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 returncode = proc.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 tmp_stderr.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 # get stderr, allowing for case where it's very large
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 tmp_stderr = open( tmp, 'rb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 stderr = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 while True:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 stderr += tmp_stderr.read( buffsize )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 if not stderr or len( stderr ) % buffsize != 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 except OverflowError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 tmp_stderr.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 if returncode != 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 raise Exception, stderr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 # clean up temp dir
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 if os.path.exists( tmp_dir ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 shutil.rmtree( tmp_dir )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 stop_err( 'Error creating index for custom genome file: %s\n' % str( e ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 # clean up temp dir
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 if os.path.exists( tmp_dir ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 shutil.rmtree( tmp_dir )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 stop_err( 'Problem handling SRMA index (dict file) for custom genome file: %s\n' % str( e ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 # using built-in dict/index files
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 if options.ref:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 reference_filepath_name = options.ref
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 reference_filepath_name = parseRefLoc( options.refLocation, options.refUID )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 if reference_filepath_name is None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 raise ValueError( 'A valid genome reference was not provided.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 # set up aligning and generate aligning command options
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 if options.params == 'pre_set':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 srma_cmds = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 if options.useSequenceQualities == 'true':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 useSequenceQualities = 'true'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 useSequenceQualities = 'false'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 ranges = 'null'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 if options.range == 'None':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 range = 'null'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 range = options.range
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 srma_cmds = "OFFSET=%s MIN_MAPQ=%s MINIMUM_ALLELE_PROBABILITY=%s MINIMUM_ALLELE_COVERAGE=%s RANGES=%s RANGE=%s CORRECT_BASES=%s USE_SEQUENCE_QUALITIES=%s MAX_HEAP_SIZE=%s" % ( options.offset, options.minMappingQuality, options.minAlleleProbability, options.minAlleleCoverage, ranges, range, options.correctBases, options.useSequenceQualities, options.maxHeapSize )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 # perform alignments
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 buffsize = 1048576
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 #symlink input bam and index files due to the naming conventions required by srma here
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 input_bam_filename = os.path.join( tmp_dir, '%s.bam' % os.path.split( options.input )[-1] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 os.symlink( options.input, input_bam_filename )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 input_bai_filename = "%s.bai" % os.path.splitext( input_bam_filename )[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 os.symlink( options.inputIndex, input_bai_filename )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 #create a temp output name, ending in .bam due to required naming conventions? unkown if required
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 output_bam_filename = os.path.join( tmp_dir, "%s.bam" % os.path.split( options.output )[-1] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 # generate commandline
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 cmd = 'java -jar %s I=%s O=%s R=%s %s' % ( os.path.join( options.jarBin, options.jarFile ), input_bam_filename, output_bam_filename, reference_filepath_name, srma_cmds )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 # need to nest try-except in try-finally to handle 2.4
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164 tmp_stderr = open( tmp, 'wb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 returncode = proc.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167 tmp_stderr.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 # get stderr, allowing for case where it's very large
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 tmp_stderr = open( tmp, 'rb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 stderr = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 while True:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173 stderr += tmp_stderr.read( buffsize )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174 if not stderr or len( stderr ) % buffsize != 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176 except OverflowError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178 tmp_stderr.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179 if returncode != 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180 raise Exception, stderr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182 raise Exception, 'Error executing SRMA. ' + str( e )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
183 # move file from temp location (with .bam name) to provided path
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
184 shutil.move( output_bam_filename, options.output )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
185 # check that there are results in the output file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
186 if os.path.getsize( options.output ) <= 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
187 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
188 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
189 stop_err( 'The re-alignment failed.\n' + str( e ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
190 finally:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
191 # clean up temp dir
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
192 if os.path.exists( tmp_dir ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
193 shutil.rmtree( tmp_dir )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
194
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
195 if __name__=="__main__": __main__()