0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Runs BFAST on single-end or paired-end data.
|
|
5 TODO: more documentation
|
|
6
|
|
7 TODO:
|
|
8 - auto-detect gzip or bz2
|
|
9 - split options (?)
|
|
10 - queue lengths (?)
|
|
11 - assumes reference always has been indexed
|
|
12 - main and secondary indexes
|
|
13 - scoring matrix file ?
|
|
14 - read group file ?
|
|
15
|
|
16 usage: bfast_wrapper.py [options]
|
|
17 -r, --ref=r: The reference genome to use or index
|
|
18 -f, --fastq=f: The fastq file to use for the mapping
|
|
19 -F, --output=u: The file to save the output (SAM format)
|
|
20 -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history)
|
|
21 -p, --params=p: Parameter setting to use (pre_set or full)
|
|
22 -n, --numThreads=n: The number of threads to use
|
|
23 -A, --space=A: The encoding space (0: base 1: color)
|
|
24 -o, --offsets=o: The offsets for 'match'
|
|
25 -l, --loadAllIndexes=l: Load all indexes into memory
|
|
26 -k, --keySize=k: truncate key size in 'match'
|
|
27 -K, --maxKeyMatches=K: the maximum number of matches to allow before a key is ignored
|
|
28 -M, --maxNumMatches=M: the maximum number of matches to allow before the read is discarded
|
|
29 -w, --whichStrand=w: the strands to consider (0: both 1: forward 2: reverse)
|
|
30 -t, --timing=t: output timing information to stderr
|
|
31 -u, --ungapped=u: performed ungapped local alignment
|
|
32 -U, --unconstrained=U: performed local alignment without mask constraints
|
|
33 -O, --offset=O: the number of bases before and after each hit to consider in local alignment
|
|
34 -q, --avgMismatchQuality=q: average mismatch quality
|
|
35 -a, --algorithm=a: post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all)
|
|
36 -P, --disallowPairing=P: do not choose alignments based on pairing
|
|
37 -R, --reverse=R: paired end reads are given on reverse strands
|
|
38 -z, --random=z: output a random best scoring alignment
|
|
39 -D, --dbkey=D: Dbkey for reference genome
|
|
40 -H, --suppressHeader=H: Suppress the sam header
|
|
41 """
|
|
42
|
|
43 import optparse, os, shutil, subprocess, sys, tempfile
|
|
44
|
|
45 def stop_err( msg ):
|
|
46 sys.stderr.write( '%s\n' % msg )
|
|
47 sys.exit()
|
|
48
|
|
49 def __main__():
|
|
50 #Parse Command Line
|
|
51 parser = optparse.OptionParser()
|
|
52 parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to index and use' )
|
|
53 parser.add_option( '-f', '--fastq', dest='fastq', help='The fastq file to use for the mapping' )
|
|
54 parser.add_option( '-F', '--output', dest='output', help='The file to save the output (SAM format)' )
|
|
55 parser.add_option( '-A', '--space', dest='space', type="choice", default='0', choices=('0','1' ), help='The encoding space (0: base 1: color)' )
|
|
56 parser.add_option( '-H', '--suppressHeader', action="store_true", dest='suppressHeader', default=False, help='Suppress header' )
|
|
57 parser.add_option( '-n', '--numThreads', dest='numThreads', type="int", default="1", help='The number of threads to use' )
|
|
58 parser.add_option( '-t', '--timing', action="store_true", default=False, dest='timing', help='output timming information to stderr' )
|
|
59 parser.add_option( '-l', '--loadAllIndexes', action="store_true", default=False, dest='loadAllIndexes', help='Load all indexes into memory' )
|
|
60 parser.add_option( '-m', '--indexMask', dest='indexMask', help='String containing info on how to build custom indexes' )
|
|
61 parser.add_option( "-b", "--buildIndex", action="store_true", dest="buildIndex", default=False, help='String containing info on how to build custom indexes' )
|
|
62 parser.add_option( "--indexRepeatMasker", action="store_true", dest="indexRepeatMasker", default=False, help='Do not index lower case sequences. Such as those created by RepeatMasker' )
|
|
63 parser.add_option( '--indexContigOptions', dest='indexContigOptions', default="", help='The contig range options to use for the indexing' )
|
|
64 parser.add_option( '--indexExonsFileName', dest='indexExonsFileName', default="", help='The exons file to use for the indexing' )
|
|
65
|
|
66 parser.add_option( '-o', '--offsets', dest='offsets', default="", help='The offsets for \'match\'' )
|
|
67 parser.add_option( '-k', '--keySize', dest='keySize', type="int", default="-1", help='truncate key size in \'match\'' )
|
|
68 parser.add_option( '-K', '--maxKeyMatches', dest='maxKeyMatches', type="int", default="-1", help='the maximum number of matches to allow before a key is ignored' )
|
|
69 parser.add_option( '-M', '--maxNumMatches', dest='maxNumMatches', type="int", default="-1", help='the maximum number of matches to allow bfore the read is discarded' )
|
|
70 parser.add_option( '-w', '--whichStrand', dest='whichStrand', type="choice", default='0', choices=('0','1','2'), help='the strands to consider (0: both 1: forward 2: reverse)' )
|
|
71
|
|
72 parser.add_option( '--scoringMatrixFileName', dest='scoringMatrixFileName', help='Scoring Matrix file used to score the alignments' )
|
|
73 parser.add_option( '-u', '--ungapped', dest='ungapped', action="store_true", default=False, help='performed ungapped local alignment' )
|
|
74 parser.add_option( '-U', '--unconstrained', dest='unconstrained', action="store_true", default=False, help='performed local alignment without mask constraints' )
|
|
75 parser.add_option( '-O', '--offset', dest='offset', type="int", default="0", help='the number of bases before and after each hit to consider in local alignment' )
|
|
76 parser.add_option( '-q', '--avgMismatchQuality', type="int", default="-1", dest='avgMismatchQuality', help='average mismatch quality' )
|
|
77
|
|
78 parser.add_option( '-a', '--algorithm', dest='algorithm', default='0', type="choice", choices=('0','1','2','3','4' ), help='post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all' )
|
|
79 parser.add_option( '--unpaired', dest='unpaired', action="store_true", default=False, help='do not choose alignments based on pairing' )
|
|
80 parser.add_option( '--reverseStrand', dest='reverseStrand', action="store_true", default=False, help='paired end reads are given on reverse strands' )
|
|
81 parser.add_option( '--pairedEndInfer', dest='pairedEndInfer', action="store_true", default=False, help='break ties when one end of a paired end read by estimating the insert size distribution' )
|
|
82 parser.add_option( '--randomBest', dest='randomBest', action="store_true", default=False, help='output a random best scoring alignment' )
|
|
83
|
|
84 (options, args) = parser.parse_args()
|
|
85
|
|
86 # output version # of tool
|
|
87 try:
|
|
88 tmp = tempfile.NamedTemporaryFile().name
|
|
89 tmp_stdout = open( tmp, 'wb' )
|
|
90 proc = subprocess.Popen( args='bfast 2>&1', shell=True, stdout=tmp_stdout )
|
|
91 tmp_stdout.close()
|
|
92 returncode = proc.wait()
|
|
93 stdout = None
|
|
94 for line in open( tmp_stdout.name, 'rb' ):
|
|
95 if line.lower().find( 'version' ) >= 0:
|
|
96 stdout = line.strip()
|
|
97 break
|
|
98 if stdout:
|
|
99 sys.stdout.write( '%s\n' % stdout )
|
|
100 else:
|
|
101 raise Exception
|
|
102 except:
|
|
103 sys.stdout.write( 'Could not determine BFAST version\n' )
|
|
104
|
|
105 buffsize = 1048576
|
|
106
|
|
107 # make temp directory for bfast, requires trailing slash
|
|
108 tmp_dir = '%s/' % tempfile.mkdtemp()
|
|
109
|
|
110 #'generic' options used in all bfast commands here
|
|
111 if options.timing:
|
|
112 all_cmd_options = "-t"
|
|
113 else:
|
|
114 all_cmd_options = ""
|
|
115
|
|
116 try:
|
|
117 if options.buildIndex:
|
|
118 reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name
|
|
119 #build bfast indexes
|
|
120 os.symlink( options.ref, reference_filepath )
|
|
121
|
|
122 #bfast fast2brg
|
|
123 try:
|
|
124 nuc_space = [ "0" ]
|
|
125 if options.space == "1":
|
|
126 #color space localalign appears to require nuc space brg
|
|
127 nuc_space.append( "1" )
|
|
128 for space in nuc_space:
|
|
129 cmd = 'bfast fasta2brg -f "%s" -A "%s" %s' % ( reference_filepath, space, all_cmd_options )
|
|
130 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
|
|
131 tmp_stderr = open( tmp, 'wb' )
|
|
132 proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
|
|
133 returncode = proc.wait()
|
|
134 tmp_stderr.close()
|
|
135 # get stderr, allowing for case where it's very large
|
|
136 tmp_stderr = open( tmp, 'rb' )
|
|
137 stderr = ''
|
|
138 try:
|
|
139 while True:
|
|
140 stderr += tmp_stderr.read( buffsize )
|
|
141 if not stderr or len( stderr ) % buffsize != 0:
|
|
142 break
|
|
143 except OverflowError:
|
|
144 pass
|
|
145 tmp_stderr.close()
|
|
146 if returncode != 0:
|
|
147 raise Exception, stderr
|
|
148 except Exception, e:
|
|
149 raise Exception, 'Error in \'bfast fasta2brg\'.\n' + str( e )
|
|
150
|
|
151 #bfast index
|
|
152 try:
|
|
153 all_index_cmds = 'bfast index %s -f "%s" -A "%s" -n "%s"' % ( all_cmd_options, reference_filepath, options.space, options.numThreads )
|
|
154
|
|
155 if options.indexRepeatMasker:
|
|
156 all_index_cmds += " -R"
|
|
157
|
|
158 if options.indexContigOptions:
|
|
159 index_contig_options = map( int, options.indexContigOptions.split( ',' ) )
|
|
160 if index_contig_options[0] >= 0:
|
|
161 all_index_cmds += ' -s "%s"' % index_contig_options[0]
|
|
162 if index_contig_options[1] >= 0:
|
|
163 all_index_cmds += ' -S "%s"' % index_contig_options[1]
|
|
164 if index_contig_options[2] >= 0:
|
|
165 all_index_cmds += ' -e "%s"' % index_contig_options[2]
|
|
166 if index_contig_options[3] >= 0:
|
|
167 all_index_cmds += ' -E "%s"' % index_contig_options[3]
|
|
168 elif options.indexExonsFileName:
|
|
169 all_index_cmds += ' -x "%s"' % options.indexExonsFileName
|
|
170
|
|
171 index_count = 1
|
|
172 for mask, hash_width in [ mask.split( ':' ) for mask in options.indexMask.split( ',' ) ]:
|
|
173 cmd = '%s -m "%s" -w "%s" -i "%i"' % ( all_index_cmds, mask, hash_width, index_count )
|
|
174 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
|
|
175 tmp_stderr = open( tmp, 'wb' )
|
|
176 proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
|
|
177 returncode = proc.wait()
|
|
178 tmp_stderr.close()
|
|
179 # get stderr, allowing for case where it's very large
|
|
180 tmp_stderr = open( tmp, 'rb' )
|
|
181 stderr = ''
|
|
182 try:
|
|
183 while True:
|
|
184 stderr += tmp_stderr.read( buffsize )
|
|
185 if not stderr or len( stderr ) % buffsize != 0:
|
|
186 break
|
|
187 except OverflowError:
|
|
188 pass
|
|
189 tmp_stderr.close()
|
|
190 if returncode != 0:
|
|
191 raise Exception, stderr
|
|
192 index_count += 1
|
|
193 except Exception, e:
|
|
194 raise Exception, 'Error in \'bfast index\'.\n' + str( e )
|
|
195
|
|
196 else:
|
|
197 reference_filepath = options.ref
|
|
198 assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.'
|
|
199
|
|
200 # set up aligning and generate aligning command options
|
|
201 # set up temp output files
|
|
202 tmp_bmf = tempfile.NamedTemporaryFile( dir=tmp_dir )
|
|
203 tmp_bmf_name = tmp_bmf.name
|
|
204 tmp_bmf.close()
|
|
205 tmp_baf = tempfile.NamedTemporaryFile( dir=tmp_dir )
|
|
206 tmp_baf_name = tmp_baf.name
|
|
207 tmp_baf.close()
|
|
208
|
|
209 bfast_match_cmd = 'bfast match -f "%s" -r "%s" -n "%s" -A "%s" -T "%s" -w "%s" %s' % ( reference_filepath, options.fastq, options.numThreads, options.space, tmp_dir, options.whichStrand, all_cmd_options )
|
|
210 bfast_localalign_cmd = 'bfast localalign -f "%s" -m "%s" -n "%s" -A "%s" -o "%s" %s' % ( reference_filepath, tmp_bmf_name, options.numThreads, options.space, options.offset, all_cmd_options )
|
|
211 bfast_postprocess_cmd = 'bfast postprocess -O 1 -f "%s" -i "%s" -n "%s" -A "%s" -a "%s" %s' % ( reference_filepath, tmp_baf_name, options.numThreads, options.space, options.algorithm, all_cmd_options )
|
|
212
|
|
213 if options.offsets:
|
|
214 bfast_match_cmd += ' -o "%s"' % options.offsets
|
|
215 if options.keySize >= 0:
|
|
216 bfast_match_cmd += ' -k "%s"' % options.keySize
|
|
217 if options.maxKeyMatches >= 0:
|
|
218 bfast_match_cmd += ' -K "%s"' % options.maxKeyMatches
|
|
219 if options.maxNumMatches >= 0:
|
|
220 bfast_match_cmd += ' -M "%s"' % options.maxNumMatches
|
|
221 bfast_localalign_cmd += ' -M "%s"' % options.maxNumMatches
|
|
222 if options.scoringMatrixFileName:
|
|
223 bfast_localalign_cmd += ' -x "%s"' % options.scoringMatrixFileName
|
|
224 bfast_postprocess_cmd += ' -x "%s"' % options.scoringMatrixFileName
|
|
225 if options.ungapped:
|
|
226 bfast_localalign_cmd += ' -u'
|
|
227 if options.unconstrained:
|
|
228 bfast_localalign_cmd += ' -U'
|
|
229 if options.avgMismatchQuality >= 0:
|
|
230 bfast_localalign_cmd += ' -q "%s"' % options.avgMismatchQuality
|
|
231 bfast_postprocess_cmd += ' -q "%s"' % options.avgMismatchQuality
|
|
232 if options.algorithm == 3:
|
|
233 if options.pairedEndInfer:
|
|
234 bfast_postprocess_cmd += ' -P'
|
|
235 if options.randomBest:
|
|
236 bfast_postprocess_cmd += ' -z'
|
|
237 if options.unpaired:
|
|
238 bfast_postprocess_cmd += ' -U'
|
|
239 if options.reverseStrand:
|
|
240 bfast_postprocess_cmd += ' -R'
|
|
241
|
|
242 #instead of using temp files, should we stream through pipes?
|
|
243 bfast_match_cmd += " > %s" % tmp_bmf_name
|
|
244 bfast_localalign_cmd += " > %s" % tmp_baf_name
|
|
245 bfast_postprocess_cmd += " > %s" % options.output
|
|
246
|
|
247 # need to nest try-except in try-finally to handle 2.4
|
|
248 try:
|
|
249 # bfast 'match'
|
|
250 try:
|
|
251 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
|
|
252 tmp_stderr = open( tmp, 'wb' )
|
|
253 proc = subprocess.Popen( args=bfast_match_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
|
|
254 returncode = proc.wait()
|
|
255 tmp_stderr.close()
|
|
256 # get stderr, allowing for case where it's very large
|
|
257 tmp_stderr = open( tmp, 'rb' )
|
|
258 stderr = ''
|
|
259 try:
|
|
260 while True:
|
|
261 stderr += tmp_stderr.read( buffsize )
|
|
262 if not stderr or len( stderr ) % buffsize != 0:
|
|
263 break
|
|
264 except OverflowError:
|
|
265 pass
|
|
266 tmp_stderr.close()
|
|
267 if returncode != 0:
|
|
268 raise Exception, stderr
|
|
269 except Exception, e:
|
|
270 raise Exception, 'Error in \'bfast match\'. \n' + str( e )
|
|
271 # bfast 'localalign'
|
|
272 try:
|
|
273 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
|
|
274 tmp_stderr = open( tmp, 'wb' )
|
|
275 proc = subprocess.Popen( args=bfast_localalign_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
|
|
276 returncode = proc.wait()
|
|
277 tmp_stderr.close()
|
|
278 # get stderr, allowing for case where it's very large
|
|
279 tmp_stderr = open( tmp, 'rb' )
|
|
280 stderr = ''
|
|
281 try:
|
|
282 while True:
|
|
283 stderr += tmp_stderr.read( buffsize )
|
|
284 if not stderr or len( stderr ) % buffsize != 0:
|
|
285 break
|
|
286 except OverflowError:
|
|
287 pass
|
|
288 tmp_stderr.close()
|
|
289 if returncode != 0:
|
|
290 raise Exception, stderr
|
|
291 except Exception, e:
|
|
292 raise Exception, 'Error in \'bfast localalign\'. \n' + str( e )
|
|
293 # bfast 'postprocess'
|
|
294 try:
|
|
295 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
|
|
296 tmp_stderr = open( tmp, 'wb' )
|
|
297 proc = subprocess.Popen( args=bfast_postprocess_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
|
|
298 returncode = proc.wait()
|
|
299 tmp_stderr.close()
|
|
300 # get stderr, allowing for case where it's very large
|
|
301 tmp_stderr = open( tmp, 'rb' )
|
|
302 stderr = ''
|
|
303 try:
|
|
304 while True:
|
|
305 stderr += tmp_stderr.read( buffsize )
|
|
306 if not stderr or len( stderr ) % buffsize != 0:
|
|
307 break
|
|
308 except OverflowError:
|
|
309 pass
|
|
310 tmp_stderr.close()
|
|
311 if returncode != 0:
|
|
312 raise Exception, stderr
|
|
313 except Exception, e:
|
|
314 raise Exception, 'Error in \'bfast postprocess\'. \n' + str( e )
|
|
315 # remove header if necessary
|
|
316 if options.suppressHeader:
|
|
317 tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
|
|
318 tmp_out_name = tmp_out.name
|
|
319 tmp_out.close()
|
|
320 try:
|
|
321 shutil.move( options.output, tmp_out_name )
|
|
322 except Exception, e:
|
|
323 raise Exception, 'Error moving output file before removing headers. \n' + str( e )
|
|
324 fout = file( options.output, 'w' )
|
|
325 for line in file( tmp_out.name, 'r' ):
|
|
326 if len( line ) < 3 or line[0:3] not in [ '@HD', '@SQ', '@RG', '@PG', '@CO' ]:
|
|
327 fout.write( line )
|
|
328 fout.close()
|
|
329 # check that there are results in the output file
|
|
330 if os.path.getsize( options.output ) > 0:
|
|
331 if "0" == options.space:
|
|
332 sys.stdout.write( 'BFAST run on Base Space data' )
|
|
333 else:
|
|
334 sys.stdout.write( 'BFAST run on Color Space data' )
|
|
335 else:
|
|
336 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
|
|
337 except Exception, e:
|
|
338 stop_err( 'The alignment failed.\n' + str( e ) )
|
|
339 finally:
|
|
340 # clean up temp dir
|
|
341 if os.path.exists( tmp_dir ):
|
|
342 shutil.rmtree( tmp_dir )
|
|
343
|
|
344 if __name__=="__main__": __main__()
|