Mercurial > repos > xuebing > sharplabtool
comparison tools/sr_mapping/bfast_wrapper.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 Runs BFAST on single-end or paired-end data. | |
5 TODO: more documentation | |
6 | |
7 TODO: | |
8 - auto-detect gzip or bz2 | |
9 - split options (?) | |
10 - queue lengths (?) | |
11 - assumes reference always has been indexed | |
12 - main and secondary indexes | |
13 - scoring matrix file ? | |
14 - read group file ? | |
15 | |
16 usage: bfast_wrapper.py [options] | |
17 -r, --ref=r: The reference genome to use or index | |
18 -f, --fastq=f: The fastq file to use for the mapping | |
19 -F, --output=u: The file to save the output (SAM format) | |
20 -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history) | |
21 -p, --params=p: Parameter setting to use (pre_set or full) | |
22 -n, --numThreads=n: The number of threads to use | |
23 -A, --space=A: The encoding space (0: base 1: color) | |
24 -o, --offsets=o: The offsets for 'match' | |
25 -l, --loadAllIndexes=l: Load all indexes into memory | |
26 -k, --keySize=k: truncate key size in 'match' | |
27 -K, --maxKeyMatches=K: the maximum number of matches to allow before a key is ignored | |
28 -M, --maxNumMatches=M: the maximum number of matches to allow before the read is discarded | |
29 -w, --whichStrand=w: the strands to consider (0: both 1: forward 2: reverse) | |
30 -t, --timing=t: output timing information to stderr | |
31 -u, --ungapped=u: performed ungapped local alignment | |
32 -U, --unconstrained=U: performed local alignment without mask constraints | |
33 -O, --offset=O: the number of bases before and after each hit to consider in local alignment | |
34 -q, --avgMismatchQuality=q: average mismatch quality | |
35 -a, --algorithm=a: post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all) | |
36 -P, --disallowPairing=P: do not choose alignments based on pairing | |
37 -R, --reverse=R: paired end reads are given on reverse strands | |
38 -z, --random=z: output a random best scoring alignment | |
39 -D, --dbkey=D: Dbkey for reference genome | |
40 -H, --suppressHeader=H: Suppress the sam header | |
41 """ | |
42 | |
43 import optparse, os, shutil, subprocess, sys, tempfile | |
44 | |
45 def stop_err( msg ): | |
46 sys.stderr.write( '%s\n' % msg ) | |
47 sys.exit() | |
48 | |
49 def __main__(): | |
50 #Parse Command Line | |
51 parser = optparse.OptionParser() | |
52 parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to index and use' ) | |
53 parser.add_option( '-f', '--fastq', dest='fastq', help='The fastq file to use for the mapping' ) | |
54 parser.add_option( '-F', '--output', dest='output', help='The file to save the output (SAM format)' ) | |
55 parser.add_option( '-A', '--space', dest='space', type="choice", default='0', choices=('0','1' ), help='The encoding space (0: base 1: color)' ) | |
56 parser.add_option( '-H', '--suppressHeader', action="store_true", dest='suppressHeader', default=False, help='Suppress header' ) | |
57 parser.add_option( '-n', '--numThreads', dest='numThreads', type="int", default="1", help='The number of threads to use' ) | |
58 parser.add_option( '-t', '--timing', action="store_true", default=False, dest='timing', help='output timming information to stderr' ) | |
59 parser.add_option( '-l', '--loadAllIndexes', action="store_true", default=False, dest='loadAllIndexes', help='Load all indexes into memory' ) | |
60 parser.add_option( '-m', '--indexMask', dest='indexMask', help='String containing info on how to build custom indexes' ) | |
61 parser.add_option( "-b", "--buildIndex", action="store_true", dest="buildIndex", default=False, help='String containing info on how to build custom indexes' ) | |
62 parser.add_option( "--indexRepeatMasker", action="store_true", dest="indexRepeatMasker", default=False, help='Do not index lower case sequences. Such as those created by RepeatMasker' ) | |
63 parser.add_option( '--indexContigOptions', dest='indexContigOptions', default="", help='The contig range options to use for the indexing' ) | |
64 parser.add_option( '--indexExonsFileName', dest='indexExonsFileName', default="", help='The exons file to use for the indexing' ) | |
65 | |
66 parser.add_option( '-o', '--offsets', dest='offsets', default="", help='The offsets for \'match\'' ) | |
67 parser.add_option( '-k', '--keySize', dest='keySize', type="int", default="-1", help='truncate key size in \'match\'' ) | |
68 parser.add_option( '-K', '--maxKeyMatches', dest='maxKeyMatches', type="int", default="-1", help='the maximum number of matches to allow before a key is ignored' ) | |
69 parser.add_option( '-M', '--maxNumMatches', dest='maxNumMatches', type="int", default="-1", help='the maximum number of matches to allow bfore the read is discarded' ) | |
70 parser.add_option( '-w', '--whichStrand', dest='whichStrand', type="choice", default='0', choices=('0','1','2'), help='the strands to consider (0: both 1: forward 2: reverse)' ) | |
71 | |
72 parser.add_option( '--scoringMatrixFileName', dest='scoringMatrixFileName', help='Scoring Matrix file used to score the alignments' ) | |
73 parser.add_option( '-u', '--ungapped', dest='ungapped', action="store_true", default=False, help='performed ungapped local alignment' ) | |
74 parser.add_option( '-U', '--unconstrained', dest='unconstrained', action="store_true", default=False, help='performed local alignment without mask constraints' ) | |
75 parser.add_option( '-O', '--offset', dest='offset', type="int", default="0", help='the number of bases before and after each hit to consider in local alignment' ) | |
76 parser.add_option( '-q', '--avgMismatchQuality', type="int", default="-1", dest='avgMismatchQuality', help='average mismatch quality' ) | |
77 | |
78 parser.add_option( '-a', '--algorithm', dest='algorithm', default='0', type="choice", choices=('0','1','2','3','4' ), help='post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all' ) | |
79 parser.add_option( '--unpaired', dest='unpaired', action="store_true", default=False, help='do not choose alignments based on pairing' ) | |
80 parser.add_option( '--reverseStrand', dest='reverseStrand', action="store_true", default=False, help='paired end reads are given on reverse strands' ) | |
81 parser.add_option( '--pairedEndInfer', dest='pairedEndInfer', action="store_true", default=False, help='break ties when one end of a paired end read by estimating the insert size distribution' ) | |
82 parser.add_option( '--randomBest', dest='randomBest', action="store_true", default=False, help='output a random best scoring alignment' ) | |
83 | |
84 (options, args) = parser.parse_args() | |
85 | |
86 # output version # of tool | |
87 try: | |
88 tmp = tempfile.NamedTemporaryFile().name | |
89 tmp_stdout = open( tmp, 'wb' ) | |
90 proc = subprocess.Popen( args='bfast 2>&1', shell=True, stdout=tmp_stdout ) | |
91 tmp_stdout.close() | |
92 returncode = proc.wait() | |
93 stdout = None | |
94 for line in open( tmp_stdout.name, 'rb' ): | |
95 if line.lower().find( 'version' ) >= 0: | |
96 stdout = line.strip() | |
97 break | |
98 if stdout: | |
99 sys.stdout.write( '%s\n' % stdout ) | |
100 else: | |
101 raise Exception | |
102 except: | |
103 sys.stdout.write( 'Could not determine BFAST version\n' ) | |
104 | |
105 buffsize = 1048576 | |
106 | |
107 # make temp directory for bfast, requires trailing slash | |
108 tmp_dir = '%s/' % tempfile.mkdtemp() | |
109 | |
110 #'generic' options used in all bfast commands here | |
111 if options.timing: | |
112 all_cmd_options = "-t" | |
113 else: | |
114 all_cmd_options = "" | |
115 | |
116 try: | |
117 if options.buildIndex: | |
118 reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name | |
119 #build bfast indexes | |
120 os.symlink( options.ref, reference_filepath ) | |
121 | |
122 #bfast fast2brg | |
123 try: | |
124 nuc_space = [ "0" ] | |
125 if options.space == "1": | |
126 #color space localalign appears to require nuc space brg | |
127 nuc_space.append( "1" ) | |
128 for space in nuc_space: | |
129 cmd = 'bfast fasta2brg -f "%s" -A "%s" %s' % ( reference_filepath, space, all_cmd_options ) | |
130 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name | |
131 tmp_stderr = open( tmp, 'wb' ) | |
132 proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) | |
133 returncode = proc.wait() | |
134 tmp_stderr.close() | |
135 # get stderr, allowing for case where it's very large | |
136 tmp_stderr = open( tmp, 'rb' ) | |
137 stderr = '' | |
138 try: | |
139 while True: | |
140 stderr += tmp_stderr.read( buffsize ) | |
141 if not stderr or len( stderr ) % buffsize != 0: | |
142 break | |
143 except OverflowError: | |
144 pass | |
145 tmp_stderr.close() | |
146 if returncode != 0: | |
147 raise Exception, stderr | |
148 except Exception, e: | |
149 raise Exception, 'Error in \'bfast fasta2brg\'.\n' + str( e ) | |
150 | |
151 #bfast index | |
152 try: | |
153 all_index_cmds = 'bfast index %s -f "%s" -A "%s" -n "%s"' % ( all_cmd_options, reference_filepath, options.space, options.numThreads ) | |
154 | |
155 if options.indexRepeatMasker: | |
156 all_index_cmds += " -R" | |
157 | |
158 if options.indexContigOptions: | |
159 index_contig_options = map( int, options.indexContigOptions.split( ',' ) ) | |
160 if index_contig_options[0] >= 0: | |
161 all_index_cmds += ' -s "%s"' % index_contig_options[0] | |
162 if index_contig_options[1] >= 0: | |
163 all_index_cmds += ' -S "%s"' % index_contig_options[1] | |
164 if index_contig_options[2] >= 0: | |
165 all_index_cmds += ' -e "%s"' % index_contig_options[2] | |
166 if index_contig_options[3] >= 0: | |
167 all_index_cmds += ' -E "%s"' % index_contig_options[3] | |
168 elif options.indexExonsFileName: | |
169 all_index_cmds += ' -x "%s"' % options.indexExonsFileName | |
170 | |
171 index_count = 1 | |
172 for mask, hash_width in [ mask.split( ':' ) for mask in options.indexMask.split( ',' ) ]: | |
173 cmd = '%s -m "%s" -w "%s" -i "%i"' % ( all_index_cmds, mask, hash_width, index_count ) | |
174 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name | |
175 tmp_stderr = open( tmp, 'wb' ) | |
176 proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) | |
177 returncode = proc.wait() | |
178 tmp_stderr.close() | |
179 # get stderr, allowing for case where it's very large | |
180 tmp_stderr = open( tmp, 'rb' ) | |
181 stderr = '' | |
182 try: | |
183 while True: | |
184 stderr += tmp_stderr.read( buffsize ) | |
185 if not stderr or len( stderr ) % buffsize != 0: | |
186 break | |
187 except OverflowError: | |
188 pass | |
189 tmp_stderr.close() | |
190 if returncode != 0: | |
191 raise Exception, stderr | |
192 index_count += 1 | |
193 except Exception, e: | |
194 raise Exception, 'Error in \'bfast index\'.\n' + str( e ) | |
195 | |
196 else: | |
197 reference_filepath = options.ref | |
198 assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.' | |
199 | |
200 # set up aligning and generate aligning command options | |
201 # set up temp output files | |
202 tmp_bmf = tempfile.NamedTemporaryFile( dir=tmp_dir ) | |
203 tmp_bmf_name = tmp_bmf.name | |
204 tmp_bmf.close() | |
205 tmp_baf = tempfile.NamedTemporaryFile( dir=tmp_dir ) | |
206 tmp_baf_name = tmp_baf.name | |
207 tmp_baf.close() | |
208 | |
209 bfast_match_cmd = 'bfast match -f "%s" -r "%s" -n "%s" -A "%s" -T "%s" -w "%s" %s' % ( reference_filepath, options.fastq, options.numThreads, options.space, tmp_dir, options.whichStrand, all_cmd_options ) | |
210 bfast_localalign_cmd = 'bfast localalign -f "%s" -m "%s" -n "%s" -A "%s" -o "%s" %s' % ( reference_filepath, tmp_bmf_name, options.numThreads, options.space, options.offset, all_cmd_options ) | |
211 bfast_postprocess_cmd = 'bfast postprocess -O 1 -f "%s" -i "%s" -n "%s" -A "%s" -a "%s" %s' % ( reference_filepath, tmp_baf_name, options.numThreads, options.space, options.algorithm, all_cmd_options ) | |
212 | |
213 if options.offsets: | |
214 bfast_match_cmd += ' -o "%s"' % options.offsets | |
215 if options.keySize >= 0: | |
216 bfast_match_cmd += ' -k "%s"' % options.keySize | |
217 if options.maxKeyMatches >= 0: | |
218 bfast_match_cmd += ' -K "%s"' % options.maxKeyMatches | |
219 if options.maxNumMatches >= 0: | |
220 bfast_match_cmd += ' -M "%s"' % options.maxNumMatches | |
221 bfast_localalign_cmd += ' -M "%s"' % options.maxNumMatches | |
222 if options.scoringMatrixFileName: | |
223 bfast_localalign_cmd += ' -x "%s"' % options.scoringMatrixFileName | |
224 bfast_postprocess_cmd += ' -x "%s"' % options.scoringMatrixFileName | |
225 if options.ungapped: | |
226 bfast_localalign_cmd += ' -u' | |
227 if options.unconstrained: | |
228 bfast_localalign_cmd += ' -U' | |
229 if options.avgMismatchQuality >= 0: | |
230 bfast_localalign_cmd += ' -q "%s"' % options.avgMismatchQuality | |
231 bfast_postprocess_cmd += ' -q "%s"' % options.avgMismatchQuality | |
232 if options.algorithm == 3: | |
233 if options.pairedEndInfer: | |
234 bfast_postprocess_cmd += ' -P' | |
235 if options.randomBest: | |
236 bfast_postprocess_cmd += ' -z' | |
237 if options.unpaired: | |
238 bfast_postprocess_cmd += ' -U' | |
239 if options.reverseStrand: | |
240 bfast_postprocess_cmd += ' -R' | |
241 | |
242 #instead of using temp files, should we stream through pipes? | |
243 bfast_match_cmd += " > %s" % tmp_bmf_name | |
244 bfast_localalign_cmd += " > %s" % tmp_baf_name | |
245 bfast_postprocess_cmd += " > %s" % options.output | |
246 | |
247 # need to nest try-except in try-finally to handle 2.4 | |
248 try: | |
249 # bfast 'match' | |
250 try: | |
251 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name | |
252 tmp_stderr = open( tmp, 'wb' ) | |
253 proc = subprocess.Popen( args=bfast_match_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) | |
254 returncode = proc.wait() | |
255 tmp_stderr.close() | |
256 # get stderr, allowing for case where it's very large | |
257 tmp_stderr = open( tmp, 'rb' ) | |
258 stderr = '' | |
259 try: | |
260 while True: | |
261 stderr += tmp_stderr.read( buffsize ) | |
262 if not stderr or len( stderr ) % buffsize != 0: | |
263 break | |
264 except OverflowError: | |
265 pass | |
266 tmp_stderr.close() | |
267 if returncode != 0: | |
268 raise Exception, stderr | |
269 except Exception, e: | |
270 raise Exception, 'Error in \'bfast match\'. \n' + str( e ) | |
271 # bfast 'localalign' | |
272 try: | |
273 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name | |
274 tmp_stderr = open( tmp, 'wb' ) | |
275 proc = subprocess.Popen( args=bfast_localalign_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) | |
276 returncode = proc.wait() | |
277 tmp_stderr.close() | |
278 # get stderr, allowing for case where it's very large | |
279 tmp_stderr = open( tmp, 'rb' ) | |
280 stderr = '' | |
281 try: | |
282 while True: | |
283 stderr += tmp_stderr.read( buffsize ) | |
284 if not stderr or len( stderr ) % buffsize != 0: | |
285 break | |
286 except OverflowError: | |
287 pass | |
288 tmp_stderr.close() | |
289 if returncode != 0: | |
290 raise Exception, stderr | |
291 except Exception, e: | |
292 raise Exception, 'Error in \'bfast localalign\'. \n' + str( e ) | |
293 # bfast 'postprocess' | |
294 try: | |
295 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name | |
296 tmp_stderr = open( tmp, 'wb' ) | |
297 proc = subprocess.Popen( args=bfast_postprocess_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) | |
298 returncode = proc.wait() | |
299 tmp_stderr.close() | |
300 # get stderr, allowing for case where it's very large | |
301 tmp_stderr = open( tmp, 'rb' ) | |
302 stderr = '' | |
303 try: | |
304 while True: | |
305 stderr += tmp_stderr.read( buffsize ) | |
306 if not stderr or len( stderr ) % buffsize != 0: | |
307 break | |
308 except OverflowError: | |
309 pass | |
310 tmp_stderr.close() | |
311 if returncode != 0: | |
312 raise Exception, stderr | |
313 except Exception, e: | |
314 raise Exception, 'Error in \'bfast postprocess\'. \n' + str( e ) | |
315 # remove header if necessary | |
316 if options.suppressHeader: | |
317 tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir) | |
318 tmp_out_name = tmp_out.name | |
319 tmp_out.close() | |
320 try: | |
321 shutil.move( options.output, tmp_out_name ) | |
322 except Exception, e: | |
323 raise Exception, 'Error moving output file before removing headers. \n' + str( e ) | |
324 fout = file( options.output, 'w' ) | |
325 for line in file( tmp_out.name, 'r' ): | |
326 if len( line ) < 3 or line[0:3] not in [ '@HD', '@SQ', '@RG', '@PG', '@CO' ]: | |
327 fout.write( line ) | |
328 fout.close() | |
329 # check that there are results in the output file | |
330 if os.path.getsize( options.output ) > 0: | |
331 if "0" == options.space: | |
332 sys.stdout.write( 'BFAST run on Base Space data' ) | |
333 else: | |
334 sys.stdout.write( 'BFAST run on Color Space data' ) | |
335 else: | |
336 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.' | |
337 except Exception, e: | |
338 stop_err( 'The alignment failed.\n' + str( e ) ) | |
339 finally: | |
340 # clean up temp dir | |
341 if os.path.exists( tmp_dir ): | |
342 shutil.rmtree( tmp_dir ) | |
343 | |
344 if __name__=="__main__": __main__() |