comparison bwa_mem.py @ 1:ebb02ba5987c draft default tip

Rewrite of param handling. interPairEnd param moved to "paired" section. Add param for '-a' option. Remove basic parallelism tag, which does not work with multiple inputs (thanks Bjoern Gruening for the notice). Simplify Python code.
author crs4
date Fri, 21 Mar 2014 12:56:15 -0400
parents 6820983ba5d5
children
comparison
equal deleted inserted replaced
0:6820983ba5d5 1:ebb02ba5987c
15 15
16 See below for options 16 See below for options
17 """ 17 """
18 18
19 import optparse, os, shutil, subprocess, sys, tempfile 19 import optparse, os, shutil, subprocess, sys, tempfile
20
21 def stop_err( msg ):
22 sys.stderr.write( '%s\n' % msg )
23 sys.exit()
24
25 def check_is_double_encoded( fastq ):
26 # check that first read is bases, not one base followed by numbers
27 bases = [ 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', 'N' ]
28 nums = [ '0', '1', '2', '3' ]
29 for line in file( fastq, 'rb'):
30 if not line.strip() or line.startswith( '@' ):
31 continue
32 if len( [ b for b in line.strip() if b in nums ] ) > 0:
33 return False
34 elif line.strip()[0] in bases and len( [ b for b in line.strip() if b in bases ] ) == len( line.strip() ):
35 return True
36 else:
37 raise Exception, 'First line in first read does not appear to be a valid FASTQ read in either base-space or color-space'
38 raise Exception, 'There is no non-comment and non-blank line in your FASTQ file'
39 20
40 def __main__(): 21 def __main__():
41 descr = "bwa_mem.py: Map (long length) reads against a reference genome with BWA-MEM." 22 descr = "bwa_mem.py: Map (long length) reads against a reference genome with BWA-MEM."
42 parser = optparse.OptionParser(description=descr) 23 parser = optparse.OptionParser(description=descr)
43 parser.add_option( '-t', '--threads', default=1, help='The number of threads to use [1]' ) 24 parser.add_option( '-t', '--threads', default=1, help='The number of threads to use [1]' )
48 parser.add_option( '-g', '--genAlignType', help='The type of pairing (single or paired)' ) 29 parser.add_option( '-g', '--genAlignType', help='The type of pairing (single or paired)' )
49 parser.add_option( '--params', help='Parameter setting to use (pre_set or full)' ) 30 parser.add_option( '--params', help='Parameter setting to use (pre_set or full)' )
50 parser.add_option( '-s', '--fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' ) 31 parser.add_option( '-s', '--fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' )
51 parser.add_option( '-D', '--dbkey', help='Dbkey for reference genome' ) 32 parser.add_option( '-D', '--dbkey', help='Dbkey for reference genome' )
52 33
53 parser.add_option( '-k', '--minEditDistSeed', default=19, type=int, help='Minimum edit distance to the seed [19]' ) 34 parser.add_option( '-k', '--minSeedLength', type=int, help='Minimum seed length [19]' )
54 parser.add_option( '-w', '--bandWidth', default=100, type=int, help='Band width for banded alignment [100]' ) 35 parser.add_option( '-w', '--bandWidth', type=int, help='Band width for banded alignment [100]' )
55 parser.add_option( '-d', '--offDiagonal', default=100, type=int, help='off-diagonal X-dropoff [100]' ) 36 parser.add_option( '-d', '--offDiagonal', type=int, help='Off-diagonal X-dropoff [100]' )
56 parser.add_option( '-r', '--internalSeeds', default=1.5, type=float, help='look for internal seeds inside a seed longer than {-k} * FLOAT [1.5]' ) 37 parser.add_option( '-r', '--internalSeeds', type=float, help='Look for internal seeds inside a seed longer than {-k} * FLOAT [1.5]' )
57 parser.add_option( '-c', '--seedsOccurrence', default=10000, type=int, help='skip seeds with more than INT occurrences [10000]' ) 38 parser.add_option( '-c', '--seedsOccurrence', type=int, help='Skip seeds with more than INT occurrences [10000]' )
58 parser.add_option( '-S', '--mateRescue', default=False, help='skip mate rescue' ) 39 parser.add_option( '-S', '--mateRescue', action='store_true', help='Skip mate rescue' )
59 parser.add_option( '-P', '--skipPairing', default=False, help='skpe pairing, mate rescue performed unless -S also in use' ) 40 parser.add_option( '-P', '--skipPairing', action='store_true', help='Skip pairing' )
60 parser.add_option( '-A', '--seqMatch', default=1, type=int, help='score of a sequence match' ) 41 parser.add_option( '-A', '--seqMatch', type=int, help='Score for a sequence match [1]' )
61 parser.add_option( '-B', '--mismatch', default=4, type=int, help='penalty for a mismatch' ) 42 parser.add_option( '-B', '--mismatch', type=int, help='Penalty for a mismatch [4]' )
62 parser.add_option( '-O', '--gapOpen', default=6, type=int, help='gap open penalty' ) 43 parser.add_option( '-O', '--gapOpen', type=int, help='Gap open penalty [6]' )
63 parser.add_option( '-E', '--gapExtension', default=None, help='gap extension penalty; a gap of size k cost {-O} + {-E}*k [1]' ) 44 parser.add_option( '-E', '--gapExtension', type=int, help='Gap extension penalty; a gap of length k costs {-O} + {-E}*k [1]' )
64 parser.add_option( '-L', '--clipping', default=5, type=int, help='penalty for clipping [5]' ) 45 parser.add_option( '-L', '--clipping', help='Penalty for clipping [5]' )
65 parser.add_option( '-U', '--unpairedReadpair', default=17, type=int, help='penalty for an unpaired read pair [17]' ) 46 parser.add_option( '-U', '--unpairedReadpair', type=int, help='Penalty for an unpaired read pair [17]' )
66 parser.add_option( '-p', '--interPairEnd', default=False, help='first query file consists of interleaved paired-end sequences' ) 47 parser.add_option( '-p', '--interPairEnd', action='store_true', help='FASTQ file consists of interleaved paired-end sequences' )
67 parser.add_option( '--rgid', help='Read group identifier' ) 48 parser.add_option( '--rgid', help='Read group identifier' )
68 parser.add_option( '--rgsm', help='Sample' ) 49 parser.add_option( '--rgsm', help='Sample' )
69 parser.add_option( '--rgpl', choices=[ 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT', 'PACBIO' ], help='Platform/technology used to produce the reads' ) 50 parser.add_option( '--rgpl', choices=[ 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT', 'PACBIO' ], help='Platform/technology used to produce the reads' )
70 parser.add_option( '--rglb', help='Library name' ) 51 parser.add_option( '--rglb', help='Library name' )
71 parser.add_option( '--rgpu', help='Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)' ) 52 parser.add_option( '--rgpu', help='Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)' )
74 parser.add_option( '--rgdt', help='Date that run was produced (ISO8601 format date or date/time, like YYYY-MM-DD)' ) 55 parser.add_option( '--rgdt', help='Date that run was produced (ISO8601 format date or date/time, like YYYY-MM-DD)' )
75 parser.add_option( '--rgfo', help='Flow order' ) 56 parser.add_option( '--rgfo', help='Flow order' )
76 parser.add_option( '--rgks', help='The array of nucleotide bases that correspond to the key sequence of each read' ) 57 parser.add_option( '--rgks', help='The array of nucleotide bases that correspond to the key sequence of each read' )
77 parser.add_option( '--rgpg', help='Programs used for processing the read group' ) 58 parser.add_option( '--rgpg', help='Programs used for processing the read group' )
78 parser.add_option( '--rgpi', help='Predicted median insert size' ) 59 parser.add_option( '--rgpi', help='Predicted median insert size' )
79 parser.add_option( '-T', '--minScore', default=30, type=int, help='minimum score to output [30]' ) 60 parser.add_option( '-T', '--minScore', type=int, help='Minimum score to output [30]' )
80 parser.add_option( '-M', '--mark', default=False, help='mark shorter split hits as secondary (for Picard/GATK compatibility)' ) 61 parser.add_option( '-a', '--outputAll', action='store_true', help='Output all found alignments for single-end or unpaired paired-end reads' )
81 parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' ) 62 parser.add_option( '-M', '--mark', action='store_true', help='Mark shorter split hits as secondary (for Picard/GATK compatibility)' )
63 parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', action='store_true', help='Suppress header' )
82 (options, args) = parser.parse_args() 64 (options, args) = parser.parse_args()
83 if len(args) > 0: 65 if len(args) > 0:
84 parser.error('Wrong number of arguments') 66 parser.error('Wrong number of arguments')
85 67
86 # output version # of tool 68 # output version # of tool
150 # clean up temp dirs 132 # clean up temp dirs
151 if os.path.exists( tmp_index_dir ): 133 if os.path.exists( tmp_index_dir ):
152 shutil.rmtree( tmp_index_dir ) 134 shutil.rmtree( tmp_index_dir )
153 if os.path.exists( tmp_dir ): 135 if os.path.exists( tmp_dir ):
154 shutil.rmtree( tmp_dir ) 136 shutil.rmtree( tmp_dir )
155 stop_err( 'Error indexing reference sequence. ' + str( e ) ) 137 raise Exception, 'Error indexing reference sequence. ' + str( e )
156 else: 138 else:
157 ref_file_name = options.ref 139 ref_file_name = options.ref
158 # if options.illumina13qual: 140 # if options.illumina13qual:
159 # illumina_quals = "-I" 141 # illumina_quals = "-I"
160 # else: 142 # else:
161 # illumina_quals = "" 143 # illumina_quals = ""
162 144
163 # set up aligning and generate aligning command args 145 # set up aligning and generate aligning command args
164 start_cmds = '-t %s ' % options.threads 146 start_cmds = "bwa mem -t %s" % options.threads
165 if options.params == 'pre_set': 147 if options.interPairEnd:
166 # aligning_cmds = '-t %s %s' % ( options.threads, illumina_quals ) 148 start_cmds += ' -p'
167 #start_cmds = '-t %s ' % options.threads 149 if options.params != 'pre_set':
168 end_cmds = ' ' 150 if options.minSeedLength is not None:
169 print start_cmds, end_cmds 151 start_cmds += " -k %d" % options.minSeedLength
170 152 if options.bandWidth is not None:
171 else: 153 start_cmds += " -w %d" % options.bandWidth
172 end_cmds = '-k %s -w %s -d %s -r %s -c %s -A %s -B %s -O %s -L %s -U %s -T %s ' % (options.minEditDistSeed, options.bandWidth, options.offDiagonal, options.internalSeeds, options.seedsOccurrence, options.seqMatch, options.mismatch, options.gapOpen, options.clipping, options.unpairedReadpair, options.minScore) 154 if options.offDiagonal is not None:
155 start_cmds += " -d %d" % options.offDiagonal
156 if options.internalSeeds is not None:
157 start_cmds += " -r %s" % options.internalSeeds
158 if options.seedsOccurrence is not None:
159 start_cmds += " -c %d" % options.seedsOccurrence
173 if options.mateRescue: 160 if options.mateRescue:
174 end_cmds += '-S ' 161 start_cmds += ' -S'
175 if options.skipPairing: 162 if options.skipPairing:
176 end_cmds += '-P ' 163 start_cmds += ' -P'
177 else: 164 if options.seqMatch is not None:
178 if options.skipPairing: 165 start_cmds += " -A %d" % options.seqMatch
179 print "Option Error and will not be considered, you should also choose 'skip mate rescue -S' option! " 166 if options.mismatch is not None:
180 if options.gapExtension != None: 167 start_cmds += " -B %d" % options.mismatch
181 end_cmds += '-E %s ' % options.gapExtension 168 if options.gapOpen is not None:
182 169 start_cmds += " -O %d" % options.gapOpen
170 if options.gapExtension is not None:
171 start_cmds += " -E %d" % options.gapExtension
172 if options.clipping:
173 start_cmds += " -L %s" % options.clipping
174 if options.unpairedReadpair is not None:
175 start_cmds += " -U %d" % options.unpairedReadpair
176 if options.minScore is not None:
177 start_cmds += " -T %d" % options.minScore
178 if options.outputAll:
179 start_cmds += ' -a'
180 if options.mark:
181 start_cmds += ' -M'
183 if options.rgid: 182 if options.rgid:
184 if not options.rglb or not options.rgpl or not options.rgsm or not options.rglb: 183 if not options.rglb or not options.rgpl or not options.rgsm or not options.rglb:
185 stop_err( 'If you want to specify read groups, you must include the ID, LB, PL, and SM tags.' ) 184 sys.exit( 'If you want to specify read groups, you must include the ID, LB, PL, and SM tags.' )
186 # readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm )
187 readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm ) 185 readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm )
188 if options.rgpu: 186 if options.rgpu:
189 readGroup += '\tPU:%s' % options.rgpu 187 readGroup += '\tPU:%s' % options.rgpu
190 if options.rgcn: 188 if options.rgcn:
191 readGroup += '\tCN:%s' % options.rgcn 189 readGroup += '\tCN:%s' % options.rgcn
199 readGroup += '\tKS:%s' % options.rgks 197 readGroup += '\tKS:%s' % options.rgks
200 if options.rgpg: 198 if options.rgpg:
201 readGroup += '\tPG:%s' % options.rgpg 199 readGroup += '\tPG:%s' % options.rgpg
202 if options.rgpi: 200 if options.rgpi:
203 readGroup += '\tPI:%s' % options.rgpi 201 readGroup += '\tPI:%s' % options.rgpi
204 end_cmds += ' -R "%s" ' % readGroup 202 start_cmds += " -R '%s'" % readGroup
205
206 if options.interPairEnd:
207 end_cmds += '-p %s ' % options.interPairEnd
208 if options.mark:
209 end_cmds += '-M '
210
211 203
212 if options.genAlignType == 'paired': 204 if options.genAlignType == 'paired':
213 cmd = 'bwa mem %s %s %s %s %s > %s' % ( start_cmds, ref_file_name, fastq, rfastq, end_cmds, options.output ) 205 cmd = "%s %s %s %s > %s" % ( start_cmds, ref_file_name, fastq, rfastq, options.output )
214 else: 206 else:
215 cmd = 'bwa mem %s %s %s %s > %s' % ( start_cmds, ref_file_name, fastq, end_cmds, options.output ) 207 cmd = "%s %s %s > %s" % ( start_cmds, ref_file_name, fastq, options.output )
216 208
217 # perform alignments 209 # perform alignments
218 buffsize = 1048576 210 buffsize = 1048576
219 try: 211 try:
220 # need to nest try-except in try-finally to handle 2.4 212 # need to nest try-except in try-finally to handle 2.4
221 try: 213 try:
214 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
215 tmp_stderr = open( tmp, 'wb' )
216 print "The cmd is %s" % cmd
217 proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
218 returncode = proc.wait()
219 tmp_stderr.close()
220 # get stderr, allowing for case where it's very large
221 tmp_stderr = open( tmp, 'rb' )
222 stderr = ''
222 try: 223 try:
223 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name 224 while True:
224 tmp_stderr = open( tmp, 'wb' ) 225 stderr += tmp_stderr.read( buffsize )
225 print "The cmd is %s" % cmd 226 if not stderr or len( stderr ) % buffsize != 0:
226 proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) 227 break
227 returncode = proc.wait() 228 except OverflowError:
228 tmp_stderr.close() 229 pass
229 # get stderr, allowing for case where it's very large 230 tmp_stderr.close()
230 tmp_stderr = open( tmp, 'rb' ) 231 if returncode != 0:
231 stderr = '' 232 raise Exception, stderr
232 try: 233 except Exception, e:
233 while True: 234 raise Exception, 'Error generating alignments. ' + str( e )
234 stderr += tmp_stderr.read( buffsize ) 235 # remove header if necessary
235 if not stderr or len( stderr ) % buffsize != 0: 236 if options.suppressHeader:
236 break 237 tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
237 except OverflowError: 238 tmp_out_name = tmp_out.name
238 pass 239 tmp_out.close()
239 tmp_stderr.close() 240 try:
240 if returncode != 0: 241 shutil.move( options.output, tmp_out_name )
241 raise Exception, stderr
242 except Exception, e: 242 except Exception, e:
243 raise Exception, 'Error generating alignments. ' + str( e ) 243 raise Exception, 'Error moving output file before removing headers. ' + str( e )
244 # remove header if necessary 244 fout = file( options.output, 'w' )
245 if options.suppressHeader == 'true': 245 for line in file( tmp_out.name, 'r' ):
246 tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir) 246 if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ):
247 tmp_out_name = tmp_out.name 247 fout.write( line )
248 tmp_out.close() 248 fout.close()
249 try: 249 # check that there are results in the output file
250 shutil.move( options.output, tmp_out_name ) 250 if os.path.getsize( options.output ) > 0:
251 except Exception, e: 251 sys.stdout.write( 'BWA run on %s-end data' % options.genAlignType )
252 raise Exception, 'Error moving output file before removing headers. ' + str( e ) 252 else:
253 fout = file( options.output, 'w' ) 253 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
254 for line in file( tmp_out.name, 'r' ):
255 if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ):
256 fout.write( line )
257 fout.close()
258 # check that there are results in the output file
259 if os.path.getsize( options.output ) > 0:
260 sys.stdout.write( 'BWA run on %s-end data' % options.genAlignType )
261 else:
262 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
263 except Exception, e:
264 stop_err( 'The alignment failed.\n' + str( e ) )
265 finally: 254 finally:
266 # clean up temp dir 255 # clean up temp dir
267 if os.path.exists( tmp_index_dir ): 256 if os.path.exists( tmp_index_dir ):
268 shutil.rmtree( tmp_index_dir ) 257 shutil.rmtree( tmp_index_dir )
269 if os.path.exists( tmp_dir ): 258 if os.path.exists( tmp_dir ):