annotate bwa_wrapper.py @ 2:3a001705dc94 draft default tip

Uploaded
author evan
date Fri, 06 Jun 2014 12:55:09 -0400
parents 8d92246f41bb
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8d92246f41bb Uploaded
evan
parents:
diff changeset
1 #!/usr/bin/env python
8d92246f41bb Uploaded
evan
parents:
diff changeset
2
8d92246f41bb Uploaded
evan
parents:
diff changeset
3 """
8d92246f41bb Uploaded
evan
parents:
diff changeset
4 Runs BWA on single-end or paired-end data.
8d92246f41bb Uploaded
evan
parents:
diff changeset
5 Produces a SAM file containing the mappings.
8d92246f41bb Uploaded
evan
parents:
diff changeset
6 Works with BWA version 0.5.9.
8d92246f41bb Uploaded
evan
parents:
diff changeset
7
8d92246f41bb Uploaded
evan
parents:
diff changeset
8 usage: bwa_wrapper.py [options]
8d92246f41bb Uploaded
evan
parents:
diff changeset
9
8d92246f41bb Uploaded
evan
parents:
diff changeset
10 See below for options
8d92246f41bb Uploaded
evan
parents:
diff changeset
11 """
8d92246f41bb Uploaded
evan
parents:
diff changeset
12
8d92246f41bb Uploaded
evan
parents:
diff changeset
13 import optparse, os, shutil, subprocess, sys, tempfile
8d92246f41bb Uploaded
evan
parents:
diff changeset
14
8d92246f41bb Uploaded
evan
parents:
diff changeset
15 def stop_err( msg ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
16 sys.stderr.write( '%s\n' % msg )
8d92246f41bb Uploaded
evan
parents:
diff changeset
17 sys.exit()
8d92246f41bb Uploaded
evan
parents:
diff changeset
18
8d92246f41bb Uploaded
evan
parents:
diff changeset
19 def check_is_double_encoded( fastq ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
20 # check that first read is bases, not one base followed by numbers
8d92246f41bb Uploaded
evan
parents:
diff changeset
21 bases = [ 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', 'N' ]
8d92246f41bb Uploaded
evan
parents:
diff changeset
22 nums = [ '0', '1', '2', '3' ]
8d92246f41bb Uploaded
evan
parents:
diff changeset
23 for line in file( fastq, 'rb'):
8d92246f41bb Uploaded
evan
parents:
diff changeset
24 if not line.strip() or line.startswith( '@' ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
25 continue
8d92246f41bb Uploaded
evan
parents:
diff changeset
26 if len( [ b for b in line.strip() if b in nums ] ) > 0:
8d92246f41bb Uploaded
evan
parents:
diff changeset
27 return False
8d92246f41bb Uploaded
evan
parents:
diff changeset
28 elif line.strip()[0] in bases and len( [ b for b in line.strip() if b in bases ] ) == len( line.strip() ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
29 return True
8d92246f41bb Uploaded
evan
parents:
diff changeset
30 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
31 raise Exception, 'First line in first read does not appear to be a valid FASTQ read in either base-space or color-space'
8d92246f41bb Uploaded
evan
parents:
diff changeset
32 raise Exception, 'There is no non-comment and non-blank line in your FASTQ file'
8d92246f41bb Uploaded
evan
parents:
diff changeset
33
8d92246f41bb Uploaded
evan
parents:
diff changeset
34 def __main__():
8d92246f41bb Uploaded
evan
parents:
diff changeset
35 #Parse Command Line
8d92246f41bb Uploaded
evan
parents:
diff changeset
36 parser = optparse.OptionParser()
8d92246f41bb Uploaded
evan
parents:
diff changeset
37 parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
38 parser.add_option( '-c', '--color-space', dest='color_space', action='store_true', help='If the input files are SOLiD format' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
39 parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
40 parser.add_option( '-f', '--input1', dest='fastq', help='The (forward) fastq file to use for the mapping' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
41 parser.add_option( '-F', '--input2', dest='rfastq', help='The reverse fastq file to use for mapping if paired-end data' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
42 parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
43 parser.add_option( '-g', '--genAlignType', dest='genAlignType', help='The type of pairing (single or paired)' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
44 parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
45 parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
46 parser.add_option( '-n', '--maxEditDist', dest='maxEditDist', help='Maximum edit distance if integer' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
47 parser.add_option( '-m', '--fracMissingAligns', dest='fracMissingAligns', help='Fraction of missing alignments given 2% uniform base error rate if fraction' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
48 parser.add_option( '-o', '--maxGapOpens', dest='maxGapOpens', help='Maximum number of gap opens' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
49 parser.add_option( '-e', '--maxGapExtens', dest='maxGapExtens', help='Maximum number of gap extensions' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
50 parser.add_option( '-d', '--disallowLongDel', dest='disallowLongDel', help='Disallow a long deletion within specified bps' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
51 parser.add_option( '-i', '--disallowIndel', dest='disallowIndel', help='Disallow indel within specified bps' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
52 parser.add_option( '-l', '--seed', dest='seed', help='Take the first specified subsequences' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
53 parser.add_option( '-k', '--maxEditDistSeed', dest='maxEditDistSeed', help='Maximum edit distance to the seed' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
54 parser.add_option( '-M', '--mismatchPenalty', dest='mismatchPenalty', help='Mismatch penalty' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
55 parser.add_option( '-O', '--gapOpenPenalty', dest='gapOpenPenalty', help='Gap open penalty' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
56 parser.add_option( '-E', '--gapExtensPenalty', dest='gapExtensPenalty', help='Gap extension penalty' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
57 parser.add_option( '-R', '--suboptAlign', dest='suboptAlign', default=None, help='Proceed with suboptimal alignments even if the top hit is a repeat' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
58 parser.add_option( '-N', '--noIterSearch', dest='noIterSearch', help='Disable iterative search' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
59 parser.add_option( '-T', '--outputTopN', dest='outputTopN', help='Maximum number of alignments to output in the XA tag for reads paired properly' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
60 parser.add_option( '', '--outputTopNDisc', dest='outputTopNDisc', help='Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons)' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
61 parser.add_option( '-S', '--maxInsertSize', dest='maxInsertSize', help='Maximum insert size for a read pair to be considered mapped good' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
62 parser.add_option( '-P', '--maxOccurPairing', dest='maxOccurPairing', help='Maximum occurrences of a read for pairings' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
63 parser.add_option( '', '--rgid', dest='rgid', help='Read group identifier' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
64 parser.add_option( '', '--rgcn', dest='rgcn', help='Sequencing center that produced the read' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
65 parser.add_option( '', '--rgds', dest='rgds', help='Description' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
66 parser.add_option( '', '--rgdt', dest='rgdt', help='Date that run was produced (ISO8601 format date or date/time, like YYYY-MM-DD)' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
67 parser.add_option( '', '--rgfo', dest='rgfo', help='Flow order' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
68 parser.add_option( '', '--rgks', dest='rgks', help='The array of nucleotide bases that correspond to the key sequence of each read' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
69 parser.add_option( '', '--rglb', dest='rglb', help='Library name' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
70 parser.add_option( '', '--rgpg', dest='rgpg', help='Programs used for processing the read group' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
71 parser.add_option( '', '--rgpi', dest='rgpi', help='Predicted median insert size' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
72 parser.add_option( '', '--rgpl', dest='rgpl', choices=[ 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT' and 'PACBIO' ], help='Platform/technology used to produce the reads' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
73 parser.add_option( '', '--rgpu', dest='rgpu', help='Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
74 parser.add_option( '', '--rgsm', dest='rgsm', help='Sample' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
75 parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
76 parser.add_option( '-X', '--do_not_build_index', dest='do_not_build_index', action='store_true', help="Don't build index" )
8d92246f41bb Uploaded
evan
parents:
diff changeset
77 parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
78 parser.add_option( '-I', '--illumina1.3', dest='illumina13qual', help='Input FASTQ files have Illuina 1.3 quality scores' )
2
3a001705dc94 Uploaded
evan
parents: 0
diff changeset
79 parser.add_option( '', '--meta_tsv', dest='metadata_tsv', help='Meta data for filling things like sequencing center' )
0
8d92246f41bb Uploaded
evan
parents:
diff changeset
80 (options, args) = parser.parse_args()
8d92246f41bb Uploaded
evan
parents:
diff changeset
81
8d92246f41bb Uploaded
evan
parents:
diff changeset
82 # output version # of tool
8d92246f41bb Uploaded
evan
parents:
diff changeset
83 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
84 tmp = tempfile.NamedTemporaryFile().name
8d92246f41bb Uploaded
evan
parents:
diff changeset
85 tmp_stdout = open( tmp, 'wb' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
86 proc = subprocess.Popen( args='bwa 2>&1', shell=True, stdout=tmp_stdout )
8d92246f41bb Uploaded
evan
parents:
diff changeset
87 tmp_stdout.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
88 returncode = proc.wait()
8d92246f41bb Uploaded
evan
parents:
diff changeset
89 stdout = None
8d92246f41bb Uploaded
evan
parents:
diff changeset
90 for line in open( tmp_stdout.name, 'rb' ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
91 if line.lower().find( 'version' ) >= 0:
8d92246f41bb Uploaded
evan
parents:
diff changeset
92 stdout = line.strip()
8d92246f41bb Uploaded
evan
parents:
diff changeset
93 break
8d92246f41bb Uploaded
evan
parents:
diff changeset
94 if stdout:
8d92246f41bb Uploaded
evan
parents:
diff changeset
95 sys.stdout.write( 'BWA %s\n' % stdout )
8d92246f41bb Uploaded
evan
parents:
diff changeset
96 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
97 raise Exception
8d92246f41bb Uploaded
evan
parents:
diff changeset
98 except:
8d92246f41bb Uploaded
evan
parents:
diff changeset
99 sys.stdout.write( 'Could not determine BWA version\n' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
100
8d92246f41bb Uploaded
evan
parents:
diff changeset
101 # check for color space fastq that's not double-encoded and exit if appropriate
8d92246f41bb Uploaded
evan
parents:
diff changeset
102 if options.color_space:
8d92246f41bb Uploaded
evan
parents:
diff changeset
103 if not check_is_double_encoded( options.fastq ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
104 stop_err( 'Your file must be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
105 if options.genAlignType == 'paired':
8d92246f41bb Uploaded
evan
parents:
diff changeset
106 if not check_is_double_encoded( options.rfastq ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
107 stop_err( 'Your reverse reads file must also be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
108
8d92246f41bb Uploaded
evan
parents:
diff changeset
109 fastq = options.fastq
8d92246f41bb Uploaded
evan
parents:
diff changeset
110 if options.rfastq:
8d92246f41bb Uploaded
evan
parents:
diff changeset
111 rfastq = options.rfastq
8d92246f41bb Uploaded
evan
parents:
diff changeset
112
8d92246f41bb Uploaded
evan
parents:
diff changeset
113 # set color space variable
8d92246f41bb Uploaded
evan
parents:
diff changeset
114 if options.color_space:
8d92246f41bb Uploaded
evan
parents:
diff changeset
115 color_space = '-c'
8d92246f41bb Uploaded
evan
parents:
diff changeset
116 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
117 color_space = ''
8d92246f41bb Uploaded
evan
parents:
diff changeset
118
8d92246f41bb Uploaded
evan
parents:
diff changeset
119 # make temp directory for placement of indices
8d92246f41bb Uploaded
evan
parents:
diff changeset
120 tmp_index_dir = tempfile.mkdtemp()
8d92246f41bb Uploaded
evan
parents:
diff changeset
121 tmp_dir = tempfile.mkdtemp()
8d92246f41bb Uploaded
evan
parents:
diff changeset
122 # index if necessary
8d92246f41bb Uploaded
evan
parents:
diff changeset
123 if options.fileSource == 'history' and not options.do_not_build_index:
8d92246f41bb Uploaded
evan
parents:
diff changeset
124 ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
8d92246f41bb Uploaded
evan
parents:
diff changeset
125 ref_file_name = ref_file.name
8d92246f41bb Uploaded
evan
parents:
diff changeset
126 ref_file.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
127 os.symlink( options.ref, ref_file_name )
8d92246f41bb Uploaded
evan
parents:
diff changeset
128 # determine which indexing algorithm to use, based on size
8d92246f41bb Uploaded
evan
parents:
diff changeset
129 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
130 size = os.stat( options.ref ).st_size
8d92246f41bb Uploaded
evan
parents:
diff changeset
131 if size <= 2**30:
8d92246f41bb Uploaded
evan
parents:
diff changeset
132 indexingAlg = 'is'
8d92246f41bb Uploaded
evan
parents:
diff changeset
133 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
134 indexingAlg = 'bwtsw'
8d92246f41bb Uploaded
evan
parents:
diff changeset
135 except:
8d92246f41bb Uploaded
evan
parents:
diff changeset
136 indexingAlg = 'is'
8d92246f41bb Uploaded
evan
parents:
diff changeset
137 indexing_cmds = '%s -a %s' % ( color_space, indexingAlg )
8d92246f41bb Uploaded
evan
parents:
diff changeset
138 cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name )
8d92246f41bb Uploaded
evan
parents:
diff changeset
139 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
140 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
8d92246f41bb Uploaded
evan
parents:
diff changeset
141 tmp_stderr = open( tmp, 'wb' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
142 proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
8d92246f41bb Uploaded
evan
parents:
diff changeset
143 returncode = proc.wait()
8d92246f41bb Uploaded
evan
parents:
diff changeset
144 tmp_stderr.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
145 # get stderr, allowing for case where it's very large
8d92246f41bb Uploaded
evan
parents:
diff changeset
146 tmp_stderr = open( tmp, 'rb' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
147 stderr = ''
8d92246f41bb Uploaded
evan
parents:
diff changeset
148 buffsize = 1048576
8d92246f41bb Uploaded
evan
parents:
diff changeset
149 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
150 while True:
8d92246f41bb Uploaded
evan
parents:
diff changeset
151 stderr += tmp_stderr.read( buffsize )
8d92246f41bb Uploaded
evan
parents:
diff changeset
152 if not stderr or len( stderr ) % buffsize != 0:
8d92246f41bb Uploaded
evan
parents:
diff changeset
153 break
8d92246f41bb Uploaded
evan
parents:
diff changeset
154 except OverflowError:
8d92246f41bb Uploaded
evan
parents:
diff changeset
155 pass
8d92246f41bb Uploaded
evan
parents:
diff changeset
156 tmp_stderr.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
157 if returncode != 0:
8d92246f41bb Uploaded
evan
parents:
diff changeset
158 raise Exception, stderr
8d92246f41bb Uploaded
evan
parents:
diff changeset
159 except Exception, e:
8d92246f41bb Uploaded
evan
parents:
diff changeset
160 # clean up temp dirs
8d92246f41bb Uploaded
evan
parents:
diff changeset
161 if os.path.exists( tmp_index_dir ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
162 shutil.rmtree( tmp_index_dir )
8d92246f41bb Uploaded
evan
parents:
diff changeset
163 if os.path.exists( tmp_dir ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
164 shutil.rmtree( tmp_dir )
8d92246f41bb Uploaded
evan
parents:
diff changeset
165 stop_err( 'Error indexing reference sequence. ' + str( e ) )
8d92246f41bb Uploaded
evan
parents:
diff changeset
166 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
167 ref_file_name = options.ref
8d92246f41bb Uploaded
evan
parents:
diff changeset
168 if options.illumina13qual:
8d92246f41bb Uploaded
evan
parents:
diff changeset
169 illumina_quals = "-I"
8d92246f41bb Uploaded
evan
parents:
diff changeset
170 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
171 illumina_quals = ""
8d92246f41bb Uploaded
evan
parents:
diff changeset
172
8d92246f41bb Uploaded
evan
parents:
diff changeset
173 # set up aligning and generate aligning command options
8d92246f41bb Uploaded
evan
parents:
diff changeset
174 if options.params == 'pre_set':
8d92246f41bb Uploaded
evan
parents:
diff changeset
175 aligning_cmds = '-t %s %s %s' % ( options.threads, color_space, illumina_quals )
8d92246f41bb Uploaded
evan
parents:
diff changeset
176 gen_alignment_cmds = ''
8d92246f41bb Uploaded
evan
parents:
diff changeset
177 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
178 if options.maxEditDist != '0':
8d92246f41bb Uploaded
evan
parents:
diff changeset
179 editDist = options.maxEditDist
8d92246f41bb Uploaded
evan
parents:
diff changeset
180 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
181 editDist = options.fracMissingAligns
8d92246f41bb Uploaded
evan
parents:
diff changeset
182 if options.seed != '-1':
8d92246f41bb Uploaded
evan
parents:
diff changeset
183 seed = '-l %s' % options.seed
8d92246f41bb Uploaded
evan
parents:
diff changeset
184 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
185 seed = ''
8d92246f41bb Uploaded
evan
parents:
diff changeset
186 if options.suboptAlign:
8d92246f41bb Uploaded
evan
parents:
diff changeset
187 suboptAlign = '-R "%s"' % ( options.suboptAlign )
8d92246f41bb Uploaded
evan
parents:
diff changeset
188 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
189 suboptAlign = ''
8d92246f41bb Uploaded
evan
parents:
diff changeset
190 if options.noIterSearch == 'true':
8d92246f41bb Uploaded
evan
parents:
diff changeset
191 noIterSearch = '-N'
8d92246f41bb Uploaded
evan
parents:
diff changeset
192 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
193 noIterSearch = ''
8d92246f41bb Uploaded
evan
parents:
diff changeset
194 aligning_cmds = '-n %s -o %s -e %s -d %s -i %s %s -k %s -t %s -M %s -O %s -E %s %s %s %s %s' % \
8d92246f41bb Uploaded
evan
parents:
diff changeset
195 ( editDist, options.maxGapOpens, options.maxGapExtens, options.disallowLongDel,
8d92246f41bb Uploaded
evan
parents:
diff changeset
196 options.disallowIndel, seed, options.maxEditDistSeed, options.threads,
8d92246f41bb Uploaded
evan
parents:
diff changeset
197 options.mismatchPenalty, options.gapOpenPenalty, options.gapExtensPenalty,
8d92246f41bb Uploaded
evan
parents:
diff changeset
198 suboptAlign, noIterSearch, color_space, illumina_quals )
8d92246f41bb Uploaded
evan
parents:
diff changeset
199 if options.genAlignType == 'paired':
8d92246f41bb Uploaded
evan
parents:
diff changeset
200 gen_alignment_cmds = '-a %s -o %s' % ( options.maxInsertSize, options.maxOccurPairing )
8d92246f41bb Uploaded
evan
parents:
diff changeset
201 if options.outputTopNDisc:
8d92246f41bb Uploaded
evan
parents:
diff changeset
202 gen_alignment_cmds += ' -N %s' % options.outputTopNDisc
8d92246f41bb Uploaded
evan
parents:
diff changeset
203 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
204 gen_alignment_cmds = ''
8d92246f41bb Uploaded
evan
parents:
diff changeset
205 if options.metadata_tsv:
8d92246f41bb Uploaded
evan
parents:
diff changeset
206 f = open(options.metadata_tsv, 'r')
8d92246f41bb Uploaded
evan
parents:
diff changeset
207 cols = f.readline().split('\t')
2
3a001705dc94 Uploaded
evan
parents: 0
diff changeset
208 readGroup = '@RG\tID:%s\tDS:%s\tPU:%s\tLB:%s\tCN:BMGC\tDT:%s\tFR:%s\tRR:%s\tPL:ILLUMINA\tSM:%s' % ( cols[0], cols[4], cols[16], cols[14], cols[1], cols[19], cols[20], cols[18] )
0
8d92246f41bb Uploaded
evan
parents:
diff changeset
209 gen_alignment_cmds += ' -r "%s"' % readGroup
8d92246f41bb Uploaded
evan
parents:
diff changeset
210 print readGroup
8d92246f41bb Uploaded
evan
parents:
diff changeset
211 if options.rgid:
8d92246f41bb Uploaded
evan
parents:
diff changeset
212 if not options.rglb or not options.rgpl or not options.rgsm:
8d92246f41bb Uploaded
evan
parents:
diff changeset
213 stop_err( 'If you want to specify read groups, you must include the ID, LB, PL, and SM tags.' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
214 readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm )
8d92246f41bb Uploaded
evan
parents:
diff changeset
215 if options.rgcn:
8d92246f41bb Uploaded
evan
parents:
diff changeset
216 readGroup += '\tCN:%s' % options.rgcn
8d92246f41bb Uploaded
evan
parents:
diff changeset
217 if options.rgds:
8d92246f41bb Uploaded
evan
parents:
diff changeset
218 readGroup += '\tDS:%s' % options.rgds
8d92246f41bb Uploaded
evan
parents:
diff changeset
219 if options.rgdt:
8d92246f41bb Uploaded
evan
parents:
diff changeset
220 readGroup += '\tDT:%s' % options.rgdt
8d92246f41bb Uploaded
evan
parents:
diff changeset
221 if options.rgfo:
8d92246f41bb Uploaded
evan
parents:
diff changeset
222 readGroup += '\tFO:%s' % options.rgfo
8d92246f41bb Uploaded
evan
parents:
diff changeset
223 if options.rgks:
8d92246f41bb Uploaded
evan
parents:
diff changeset
224 readGroup += '\tKS:%s' % options.rgks
8d92246f41bb Uploaded
evan
parents:
diff changeset
225 if options.rgpg:
8d92246f41bb Uploaded
evan
parents:
diff changeset
226 readGroup += '\tPG:%s' % options.rgpg
8d92246f41bb Uploaded
evan
parents:
diff changeset
227 if options.rgpi:
8d92246f41bb Uploaded
evan
parents:
diff changeset
228 readGroup += '\tPI:%s' % options.rgpi
8d92246f41bb Uploaded
evan
parents:
diff changeset
229 if options.rgpu:
8d92246f41bb Uploaded
evan
parents:
diff changeset
230 readGroup += '\tPU:%s' % options.rgpu
8d92246f41bb Uploaded
evan
parents:
diff changeset
231 gen_alignment_cmds += ' -r "%s"' % readGroup
8d92246f41bb Uploaded
evan
parents:
diff changeset
232 if options.outputTopN:
8d92246f41bb Uploaded
evan
parents:
diff changeset
233 gen_alignment_cmds += ' -n %s' % options.outputTopN
8d92246f41bb Uploaded
evan
parents:
diff changeset
234 # set up output files
8d92246f41bb Uploaded
evan
parents:
diff changeset
235 tmp_align_out = tempfile.NamedTemporaryFile( dir=tmp_dir )
8d92246f41bb Uploaded
evan
parents:
diff changeset
236 tmp_align_out_name = tmp_align_out.name
8d92246f41bb Uploaded
evan
parents:
diff changeset
237 tmp_align_out.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
238 tmp_align_out2 = tempfile.NamedTemporaryFile( dir=tmp_dir )
8d92246f41bb Uploaded
evan
parents:
diff changeset
239 tmp_align_out2_name = tmp_align_out2.name
8d92246f41bb Uploaded
evan
parents:
diff changeset
240 tmp_align_out2.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
241 # prepare actual aligning and generate aligning commands
8d92246f41bb Uploaded
evan
parents:
diff changeset
242 cmd2 = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, fastq, tmp_align_out_name )
8d92246f41bb Uploaded
evan
parents:
diff changeset
243 cmd2b = ''
8d92246f41bb Uploaded
evan
parents:
diff changeset
244 if options.genAlignType == 'paired':
8d92246f41bb Uploaded
evan
parents:
diff changeset
245 cmd2b = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, rfastq, tmp_align_out2_name )
8d92246f41bb Uploaded
evan
parents:
diff changeset
246 cmd3 = 'bwa sampe %s %s %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, tmp_align_out2_name, fastq, rfastq, options.output )
8d92246f41bb Uploaded
evan
parents:
diff changeset
247 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
248 cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, fastq, options.output )
8d92246f41bb Uploaded
evan
parents:
diff changeset
249 # perform alignments
8d92246f41bb Uploaded
evan
parents:
diff changeset
250 buffsize = 1048576
8d92246f41bb Uploaded
evan
parents:
diff changeset
251 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
252 # need to nest try-except in try-finally to handle 2.4
8d92246f41bb Uploaded
evan
parents:
diff changeset
253 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
254 # align
8d92246f41bb Uploaded
evan
parents:
diff changeset
255 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
256 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
8d92246f41bb Uploaded
evan
parents:
diff changeset
257 tmp_stderr = open( tmp, 'wb' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
258 proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
8d92246f41bb Uploaded
evan
parents:
diff changeset
259 returncode = proc.wait()
8d92246f41bb Uploaded
evan
parents:
diff changeset
260 tmp_stderr.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
261 # get stderr, allowing for case where it's very large
8d92246f41bb Uploaded
evan
parents:
diff changeset
262 tmp_stderr = open( tmp, 'rb' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
263 stderr = ''
8d92246f41bb Uploaded
evan
parents:
diff changeset
264 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
265 while True:
8d92246f41bb Uploaded
evan
parents:
diff changeset
266 stderr += tmp_stderr.read( buffsize )
8d92246f41bb Uploaded
evan
parents:
diff changeset
267 if not stderr or len( stderr ) % buffsize != 0:
8d92246f41bb Uploaded
evan
parents:
diff changeset
268 break
8d92246f41bb Uploaded
evan
parents:
diff changeset
269 except OverflowError:
8d92246f41bb Uploaded
evan
parents:
diff changeset
270 pass
8d92246f41bb Uploaded
evan
parents:
diff changeset
271 tmp_stderr.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
272 if returncode != 0:
8d92246f41bb Uploaded
evan
parents:
diff changeset
273 raise Exception, stderr
8d92246f41bb Uploaded
evan
parents:
diff changeset
274 except Exception, e:
8d92246f41bb Uploaded
evan
parents:
diff changeset
275 raise Exception, 'Error aligning sequence. ' + str( e )
8d92246f41bb Uploaded
evan
parents:
diff changeset
276 # and again if paired data
8d92246f41bb Uploaded
evan
parents:
diff changeset
277 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
278 if cmd2b:
8d92246f41bb Uploaded
evan
parents:
diff changeset
279 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
8d92246f41bb Uploaded
evan
parents:
diff changeset
280 tmp_stderr = open( tmp, 'wb' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
281 proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
8d92246f41bb Uploaded
evan
parents:
diff changeset
282 returncode = proc.wait()
8d92246f41bb Uploaded
evan
parents:
diff changeset
283 tmp_stderr.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
284 # get stderr, allowing for case where it's very large
8d92246f41bb Uploaded
evan
parents:
diff changeset
285 tmp_stderr = open( tmp, 'rb' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
286 stderr = ''
8d92246f41bb Uploaded
evan
parents:
diff changeset
287 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
288 while True:
8d92246f41bb Uploaded
evan
parents:
diff changeset
289 stderr += tmp_stderr.read( buffsize )
8d92246f41bb Uploaded
evan
parents:
diff changeset
290 if not stderr or len( stderr ) % buffsize != 0:
8d92246f41bb Uploaded
evan
parents:
diff changeset
291 break
8d92246f41bb Uploaded
evan
parents:
diff changeset
292 except OverflowError:
8d92246f41bb Uploaded
evan
parents:
diff changeset
293 pass
8d92246f41bb Uploaded
evan
parents:
diff changeset
294 tmp_stderr.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
295 if returncode != 0:
8d92246f41bb Uploaded
evan
parents:
diff changeset
296 raise Exception, stderr
8d92246f41bb Uploaded
evan
parents:
diff changeset
297 except Exception, e:
8d92246f41bb Uploaded
evan
parents:
diff changeset
298 raise Exception, 'Error aligning second sequence. ' + str( e )
8d92246f41bb Uploaded
evan
parents:
diff changeset
299 # generate align
8d92246f41bb Uploaded
evan
parents:
diff changeset
300 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
301 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
8d92246f41bb Uploaded
evan
parents:
diff changeset
302 tmp_stderr = open( tmp, 'wb' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
303 proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
8d92246f41bb Uploaded
evan
parents:
diff changeset
304 returncode = proc.wait()
8d92246f41bb Uploaded
evan
parents:
diff changeset
305 tmp_stderr.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
306 # get stderr, allowing for case where it's very large
8d92246f41bb Uploaded
evan
parents:
diff changeset
307 tmp_stderr = open( tmp, 'rb' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
308 stderr = ''
8d92246f41bb Uploaded
evan
parents:
diff changeset
309 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
310 while True:
8d92246f41bb Uploaded
evan
parents:
diff changeset
311 stderr += tmp_stderr.read( buffsize )
8d92246f41bb Uploaded
evan
parents:
diff changeset
312 if not stderr or len( stderr ) % buffsize != 0:
8d92246f41bb Uploaded
evan
parents:
diff changeset
313 break
8d92246f41bb Uploaded
evan
parents:
diff changeset
314 except OverflowError:
8d92246f41bb Uploaded
evan
parents:
diff changeset
315 pass
8d92246f41bb Uploaded
evan
parents:
diff changeset
316 tmp_stderr.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
317 if returncode != 0:
8d92246f41bb Uploaded
evan
parents:
diff changeset
318 raise Exception, stderr
8d92246f41bb Uploaded
evan
parents:
diff changeset
319 except Exception, e:
8d92246f41bb Uploaded
evan
parents:
diff changeset
320 raise Exception, 'Error generating alignments. ' + str( e )
8d92246f41bb Uploaded
evan
parents:
diff changeset
321 # remove header if necessary
8d92246f41bb Uploaded
evan
parents:
diff changeset
322 if options.suppressHeader == 'true':
8d92246f41bb Uploaded
evan
parents:
diff changeset
323 tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
8d92246f41bb Uploaded
evan
parents:
diff changeset
324 tmp_out_name = tmp_out.name
8d92246f41bb Uploaded
evan
parents:
diff changeset
325 tmp_out.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
326 try:
8d92246f41bb Uploaded
evan
parents:
diff changeset
327 shutil.move( options.output, tmp_out_name )
8d92246f41bb Uploaded
evan
parents:
diff changeset
328 except Exception, e:
8d92246f41bb Uploaded
evan
parents:
diff changeset
329 raise Exception, 'Error moving output file before removing headers. ' + str( e )
8d92246f41bb Uploaded
evan
parents:
diff changeset
330 fout = file( options.output, 'w' )
8d92246f41bb Uploaded
evan
parents:
diff changeset
331 for line in file( tmp_out.name, 'r' ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
332 if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
333 fout.write( line )
8d92246f41bb Uploaded
evan
parents:
diff changeset
334 fout.close()
8d92246f41bb Uploaded
evan
parents:
diff changeset
335 # check that there are results in the output file
8d92246f41bb Uploaded
evan
parents:
diff changeset
336 if os.path.getsize( options.output ) > 0:
8d92246f41bb Uploaded
evan
parents:
diff changeset
337 sys.stdout.write( 'BWA run on %s-end data' % options.genAlignType )
8d92246f41bb Uploaded
evan
parents:
diff changeset
338 else:
8d92246f41bb Uploaded
evan
parents:
diff changeset
339 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
8d92246f41bb Uploaded
evan
parents:
diff changeset
340 except Exception, e:
8d92246f41bb Uploaded
evan
parents:
diff changeset
341 stop_err( 'The alignment failed.\n' + str( e ) )
8d92246f41bb Uploaded
evan
parents:
diff changeset
342 finally:
8d92246f41bb Uploaded
evan
parents:
diff changeset
343 # clean up temp dir
8d92246f41bb Uploaded
evan
parents:
diff changeset
344 if os.path.exists( tmp_index_dir ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
345 shutil.rmtree( tmp_index_dir )
8d92246f41bb Uploaded
evan
parents:
diff changeset
346 if os.path.exists( tmp_dir ):
8d92246f41bb Uploaded
evan
parents:
diff changeset
347 shutil.rmtree( tmp_dir )
8d92246f41bb Uploaded
evan
parents:
diff changeset
348
8d92246f41bb Uploaded
evan
parents:
diff changeset
349 if __name__=="__main__": __main__()