annotate bwa_wrapper.py @ 1:150b3fe44caa draft

Updated command line format per dev team standards.
author devteam <devteam@galaxyproject.org>
date Tue, 02 Apr 2013 12:23:14 -0400
parents ffa8aaa14f7c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
1 #!/usr/bin/env python
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
2
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
3 """
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
4 Runs BWA on single-end or paired-end data.
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
5 Produces a SAM file containing the mappings.
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
6 Works with BWA version 0.5.9.
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
7
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
8 usage: bwa_wrapper.py [options]
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
9
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
10 See below for options
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
11 """
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
12
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
13 import optparse, os, shutil, subprocess, sys, tempfile
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
14
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
15 def stop_err( msg ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
16 sys.stderr.write( '%s\n' % msg )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
17 sys.exit()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
18
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
19 def check_is_double_encoded( fastq ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
20 # check that first read is bases, not one base followed by numbers
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
21 bases = [ 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', 'N' ]
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
22 nums = [ '0', '1', '2', '3' ]
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
23 for line in file( fastq, 'rb'):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
24 if not line.strip() or line.startswith( '@' ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
25 continue
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
26 if len( [ b for b in line.strip() if b in nums ] ) > 0:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
27 return False
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
28 elif line.strip()[0] in bases and len( [ b for b in line.strip() if b in bases ] ) == len( line.strip() ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
29 return True
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
30 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
31 raise Exception, 'First line in first read does not appear to be a valid FASTQ read in either base-space or color-space'
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
32 raise Exception, 'There is no non-comment and non-blank line in your FASTQ file'
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
33
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
34 def __main__():
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
35 #Parse Command Line
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
36 parser = optparse.OptionParser()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
37 parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
38 parser.add_option( '-c', '--color-space', dest='color_space', action='store_true', help='If the input files are SOLiD format' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
39 parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
40 parser.add_option( '-f', '--input1', dest='fastq', help='The (forward) fastq file to use for the mapping' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
41 parser.add_option( '-F', '--input2', dest='rfastq', help='The reverse fastq file to use for mapping if paired-end data' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
42 parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
43 parser.add_option( '-g', '--genAlignType', dest='genAlignType', help='The type of pairing (single or paired)' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
44 parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
45 parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
46 parser.add_option( '-n', '--maxEditDist', dest='maxEditDist', help='Maximum edit distance if integer' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
47 parser.add_option( '-m', '--fracMissingAligns', dest='fracMissingAligns', help='Fraction of missing alignments given 2% uniform base error rate if fraction' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
48 parser.add_option( '-o', '--maxGapOpens', dest='maxGapOpens', help='Maximum number of gap opens' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
49 parser.add_option( '-e', '--maxGapExtens', dest='maxGapExtens', help='Maximum number of gap extensions' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
50 parser.add_option( '-d', '--disallowLongDel', dest='disallowLongDel', help='Disallow a long deletion within specified bps' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
51 parser.add_option( '-i', '--disallowIndel', dest='disallowIndel', help='Disallow indel within specified bps' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
52 parser.add_option( '-l', '--seed', dest='seed', help='Take the first specified subsequences' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
53 parser.add_option( '-k', '--maxEditDistSeed', dest='maxEditDistSeed', help='Maximum edit distance to the seed' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
54 parser.add_option( '-M', '--mismatchPenalty', dest='mismatchPenalty', help='Mismatch penalty' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
55 parser.add_option( '-O', '--gapOpenPenalty', dest='gapOpenPenalty', help='Gap open penalty' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
56 parser.add_option( '-E', '--gapExtensPenalty', dest='gapExtensPenalty', help='Gap extension penalty' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
57 parser.add_option( '-R', '--suboptAlign', dest='suboptAlign', default=None, help='Proceed with suboptimal alignments even if the top hit is a repeat' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
58 parser.add_option( '-N', '--noIterSearch', dest='noIterSearch', help='Disable iterative search' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
59 parser.add_option( '-T', '--outputTopN', dest='outputTopN', help='Maximum number of alignments to output in the XA tag for reads paired properly' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
60 parser.add_option( '', '--outputTopNDisc', dest='outputTopNDisc', help='Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons)' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
61 parser.add_option( '-S', '--maxInsertSize', dest='maxInsertSize', help='Maximum insert size for a read pair to be considered mapped good' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
62 parser.add_option( '-P', '--maxOccurPairing', dest='maxOccurPairing', help='Maximum occurrences of a read for pairings' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
63 parser.add_option( '', '--rgid', dest='rgid', help='Read group identifier' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
64 parser.add_option( '', '--rgcn', dest='rgcn', help='Sequencing center that produced the read' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
65 parser.add_option( '', '--rgds', dest='rgds', help='Description' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
66 parser.add_option( '', '--rgdt', dest='rgdt', help='Date that run was produced (ISO8601 format date or date/time, like YYYY-MM-DD)' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
67 parser.add_option( '', '--rgfo', dest='rgfo', help='Flow order' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
68 parser.add_option( '', '--rgks', dest='rgks', help='The array of nucleotide bases that correspond to the key sequence of each read' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
69 parser.add_option( '', '--rglb', dest='rglb', help='Library name' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
70 parser.add_option( '', '--rgpg', dest='rgpg', help='Programs used for processing the read group' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
71 parser.add_option( '', '--rgpi', dest='rgpi', help='Predicted median insert size' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
72 parser.add_option( '', '--rgpl', dest='rgpl', choices=[ 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT' and 'PACBIO' ], help='Platform/technology used to produce the reads' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
73 parser.add_option( '', '--rgpu', dest='rgpu', help='Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
74 parser.add_option( '', '--rgsm', dest='rgsm', help='Sample' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
75 parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
76 parser.add_option( '-X', '--do_not_build_index', dest='do_not_build_index', action='store_true', help="Don't build index" )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
77 parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
78 parser.add_option( '-I', '--illumina1.3', dest='illumina13qual', help='Input FASTQ files have Illuina 1.3 quality scores' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
79 (options, args) = parser.parse_args()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
80
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
81 # output version # of tool
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
82 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
83 tmp = tempfile.NamedTemporaryFile().name
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
84 tmp_stdout = open( tmp, 'wb' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
85 proc = subprocess.Popen( args='bwa 2>&1', shell=True, stdout=tmp_stdout )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
86 tmp_stdout.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
87 returncode = proc.wait()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
88 stdout = None
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
89 for line in open( tmp_stdout.name, 'rb' ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
90 if line.lower().find( 'version' ) >= 0:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
91 stdout = line.strip()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
92 break
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
93 if stdout:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
94 sys.stdout.write( 'BWA %s\n' % stdout )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
95 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
96 raise Exception
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
97 except:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
98 sys.stdout.write( 'Could not determine BWA version\n' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
99
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
100 # check for color space fastq that's not double-encoded and exit if appropriate
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
101 if options.color_space:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
102 if not check_is_double_encoded( options.fastq ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
103 stop_err( 'Your file must be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
104 if options.genAlignType == 'paired':
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
105 if not check_is_double_encoded( options.rfastq ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
106 stop_err( 'Your reverse reads file must also be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
107
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
108 fastq = options.fastq
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
109 if options.rfastq:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
110 rfastq = options.rfastq
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
111
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
112 # set color space variable
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
113 if options.color_space:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
114 color_space = '-c'
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
115 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
116 color_space = ''
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
117
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
118 # make temp directory for placement of indices
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
119 tmp_index_dir = tempfile.mkdtemp()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
120 tmp_dir = tempfile.mkdtemp()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
121 # index if necessary
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
122 if options.fileSource == 'history' and not options.do_not_build_index:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
123 ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
124 ref_file_name = ref_file.name
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
125 ref_file.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
126 os.symlink( options.ref, ref_file_name )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
127 # determine which indexing algorithm to use, based on size
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
128 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
129 size = os.stat( options.ref ).st_size
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
130 if size <= 2**30:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
131 indexingAlg = 'is'
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
132 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
133 indexingAlg = 'bwtsw'
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
134 except:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
135 indexingAlg = 'is'
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
136 indexing_cmds = '%s -a %s' % ( color_space, indexingAlg )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
137 cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
138 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
139 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
140 tmp_stderr = open( tmp, 'wb' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
141 proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
142 returncode = proc.wait()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
143 tmp_stderr.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
144 # get stderr, allowing for case where it's very large
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
145 tmp_stderr = open( tmp, 'rb' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
146 stderr = ''
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
147 buffsize = 1048576
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
148 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
149 while True:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
150 stderr += tmp_stderr.read( buffsize )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
151 if not stderr or len( stderr ) % buffsize != 0:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
152 break
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
153 except OverflowError:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
154 pass
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
155 tmp_stderr.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
156 if returncode != 0:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
157 raise Exception, stderr
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
158 except Exception, e:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
159 # clean up temp dirs
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
160 if os.path.exists( tmp_index_dir ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
161 shutil.rmtree( tmp_index_dir )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
162 if os.path.exists( tmp_dir ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
163 shutil.rmtree( tmp_dir )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
164 stop_err( 'Error indexing reference sequence. ' + str( e ) )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
165 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
166 ref_file_name = options.ref
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
167 if options.illumina13qual:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
168 illumina_quals = "-I"
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
169 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
170 illumina_quals = ""
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
171
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
172 # set up aligning and generate aligning command options
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
173 if options.params == 'pre_set':
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
174 aligning_cmds = '-t %s %s %s' % ( options.threads, color_space, illumina_quals )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
175 gen_alignment_cmds = ''
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
176 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
177 if options.maxEditDist != '0':
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
178 editDist = options.maxEditDist
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
179 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
180 editDist = options.fracMissingAligns
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
181 if options.seed != '-1':
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
182 seed = '-l %s' % options.seed
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
183 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
184 seed = ''
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
185 if options.suboptAlign:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
186 suboptAlign = '-R "%s"' % ( options.suboptAlign )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
187 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
188 suboptAlign = ''
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
189 if options.noIterSearch == 'true':
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
190 noIterSearch = '-N'
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
191 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
192 noIterSearch = ''
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
193 aligning_cmds = '-n %s -o %s -e %s -d %s -i %s %s -k %s -t %s -M %s -O %s -E %s %s %s %s %s' % \
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
194 ( editDist, options.maxGapOpens, options.maxGapExtens, options.disallowLongDel,
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
195 options.disallowIndel, seed, options.maxEditDistSeed, options.threads,
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
196 options.mismatchPenalty, options.gapOpenPenalty, options.gapExtensPenalty,
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
197 suboptAlign, noIterSearch, color_space, illumina_quals )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
198 if options.genAlignType == 'paired':
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
199 gen_alignment_cmds = '-a %s -o %s' % ( options.maxInsertSize, options.maxOccurPairing )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
200 if options.outputTopNDisc:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
201 gen_alignment_cmds += ' -N %s' % options.outputTopNDisc
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
202 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
203 gen_alignment_cmds = ''
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
204 if options.rgid:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
205 if not options.rglb or not options.rgpl or not options.rgsm:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
206 stop_err( 'If you want to specify read groups, you must include the ID, LB, PL, and SM tags.' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
207 readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
208 if options.rgcn:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
209 readGroup += '\tCN:%s' % options.rgcn
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
210 if options.rgds:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
211 readGroup += '\tDS:%s' % options.rgds
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
212 if options.rgdt:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
213 readGroup += '\tDT:%s' % options.rgdt
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
214 if options.rgfo:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
215 readGroup += '\tFO:%s' % options.rgfo
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
216 if options.rgks:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
217 readGroup += '\tKS:%s' % options.rgks
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
218 if options.rgpg:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
219 readGroup += '\tPG:%s' % options.rgpg
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
220 if options.rgpi:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
221 readGroup += '\tPI:%s' % options.rgpi
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
222 if options.rgpu:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
223 readGroup += '\tPU:%s' % options.rgpu
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
224 gen_alignment_cmds += ' -r "%s"' % readGroup
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
225 if options.outputTopN:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
226 gen_alignment_cmds += ' -n %s' % options.outputTopN
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
227 # set up output files
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
228 tmp_align_out = tempfile.NamedTemporaryFile( dir=tmp_dir )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
229 tmp_align_out_name = tmp_align_out.name
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
230 tmp_align_out.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
231 tmp_align_out2 = tempfile.NamedTemporaryFile( dir=tmp_dir )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
232 tmp_align_out2_name = tmp_align_out2.name
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
233 tmp_align_out2.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
234 # prepare actual aligning and generate aligning commands
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
235 cmd2 = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, fastq, tmp_align_out_name )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
236 cmd2b = ''
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
237 if options.genAlignType == 'paired':
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
238 cmd2b = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, rfastq, tmp_align_out2_name )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
239 cmd3 = 'bwa sampe %s %s %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, tmp_align_out2_name, fastq, rfastq, options.output )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
240 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
241 cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, fastq, options.output )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
242 # perform alignments
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
243 buffsize = 1048576
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
244 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
245 # need to nest try-except in try-finally to handle 2.4
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
246 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
247 # align
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
248 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
249 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
250 tmp_stderr = open( tmp, 'wb' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
251 proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
252 returncode = proc.wait()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
253 tmp_stderr.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
254 # get stderr, allowing for case where it's very large
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
255 tmp_stderr = open( tmp, 'rb' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
256 stderr = ''
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
257 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
258 while True:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
259 stderr += tmp_stderr.read( buffsize )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
260 if not stderr or len( stderr ) % buffsize != 0:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
261 break
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
262 except OverflowError:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
263 pass
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
264 tmp_stderr.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
265 if returncode != 0:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
266 raise Exception, stderr
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
267 except Exception, e:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
268 raise Exception, 'Error aligning sequence. ' + str( e )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
269 # and again if paired data
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
270 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
271 if cmd2b:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
272 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
273 tmp_stderr = open( tmp, 'wb' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
274 proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
275 returncode = proc.wait()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
276 tmp_stderr.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
277 # get stderr, allowing for case where it's very large
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
278 tmp_stderr = open( tmp, 'rb' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
279 stderr = ''
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
280 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
281 while True:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
282 stderr += tmp_stderr.read( buffsize )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
283 if not stderr or len( stderr ) % buffsize != 0:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
284 break
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
285 except OverflowError:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
286 pass
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
287 tmp_stderr.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
288 if returncode != 0:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
289 raise Exception, stderr
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
290 except Exception, e:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
291 raise Exception, 'Error aligning second sequence. ' + str( e )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
292 # generate align
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
293 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
294 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
295 tmp_stderr = open( tmp, 'wb' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
296 proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
297 returncode = proc.wait()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
298 tmp_stderr.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
299 # get stderr, allowing for case where it's very large
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
300 tmp_stderr = open( tmp, 'rb' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
301 stderr = ''
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
302 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
303 while True:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
304 stderr += tmp_stderr.read( buffsize )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
305 if not stderr or len( stderr ) % buffsize != 0:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
306 break
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
307 except OverflowError:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
308 pass
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
309 tmp_stderr.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
310 if returncode != 0:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
311 raise Exception, stderr
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
312 except Exception, e:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
313 raise Exception, 'Error generating alignments. ' + str( e )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
314 # remove header if necessary
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
315 if options.suppressHeader == 'true':
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
316 tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
317 tmp_out_name = tmp_out.name
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
318 tmp_out.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
319 try:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
320 shutil.move( options.output, tmp_out_name )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
321 except Exception, e:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
322 raise Exception, 'Error moving output file before removing headers. ' + str( e )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
323 fout = file( options.output, 'w' )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
324 for line in file( tmp_out.name, 'r' ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
325 if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
326 fout.write( line )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
327 fout.close()
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
328 # check that there are results in the output file
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
329 if os.path.getsize( options.output ) > 0:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
330 sys.stdout.write( 'BWA run on %s-end data' % options.genAlignType )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
331 else:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
332 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
333 except Exception, e:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
334 stop_err( 'The alignment failed.\n' + str( e ) )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
335 finally:
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
336 # clean up temp dir
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
337 if os.path.exists( tmp_index_dir ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
338 shutil.rmtree( tmp_index_dir )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
339 if os.path.exists( tmp_dir ):
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
340 shutil.rmtree( tmp_dir )
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
341
ffa8aaa14f7c Uploaded initial tarball.
devteam
parents:
diff changeset
342 if __name__=="__main__": __main__()