Mercurial > repos > bgruening > bismark
comparison bismark_wrapper.py @ 0:62c6da72dd4a draft
Uploaded
author | bgruening |
---|---|
date | Sat, 06 Jul 2013 09:57:36 -0400 |
parents | |
children | 82814a8a2395 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:62c6da72dd4a |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse | |
4 import os | |
5 import shutil | |
6 import subprocess | |
7 import sys | |
8 import shlex | |
9 import tempfile | |
10 import fileinput | |
11 import fileinput | |
12 from glob import glob | |
13 | |
14 def stop_err( msg ): | |
15 sys.stderr.write( "%s\n" % msg ) | |
16 sys.exit() | |
17 | |
18 def __main__(): | |
19 | |
20 print 'tempfile_location',tempfile.gettempdir() | |
21 #Parse Command Line | |
22 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') | |
23 parser.add_argument( '-p', '--num-threads', dest='num_threads', | |
24 type=int, default=4, help='Use this many threads to align reads. The default is 4.' ) | |
25 | |
26 parser.add_argument( '--bismark_path', dest='bismark_path', help='Path to the bismark perl scripts' ) | |
27 | |
28 parser.add_argument( '--bowtie2', action='store_true', default=False, help='Running bismark with bowtie2 and not with bowtie.' ) | |
29 | |
30 # input options | |
31 parser.add_argument( '--own-file', dest='own_file', help='' ) | |
32 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) | |
33 parser.add_argument( '-O', '--output', dest='output' ) | |
34 | |
35 | |
36 parser.add_argument( '--output-report-file', dest='output_report_file' ) | |
37 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" ) | |
38 | |
39 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False) | |
40 | |
41 | |
42 parser.add_argument( '-1', '--mate1', dest='mate1', | |
43 help='The forward reads file in Sanger FASTQ or FASTA format.' ) | |
44 parser.add_argument( '-2', '--mate2', dest='mate2', | |
45 help='The reverse reads file in Sanger FASTQ or FASTA format.' ) | |
46 | |
47 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads', | |
48 help='Additional output file with unmapped reads (single-end).' ) | |
49 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l', | |
50 help='File name for unmapped reads (left, paired-end).' ) | |
51 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r', | |
52 help='File name for unmapped reads (right, paired-end).' ) | |
53 | |
54 | |
55 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads', | |
56 help='Additional output file with suppressed reads (single-end).' ) | |
57 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l', | |
58 help='File name for suppressed reads (left, paired-end).' ) | |
59 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r', | |
60 help='File name for suppressed reads (right, paired-end).' ) | |
61 parser.add_argument( '--stdout', dest='output_stdout', | |
62 help='File name for the standard output of bismark.' ) | |
63 | |
64 | |
65 parser.add_argument( '--single-paired', dest='single_paired', | |
66 help='The single-end reads file in Sanger FASTQ or FASTA format.' ) | |
67 | |
68 parser.add_argument( '--fastq', action='store_true', help='Query filetype is in FASTQ format') | |
69 parser.add_argument( '--fasta', action='store_true', help='Query filetype is in FASTA format') | |
70 parser.add_argument( '--phred64-quals', dest='phred64', action="store_true" ) | |
71 | |
72 | |
73 parser.add_argument( '--skip-reads', dest='skip_reads', type=int ) | |
74 parser.add_argument( '--qupto', type=int) | |
75 | |
76 | |
77 # paired end options | |
78 parser.add_argument( '-I', '--minins', dest='min_insert' ) | |
79 parser.add_argument( '-X', '--maxins', dest='max_insert' ) | |
80 parser.add_argument( '--no-mixed', dest='no_mixed', action="store_true" ) | |
81 parser.add_argument( '--no-discordant', dest='no_discordant', action="store_true" ) | |
82 | |
83 #parse general options | |
84 # default 20 | |
85 parser.add_argument( '--seed-len', dest='seed_len', type=int) | |
86 # default 15 | |
87 parser.add_argument( '--seed-extention-attempts', dest='seed_extention_attempts', type=int ) | |
88 # default 0 | |
89 parser.add_argument( '--seed-mismatches', dest='seed_mismatches', type=int ) | |
90 # default 2 | |
91 parser.add_argument( '--max-reseed', dest='max_reseed', type=int ) | |
92 """ | |
93 # default 70 | |
94 parser.add_argument( '--maqerr', dest='maqerr', type=int ) | |
95 """ | |
96 | |
97 """ | |
98 The number of megabytes of memory a given thread is given to store path | |
99 descriptors in --best mode. Best-first search must keep track of many paths | |
100 at once to ensure it is always extending the path with the lowest cumulative | |
101 cost. Bowtie tries to minimize the memory impact of the descriptors, but | |
102 they can still grow very large in some cases. If you receive an error message | |
103 saying that chunk memory has been exhausted in --best mode, try adjusting | |
104 this parameter up to dedicate more memory to the descriptors. Default: 512. | |
105 """ | |
106 parser.add_argument( '--chunkmbs', type=int, default=512 ) | |
107 | |
108 args = parser.parse_args() | |
109 | |
110 # Create bismark index if necessary. | |
111 index_dir = "" | |
112 if args.own_file: | |
113 """ | |
114 Create a temporary index with the offered files from the user. | |
115 Utilizing the script: bismark_genome_preparation | |
116 bismark_genome_preparation --bowtie2 hg19/ | |
117 """ | |
118 tmp_index_dir = tempfile.mkdtemp() | |
119 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( args.own_file )[1].split( '.' )[:-1] ) ) | |
120 try: | |
121 """ | |
122 Create a hard link pointing to args.own_file named 'index_path'.fa. | |
123 """ | |
124 os.symlink( args.own_file, index_path + '.fa' ) | |
125 except Exception, e: | |
126 if os.path.exists( tmp_index_dir ): | |
127 shutil.rmtree( tmp_index_dir ) | |
128 stop_err( 'Error in linking the reference database.\n' + str( e ) ) | |
129 # bismark_genome_preparation needs the complete path to the folder in which the database is stored | |
130 if args.bowtie2: | |
131 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir ) | |
132 else: | |
133 cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir ) | |
134 if args.bismark_path: | |
135 if os.path.exists(args.bismark_path): | |
136 # add the path to the bismark perl scripts, that is needed for galaxy | |
137 cmd_index = os.path.join(args.bismark_path, cmd_index) | |
138 else: | |
139 # assume the same directory as that script | |
140 cmd_index = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd_index) | |
141 try: | |
142 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name | |
143 tmp_stderr = open( tmp, 'wb' ) | |
144 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() ) | |
145 returncode = proc.wait() | |
146 tmp_stderr.close() | |
147 # get stderr, allowing for case where it's very large | |
148 tmp_stderr = open( tmp, 'rb' ) | |
149 stderr = '' | |
150 buffsize = 1048576 | |
151 try: | |
152 while True: | |
153 stderr += tmp_stderr.read( buffsize ) | |
154 if not stderr or len( stderr ) % buffsize != 0: | |
155 break | |
156 except OverflowError: | |
157 pass | |
158 tmp_stderr.close() | |
159 if returncode != 0: | |
160 raise Exception, stderr | |
161 except Exception, e: | |
162 if os.path.exists( tmp_index_dir ): | |
163 shutil.rmtree( tmp_index_dir ) | |
164 stop_err( 'Error indexing reference sequence\n' + str( e ) ) | |
165 index_dir = tmp_index_dir | |
166 else: | |
167 # bowtie path is the path to the index directory and the first path of the index file name | |
168 index_dir = os.path.dirname( args.index_path ) | |
169 | |
170 # Build bismark command | |
171 | |
172 """ | |
173 Bismark requires a large amount of temporary disc space. If that is not available, for example on a cluster you can hardcode the | |
174 TMP to some larger space. It's not recommended but it works. | |
175 """ | |
176 #tmp_bismark_dir = tempfile.mkdtemp( dir='/data/0/galaxy_db/tmp/' ) | |
177 tmp_bismark_dir = tempfile.mkdtemp() | |
178 output_dir = os.path.join( tmp_bismark_dir, 'results') | |
179 cmd = 'bismark %(args)s --bam --temp_dir %(tmp_bismark_dir)s -o %(output_dir)s --quiet %(genome_folder)s %(reads)s' | |
180 | |
181 if args.fasta: | |
182 # he query input files (specified as mate1,mate2 or singles) are FastA | |
183 cmd = '%s %s' % (cmd, '--fasta') | |
184 elif args.fastq: | |
185 cmd = '%s %s' % (cmd, '--fastq') | |
186 | |
187 if args.bismark_path: | |
188 # add the path to the bismark perl scripts, that is needed for galaxy | |
189 if os.path.exists(args.bismark_path): | |
190 cmd = os.path.join(args.bismark_path, cmd) | |
191 else: | |
192 # assume the same directory as that script | |
193 cmd = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd) | |
194 | |
195 arguments = { | |
196 'genome_folder': index_dir, | |
197 'args': '', | |
198 'tmp_bismark_dir': tmp_bismark_dir, | |
199 'output_dir': output_dir, | |
200 } | |
201 | |
202 additional_opts = '' | |
203 # Set up the reads | |
204 if args.mate_paired: | |
205 # paired-end reads library | |
206 reads = '-1 %s ' % ( args.mate1 ) | |
207 reads += ' -2 %s ' % ( args.mate2 ) | |
208 additional_opts += ' -I %s -X %s ' % (args.min_insert, args.max_insert) | |
209 else: | |
210 # single paired reads library | |
211 reads = ' %s ' % ( args.single_paired ) | |
212 | |
213 | |
214 if not args.bowtie2: | |
215 # use bowtie specific options | |
216 #additional_opts += ' --best ' # bug in bismark, --best is not available as option. Only --non-best, best-mode is activated by default | |
217 if args.seed_mismatches: | |
218 # --seedmms | |
219 additional_opts += ' -n %s ' % args.seed_mismatches | |
220 if args.seed_len: | |
221 # --seedlen | |
222 additional_opts += ' -l %s ' % args.seed_len | |
223 | |
224 # alignment options | |
225 if args.bowtie2: | |
226 additional_opts += ' -p %s --bowtie2 ' % args.num_threads | |
227 if args.seed_mismatches: | |
228 additional_opts += ' -N %s ' % args.seed_mismatches | |
229 if args.seed_len: | |
230 additional_opts += ' -L %s ' % args.seed_len | |
231 if args.seed_extention_attempts: | |
232 additional_opts += ' -D %s ' % args.seed_extention_attempts | |
233 if args.max_reseed: | |
234 additional_opts += ' -R %s ' % args.max_reseed | |
235 if args.no_discordant: | |
236 additional_opts += ' --no-discordant ' | |
237 if args.no_mixed: | |
238 additional_opts += ' --no-mixed ' | |
239 """ | |
240 if args.maqerr: | |
241 additional_opts += ' --maqerr %s ' % args.maqerr | |
242 """ | |
243 if args.skip_reads: | |
244 additional_opts += ' --skip %s ' % args.skip_reads | |
245 if args.qupto: | |
246 additional_opts += ' --qupto %s ' % args.qupto | |
247 if args.phred64: | |
248 additional_opts += ' --phred64-quals ' | |
249 if args.suppress_header: | |
250 additional_opts += ' --sam-no-hd ' | |
251 if args.output_unmapped_reads or ( args.output_unmapped_reads_l and args.output_unmapped_reads_r): | |
252 additional_opts += ' --un ' | |
253 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r): | |
254 additional_opts += ' --ambiguous ' | |
255 | |
256 arguments.update( {'args': additional_opts, 'reads': reads} ) | |
257 | |
258 # Final bismark command: | |
259 cmd = cmd % arguments | |
260 print 'bismark_cmd:', cmd | |
261 #sys.stderr.write( cmd ) | |
262 #sys.exit(1) | |
263 # Run | |
264 try: | |
265 tmp_out = tempfile.NamedTemporaryFile().name | |
266 tmp_stdout = open( tmp_out, 'wb' ) | |
267 tmp_err = tempfile.NamedTemporaryFile().name | |
268 tmp_stderr = open( tmp_err, 'wb' ) | |
269 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) | |
270 returncode = proc.wait() | |
271 | |
272 if returncode != 0: | |
273 tmp_stdout.close() | |
274 tmp_stderr.close() | |
275 # get stderr, allowing for case where it's very large | |
276 tmp_stderr = open( tmp_err, 'rb' ) | |
277 stderr = '' | |
278 buffsize = 1048576 | |
279 try: | |
280 while True: | |
281 stderr += tmp_stderr.read( buffsize ) | |
282 if not stderr or len( stderr ) % buffsize != 0: | |
283 break | |
284 except OverflowError: | |
285 pass | |
286 | |
287 raise Exception, stderr | |
288 | |
289 tmp_stdout.close() | |
290 tmp_stderr.close() | |
291 | |
292 # TODO: look for errors in program output. | |
293 except Exception, e: | |
294 stop_err( 'Error in bismark:\n' + str( e ) ) | |
295 | |
296 # collect and copy output files | |
297 if args.output_report_file: | |
298 output_report_file = open(args.output_report_file, 'w+') | |
299 for line in fileinput.input(glob( os.path.join( output_dir, '*report.txt') )): | |
300 output_report_file.write(line) | |
301 output_report_file.close() | |
302 | |
303 | |
304 if args.output_suppressed_reads: | |
305 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads ) | |
306 if args.output_suppressed_reads_l: | |
307 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l ) | |
308 if args.output_suppressed_reads_r: | |
309 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r ) | |
310 | |
311 if args.output_unmapped_reads: | |
312 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads ) | |
313 if args.output_unmapped_reads_l: | |
314 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l ) | |
315 if args.output_unmapped_reads_r: | |
316 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r ) | |
317 | |
318 try: | |
319 """ | |
320 merge all bam files | |
321 """ | |
322 #tmp_out = tempfile.NamedTemporaryFile( dir=output_dir ).name | |
323 tmp_stdout = open( tmp_out, 'wab' ) | |
324 tmp_err = tempfile.NamedTemporaryFile( dir=output_dir ).name | |
325 tmp_stderr = open( tmp_err, 'wb' ) | |
326 | |
327 tmp_res = tempfile.NamedTemporaryFile( dir= output_dir).name | |
328 | |
329 bam_files = glob( os.path.join( output_dir, '*.bam') ) | |
330 if len( bam_files ) > 1: | |
331 cmd = 'samtools merge -@ %s -f - %s ' % ( args.num_threads, ' '.join( bam_files ) ) | |
332 | |
333 p1 = subprocess.Popen( args=shlex.split( cmd ), stdout=subprocess.PIPE ) | |
334 proc = subprocess.Popen( ['samtools', 'sort', '-@', str(args.num_threads), '-', tmp_res], stdin=p1.stdout, stdout=tmp_stdout, stderr=tmp_stderr ) | |
335 else: | |
336 proc = subprocess.Popen( ['samtools', 'sort', '-@', str(args.num_threads), bam_files[0], tmp_res], stdout=tmp_stdout, stderr=tmp_stderr ) | |
337 | |
338 returncode = proc.wait() | |
339 tmp_stdout.close() | |
340 tmp_stderr.close() | |
341 if returncode != 0: | |
342 raise Exception, open( tmp_stderr.name ).read() | |
343 | |
344 bam_path = "%s.bam" % tmp_res | |
345 if os.path.exists( bam_path ): | |
346 shutil.copy( bam_path, args.output ) | |
347 else: | |
348 stop_err( 'BAM file no found:\n' + str( bam_path ) ) | |
349 | |
350 # TODO: look for errors in program output. | |
351 except Exception, e: | |
352 stop_err( 'Error in merging bam files:\n' + str( e ) ) | |
353 | |
354 | |
355 if args.output_stdout: | |
356 # copy the temporary saved stdout from bismark | |
357 shutil.move( tmp_out, args.output_stdout ) | |
358 | |
359 # Clean up temp dirs | |
360 if args.own_file: | |
361 if os.path.exists( tmp_index_dir ): | |
362 shutil.rmtree( tmp_index_dir ) | |
363 if os.path.exists( tmp_bismark_dir ): | |
364 shutil.rmtree( tmp_bismark_dir ) | |
365 if os.path.exists( output_dir ): | |
366 shutil.rmtree( output_dir ) | |
367 | |
368 if __name__=="__main__": __main__() |