Mercurial > repos > bjoern-gruening > bismark
comparison bismark_wrapper.py @ 0:36d124f44c0a draft
inital commit
author | bjoern-gruening |
---|---|
date | Tue, 25 Dec 2012 05:45:46 -0500 |
parents | |
children | 427fb56f2e41 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:36d124f44c0a |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse, os, shutil, subprocess, sys, tempfile, fileinput | |
4 import fileinput | |
5 from glob import glob | |
6 | |
7 def stop_err( msg ): | |
8 sys.stderr.write( "%s\n" % msg ) | |
9 sys.exit() | |
10 | |
11 def __main__(): | |
12 #Parse Command Line | |
13 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') | |
14 parser.add_argument( '-p', '--num-threads', dest='num_threads', | |
15 type=int, default=4, help='Use this many threads to align reads. The default is 4.' ) | |
16 | |
17 parser.add_argument( '--bismark_path', dest='bismark_path', help='Path to the bismark perl scripts' ) | |
18 | |
19 parser.add_argument( '--bowtie2', action='store_true', default=False, help='Running bismark with bowtie2 and not with bowtie.' ) | |
20 | |
21 # input options | |
22 parser.add_argument( '--own-file', dest='own_file', help='' ) | |
23 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) | |
24 parser.add_argument( '-O', '--output', dest='output' ) | |
25 parser.add_argument( '--output-report-file', dest='output_report_file' ) | |
26 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" ) | |
27 | |
28 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False) | |
29 | |
30 | |
31 parser.add_argument( '-1', '--mate1', dest='mate1', | |
32 help='The forward reads file in Sanger FASTQ or FASTA format.' ) | |
33 parser.add_argument( '-2', '--mate2', dest='mate2', | |
34 help='The reverse reads file in Sanger FASTQ or FASTA format.' ) | |
35 | |
36 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads', | |
37 help='Additional output file with unmapped reads (single-end).' ) | |
38 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l', | |
39 help='File name for unmapped reads (left, paired-end).' ) | |
40 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r', | |
41 help='File name for unmapped reads (right, paired-end).' ) | |
42 | |
43 | |
44 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads', | |
45 help='Additional output file with suppressed reads (single-end).' ) | |
46 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l', | |
47 help='File name for suppressed reads (left, paired-end).' ) | |
48 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r', | |
49 help='File name for suppressed reads (right, paired-end).' ) | |
50 | |
51 | |
52 parser.add_argument( '--single-paired', dest='single_paired', | |
53 help='The single-end reads file in Sanger FASTQ or FASTA format.' ) | |
54 | |
55 parser.add_argument( '--fastq', action='store_true', help='Query filetype is in FASTQ format') | |
56 parser.add_argument( '--fasta', action='store_true', help='Query filetype is in FASTA format') | |
57 parser.add_argument( '--phred64-quals', dest='phred64', action="store_true" ) | |
58 | |
59 | |
60 parser.add_argument( '--skip-reads', dest='skip_reads', type=int ) | |
61 parser.add_argument( '--qupto', type=int) | |
62 | |
63 | |
64 # paired end options | |
65 parser.add_argument( '-I', '--minins', dest='min_insert' ) | |
66 parser.add_argument( '-X', '--maxins', dest='max_insert' ) | |
67 parser.add_argument( '--no-mixed', dest='no_mixed', action="store_true" ) | |
68 parser.add_argument( '--no-discordant', dest='no_discordant', action="store_true" ) | |
69 | |
70 #parse general options | |
71 # default 20 | |
72 parser.add_argument( '--seed-len', dest='seed_len', type=int) | |
73 # default 15 | |
74 parser.add_argument( '--seed-extention-attempts', dest='seed_extention_attempts', type=int ) | |
75 # default 0 | |
76 parser.add_argument( '--seed-mismatches', dest='seed_mismatches', type=int ) | |
77 # default 2 | |
78 parser.add_argument( '--max-reseed', dest='max_reseed', type=int ) | |
79 """ | |
80 # default 70 | |
81 parser.add_argument( '--maqerr', dest='maqerr', type=int ) | |
82 """ | |
83 | |
84 """ | |
85 The number of megabytes of memory a given thread is given to store path | |
86 descriptors in --best mode. Best-first search must keep track of many paths | |
87 at once to ensure it is always extending the path with the lowest cumulative | |
88 cost. Bowtie tries to minimize the memory impact of the descriptors, but | |
89 they can still grow very large in some cases. If you receive an error message | |
90 saying that chunk memory has been exhausted in --best mode, try adjusting | |
91 this parameter up to dedicate more memory to the descriptors. Default: 512. | |
92 """ | |
93 parser.add_argument( '--chunkmbs', type=int, default=512 ) | |
94 | |
95 args = parser.parse_args() | |
96 | |
97 # Create bismark index if necessary. | |
98 index_dir = "" | |
99 if args.own_file: | |
100 """ | |
101 Create a temporary index with the offered files from the user. | |
102 Utilizing the script: bismark_genome_preparation | |
103 bismark_genome_preparation --bowtie2 hg19/ | |
104 """ | |
105 tmp_index_dir = tempfile.mkdtemp() | |
106 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( args.own_file )[1].split( '.' )[:-1] ) ) | |
107 try: | |
108 """ | |
109 Create a hard link pointing to args.own_file named 'index_path'.fa. | |
110 """ | |
111 os.symlink( args.own_file, index_path + '.fa' ) | |
112 except Exception, e: | |
113 if os.path.exists( tmp_index_dir ): | |
114 shutil.rmtree( tmp_index_dir ) | |
115 stop_err( 'Error in linking the reference database.\n' + str( e ) ) | |
116 # bismark_genome_preparation needs the complete path to the folder in which the database is stored | |
117 if args.bowtie2: | |
118 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir ) | |
119 else: | |
120 cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir ) | |
121 if args.bismark_path: | |
122 # add the path to the bismark perl scripts, that is needed for galaxy | |
123 cmd_index = '%s/%s' % (args.bismark_path, cmd_index) | |
124 try: | |
125 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name | |
126 tmp_stderr = open( tmp, 'wb' ) | |
127 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() ) | |
128 returncode = proc.wait() | |
129 tmp_stderr.close() | |
130 # get stderr, allowing for case where it's very large | |
131 tmp_stderr = open( tmp, 'rb' ) | |
132 stderr = '' | |
133 buffsize = 1048576 | |
134 try: | |
135 while True: | |
136 stderr += tmp_stderr.read( buffsize ) | |
137 if not stderr or len( stderr ) % buffsize != 0: | |
138 break | |
139 except OverflowError: | |
140 pass | |
141 tmp_stderr.close() | |
142 if returncode != 0: | |
143 raise Exception, stderr | |
144 except Exception, e: | |
145 if os.path.exists( tmp_index_dir ): | |
146 shutil.rmtree( tmp_index_dir ) | |
147 stop_err( 'Error indexing reference sequence\n' + str( e ) ) | |
148 index_dir = tmp_index_dir | |
149 else: | |
150 index_dir = args.index_path | |
151 | |
152 # Build bismark command | |
153 tmp_bismark_dir = tempfile.mkdtemp() | |
154 output_dir = os.path.join( tmp_bismark_dir, 'results') | |
155 cmd = 'bismark %(args)s --temp_dir %(tmp_bismark_dir)s -o %(output_dir)s --quiet %(genome_folder)s %(reads)s' | |
156 if args.bismark_path: | |
157 # add the path to the bismark perl scripts, that is needed for galaxy | |
158 cmd = '%s/%s' % (args.bismark_path, cmd) | |
159 | |
160 arguments = { | |
161 'genome_folder': index_dir, | |
162 'args': '', | |
163 'tmp_bismark_dir': tmp_bismark_dir, | |
164 'output_dir': output_dir, | |
165 } | |
166 | |
167 additional_opts = '' | |
168 # Set up the reads | |
169 if args.mate_paired: | |
170 # paired-end reads library | |
171 reads = '-1 %s ' % ( args.mate1 ) | |
172 reads += ' -2 %s ' % ( args.mate2 ) | |
173 additional_opts += ' -I %s -X %s ' % (args.min_insert, args.max_insert) | |
174 else: | |
175 # single paired reads library | |
176 reads = ' %s ' % ( args.single_paired ) | |
177 | |
178 | |
179 if not args.bowtie2: | |
180 # use bowtie specific options | |
181 additional_opts += ' --best ' | |
182 if args.seed_mismatches: | |
183 # --seedmms | |
184 additional_opts += ' -n %s ' % args.seed_mismatches | |
185 if args.seed_len: | |
186 # --seedlen | |
187 additional_opts += ' -l %s ' % args.seed_len | |
188 | |
189 # alignment options | |
190 if args.bowtie2: | |
191 additional_opts += ' -p %s --bowtie2 ' % args.num_threads | |
192 if args.seed_mismatches: | |
193 additional_opts += ' -N %s ' % args.seed_mismatches | |
194 if args.seed_len: | |
195 additional_opts += ' -L %s ' % args.seed_len | |
196 if args.seed_extention_attempts: | |
197 additional_opts += ' -D %s ' % args.seed_extention_attempts | |
198 if args.max_reseed: | |
199 additional_opts += ' -R %s ' % args.max_reseed | |
200 if args.no_discordant: | |
201 additional_opts += ' --no-discordant ' | |
202 if args.no_mixed: | |
203 additional_opts += ' --no-mixed ' | |
204 """ | |
205 if args.maqerr: | |
206 additional_opts += ' --maqerr %s ' % args.maqerr | |
207 """ | |
208 if args.skip_reads: | |
209 additional_opts += ' --skip %s ' % args.skip_reads | |
210 if args.qupto: | |
211 additional_opts += ' --qupto %s ' % args.qupto | |
212 if args.phred64: | |
213 additional_opts += ' --phred64-quals ' | |
214 if args.suppress_header: | |
215 additional_opts += ' --sam-no-hd ' | |
216 if args.output_unmapped_reads or ( args.output_unmapped_reads_l and args.output_unmapped_reads_r): | |
217 additional_opts += ' --un ' | |
218 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r): | |
219 additional_opts += ' --ambiguous ' | |
220 | |
221 arguments.update( {'args': additional_opts, 'reads': reads} ) | |
222 | |
223 # Final command: | |
224 cmd = cmd % arguments | |
225 | |
226 # Run | |
227 try: | |
228 tmp_out = tempfile.NamedTemporaryFile().name | |
229 tmp_stdout = open( tmp_out, 'wb' ) | |
230 tmp_err = tempfile.NamedTemporaryFile().name | |
231 tmp_stderr = open( tmp_err, 'wb' ) | |
232 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) | |
233 returncode = proc.wait() | |
234 tmp_stderr.close() | |
235 # get stderr, allowing for case where it's very large | |
236 tmp_stderr = open( tmp_err, 'rb' ) | |
237 stderr = '' | |
238 buffsize = 1048576 | |
239 try: | |
240 while True: | |
241 stderr += tmp_stderr.read( buffsize ) | |
242 if not stderr or len( stderr ) % buffsize != 0: | |
243 break | |
244 except OverflowError: | |
245 pass | |
246 tmp_stdout.close() | |
247 tmp_stderr.close() | |
248 if returncode != 0: | |
249 raise Exception, stderr | |
250 | |
251 # TODO: look for errors in program output. | |
252 except Exception, e: | |
253 stop_err( 'Error in bismark:\n' + str( e ) ) | |
254 | |
255 | |
256 # collect and copy output files | |
257 """ | |
258 if args.output_report_file: | |
259 output_report_file = open(args.output_report_file, 'w+') | |
260 for line in fileinput.input(glob( os.path.join( output_dir, '*.txt') )): | |
261 output_report_file.write(line) | |
262 output_report_file.close() | |
263 """ | |
264 | |
265 if args.output_suppressed_reads: | |
266 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads ) | |
267 if args.output_suppressed_reads_l: | |
268 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l ) | |
269 if args.output_suppressed_reads_r: | |
270 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r ) | |
271 | |
272 if args.output_unmapped_reads: | |
273 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads ) | |
274 if args.output_unmapped_reads_l: | |
275 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l ) | |
276 if args.output_unmapped_reads_r: | |
277 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r ) | |
278 | |
279 shutil.move( glob( os.path.join( output_dir, '*.sam'))[0] , args.output) | |
280 | |
281 # Clean up temp dirs | |
282 if args.own_file: | |
283 if os.path.exists( tmp_index_dir ): | |
284 shutil.rmtree( tmp_index_dir ) | |
285 if os.path.exists( tmp_bismark_dir ): | |
286 shutil.rmtree( tmp_bismark_dir ) | |
287 | |
288 if __name__=="__main__": __main__() |