Mercurial > repos > bgruening > bismark
comparison bismark_wrapper.py @ 8:9bfe38410155 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit 51299fa62f0566a4a897b1c149db564631282fff
author | bgruening |
---|---|
date | Wed, 22 Aug 2018 08:09:42 -0400 |
parents | |
children | 7bffcb6fc81d |
comparison
equal
deleted
inserted
replaced
7:fcadce4d9a06 | 8:9bfe38410155 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse | |
4 import fileinput | |
5 import logging | |
6 import math | |
7 import os | |
8 import shutil | |
9 import subprocess | |
10 import sys | |
11 import tempfile | |
12 from glob import glob | |
13 | |
14 | |
15 def stop_err(logger, msg): | |
16 logger.critical(msg) | |
17 sys.exit(1) | |
18 | |
19 | |
20 def log_subprocess_output(logger, pipe): | |
21 for line in iter(pipe.readline, b''): | |
22 logger.debug(line.decode().rstrip()) | |
23 | |
24 | |
25 def __main__(): | |
26 # Parse Command Line | |
27 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') | |
28 parser.add_argument('-p', '--num-threads', dest='num_threads', | |
29 type=int, default=4, help='Use this many threads to align reads. The default is 4.') | |
30 | |
31 # input options | |
32 parser.add_argument('--own-file', dest='own_file', help='') | |
33 parser.add_argument('-D', '--indexes-path', dest='index_path', | |
34 help='Indexes directory; location of .ebwt and .fa files.') | |
35 parser.add_argument('-O', '--output', dest='output') | |
36 | |
37 parser.add_argument('--output-report-file', dest='output_report_file') | |
38 parser.add_argument('--suppress-header', dest='suppress_header', action="store_true") | |
39 | |
40 parser.add_argument('--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', | |
41 default=False) | |
42 | |
43 parser.add_argument('-1', '--mate1', dest='mate1', | |
44 help='The forward reads file in Sanger FASTQ or FASTA format.') | |
45 parser.add_argument('-2', '--mate2', dest='mate2', | |
46 help='The reverse reads file in Sanger FASTQ or FASTA format.') | |
47 parser.add_argument('--sort-bam', dest='sort_bam', action="store_true") | |
48 | |
49 parser.add_argument('--output-unmapped-reads', dest='output_unmapped_reads', | |
50 help='Additional output file with unmapped reads (single-end).') | |
51 parser.add_argument('--output-unmapped-reads-l', dest='output_unmapped_reads_l', | |
52 help='File name for unmapped reads (left, paired-end).') | |
53 parser.add_argument('--output-unmapped-reads-r', dest='output_unmapped_reads_r', | |
54 help='File name for unmapped reads (right, paired-end).') | |
55 | |
56 parser.add_argument('--output-suppressed-reads', dest='output_suppressed_reads', | |
57 help='Additional output file with suppressed reads (single-end).') | |
58 parser.add_argument('--output-suppressed-reads-l', dest='output_suppressed_reads_l', | |
59 help='File name for suppressed reads (left, paired-end).') | |
60 parser.add_argument('--output-suppressed-reads-r', dest='output_suppressed_reads_r', | |
61 help='File name for suppressed reads (right, paired-end).') | |
62 parser.add_argument('--stdout', dest='output_stdout', | |
63 help='File name for the standard output of bismark.') | |
64 | |
65 parser.add_argument('--single-paired', dest='single_paired', | |
66 help='The single-end reads file in Sanger FASTQ or FASTA format.') | |
67 | |
68 parser.add_argument('--fastq', action='store_true', help='Query filetype is in FASTQ format') | |
69 parser.add_argument('--fasta', action='store_true', help='Query filetype is in FASTA format') | |
70 parser.add_argument('--phred64-quals', dest='phred64', action="store_true") | |
71 | |
72 parser.add_argument('--skip-reads', dest='skip_reads', type=int) | |
73 parser.add_argument('--qupto', type=int) | |
74 | |
75 # paired end options | |
76 parser.add_argument('-I', '--minins', dest='min_insert') | |
77 parser.add_argument('-X', '--maxins', dest='max_insert') | |
78 parser.add_argument('--no-mixed', dest='no_mixed', action="store_true") | |
79 parser.add_argument('--no-discordant', dest='no_discordant', action="store_true") | |
80 | |
81 # parse general options | |
82 # default 20 | |
83 parser.add_argument('--seed-len', dest='seed_len', type=int) | |
84 # default 15 | |
85 parser.add_argument('--seed-extention-attempts', dest='seed_extention_attempts', type=int) | |
86 # default 0 | |
87 parser.add_argument('--seed-mismatches', dest='seed_mismatches', type=int) | |
88 # default 2 | |
89 parser.add_argument('--max-reseed', dest='max_reseed', type=int) | |
90 """ | |
91 # default 70 | |
92 parser.add_argument( '--maqerr', dest='maqerr', type=int ) | |
93 """ | |
94 | |
95 """ | |
96 The number of megabytes of memory a given thread is given to store path | |
97 descriptors in --best mode. Best-first search must keep track of many paths | |
98 at once to ensure it is always extending the path with the lowest cumulative | |
99 cost. Bowtie tries to minimize the memory impact of the descriptors, but | |
100 they can still grow very large in some cases. If you receive an error message | |
101 saying that chunk memory has been exhausted in --best mode, try adjusting | |
102 this parameter up to dedicate more memory to the descriptors. Default: 512. | |
103 """ | |
104 parser.add_argument('--chunkmbs', type=int, default=512) | |
105 | |
106 args = parser.parse_args() | |
107 | |
108 logger = logging.getLogger('bismark_wrapper') | |
109 logger.setLevel(logging.DEBUG) | |
110 ch = logging.StreamHandler(sys.stdout) | |
111 if args.output_stdout: | |
112 ch.setLevel(logging.WARNING) | |
113 handler = logging.FileHandler(args.output_stdout) | |
114 handler.setLevel(logging.DEBUG) | |
115 logger.addHandler(handler) | |
116 else: | |
117 ch.setLevel(logging.DEBUG) | |
118 logger.addHandler(ch) | |
119 | |
120 # Create bismark index if necessary. | |
121 index_dir = "" | |
122 tmp_index_dir = None | |
123 if args.own_file: | |
124 logger.info("Create a temporary index with the offered files from the user. " | |
125 "Utilizing the script: bismark_genome_preparation") | |
126 tmp_index_dir = tempfile.mkdtemp() | |
127 index_path = os.path.join(tmp_index_dir, '.'.join(os.path.split(args.own_file)[1].split('.')[:-1])) | |
128 try: | |
129 # Create a hard link pointing to args.own_file named 'index_path'.fa. | |
130 os.symlink(args.own_file, index_path + '.fa') | |
131 except Exception as e: | |
132 if os.path.exists(tmp_index_dir): | |
133 shutil.rmtree(tmp_index_dir) | |
134 stop_err(logger, 'Error in linking the reference database!\n%s' % e) | |
135 # bismark_genome_preparation needs the complete path to the folder in which the database is stored | |
136 cmd_index = ['bismark_genome_preparation', '--bowtie2', tmp_index_dir] | |
137 try: | |
138 logger.info("Generating index with: '%s'", " ".join(cmd_index)) | |
139 process = subprocess.Popen(cmd_index, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
140 with process.stdout: | |
141 log_subprocess_output(logger, process.stdout) | |
142 exitcode = process.wait() | |
143 if exitcode != 0: | |
144 raise Exception(process.stderr) | |
145 except Exception as e: | |
146 if os.path.exists(tmp_index_dir): | |
147 shutil.rmtree(tmp_index_dir) | |
148 stop_err(logger, 'Error indexing reference sequence!\n%s' % e) | |
149 index_dir = tmp_index_dir | |
150 else: | |
151 # bowtie path is the path to the index directory and the first path of the index file name | |
152 index_dir = os.path.dirname(args.index_path) | |
153 | |
154 # Build bismark command | |
155 tmp_bismark_dir = tempfile.mkdtemp() | |
156 output_dir = os.path.join(tmp_bismark_dir, 'results') | |
157 cmd = ['bismark', '--bam', '--gzip', '--temp_dir', tmp_bismark_dir, '-o', output_dir, '--quiet'] | |
158 | |
159 if args.fasta: | |
160 # the query input files (specified as mate1,mate2 or singles) are FastA | |
161 cmd.append('--fasta') | |
162 elif args.fastq: | |
163 cmd.append('--fastq') | |
164 | |
165 # alignment options | |
166 if args.num_threads > 2: | |
167 # divide num_threads by 2 here since bismark will spawn 2 jobs with -p threads each | |
168 cmd.extend(['-p', str(math.ceil(args.num_threads / 2))]) | |
169 if args.seed_mismatches: | |
170 cmd.extend(['-N', str(args.seed_mismatches)]) | |
171 if args.seed_len: | |
172 cmd.extend(['-L', str(args.seed_len)]) | |
173 if args.seed_extention_attempts: | |
174 cmd.extend(['-D', str(args.seed_extention_attempts)]) | |
175 if args.max_reseed: | |
176 cmd.extend(['-R', str(args.max_reseed)]) | |
177 if args.no_discordant: | |
178 cmd.append('--no-discordant') | |
179 if args.no_mixed: | |
180 cmd.append('--no-mixed') | |
181 if args.skip_reads: | |
182 cmd.extend(['--skip', str(args.skip_reads)]) | |
183 if args.qupto: | |
184 cmd.extend(['--upto', 'args.qupto']) | |
185 if args.phred64: | |
186 cmd.append('--phred64-quals') | |
187 if args.suppress_header: | |
188 cmd.append('--sam-no-hd') | |
189 if args.output_unmapped_reads or (args.output_unmapped_reads_l and args.output_unmapped_reads_r): | |
190 cmd.append('--un') | |
191 if args.output_suppressed_reads or (args.output_suppressed_reads_l and args.output_suppressed_reads_r): | |
192 cmd.append('--ambiguous') | |
193 | |
194 cmd.append(index_dir) | |
195 # Set up the reads | |
196 if args.mate_paired: | |
197 # paired-end reads library | |
198 cmd.extend(['-1', args.mate1, '-2', args.mate2, '-I', str(args.min_insert), '-X', str(args.max_insert)]) | |
199 else: | |
200 # single paired reads library | |
201 cmd.append(args.single_paired) | |
202 | |
203 # Run | |
204 try: | |
205 logger.info("Running bismark with: '%s'", " ".join(cmd)) | |
206 process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
207 with process.stdout: | |
208 log_subprocess_output(logger, process.stdout) | |
209 exitcode = process.wait() | |
210 if exitcode != 0: | |
211 raise Exception(process.stderr) | |
212 except Exception as e: | |
213 stop_err(logger, 'Error in running bismark!\n%s' % e) | |
214 | |
215 # collect and copy output files | |
216 if args.output_report_file: | |
217 output_report_file = open(args.output_report_file, 'w+') | |
218 for line in fileinput.input(glob(os.path.join(output_dir, '*report.txt'))): | |
219 output_report_file.write(line) | |
220 output_report_file.close() | |
221 | |
222 if args.output_suppressed_reads: | |
223 if glob(os.path.join(output_dir, '*ambiguous_reads.txt')): | |
224 shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads) | |
225 if args.output_suppressed_reads_l: | |
226 if glob(os.path.join(output_dir, '*ambiguous_reads_1.txt')): | |
227 shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l) | |
228 if args.output_suppressed_reads_r: | |
229 if glob(os.path.join(output_dir, '*ambiguous_reads_2.txt')): | |
230 shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r) | |
231 | |
232 if args.output_unmapped_reads: | |
233 if glob(os.path.join(output_dir, '*unmapped_reads.txt')): | |
234 shutil.move(glob(os.path.join(output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads) | |
235 if args.output_unmapped_reads_l: | |
236 if glob(os.path.join(output_dir, '*unmapped_reads_1.txt')): | |
237 shutil.move(glob(os.path.join(output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l) | |
238 if args.output_unmapped_reads_r: | |
239 if glob(os.path.join(output_dir, '*unmapped_reads_2.txt')): | |
240 shutil.move(glob(os.path.join(output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r) | |
241 | |
242 try: | |
243 # merge all bam files | |
244 tmp_res = tempfile.NamedTemporaryFile(dir=output_dir).name | |
245 | |
246 bam_files = glob(os.path.join(output_dir, '*.bam')) | |
247 if len(bam_files) > 1: | |
248 cmd = ['samtools', 'merge', '-@', str(args.num_threads), '-f', tmp_res, ' '.join(bam_files)] | |
249 logger.info("Merging bams with: '%s'", cmd) | |
250 process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
251 with process.stdout: | |
252 log_subprocess_output(logger, process.stdout) | |
253 exitcode = process.wait() | |
254 if exitcode != 0: | |
255 raise Exception(process.stderr) | |
256 else: | |
257 tmp_res = bam_files[0] | |
258 | |
259 bam_path = "%s" % tmp_res | |
260 | |
261 if os.path.exists(bam_path): | |
262 if args.sort_bam: | |
263 cmd = ['samtools', 'sort', '-@', str(args.num_threads), bam_path, 'sorted_bam'] | |
264 logger.info("Sorting bam with: '%s'", cmd) | |
265 process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
266 with process.stdout: | |
267 log_subprocess_output(logger, process.stdout) | |
268 exitcode = process.wait() | |
269 if exitcode != 0: | |
270 raise Exception(process.stderr) | |
271 shutil.move('sorted_bam.bam', args.output) | |
272 else: | |
273 shutil.move(bam_path, args.output) | |
274 else: | |
275 stop_err(logger, 'BAM file no found:\n%s' % bam_path) | |
276 | |
277 except Exception as e: | |
278 stop_err(logger, 'Error in merging bam files!\n%s' % e) | |
279 | |
280 # Clean up temp dirs | |
281 if tmp_index_dir and os.path.exists(tmp_index_dir): | |
282 shutil.rmtree(tmp_index_dir) | |
283 if os.path.exists(tmp_bismark_dir): | |
284 shutil.rmtree(tmp_bismark_dir) | |
285 if os.path.exists(output_dir): | |
286 shutil.rmtree(output_dir) | |
287 | |
288 | |
289 if __name__ == "__main__": __main__() |