comparison bismark_wrapper.py @ 8:9bfe38410155 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit 51299fa62f0566a4a897b1c149db564631282fff
author bgruening
date Wed, 22 Aug 2018 08:09:42 -0400
parents
children 7bffcb6fc81d
comparison
equal deleted inserted replaced
7:fcadce4d9a06 8:9bfe38410155
1 #!/usr/bin/env python
2
3 import argparse
4 import fileinput
5 import logging
6 import math
7 import os
8 import shutil
9 import subprocess
10 import sys
11 import tempfile
12 from glob import glob
13
14
15 def stop_err(logger, msg):
16 logger.critical(msg)
17 sys.exit(1)
18
19
20 def log_subprocess_output(logger, pipe):
21 for line in iter(pipe.readline, b''):
22 logger.debug(line.decode().rstrip())
23
24
25 def __main__():
26 # Parse Command Line
27 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.')
28 parser.add_argument('-p', '--num-threads', dest='num_threads',
29 type=int, default=4, help='Use this many threads to align reads. The default is 4.')
30
31 # input options
32 parser.add_argument('--own-file', dest='own_file', help='')
33 parser.add_argument('-D', '--indexes-path', dest='index_path',
34 help='Indexes directory; location of .ebwt and .fa files.')
35 parser.add_argument('-O', '--output', dest='output')
36
37 parser.add_argument('--output-report-file', dest='output_report_file')
38 parser.add_argument('--suppress-header', dest='suppress_header', action="store_true")
39
40 parser.add_argument('--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired',
41 default=False)
42
43 parser.add_argument('-1', '--mate1', dest='mate1',
44 help='The forward reads file in Sanger FASTQ or FASTA format.')
45 parser.add_argument('-2', '--mate2', dest='mate2',
46 help='The reverse reads file in Sanger FASTQ or FASTA format.')
47 parser.add_argument('--sort-bam', dest='sort_bam', action="store_true")
48
49 parser.add_argument('--output-unmapped-reads', dest='output_unmapped_reads',
50 help='Additional output file with unmapped reads (single-end).')
51 parser.add_argument('--output-unmapped-reads-l', dest='output_unmapped_reads_l',
52 help='File name for unmapped reads (left, paired-end).')
53 parser.add_argument('--output-unmapped-reads-r', dest='output_unmapped_reads_r',
54 help='File name for unmapped reads (right, paired-end).')
55
56 parser.add_argument('--output-suppressed-reads', dest='output_suppressed_reads',
57 help='Additional output file with suppressed reads (single-end).')
58 parser.add_argument('--output-suppressed-reads-l', dest='output_suppressed_reads_l',
59 help='File name for suppressed reads (left, paired-end).')
60 parser.add_argument('--output-suppressed-reads-r', dest='output_suppressed_reads_r',
61 help='File name for suppressed reads (right, paired-end).')
62 parser.add_argument('--stdout', dest='output_stdout',
63 help='File name for the standard output of bismark.')
64
65 parser.add_argument('--single-paired', dest='single_paired',
66 help='The single-end reads file in Sanger FASTQ or FASTA format.')
67
68 parser.add_argument('--fastq', action='store_true', help='Query filetype is in FASTQ format')
69 parser.add_argument('--fasta', action='store_true', help='Query filetype is in FASTA format')
70 parser.add_argument('--phred64-quals', dest='phred64', action="store_true")
71
72 parser.add_argument('--skip-reads', dest='skip_reads', type=int)
73 parser.add_argument('--qupto', type=int)
74
75 # paired end options
76 parser.add_argument('-I', '--minins', dest='min_insert')
77 parser.add_argument('-X', '--maxins', dest='max_insert')
78 parser.add_argument('--no-mixed', dest='no_mixed', action="store_true")
79 parser.add_argument('--no-discordant', dest='no_discordant', action="store_true")
80
81 # parse general options
82 # default 20
83 parser.add_argument('--seed-len', dest='seed_len', type=int)
84 # default 15
85 parser.add_argument('--seed-extention-attempts', dest='seed_extention_attempts', type=int)
86 # default 0
87 parser.add_argument('--seed-mismatches', dest='seed_mismatches', type=int)
88 # default 2
89 parser.add_argument('--max-reseed', dest='max_reseed', type=int)
90 """
91 # default 70
92 parser.add_argument( '--maqerr', dest='maqerr', type=int )
93 """
94
95 """
96 The number of megabytes of memory a given thread is given to store path
97 descriptors in --best mode. Best-first search must keep track of many paths
98 at once to ensure it is always extending the path with the lowest cumulative
99 cost. Bowtie tries to minimize the memory impact of the descriptors, but
100 they can still grow very large in some cases. If you receive an error message
101 saying that chunk memory has been exhausted in --best mode, try adjusting
102 this parameter up to dedicate more memory to the descriptors. Default: 512.
103 """
104 parser.add_argument('--chunkmbs', type=int, default=512)
105
106 args = parser.parse_args()
107
108 logger = logging.getLogger('bismark_wrapper')
109 logger.setLevel(logging.DEBUG)
110 ch = logging.StreamHandler(sys.stdout)
111 if args.output_stdout:
112 ch.setLevel(logging.WARNING)
113 handler = logging.FileHandler(args.output_stdout)
114 handler.setLevel(logging.DEBUG)
115 logger.addHandler(handler)
116 else:
117 ch.setLevel(logging.DEBUG)
118 logger.addHandler(ch)
119
120 # Create bismark index if necessary.
121 index_dir = ""
122 tmp_index_dir = None
123 if args.own_file:
124 logger.info("Create a temporary index with the offered files from the user. "
125 "Utilizing the script: bismark_genome_preparation")
126 tmp_index_dir = tempfile.mkdtemp()
127 index_path = os.path.join(tmp_index_dir, '.'.join(os.path.split(args.own_file)[1].split('.')[:-1]))
128 try:
129 # Create a hard link pointing to args.own_file named 'index_path'.fa.
130 os.symlink(args.own_file, index_path + '.fa')
131 except Exception as e:
132 if os.path.exists(tmp_index_dir):
133 shutil.rmtree(tmp_index_dir)
134 stop_err(logger, 'Error in linking the reference database!\n%s' % e)
135 # bismark_genome_preparation needs the complete path to the folder in which the database is stored
136 cmd_index = ['bismark_genome_preparation', '--bowtie2', tmp_index_dir]
137 try:
138 logger.info("Generating index with: '%s'", " ".join(cmd_index))
139 process = subprocess.Popen(cmd_index, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
140 with process.stdout:
141 log_subprocess_output(logger, process.stdout)
142 exitcode = process.wait()
143 if exitcode != 0:
144 raise Exception(process.stderr)
145 except Exception as e:
146 if os.path.exists(tmp_index_dir):
147 shutil.rmtree(tmp_index_dir)
148 stop_err(logger, 'Error indexing reference sequence!\n%s' % e)
149 index_dir = tmp_index_dir
150 else:
151 # bowtie path is the path to the index directory and the first path of the index file name
152 index_dir = os.path.dirname(args.index_path)
153
154 # Build bismark command
155 tmp_bismark_dir = tempfile.mkdtemp()
156 output_dir = os.path.join(tmp_bismark_dir, 'results')
157 cmd = ['bismark', '--bam', '--gzip', '--temp_dir', tmp_bismark_dir, '-o', output_dir, '--quiet']
158
159 if args.fasta:
160 # the query input files (specified as mate1,mate2 or singles) are FastA
161 cmd.append('--fasta')
162 elif args.fastq:
163 cmd.append('--fastq')
164
165 # alignment options
166 if args.num_threads > 2:
167 # divide num_threads by 2 here since bismark will spawn 2 jobs with -p threads each
168 cmd.extend(['-p', str(math.ceil(args.num_threads / 2))])
169 if args.seed_mismatches:
170 cmd.extend(['-N', str(args.seed_mismatches)])
171 if args.seed_len:
172 cmd.extend(['-L', str(args.seed_len)])
173 if args.seed_extention_attempts:
174 cmd.extend(['-D', str(args.seed_extention_attempts)])
175 if args.max_reseed:
176 cmd.extend(['-R', str(args.max_reseed)])
177 if args.no_discordant:
178 cmd.append('--no-discordant')
179 if args.no_mixed:
180 cmd.append('--no-mixed')
181 if args.skip_reads:
182 cmd.extend(['--skip', str(args.skip_reads)])
183 if args.qupto:
184 cmd.extend(['--upto', 'args.qupto'])
185 if args.phred64:
186 cmd.append('--phred64-quals')
187 if args.suppress_header:
188 cmd.append('--sam-no-hd')
189 if args.output_unmapped_reads or (args.output_unmapped_reads_l and args.output_unmapped_reads_r):
190 cmd.append('--un')
191 if args.output_suppressed_reads or (args.output_suppressed_reads_l and args.output_suppressed_reads_r):
192 cmd.append('--ambiguous')
193
194 cmd.append(index_dir)
195 # Set up the reads
196 if args.mate_paired:
197 # paired-end reads library
198 cmd.extend(['-1', args.mate1, '-2', args.mate2, '-I', str(args.min_insert), '-X', str(args.max_insert)])
199 else:
200 # single paired reads library
201 cmd.append(args.single_paired)
202
203 # Run
204 try:
205 logger.info("Running bismark with: '%s'", " ".join(cmd))
206 process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
207 with process.stdout:
208 log_subprocess_output(logger, process.stdout)
209 exitcode = process.wait()
210 if exitcode != 0:
211 raise Exception(process.stderr)
212 except Exception as e:
213 stop_err(logger, 'Error in running bismark!\n%s' % e)
214
215 # collect and copy output files
216 if args.output_report_file:
217 output_report_file = open(args.output_report_file, 'w+')
218 for line in fileinput.input(glob(os.path.join(output_dir, '*report.txt'))):
219 output_report_file.write(line)
220 output_report_file.close()
221
222 if args.output_suppressed_reads:
223 if glob(os.path.join(output_dir, '*ambiguous_reads.txt')):
224 shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads)
225 if args.output_suppressed_reads_l:
226 if glob(os.path.join(output_dir, '*ambiguous_reads_1.txt')):
227 shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l)
228 if args.output_suppressed_reads_r:
229 if glob(os.path.join(output_dir, '*ambiguous_reads_2.txt')):
230 shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r)
231
232 if args.output_unmapped_reads:
233 if glob(os.path.join(output_dir, '*unmapped_reads.txt')):
234 shutil.move(glob(os.path.join(output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads)
235 if args.output_unmapped_reads_l:
236 if glob(os.path.join(output_dir, '*unmapped_reads_1.txt')):
237 shutil.move(glob(os.path.join(output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l)
238 if args.output_unmapped_reads_r:
239 if glob(os.path.join(output_dir, '*unmapped_reads_2.txt')):
240 shutil.move(glob(os.path.join(output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r)
241
242 try:
243 # merge all bam files
244 tmp_res = tempfile.NamedTemporaryFile(dir=output_dir).name
245
246 bam_files = glob(os.path.join(output_dir, '*.bam'))
247 if len(bam_files) > 1:
248 cmd = ['samtools', 'merge', '-@', str(args.num_threads), '-f', tmp_res, ' '.join(bam_files)]
249 logger.info("Merging bams with: '%s'", cmd)
250 process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
251 with process.stdout:
252 log_subprocess_output(logger, process.stdout)
253 exitcode = process.wait()
254 if exitcode != 0:
255 raise Exception(process.stderr)
256 else:
257 tmp_res = bam_files[0]
258
259 bam_path = "%s" % tmp_res
260
261 if os.path.exists(bam_path):
262 if args.sort_bam:
263 cmd = ['samtools', 'sort', '-@', str(args.num_threads), bam_path, 'sorted_bam']
264 logger.info("Sorting bam with: '%s'", cmd)
265 process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
266 with process.stdout:
267 log_subprocess_output(logger, process.stdout)
268 exitcode = process.wait()
269 if exitcode != 0:
270 raise Exception(process.stderr)
271 shutil.move('sorted_bam.bam', args.output)
272 else:
273 shutil.move(bam_path, args.output)
274 else:
275 stop_err(logger, 'BAM file no found:\n%s' % bam_path)
276
277 except Exception as e:
278 stop_err(logger, 'Error in merging bam files!\n%s' % e)
279
280 # Clean up temp dirs
281 if tmp_index_dir and os.path.exists(tmp_index_dir):
282 shutil.rmtree(tmp_index_dir)
283 if os.path.exists(tmp_bismark_dir):
284 shutil.rmtree(tmp_bismark_dir)
285 if os.path.exists(output_dir):
286 shutil.rmtree(output_dir)
287
288
289 if __name__ == "__main__": __main__()