Mercurial > repos > peterjc > coverage_stats
comparison coverage_stats-1b5523d3d2c2/tools/coverage_stats/coverage_stats.py @ 3:57b3ea22aff3 draft
Uploaded v0.1.0 which was already on the Test Tool Shed. Included Python 3 support.
author | peterjc |
---|---|
date | Tue, 11 Aug 2020 18:23:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:7254ece0c0ff | 3:57b3ea22aff3 |
---|---|
1 #!/usr/bin/env python | |
2 """BAM coverage statistics using samtools idxstats and depth. | |
3 | |
4 This script takes exactly three command line arguments: | |
5 * Input BAM filename | |
6 * Input BAI filename (via Galaxy metadata) | |
7 * Output tabular filename | |
8 | |
9 Optional fourth argument: | |
10 * Max coverage depth (integer) | |
11 | |
12 This messes about with the filenames to make samtools happy, then | |
13 runs "samtools idxstats" and "samtools depth", captures and combines | |
14 the output to the desired output tabular file. | |
15 | |
16 Because "samtools depth" treats the max depth a little fuzzily, this | |
17 tool tries to account for this and applies a clear max-depth cut off. | |
18 """ | |
19 | |
20 import os | |
21 import subprocess | |
22 import sys | |
23 import tempfile | |
24 | |
25 from optparse import OptionParser | |
26 | |
27 usage = """Example usage: | |
28 | |
29 $ python coverage_stats.py -b mapped.bam -i mapped.bai -o summary.tsv -d 10000 | |
30 """ | |
31 | |
32 parser = OptionParser(usage=usage) | |
33 parser.add_option( | |
34 "-b", "--bam", dest="input_bam", default=None, help="Input BAM file", metavar="FILE" | |
35 ) | |
36 parser.add_option( | |
37 "-i", | |
38 "--bai", | |
39 dest="input_bai", | |
40 default="-", | |
41 help="Input BAI file (BAM index, optional, default or - infer from BAM filename)", | |
42 metavar="FILE", | |
43 ) | |
44 parser.add_option( | |
45 "-o", | |
46 dest="output_tabular", | |
47 default="-", | |
48 help="Output tabular file (optional, default stdout)", | |
49 metavar="FILE", | |
50 ) | |
51 parser.add_option( | |
52 "-d", | |
53 "--maxdepth", | |
54 dest="max_depth", | |
55 default=8000, | |
56 help="Max coverage depth (integer, default 8000)", | |
57 type="int", | |
58 ) | |
59 parser.add_option( | |
60 "-v", | |
61 "--version", | |
62 dest="version", | |
63 default=False, | |
64 action="store_true", | |
65 help="Show version and quit", | |
66 ) | |
67 | |
68 options, args = parser.parse_args() | |
69 | |
70 if options.version: | |
71 # Galaxy seems to invert the order of the two lines | |
72 print("BAM coverage statistics v0.1.0") | |
73 cmd = "samtools 2>&1 | grep -i ^Version" | |
74 sys.exit(os.system(cmd)) | |
75 | |
76 bam_filename = options.input_bam | |
77 bai_filename = options.input_bai | |
78 tabular_filename = options.output_tabular | |
79 max_depth = options.max_depth | |
80 | |
81 if args: | |
82 sys.exit("Sorry, the legacy API has been dropped.") | |
83 | |
84 if not bam_filename: | |
85 sys.exit("Input BAM filename is required.") | |
86 if not os.path.isfile(bam_filename): | |
87 sys.exit("Input BAM file not found: %s" % bam_filename) | |
88 if bai_filename == "-": | |
89 if os.path.isfile(bam_filename + ".bai"): | |
90 bai_filename = bam_filename + ".bai" | |
91 if not os.path.isfile(bai_filename): | |
92 if bai_filename == "None": | |
93 sys.exit("Error: Galaxy did not index your BAM file") | |
94 sys.exit("Input BAI file not found: %s" % bai_filename) | |
95 | |
96 try: | |
97 max_depth = int(max_depth) | |
98 except ValueError: | |
99 sys.exit("Bad argument for max depth: %r" % max_depth) | |
100 if max_depth < 0: | |
101 sys.exit("Bad argument for max depth: %r" % max_depth) | |
102 | |
103 # fuzz factor to ensure can reach max_depth, e.g. region with | |
104 # many reads having a deletion present could underestimate the | |
105 # coverage by capping the number of reads considered | |
106 depth_margin = 100 | |
107 | |
108 # Assign sensible names with real extensions, and setup symlinks: | |
109 tmp_dir = tempfile.mkdtemp() | |
110 bam_file = os.path.join(tmp_dir, "temp.bam") | |
111 bai_file = os.path.join(tmp_dir, "temp.bam.bai") | |
112 idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv") | |
113 depth_filename = os.path.join(tmp_dir, "depth.tsv") | |
114 os.symlink(os.path.abspath(bam_filename), bam_file) | |
115 os.symlink(os.path.abspath(bai_filename), bai_file) | |
116 assert os.path.isfile(bam_file), bam_file | |
117 assert os.path.isfile(bai_file), bai_file | |
118 assert os.path.isfile(bam_file + ".bai"), bam_file | |
119 | |
120 | |
121 def clean_up(): | |
122 """Remove our temporary files and directory.""" | |
123 os.remove(idxstats_filename) | |
124 os.remove(depth_filename) | |
125 os.remove(bam_file) | |
126 os.remove(bai_file) | |
127 os.rmdir(tmp_dir) | |
128 | |
129 | |
130 def samtools_depth_opt_available(): | |
131 """Determine if samtools depth supports maximum coverage argument.""" | |
132 child = subprocess.Popen( | |
133 ["samtools", "depth"], | |
134 universal_newlines=True, | |
135 stdout=subprocess.PIPE, | |
136 stderr=subprocess.STDOUT, | |
137 ) | |
138 # Combined stdout/stderr in case samtools is ever inconsistent | |
139 output, tmp = child.communicate() | |
140 assert tmp is None | |
141 # Expect to find this line in the help text, exact wording could change: | |
142 # -d/-m <int> maximum coverage depth [8000] | |
143 return " -d/-m " in output | |
144 | |
145 | |
146 depth_hack = False | |
147 if not samtools_depth_opt_available(): | |
148 if max_depth + depth_margin <= 8000: | |
149 sys.stderr.write( | |
150 "WARNING: The version of samtools depth installed does not " | |
151 "support the -d option, however, the requested max-depth " | |
152 "is safely under the default of 8000.\n" | |
153 ) | |
154 depth_hack = True | |
155 else: | |
156 sys.exit( | |
157 "The version of samtools depth installed does not support the -d option." | |
158 ) | |
159 | |
160 # Run samtools idxstats: | |
161 cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename) | |
162 return_code = os.system(cmd) | |
163 if return_code: | |
164 clean_up() | |
165 sys.exit("Return code %i from command:\n%s" % (return_code, cmd)) | |
166 | |
167 # Run samtools depth: | |
168 # TODO - Parse stdout instead? | |
169 if depth_hack: | |
170 # Using an old samtools without the -d option, but hard coded default | |
171 # of 8000 should be fine even allowing a margin for fuzzy output | |
172 cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename) | |
173 else: | |
174 cmd = 'samtools depth -d %i "%s" > "%s"' % ( | |
175 max_depth + depth_margin, | |
176 bam_file, | |
177 depth_filename, | |
178 ) | |
179 return_code = os.system(cmd) | |
180 if return_code: | |
181 clean_up() | |
182 sys.exit("Return code %i from command:\n%s" % (return_code, cmd)) | |
183 | |
184 | |
185 def load_total_coverage(depth_handle, identifier, length): | |
186 """Parse some of the 'samtools depth' output for coverages. | |
187 | |
188 Returns min_cov (int), max_cov (int) and mean cov (float). | |
189 | |
190 Uses global variables to cache the first line of output from the | |
191 next reference sequence. | |
192 """ | |
193 global depth_ref, depth_pos, depth_reads | |
194 | |
195 # print("====") | |
196 # print("%s coverage calculation, length %i, ..." % (identifier, length)) | |
197 | |
198 if depth_ref is None: | |
199 # Right at start of file / new contig | |
200 line = depth_handle.readline() | |
201 # Are we at the end of the file? | |
202 if not line: | |
203 # Must be at the end of the file. | |
204 # This can happen if the file contig(s) had no reads mapped | |
205 return 0, 0, 0.0, 0 | |
206 depth_ref, depth_pos, depth_reads = line.rstrip("\n").split() | |
207 depth_pos = int(depth_pos) | |
208 depth_reads = min(max_depth, int(depth_reads)) | |
209 # Can now treat as later references where first line cached | |
210 elif identifier != depth_ref: | |
211 # Infer that identifier had coverage zero, | |
212 # and so was not in the 'samtools depth' | |
213 # output. | |
214 # print("%s appears to have no coverage at all" % identifier) | |
215 return 0, 0, 0.0, 0 | |
216 | |
217 # Good, at start of expected reference | |
218 bases = depth_reads | |
219 if depth_pos == 1: | |
220 min_cov = depth_reads | |
221 else: | |
222 # print("%s has no coverage at start" % identifier) | |
223 min_cov = 0 | |
224 max_cov = depth_reads | |
225 | |
226 last_pos = depth_pos | |
227 depth_ref = None | |
228 depth_pos = 0 | |
229 depth_reads = 0 | |
230 for line in depth_handle: | |
231 ref, pos, depth = line.rstrip("\n").split() | |
232 pos = int(pos) | |
233 depth = min(max_depth, int(depth)) | |
234 if ref != identifier: | |
235 # Reached the end of this identifier's coverage | |
236 # so cache this ready for next identifier | |
237 depth_ref, depth_pos, depth_reads = ref, pos, depth | |
238 break | |
239 bases += depth | |
240 if last_pos + 1 < pos: | |
241 # print("%s has no coverage between %i and %i" | |
242 # % (identifier, last_pos, pos)) | |
243 min_cov = 0 | |
244 else: | |
245 min_cov = min(min_cov, depth) | |
246 max_cov = max(max_cov, depth) | |
247 last_pos = pos | |
248 | |
249 # Reach the end of this identifier's coverage or end of file | |
250 if last_pos < length: | |
251 # print("%s has no coverage at end" % identifier) | |
252 min_cov = 0 | |
253 mean_cov = bases / float(length) | |
254 return min_cov, max_cov, mean_cov, bases | |
255 | |
256 | |
257 # Parse and combine the output | |
258 if tabular_filename == "-": | |
259 out_handle = sys.stdout | |
260 else: | |
261 out_handle = open(tabular_filename, "w") | |
262 out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n") | |
263 | |
264 idxstats_handle = open(idxstats_filename) | |
265 depth_handle = open(depth_filename) | |
266 | |
267 depth_ref = None | |
268 depth_pos = 0 | |
269 depth_reads = 0 | |
270 | |
271 global_bases = 0 | |
272 global_length = 0 | |
273 for line in idxstats_handle: | |
274 identifier, length, mapped, placed = line.rstrip("\n").split() | |
275 length = int(length) | |
276 mapped = int(mapped) | |
277 placed = int(placed) | |
278 if identifier == "*": | |
279 min_cov = 0 | |
280 max_cov = 0 | |
281 mean_cov = 0.0 | |
282 bases = 0 | |
283 else: | |
284 min_cov, max_cov, mean_cov, bases = load_total_coverage( | |
285 depth_handle, identifier, length | |
286 ) | |
287 if max_cov > max_depth: | |
288 sys.exit( | |
289 "Using max depth %i yet saw max coverage %i for %s" | |
290 % (max_depth, max_cov, identifier) | |
291 ) | |
292 out_handle.write( | |
293 "%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n" | |
294 % (identifier, length, mapped, placed, min_cov, max_cov, mean_cov) | |
295 ) | |
296 if not (min_cov <= mean_cov <= max_cov): | |
297 out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n") | |
298 idxstats_handle.close() | |
299 depth_handle.close() | |
300 out_handle.close() | |
301 clean_up() | |
302 sys.exit( | |
303 "Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov" | |
304 % identifier | |
305 ) | |
306 global_length += length | |
307 global_bases += bases | |
308 | |
309 idxstats_handle.close() | |
310 depth_handle.close() | |
311 if tabular_filename != "-": | |
312 out_handle.close() | |
313 | |
314 print( | |
315 "Total reference length %i with overall mean coverage %0.2f" | |
316 % (global_length, float(global_bases) / global_length) | |
317 ) | |
318 | |
319 # Remove the temp symlinks and files: | |
320 clean_up() | |
321 | |
322 if depth_ref is not None: | |
323 sys.exit("Left over output from 'samtools depth'? %r" % depth_ref) |