comparison coverage_stats-1b5523d3d2c2/tools/coverage_stats/coverage_stats.py @ 3:57b3ea22aff3 draft

Uploaded v0.1.0 which was already on the Test Tool Shed. Included Python 3 support.
author peterjc
date Tue, 11 Aug 2020 18:23:05 -0400
parents
children
comparison
equal deleted inserted replaced
2:7254ece0c0ff 3:57b3ea22aff3
1 #!/usr/bin/env python
2 """BAM coverage statistics using samtools idxstats and depth.
3
4 This script takes exactly three command line arguments:
5 * Input BAM filename
6 * Input BAI filename (via Galaxy metadata)
7 * Output tabular filename
8
9 Optional fourth argument:
10 * Max coverage depth (integer)
11
12 This messes about with the filenames to make samtools happy, then
13 runs "samtools idxstats" and "samtools depth", captures and combines
14 the output to the desired output tabular file.
15
16 Because "samtools depth" treats the max depth a little fuzzily, this
17 tool tries to account for this and applies a clear max-depth cut off.
18 """
19
20 import os
21 import subprocess
22 import sys
23 import tempfile
24
25 from optparse import OptionParser
26
27 usage = """Example usage:
28
29 $ python coverage_stats.py -b mapped.bam -i mapped.bai -o summary.tsv -d 10000
30 """
31
32 parser = OptionParser(usage=usage)
33 parser.add_option(
34 "-b", "--bam", dest="input_bam", default=None, help="Input BAM file", metavar="FILE"
35 )
36 parser.add_option(
37 "-i",
38 "--bai",
39 dest="input_bai",
40 default="-",
41 help="Input BAI file (BAM index, optional, default or - infer from BAM filename)",
42 metavar="FILE",
43 )
44 parser.add_option(
45 "-o",
46 dest="output_tabular",
47 default="-",
48 help="Output tabular file (optional, default stdout)",
49 metavar="FILE",
50 )
51 parser.add_option(
52 "-d",
53 "--maxdepth",
54 dest="max_depth",
55 default=8000,
56 help="Max coverage depth (integer, default 8000)",
57 type="int",
58 )
59 parser.add_option(
60 "-v",
61 "--version",
62 dest="version",
63 default=False,
64 action="store_true",
65 help="Show version and quit",
66 )
67
68 options, args = parser.parse_args()
69
70 if options.version:
71 # Galaxy seems to invert the order of the two lines
72 print("BAM coverage statistics v0.1.0")
73 cmd = "samtools 2>&1 | grep -i ^Version"
74 sys.exit(os.system(cmd))
75
76 bam_filename = options.input_bam
77 bai_filename = options.input_bai
78 tabular_filename = options.output_tabular
79 max_depth = options.max_depth
80
81 if args:
82 sys.exit("Sorry, the legacy API has been dropped.")
83
84 if not bam_filename:
85 sys.exit("Input BAM filename is required.")
86 if not os.path.isfile(bam_filename):
87 sys.exit("Input BAM file not found: %s" % bam_filename)
88 if bai_filename == "-":
89 if os.path.isfile(bam_filename + ".bai"):
90 bai_filename = bam_filename + ".bai"
91 if not os.path.isfile(bai_filename):
92 if bai_filename == "None":
93 sys.exit("Error: Galaxy did not index your BAM file")
94 sys.exit("Input BAI file not found: %s" % bai_filename)
95
96 try:
97 max_depth = int(max_depth)
98 except ValueError:
99 sys.exit("Bad argument for max depth: %r" % max_depth)
100 if max_depth < 0:
101 sys.exit("Bad argument for max depth: %r" % max_depth)
102
103 # fuzz factor to ensure can reach max_depth, e.g. region with
104 # many reads having a deletion present could underestimate the
105 # coverage by capping the number of reads considered
106 depth_margin = 100
107
108 # Assign sensible names with real extensions, and setup symlinks:
109 tmp_dir = tempfile.mkdtemp()
110 bam_file = os.path.join(tmp_dir, "temp.bam")
111 bai_file = os.path.join(tmp_dir, "temp.bam.bai")
112 idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv")
113 depth_filename = os.path.join(tmp_dir, "depth.tsv")
114 os.symlink(os.path.abspath(bam_filename), bam_file)
115 os.symlink(os.path.abspath(bai_filename), bai_file)
116 assert os.path.isfile(bam_file), bam_file
117 assert os.path.isfile(bai_file), bai_file
118 assert os.path.isfile(bam_file + ".bai"), bam_file
119
120
121 def clean_up():
122 """Remove our temporary files and directory."""
123 os.remove(idxstats_filename)
124 os.remove(depth_filename)
125 os.remove(bam_file)
126 os.remove(bai_file)
127 os.rmdir(tmp_dir)
128
129
130 def samtools_depth_opt_available():
131 """Determine if samtools depth supports maximum coverage argument."""
132 child = subprocess.Popen(
133 ["samtools", "depth"],
134 universal_newlines=True,
135 stdout=subprocess.PIPE,
136 stderr=subprocess.STDOUT,
137 )
138 # Combined stdout/stderr in case samtools is ever inconsistent
139 output, tmp = child.communicate()
140 assert tmp is None
141 # Expect to find this line in the help text, exact wording could change:
142 # -d/-m <int> maximum coverage depth [8000]
143 return " -d/-m " in output
144
145
146 depth_hack = False
147 if not samtools_depth_opt_available():
148 if max_depth + depth_margin <= 8000:
149 sys.stderr.write(
150 "WARNING: The version of samtools depth installed does not "
151 "support the -d option, however, the requested max-depth "
152 "is safely under the default of 8000.\n"
153 )
154 depth_hack = True
155 else:
156 sys.exit(
157 "The version of samtools depth installed does not support the -d option."
158 )
159
160 # Run samtools idxstats:
161 cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename)
162 return_code = os.system(cmd)
163 if return_code:
164 clean_up()
165 sys.exit("Return code %i from command:\n%s" % (return_code, cmd))
166
167 # Run samtools depth:
168 # TODO - Parse stdout instead?
169 if depth_hack:
170 # Using an old samtools without the -d option, but hard coded default
171 # of 8000 should be fine even allowing a margin for fuzzy output
172 cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename)
173 else:
174 cmd = 'samtools depth -d %i "%s" > "%s"' % (
175 max_depth + depth_margin,
176 bam_file,
177 depth_filename,
178 )
179 return_code = os.system(cmd)
180 if return_code:
181 clean_up()
182 sys.exit("Return code %i from command:\n%s" % (return_code, cmd))
183
184
185 def load_total_coverage(depth_handle, identifier, length):
186 """Parse some of the 'samtools depth' output for coverages.
187
188 Returns min_cov (int), max_cov (int) and mean cov (float).
189
190 Uses global variables to cache the first line of output from the
191 next reference sequence.
192 """
193 global depth_ref, depth_pos, depth_reads
194
195 # print("====")
196 # print("%s coverage calculation, length %i, ..." % (identifier, length))
197
198 if depth_ref is None:
199 # Right at start of file / new contig
200 line = depth_handle.readline()
201 # Are we at the end of the file?
202 if not line:
203 # Must be at the end of the file.
204 # This can happen if the file contig(s) had no reads mapped
205 return 0, 0, 0.0, 0
206 depth_ref, depth_pos, depth_reads = line.rstrip("\n").split()
207 depth_pos = int(depth_pos)
208 depth_reads = min(max_depth, int(depth_reads))
209 # Can now treat as later references where first line cached
210 elif identifier != depth_ref:
211 # Infer that identifier had coverage zero,
212 # and so was not in the 'samtools depth'
213 # output.
214 # print("%s appears to have no coverage at all" % identifier)
215 return 0, 0, 0.0, 0
216
217 # Good, at start of expected reference
218 bases = depth_reads
219 if depth_pos == 1:
220 min_cov = depth_reads
221 else:
222 # print("%s has no coverage at start" % identifier)
223 min_cov = 0
224 max_cov = depth_reads
225
226 last_pos = depth_pos
227 depth_ref = None
228 depth_pos = 0
229 depth_reads = 0
230 for line in depth_handle:
231 ref, pos, depth = line.rstrip("\n").split()
232 pos = int(pos)
233 depth = min(max_depth, int(depth))
234 if ref != identifier:
235 # Reached the end of this identifier's coverage
236 # so cache this ready for next identifier
237 depth_ref, depth_pos, depth_reads = ref, pos, depth
238 break
239 bases += depth
240 if last_pos + 1 < pos:
241 # print("%s has no coverage between %i and %i"
242 # % (identifier, last_pos, pos))
243 min_cov = 0
244 else:
245 min_cov = min(min_cov, depth)
246 max_cov = max(max_cov, depth)
247 last_pos = pos
248
249 # Reach the end of this identifier's coverage or end of file
250 if last_pos < length:
251 # print("%s has no coverage at end" % identifier)
252 min_cov = 0
253 mean_cov = bases / float(length)
254 return min_cov, max_cov, mean_cov, bases
255
256
257 # Parse and combine the output
258 if tabular_filename == "-":
259 out_handle = sys.stdout
260 else:
261 out_handle = open(tabular_filename, "w")
262 out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n")
263
264 idxstats_handle = open(idxstats_filename)
265 depth_handle = open(depth_filename)
266
267 depth_ref = None
268 depth_pos = 0
269 depth_reads = 0
270
271 global_bases = 0
272 global_length = 0
273 for line in idxstats_handle:
274 identifier, length, mapped, placed = line.rstrip("\n").split()
275 length = int(length)
276 mapped = int(mapped)
277 placed = int(placed)
278 if identifier == "*":
279 min_cov = 0
280 max_cov = 0
281 mean_cov = 0.0
282 bases = 0
283 else:
284 min_cov, max_cov, mean_cov, bases = load_total_coverage(
285 depth_handle, identifier, length
286 )
287 if max_cov > max_depth:
288 sys.exit(
289 "Using max depth %i yet saw max coverage %i for %s"
290 % (max_depth, max_cov, identifier)
291 )
292 out_handle.write(
293 "%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n"
294 % (identifier, length, mapped, placed, min_cov, max_cov, mean_cov)
295 )
296 if not (min_cov <= mean_cov <= max_cov):
297 out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n")
298 idxstats_handle.close()
299 depth_handle.close()
300 out_handle.close()
301 clean_up()
302 sys.exit(
303 "Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov"
304 % identifier
305 )
306 global_length += length
307 global_bases += bases
308
309 idxstats_handle.close()
310 depth_handle.close()
311 if tabular_filename != "-":
312 out_handle.close()
313
314 print(
315 "Total reference length %i with overall mean coverage %0.2f"
316 % (global_length, float(global_bases) / global_length)
317 )
318
319 # Remove the temp symlinks and files:
320 clean_up()
321
322 if depth_ref is not None:
323 sys.exit("Left over output from 'samtools depth'? %r" % depth_ref)