Mercurial > repos > peterjc > coverage_stats
comparison tools/coverage_stats/coverage_stats.py @ 2:7254ece0c0ff draft
v0.0.5 - Supports max coverage depth in recent samtools. Expects samtools 1.4.1 via Conda, not via Tool Shed.
author | peterjc |
---|---|
date | Thu, 11 May 2017 12:16:10 -0400 |
parents | ca8f63f2f7d4 |
children |
comparison
equal
deleted
inserted
replaced
1:d1fdfaae5dbe | 2:7254ece0c0ff |
---|---|
4 This script takes exactly three command line arguments: | 4 This script takes exactly three command line arguments: |
5 * Input BAM filename | 5 * Input BAM filename |
6 * Input BAI filename (via Galaxy metadata) | 6 * Input BAI filename (via Galaxy metadata) |
7 * Output tabular filename | 7 * Output tabular filename |
8 | 8 |
9 Optional fourth argument: | |
10 * Max coverage depth (integer) | |
11 | |
9 This messes about with the filenames to make samtools happy, then | 12 This messes about with the filenames to make samtools happy, then |
10 runs "samtools idxstats" and "samtools depth", captures and combines | 13 runs "samtools idxstats" and "samtools depth", captures and combines |
11 the output to the desired output tabular file. | 14 the output to the desired output tabular file. |
15 | |
16 Because "samtools depth" treats the max depth a little fuzzily, this | |
17 tool tries to account for this and applies a clear max-depth cut off. | |
12 """ | 18 """ |
13 import sys | 19 |
14 import os | 20 import os |
15 import subprocess | 21 import subprocess |
22 import sys | |
16 import tempfile | 23 import tempfile |
17 | 24 |
18 if "-v" in sys.argv or "--version" in sys.argv: | 25 if "-v" in sys.argv or "--version" in sys.argv: |
19 #Galaxy seems to invert the order of the two lines | 26 # Galaxy seems to invert the order of the two lines |
20 print("BAM coverage statistics v0.0.1") | 27 print("BAM coverage statistics v0.0.5") |
21 cmd = "samtools 2>&1 | grep -i ^Version" | 28 cmd = "samtools 2>&1 | grep -i ^Version" |
22 sys.exit(os.system(cmd)) | 29 sys.exit(os.system(cmd)) |
23 | 30 |
24 def stop_err(msg, error_level=1): | 31 # TODO - Proper command line API |
25 """Print error message to stdout and quit with given error level.""" | 32 if len(sys.argv) == 4: |
26 sys.stderr.write("%s\n" % msg) | 33 bam_filename, bai_filename, tabular_filename = sys.argv[1:] |
27 sys.exit(error_level) | 34 max_depth = "8000" |
28 | 35 elif len(sys.argv) == 5: |
29 if len(sys.argv) != 4: | 36 bam_filename, bai_filename, tabular_filename, max_depth = sys.argv[1:] |
30 stop_err("Require three arguments: BAM, BAI, tabular filenames") | 37 else: |
31 | 38 sys.exit("Require 3 or 4 arguments: BAM, BAI, tabular filename, [max depth]") |
32 bam_filename, bai_filename, tabular_filename = sys.argv[1:] | |
33 | 39 |
34 if not os.path.isfile(bam_filename): | 40 if not os.path.isfile(bam_filename): |
35 stop_err("Input BAM file not found: %s" % bam_filename) | 41 sys.exit("Input BAM file not found: %s" % bam_filename) |
42 if bai_filename == "-": | |
43 # Make this optional for ease of use at the command line by hand: | |
44 if os.path.isfile(bam_filename + ".bai"): | |
45 bai_filename = bam_filename + ".bai" | |
36 if not os.path.isfile(bai_filename): | 46 if not os.path.isfile(bai_filename): |
37 if bai_filename == "None": | 47 if bai_filename == "None": |
38 stop_err("Error: Galaxy did not index your BAM file") | 48 sys.exit("Error: Galaxy did not index your BAM file") |
39 stop_err("Input BAI file not found: %s" % bai_filename) | 49 sys.exit("Input BAI file not found: %s" % bai_filename) |
40 | 50 |
41 #Assign sensible names with real extensions, and setup symlinks: | 51 try: |
52 max_depth = int(max_depth) | |
53 except ValueError: | |
54 sys.exit("Bad argument for max depth: %r" % max_depth) | |
55 if max_depth < 0: | |
56 sys.exit("Bad argument for max depth: %r" % max_depth) | |
57 | |
58 # fuzz factor to ensure can reach max_depth, e.g. region with | |
59 # many reads having a deletion present could underestimate the | |
60 # coverage by capping the number of reads considered | |
61 depth_margin = 100 | |
62 | |
63 # Assign sensible names with real extensions, and setup symlinks: | |
42 tmp_dir = tempfile.mkdtemp() | 64 tmp_dir = tempfile.mkdtemp() |
43 bam_file = os.path.join(tmp_dir, "temp.bam") | 65 bam_file = os.path.join(tmp_dir, "temp.bam") |
44 bai_file = os.path.join(tmp_dir, "temp.bam.bai") | 66 bai_file = os.path.join(tmp_dir, "temp.bam.bai") |
45 idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv") | 67 idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv") |
46 depth_filename = os.path.join(tmp_dir, "depth.tsv") | 68 depth_filename = os.path.join(tmp_dir, "depth.tsv") |
48 os.symlink(os.path.abspath(bai_filename), bai_file) | 70 os.symlink(os.path.abspath(bai_filename), bai_file) |
49 assert os.path.isfile(bam_file), bam_file | 71 assert os.path.isfile(bam_file), bam_file |
50 assert os.path.isfile(bai_file), bai_file | 72 assert os.path.isfile(bai_file), bai_file |
51 assert os.path.isfile(bam_file + ".bai"), bam_file | 73 assert os.path.isfile(bam_file + ".bai"), bam_file |
52 | 74 |
75 | |
53 def clean_up(): | 76 def clean_up(): |
77 """Remove our temporary files and directory.""" | |
54 os.remove(idxstats_filename) | 78 os.remove(idxstats_filename) |
55 os.remove(depth_filename) | 79 os.remove(depth_filename) |
56 os.remove(bam_file) | 80 os.remove(bam_file) |
57 os.remove(bai_file) | 81 os.remove(bai_file) |
58 os.rmdir(tmp_dir) | 82 os.rmdir(tmp_dir) |
59 | 83 |
84 | |
85 def samtools_depth_opt_available(): | |
86 """Determine if samtools depth supports maximum coverage argument.""" | |
87 child = subprocess.Popen(["samtools", "depth"], | |
88 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
89 # Combined stdout/stderr in case samtools is ever inconsistent | |
90 output, tmp = child.communicate() | |
91 assert tmp is None | |
92 # Expect to find this line in the help text, exact wording could change: | |
93 # -d/-m <int> maximum coverage depth [8000] | |
94 return " -d/-m " in output | |
95 | |
96 | |
97 depth_hack = False | |
98 if not samtools_depth_opt_available(): | |
99 if max_depth + depth_margin <= 8000: | |
100 sys.stderr.write("WARNING: The version of samtools depth installed does not " | |
101 "support the -d option, however, the requested max-depth " | |
102 "is safely under the default of 8000.\n") | |
103 depth_hack = True | |
104 else: | |
105 sys.exit("The version of samtools depth installed does not support the -d option.") | |
106 | |
60 # Run samtools idxstats: | 107 # Run samtools idxstats: |
61 cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename) | 108 cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename) |
62 return_code = os.system(cmd) | 109 return_code = os.system(cmd) |
63 if return_code: | 110 if return_code: |
64 clean_up() | 111 clean_up() |
65 stop_err("Return code %i from command:\n%s" % (return_code, cmd)) | 112 sys.exit("Return code %i from command:\n%s" % (return_code, cmd)) |
66 | 113 |
67 # Run samtools depth: | 114 # Run samtools depth: |
68 # TODO - Parse stdout instead? | 115 # TODO - Parse stdout instead? |
69 cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename) | 116 if depth_hack: |
117 # Using an old samtools without the -d option, but hard coded default | |
118 # of 8000 should be fine even allowing a margin for fuzzy output | |
119 cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename) | |
120 else: | |
121 cmd = 'samtools depth -d %i "%s" > "%s"' % (max_depth + depth_margin, bam_file, depth_filename) | |
70 return_code = os.system(cmd) | 122 return_code = os.system(cmd) |
71 if return_code: | 123 if return_code: |
72 clean_up() | 124 clean_up() |
73 stop_err("Return code %i from command:\n%s" % (return_code, cmd)) | 125 sys.exit("Return code %i from command:\n%s" % (return_code, cmd)) |
126 | |
74 | 127 |
75 def load_total_coverage(depth_handle, identifier, length): | 128 def load_total_coverage(depth_handle, identifier, length): |
76 """Parse some of the 'samtools depth' output for coverages. | 129 """Parse some of the 'samtools depth' output for coverages. |
77 | 130 |
78 Returns min_cov (int), max_cov (int) and mean cov (float). | 131 Returns min_cov (int), max_cov (int) and mean cov (float). |
84 | 137 |
85 # print("====") | 138 # print("====") |
86 # print("%s coverage calculation, length %i, ..." % (identifier, length)) | 139 # print("%s coverage calculation, length %i, ..." % (identifier, length)) |
87 | 140 |
88 if depth_ref is None: | 141 if depth_ref is None: |
89 # Right at start of file! | 142 # Right at start of file / new contig |
90 line = depth_handle.readline() | 143 line = depth_handle.readline() |
144 # Are we at the end of the file? | |
145 if not line: | |
146 # Must be at the end of the file. | |
147 # This can happen if the file contig(s) had no reads mapped | |
148 return 0, 0, 0.0, 0 | |
91 depth_ref, depth_pos, depth_reads = line.rstrip("\n").split() | 149 depth_ref, depth_pos, depth_reads = line.rstrip("\n").split() |
92 depth_pos = int(depth_pos) | 150 depth_pos = int(depth_pos) |
93 depth_reads = int(depth_reads) | 151 depth_reads = min(max_depth, int(depth_reads)) |
94 # Can now treat as later references where first line cached | 152 # Can now treat as later references where first line cached |
95 elif identifier != depth_ref: | 153 elif identifier != depth_ref: |
96 # Infer that identifier had coverage zero, | 154 # Infer that identifier had coverage zero, |
97 # and so was not in the 'samtools depth' | 155 # and so was not in the 'samtools depth' |
98 # output. | 156 # output. |
99 # print("%s appears to have no coverage at all" % identifier) | 157 # print("%s appears to have no coverage at all" % identifier) |
100 return 0, 0, 0.0 | 158 return 0, 0, 0.0, 0 |
101 | 159 |
102 # Good, at start of expected reference | 160 # Good, at start of expected reference |
103 bases = depth_reads | 161 bases = depth_reads |
104 if depth_pos == 1: | 162 if depth_pos == 1: |
105 min_cov = depth_reads | 163 min_cov = depth_reads |
113 depth_pos = 0 | 171 depth_pos = 0 |
114 depth_reads = 0 | 172 depth_reads = 0 |
115 for line in depth_handle: | 173 for line in depth_handle: |
116 ref, pos, depth = line.rstrip("\n").split() | 174 ref, pos, depth = line.rstrip("\n").split() |
117 pos = int(pos) | 175 pos = int(pos) |
118 depth = int(depth) | 176 depth = min(max_depth, int(depth)) |
119 if ref != identifier: | 177 if ref != identifier: |
120 # Reached the end of this identifier's coverage | 178 # Reached the end of this identifier's coverage |
121 # so cache this ready for next identifier | 179 # so cache this ready for next identifier |
122 depth_ref, depth_pos, depth_reads = ref, pos, depth | 180 depth_ref, depth_pos, depth_reads = ref, pos, depth |
123 break | 181 break |
133 # Reach the end of this identifier's coverage or end of file | 191 # Reach the end of this identifier's coverage or end of file |
134 if last_pos < length: | 192 if last_pos < length: |
135 # print("%s has no coverage at end" % identifier) | 193 # print("%s has no coverage at end" % identifier) |
136 min_cov = 0 | 194 min_cov = 0 |
137 mean_cov = bases / float(length) | 195 mean_cov = bases / float(length) |
138 return min_cov, max_cov, mean_cov | 196 return min_cov, max_cov, mean_cov, bases |
197 | |
139 | 198 |
140 # Parse and combine the output | 199 # Parse and combine the output |
141 out_handle = open(tabular_filename, "w") | 200 out_handle = open(tabular_filename, "w") |
142 out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n") | 201 out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n") |
143 | 202 |
146 | 205 |
147 depth_ref = None | 206 depth_ref = None |
148 depth_pos = 0 | 207 depth_pos = 0 |
149 depth_reads = 0 | 208 depth_reads = 0 |
150 | 209 |
210 global_bases = 0 | |
211 global_length = 0 | |
151 for line in idxstats_handle: | 212 for line in idxstats_handle: |
152 identifier, length, mapped, placed = line.rstrip("\n").split() | 213 identifier, length, mapped, placed = line.rstrip("\n").split() |
153 length = int(length) | 214 length = int(length) |
154 mapped = int(mapped) | 215 mapped = int(mapped) |
155 placed = int(placed) | 216 placed = int(placed) |
156 if identifier == "*": | 217 if identifier == "*": |
157 min_cov = 0 | 218 min_cov = 0 |
158 max_cov = 0 | 219 max_cov = 0 |
159 mean_cov = 0.0 | 220 mean_cov = 0.0 |
221 bases = 0 | |
160 else: | 222 else: |
161 min_cov, max_cov, mean_cov = load_total_coverage(depth_handle, identifier, length) | 223 min_cov, max_cov, mean_cov, bases = load_total_coverage(depth_handle, identifier, length) |
224 if max_cov > max_depth: | |
225 sys.exit("Using max depth %i yet saw max coverage %i for %s" | |
226 % (max_depth, max_cov, identifier)) | |
162 out_handle.write("%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n" | 227 out_handle.write("%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n" |
163 % (identifier, length, mapped, placed, | 228 % (identifier, length, mapped, placed, |
164 min_cov, max_cov, mean_cov)) | 229 min_cov, max_cov, mean_cov)) |
165 if not (min_cov <= mean_cov <= max_cov): | 230 if not (min_cov <= mean_cov <= max_cov): |
166 out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n") | 231 out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n") |
167 idxstats_handle.close() | 232 idxstats_handle.close() |
168 depth_handle.close() | 233 depth_handle.close() |
169 out_handle.close() | 234 out_handle.close() |
170 clean_up() | 235 clean_up() |
171 stop_err("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov" | 236 sys.exit("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov" |
172 % identifier) | 237 % identifier) |
238 global_length += length | |
239 global_bases += bases | |
173 | 240 |
174 idxstats_handle.close() | 241 idxstats_handle.close() |
175 depth_handle.close() | 242 depth_handle.close() |
176 out_handle.close() | 243 out_handle.close() |
177 | 244 |
245 print("Total reference length %i with overall mean coverage %0.2f" % (global_length, float(global_bases) / global_length)) | |
246 | |
178 # Remove the temp symlinks and files: | 247 # Remove the temp symlinks and files: |
179 clean_up() | 248 clean_up() |
180 | 249 |
181 if depth_ref is not None: | 250 if depth_ref is not None: |
182 stop_err("Left over output from 'samtools depth'? %r" % depth_ref) | 251 sys.exit("Left over output from 'samtools depth'? %r" % depth_ref) |