Mercurial > repos > peterjc > coverage_stats
comparison tools/coverage_stats/coverage_stats.py @ 0:ca8f63f2f7d4 draft
Uploaded v0.0.1
author | peterjc |
---|---|
date | Fri, 21 Nov 2014 09:28:29 -0500 |
parents | |
children | 7254ece0c0ff |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ca8f63f2f7d4 |
---|---|
1 #!/usr/bin/env python | |
2 """BAM coverage statistics using samtools idxstats and depth. | |
3 | |
4 This script takes exactly three command line arguments: | |
5 * Input BAM filename | |
6 * Input BAI filename (via Galaxy metadata) | |
7 * Output tabular filename | |
8 | |
9 This messes about with the filenames to make samtools happy, then | |
10 runs "samtools idxstats" and "samtools depth", captures and combines | |
11 the output to the desired output tabular file. | |
12 """ | |
13 import sys | |
14 import os | |
15 import subprocess | |
16 import tempfile | |
17 | |
18 if "-v" in sys.argv or "--version" in sys.argv: | |
19 #Galaxy seems to invert the order of the two lines | |
20 print("BAM coverage statistics v0.0.1") | |
21 cmd = "samtools 2>&1 | grep -i ^Version" | |
22 sys.exit(os.system(cmd)) | |
23 | |
24 def stop_err(msg, error_level=1): | |
25 """Print error message to stdout and quit with given error level.""" | |
26 sys.stderr.write("%s\n" % msg) | |
27 sys.exit(error_level) | |
28 | |
29 if len(sys.argv) != 4: | |
30 stop_err("Require three arguments: BAM, BAI, tabular filenames") | |
31 | |
32 bam_filename, bai_filename, tabular_filename = sys.argv[1:] | |
33 | |
34 if not os.path.isfile(bam_filename): | |
35 stop_err("Input BAM file not found: %s" % bam_filename) | |
36 if not os.path.isfile(bai_filename): | |
37 if bai_filename == "None": | |
38 stop_err("Error: Galaxy did not index your BAM file") | |
39 stop_err("Input BAI file not found: %s" % bai_filename) | |
40 | |
41 #Assign sensible names with real extensions, and setup symlinks: | |
42 tmp_dir = tempfile.mkdtemp() | |
43 bam_file = os.path.join(tmp_dir, "temp.bam") | |
44 bai_file = os.path.join(tmp_dir, "temp.bam.bai") | |
45 idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv") | |
46 depth_filename = os.path.join(tmp_dir, "depth.tsv") | |
47 os.symlink(os.path.abspath(bam_filename), bam_file) | |
48 os.symlink(os.path.abspath(bai_filename), bai_file) | |
49 assert os.path.isfile(bam_file), bam_file | |
50 assert os.path.isfile(bai_file), bai_file | |
51 assert os.path.isfile(bam_file + ".bai"), bam_file | |
52 | |
53 def clean_up(): | |
54 os.remove(idxstats_filename) | |
55 os.remove(depth_filename) | |
56 os.remove(bam_file) | |
57 os.remove(bai_file) | |
58 os.rmdir(tmp_dir) | |
59 | |
60 # Run samtools idxstats: | |
61 cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename) | |
62 return_code = os.system(cmd) | |
63 if return_code: | |
64 clean_up() | |
65 stop_err("Return code %i from command:\n%s" % (return_code, cmd)) | |
66 | |
67 # Run samtools depth: | |
68 # TODO - Parse stdout instead? | |
69 cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename) | |
70 return_code = os.system(cmd) | |
71 if return_code: | |
72 clean_up() | |
73 stop_err("Return code %i from command:\n%s" % (return_code, cmd)) | |
74 | |
75 def load_total_coverage(depth_handle, identifier, length): | |
76 """Parse some of the 'samtools depth' output for coverages. | |
77 | |
78 Returns min_cov (int), max_cov (int) and mean cov (float). | |
79 | |
80 Uses global variables to cache the first line of output from the | |
81 next reference sequence. | |
82 """ | |
83 global depth_ref, depth_pos, depth_reads | |
84 | |
85 # print("====") | |
86 # print("%s coverage calculation, length %i, ..." % (identifier, length)) | |
87 | |
88 if depth_ref is None: | |
89 # Right at start of file! | |
90 line = depth_handle.readline() | |
91 depth_ref, depth_pos, depth_reads = line.rstrip("\n").split() | |
92 depth_pos = int(depth_pos) | |
93 depth_reads = int(depth_reads) | |
94 # Can now treat as later references where first line cached | |
95 elif identifier != depth_ref: | |
96 # Infer that identifier had coverage zero, | |
97 # and so was not in the 'samtools depth' | |
98 # output. | |
99 # print("%s appears to have no coverage at all" % identifier) | |
100 return 0, 0, 0.0 | |
101 | |
102 # Good, at start of expected reference | |
103 bases = depth_reads | |
104 if depth_pos == 1: | |
105 min_cov = depth_reads | |
106 else: | |
107 # print("%s has no coverage at start" % identifier) | |
108 min_cov = 0 | |
109 max_cov = depth_reads | |
110 | |
111 last_pos = depth_pos | |
112 depth_ref = None | |
113 depth_pos = 0 | |
114 depth_reads = 0 | |
115 for line in depth_handle: | |
116 ref, pos, depth = line.rstrip("\n").split() | |
117 pos = int(pos) | |
118 depth = int(depth) | |
119 if ref != identifier: | |
120 # Reached the end of this identifier's coverage | |
121 # so cache this ready for next identifier | |
122 depth_ref, depth_pos, depth_reads = ref, pos, depth | |
123 break | |
124 bases += depth | |
125 if last_pos + 1 < pos: | |
126 # print("%s has no coverage between %i and %i" % (identifier, last_pos, pos)) | |
127 min_cov = 0 | |
128 else: | |
129 min_cov = min(min_cov, depth) | |
130 max_cov = max(max_cov, depth) | |
131 last_pos = pos | |
132 | |
133 # Reach the end of this identifier's coverage or end of file | |
134 if last_pos < length: | |
135 # print("%s has no coverage at end" % identifier) | |
136 min_cov = 0 | |
137 mean_cov = bases / float(length) | |
138 return min_cov, max_cov, mean_cov | |
139 | |
140 # Parse and combine the output | |
141 out_handle = open(tabular_filename, "w") | |
142 out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n") | |
143 | |
144 idxstats_handle = open(idxstats_filename) | |
145 depth_handle = open(depth_filename) | |
146 | |
147 depth_ref = None | |
148 depth_pos = 0 | |
149 depth_reads = 0 | |
150 | |
151 for line in idxstats_handle: | |
152 identifier, length, mapped, placed = line.rstrip("\n").split() | |
153 length = int(length) | |
154 mapped = int(mapped) | |
155 placed = int(placed) | |
156 if identifier == "*": | |
157 min_cov = 0 | |
158 max_cov = 0 | |
159 mean_cov = 0.0 | |
160 else: | |
161 min_cov, max_cov, mean_cov = load_total_coverage(depth_handle, identifier, length) | |
162 out_handle.write("%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n" | |
163 % (identifier, length, mapped, placed, | |
164 min_cov, max_cov, mean_cov)) | |
165 if not (min_cov <= mean_cov <= max_cov): | |
166 out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n") | |
167 idxstats_handle.close() | |
168 depth_handle.close() | |
169 out_handle.close() | |
170 clean_up() | |
171 stop_err("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov" | |
172 % identifier) | |
173 | |
174 idxstats_handle.close() | |
175 depth_handle.close() | |
176 out_handle.close() | |
177 | |
178 # Remove the temp symlinks and files: | |
179 clean_up() | |
180 | |
181 if depth_ref is not None: | |
182 stop_err("Left over output from 'samtools depth'? %r" % depth_ref) |