annotate tools/coverage_stats/coverage_stats.py @ 0:ca8f63f2f7d4 draft

Uploaded v0.0.1
author peterjc
date Fri, 21 Nov 2014 09:28:29 -0500
parents
children 7254ece0c0ff
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
2 """BAM coverage statistics using samtools idxstats and depth.
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
3
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
4 This script takes exactly three command line arguments:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
5 * Input BAM filename
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
6 * Input BAI filename (via Galaxy metadata)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
7 * Output tabular filename
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
8
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
9 This messes about with the filenames to make samtools happy, then
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
10 runs "samtools idxstats" and "samtools depth", captures and combines
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
11 the output to the desired output tabular file.
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
12 """
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
13 import sys
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
14 import os
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
15 import subprocess
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
16 import tempfile
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
17
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
18 if "-v" in sys.argv or "--version" in sys.argv:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
19 #Galaxy seems to invert the order of the two lines
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
20 print("BAM coverage statistics v0.0.1")
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
21 cmd = "samtools 2>&1 | grep -i ^Version"
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
22 sys.exit(os.system(cmd))
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
23
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
24 def stop_err(msg, error_level=1):
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
25 """Print error message to stdout and quit with given error level."""
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
26 sys.stderr.write("%s\n" % msg)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
27 sys.exit(error_level)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
28
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
29 if len(sys.argv) != 4:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
30 stop_err("Require three arguments: BAM, BAI, tabular filenames")
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
31
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
32 bam_filename, bai_filename, tabular_filename = sys.argv[1:]
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
33
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
34 if not os.path.isfile(bam_filename):
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
35 stop_err("Input BAM file not found: %s" % bam_filename)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
36 if not os.path.isfile(bai_filename):
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
37 if bai_filename == "None":
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
38 stop_err("Error: Galaxy did not index your BAM file")
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
39 stop_err("Input BAI file not found: %s" % bai_filename)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
40
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
41 #Assign sensible names with real extensions, and setup symlinks:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
42 tmp_dir = tempfile.mkdtemp()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
43 bam_file = os.path.join(tmp_dir, "temp.bam")
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
44 bai_file = os.path.join(tmp_dir, "temp.bam.bai")
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
45 idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv")
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
46 depth_filename = os.path.join(tmp_dir, "depth.tsv")
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
47 os.symlink(os.path.abspath(bam_filename), bam_file)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
48 os.symlink(os.path.abspath(bai_filename), bai_file)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
49 assert os.path.isfile(bam_file), bam_file
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
50 assert os.path.isfile(bai_file), bai_file
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
51 assert os.path.isfile(bam_file + ".bai"), bam_file
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
52
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
53 def clean_up():
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
54 os.remove(idxstats_filename)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
55 os.remove(depth_filename)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
56 os.remove(bam_file)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
57 os.remove(bai_file)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
58 os.rmdir(tmp_dir)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
59
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
60 # Run samtools idxstats:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
61 cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
62 return_code = os.system(cmd)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
63 if return_code:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
64 clean_up()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
65 stop_err("Return code %i from command:\n%s" % (return_code, cmd))
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
66
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
67 # Run samtools depth:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
68 # TODO - Parse stdout instead?
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
69 cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
70 return_code = os.system(cmd)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
71 if return_code:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
72 clean_up()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
73 stop_err("Return code %i from command:\n%s" % (return_code, cmd))
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
74
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
75 def load_total_coverage(depth_handle, identifier, length):
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
76 """Parse some of the 'samtools depth' output for coverages.
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
77
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
78 Returns min_cov (int), max_cov (int) and mean cov (float).
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
79
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
80 Uses global variables to cache the first line of output from the
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
81 next reference sequence.
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
82 """
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
83 global depth_ref, depth_pos, depth_reads
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
84
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
85 # print("====")
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
86 # print("%s coverage calculation, length %i, ..." % (identifier, length))
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
87
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
88 if depth_ref is None:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
89 # Right at start of file!
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
90 line = depth_handle.readline()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
91 depth_ref, depth_pos, depth_reads = line.rstrip("\n").split()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
92 depth_pos = int(depth_pos)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
93 depth_reads = int(depth_reads)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
94 # Can now treat as later references where first line cached
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
95 elif identifier != depth_ref:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
96 # Infer that identifier had coverage zero,
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
97 # and so was not in the 'samtools depth'
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
98 # output.
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
99 # print("%s appears to have no coverage at all" % identifier)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
100 return 0, 0, 0.0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
101
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
102 # Good, at start of expected reference
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
103 bases = depth_reads
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
104 if depth_pos == 1:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
105 min_cov = depth_reads
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
106 else:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
107 # print("%s has no coverage at start" % identifier)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
108 min_cov = 0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
109 max_cov = depth_reads
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
110
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
111 last_pos = depth_pos
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
112 depth_ref = None
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
113 depth_pos = 0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
114 depth_reads = 0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
115 for line in depth_handle:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
116 ref, pos, depth = line.rstrip("\n").split()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
117 pos = int(pos)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
118 depth = int(depth)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
119 if ref != identifier:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
120 # Reached the end of this identifier's coverage
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
121 # so cache this ready for next identifier
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
122 depth_ref, depth_pos, depth_reads = ref, pos, depth
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
123 break
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
124 bases += depth
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
125 if last_pos + 1 < pos:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
126 # print("%s has no coverage between %i and %i" % (identifier, last_pos, pos))
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
127 min_cov = 0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
128 else:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
129 min_cov = min(min_cov, depth)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
130 max_cov = max(max_cov, depth)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
131 last_pos = pos
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
132
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
133 # Reach the end of this identifier's coverage or end of file
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
134 if last_pos < length:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
135 # print("%s has no coverage at end" % identifier)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
136 min_cov = 0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
137 mean_cov = bases / float(length)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
138 return min_cov, max_cov, mean_cov
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
139
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
140 # Parse and combine the output
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
141 out_handle = open(tabular_filename, "w")
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
142 out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n")
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
143
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
144 idxstats_handle = open(idxstats_filename)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
145 depth_handle = open(depth_filename)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
146
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
147 depth_ref = None
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
148 depth_pos = 0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
149 depth_reads = 0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
150
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
151 for line in idxstats_handle:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
152 identifier, length, mapped, placed = line.rstrip("\n").split()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
153 length = int(length)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
154 mapped = int(mapped)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
155 placed = int(placed)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
156 if identifier == "*":
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
157 min_cov = 0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
158 max_cov = 0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
159 mean_cov = 0.0
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
160 else:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
161 min_cov, max_cov, mean_cov = load_total_coverage(depth_handle, identifier, length)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
162 out_handle.write("%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n"
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
163 % (identifier, length, mapped, placed,
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
164 min_cov, max_cov, mean_cov))
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
165 if not (min_cov <= mean_cov <= max_cov):
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
166 out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n")
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
167 idxstats_handle.close()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
168 depth_handle.close()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
169 out_handle.close()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
170 clean_up()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
171 stop_err("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov"
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
172 % identifier)
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
173
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
174 idxstats_handle.close()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
175 depth_handle.close()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
176 out_handle.close()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
177
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
178 # Remove the temp symlinks and files:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
179 clean_up()
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
180
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
181 if depth_ref is not None:
ca8f63f2f7d4 Uploaded v0.0.1
peterjc
parents:
diff changeset
182 stop_err("Left over output from 'samtools depth'? %r" % depth_ref)