comparison tools/coverage_stats/coverage_stats.py @ 0:ca8f63f2f7d4 draft

Uploaded v0.0.1
author peterjc
date Fri, 21 Nov 2014 09:28:29 -0500
parents
children 7254ece0c0ff
comparison
equal deleted inserted replaced
-1:000000000000 0:ca8f63f2f7d4
1 #!/usr/bin/env python
2 """BAM coverage statistics using samtools idxstats and depth.
3
4 This script takes exactly three command line arguments:
5 * Input BAM filename
6 * Input BAI filename (via Galaxy metadata)
7 * Output tabular filename
8
9 This messes about with the filenames to make samtools happy, then
10 runs "samtools idxstats" and "samtools depth", captures and combines
11 the output to the desired output tabular file.
12 """
13 import sys
14 import os
15 import subprocess
16 import tempfile
17
18 if "-v" in sys.argv or "--version" in sys.argv:
19 #Galaxy seems to invert the order of the two lines
20 print("BAM coverage statistics v0.0.1")
21 cmd = "samtools 2>&1 | grep -i ^Version"
22 sys.exit(os.system(cmd))
23
24 def stop_err(msg, error_level=1):
25 """Print error message to stdout and quit with given error level."""
26 sys.stderr.write("%s\n" % msg)
27 sys.exit(error_level)
28
29 if len(sys.argv) != 4:
30 stop_err("Require three arguments: BAM, BAI, tabular filenames")
31
32 bam_filename, bai_filename, tabular_filename = sys.argv[1:]
33
34 if not os.path.isfile(bam_filename):
35 stop_err("Input BAM file not found: %s" % bam_filename)
36 if not os.path.isfile(bai_filename):
37 if bai_filename == "None":
38 stop_err("Error: Galaxy did not index your BAM file")
39 stop_err("Input BAI file not found: %s" % bai_filename)
40
41 #Assign sensible names with real extensions, and setup symlinks:
42 tmp_dir = tempfile.mkdtemp()
43 bam_file = os.path.join(tmp_dir, "temp.bam")
44 bai_file = os.path.join(tmp_dir, "temp.bam.bai")
45 idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv")
46 depth_filename = os.path.join(tmp_dir, "depth.tsv")
47 os.symlink(os.path.abspath(bam_filename), bam_file)
48 os.symlink(os.path.abspath(bai_filename), bai_file)
49 assert os.path.isfile(bam_file), bam_file
50 assert os.path.isfile(bai_file), bai_file
51 assert os.path.isfile(bam_file + ".bai"), bam_file
52
53 def clean_up():
54 os.remove(idxstats_filename)
55 os.remove(depth_filename)
56 os.remove(bam_file)
57 os.remove(bai_file)
58 os.rmdir(tmp_dir)
59
60 # Run samtools idxstats:
61 cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename)
62 return_code = os.system(cmd)
63 if return_code:
64 clean_up()
65 stop_err("Return code %i from command:\n%s" % (return_code, cmd))
66
67 # Run samtools depth:
68 # TODO - Parse stdout instead?
69 cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename)
70 return_code = os.system(cmd)
71 if return_code:
72 clean_up()
73 stop_err("Return code %i from command:\n%s" % (return_code, cmd))
74
75 def load_total_coverage(depth_handle, identifier, length):
76 """Parse some of the 'samtools depth' output for coverages.
77
78 Returns min_cov (int), max_cov (int) and mean cov (float).
79
80 Uses global variables to cache the first line of output from the
81 next reference sequence.
82 """
83 global depth_ref, depth_pos, depth_reads
84
85 # print("====")
86 # print("%s coverage calculation, length %i, ..." % (identifier, length))
87
88 if depth_ref is None:
89 # Right at start of file!
90 line = depth_handle.readline()
91 depth_ref, depth_pos, depth_reads = line.rstrip("\n").split()
92 depth_pos = int(depth_pos)
93 depth_reads = int(depth_reads)
94 # Can now treat as later references where first line cached
95 elif identifier != depth_ref:
96 # Infer that identifier had coverage zero,
97 # and so was not in the 'samtools depth'
98 # output.
99 # print("%s appears to have no coverage at all" % identifier)
100 return 0, 0, 0.0
101
102 # Good, at start of expected reference
103 bases = depth_reads
104 if depth_pos == 1:
105 min_cov = depth_reads
106 else:
107 # print("%s has no coverage at start" % identifier)
108 min_cov = 0
109 max_cov = depth_reads
110
111 last_pos = depth_pos
112 depth_ref = None
113 depth_pos = 0
114 depth_reads = 0
115 for line in depth_handle:
116 ref, pos, depth = line.rstrip("\n").split()
117 pos = int(pos)
118 depth = int(depth)
119 if ref != identifier:
120 # Reached the end of this identifier's coverage
121 # so cache this ready for next identifier
122 depth_ref, depth_pos, depth_reads = ref, pos, depth
123 break
124 bases += depth
125 if last_pos + 1 < pos:
126 # print("%s has no coverage between %i and %i" % (identifier, last_pos, pos))
127 min_cov = 0
128 else:
129 min_cov = min(min_cov, depth)
130 max_cov = max(max_cov, depth)
131 last_pos = pos
132
133 # Reach the end of this identifier's coverage or end of file
134 if last_pos < length:
135 # print("%s has no coverage at end" % identifier)
136 min_cov = 0
137 mean_cov = bases / float(length)
138 return min_cov, max_cov, mean_cov
139
140 # Parse and combine the output
141 out_handle = open(tabular_filename, "w")
142 out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n")
143
144 idxstats_handle = open(idxstats_filename)
145 depth_handle = open(depth_filename)
146
147 depth_ref = None
148 depth_pos = 0
149 depth_reads = 0
150
151 for line in idxstats_handle:
152 identifier, length, mapped, placed = line.rstrip("\n").split()
153 length = int(length)
154 mapped = int(mapped)
155 placed = int(placed)
156 if identifier == "*":
157 min_cov = 0
158 max_cov = 0
159 mean_cov = 0.0
160 else:
161 min_cov, max_cov, mean_cov = load_total_coverage(depth_handle, identifier, length)
162 out_handle.write("%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n"
163 % (identifier, length, mapped, placed,
164 min_cov, max_cov, mean_cov))
165 if not (min_cov <= mean_cov <= max_cov):
166 out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n")
167 idxstats_handle.close()
168 depth_handle.close()
169 out_handle.close()
170 clean_up()
171 stop_err("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov"
172 % identifier)
173
174 idxstats_handle.close()
175 depth_handle.close()
176 out_handle.close()
177
178 # Remove the temp symlinks and files:
179 clean_up()
180
181 if depth_ref is not None:
182 stop_err("Left over output from 'samtools depth'? %r" % depth_ref)