0
|
1 #!/usr/bin/env python
|
|
2 """BAM coverage statistics using samtools idxstats and depth.
|
|
3
|
|
4 This script takes exactly three command line arguments:
|
|
5 * Input BAM filename
|
|
6 * Input BAI filename (via Galaxy metadata)
|
|
7 * Output tabular filename
|
|
8
|
|
9 This messes about with the filenames to make samtools happy, then
|
|
10 runs "samtools idxstats" and "samtools depth", captures and combines
|
|
11 the output to the desired output tabular file.
|
|
12 """
|
|
13 import sys
|
|
14 import os
|
|
15 import subprocess
|
|
16 import tempfile
|
|
17
|
|
18 if "-v" in sys.argv or "--version" in sys.argv:
|
|
19 #Galaxy seems to invert the order of the two lines
|
|
20 print("BAM coverage statistics v0.0.1")
|
|
21 cmd = "samtools 2>&1 | grep -i ^Version"
|
|
22 sys.exit(os.system(cmd))
|
|
23
|
|
24 def stop_err(msg, error_level=1):
|
|
25 """Print error message to stdout and quit with given error level."""
|
|
26 sys.stderr.write("%s\n" % msg)
|
|
27 sys.exit(error_level)
|
|
28
|
|
29 if len(sys.argv) != 4:
|
|
30 stop_err("Require three arguments: BAM, BAI, tabular filenames")
|
|
31
|
|
32 bam_filename, bai_filename, tabular_filename = sys.argv[1:]
|
|
33
|
|
34 if not os.path.isfile(bam_filename):
|
|
35 stop_err("Input BAM file not found: %s" % bam_filename)
|
|
36 if not os.path.isfile(bai_filename):
|
|
37 if bai_filename == "None":
|
|
38 stop_err("Error: Galaxy did not index your BAM file")
|
|
39 stop_err("Input BAI file not found: %s" % bai_filename)
|
|
40
|
|
41 #Assign sensible names with real extensions, and setup symlinks:
|
|
42 tmp_dir = tempfile.mkdtemp()
|
|
43 bam_file = os.path.join(tmp_dir, "temp.bam")
|
|
44 bai_file = os.path.join(tmp_dir, "temp.bam.bai")
|
|
45 idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv")
|
|
46 depth_filename = os.path.join(tmp_dir, "depth.tsv")
|
|
47 os.symlink(os.path.abspath(bam_filename), bam_file)
|
|
48 os.symlink(os.path.abspath(bai_filename), bai_file)
|
|
49 assert os.path.isfile(bam_file), bam_file
|
|
50 assert os.path.isfile(bai_file), bai_file
|
|
51 assert os.path.isfile(bam_file + ".bai"), bam_file
|
|
52
|
|
53 def clean_up():
|
|
54 os.remove(idxstats_filename)
|
|
55 os.remove(depth_filename)
|
|
56 os.remove(bam_file)
|
|
57 os.remove(bai_file)
|
|
58 os.rmdir(tmp_dir)
|
|
59
|
|
60 # Run samtools idxstats:
|
|
61 cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename)
|
|
62 return_code = os.system(cmd)
|
|
63 if return_code:
|
|
64 clean_up()
|
|
65 stop_err("Return code %i from command:\n%s" % (return_code, cmd))
|
|
66
|
|
67 # Run samtools depth:
|
|
68 # TODO - Parse stdout instead?
|
|
69 cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename)
|
|
70 return_code = os.system(cmd)
|
|
71 if return_code:
|
|
72 clean_up()
|
|
73 stop_err("Return code %i from command:\n%s" % (return_code, cmd))
|
|
74
|
|
75 def load_total_coverage(depth_handle, identifier, length):
|
|
76 """Parse some of the 'samtools depth' output for coverages.
|
|
77
|
|
78 Returns min_cov (int), max_cov (int) and mean cov (float).
|
|
79
|
|
80 Uses global variables to cache the first line of output from the
|
|
81 next reference sequence.
|
|
82 """
|
|
83 global depth_ref, depth_pos, depth_reads
|
|
84
|
|
85 # print("====")
|
|
86 # print("%s coverage calculation, length %i, ..." % (identifier, length))
|
|
87
|
|
88 if depth_ref is None:
|
|
89 # Right at start of file!
|
|
90 line = depth_handle.readline()
|
|
91 depth_ref, depth_pos, depth_reads = line.rstrip("\n").split()
|
|
92 depth_pos = int(depth_pos)
|
|
93 depth_reads = int(depth_reads)
|
|
94 # Can now treat as later references where first line cached
|
|
95 elif identifier != depth_ref:
|
|
96 # Infer that identifier had coverage zero,
|
|
97 # and so was not in the 'samtools depth'
|
|
98 # output.
|
|
99 # print("%s appears to have no coverage at all" % identifier)
|
|
100 return 0, 0, 0.0
|
|
101
|
|
102 # Good, at start of expected reference
|
|
103 bases = depth_reads
|
|
104 if depth_pos == 1:
|
|
105 min_cov = depth_reads
|
|
106 else:
|
|
107 # print("%s has no coverage at start" % identifier)
|
|
108 min_cov = 0
|
|
109 max_cov = depth_reads
|
|
110
|
|
111 last_pos = depth_pos
|
|
112 depth_ref = None
|
|
113 depth_pos = 0
|
|
114 depth_reads = 0
|
|
115 for line in depth_handle:
|
|
116 ref, pos, depth = line.rstrip("\n").split()
|
|
117 pos = int(pos)
|
|
118 depth = int(depth)
|
|
119 if ref != identifier:
|
|
120 # Reached the end of this identifier's coverage
|
|
121 # so cache this ready for next identifier
|
|
122 depth_ref, depth_pos, depth_reads = ref, pos, depth
|
|
123 break
|
|
124 bases += depth
|
|
125 if last_pos + 1 < pos:
|
|
126 # print("%s has no coverage between %i and %i" % (identifier, last_pos, pos))
|
|
127 min_cov = 0
|
|
128 else:
|
|
129 min_cov = min(min_cov, depth)
|
|
130 max_cov = max(max_cov, depth)
|
|
131 last_pos = pos
|
|
132
|
|
133 # Reach the end of this identifier's coverage or end of file
|
|
134 if last_pos < length:
|
|
135 # print("%s has no coverage at end" % identifier)
|
|
136 min_cov = 0
|
|
137 mean_cov = bases / float(length)
|
|
138 return min_cov, max_cov, mean_cov
|
|
139
|
|
140 # Parse and combine the output
|
|
141 out_handle = open(tabular_filename, "w")
|
|
142 out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n")
|
|
143
|
|
144 idxstats_handle = open(idxstats_filename)
|
|
145 depth_handle = open(depth_filename)
|
|
146
|
|
147 depth_ref = None
|
|
148 depth_pos = 0
|
|
149 depth_reads = 0
|
|
150
|
|
151 for line in idxstats_handle:
|
|
152 identifier, length, mapped, placed = line.rstrip("\n").split()
|
|
153 length = int(length)
|
|
154 mapped = int(mapped)
|
|
155 placed = int(placed)
|
|
156 if identifier == "*":
|
|
157 min_cov = 0
|
|
158 max_cov = 0
|
|
159 mean_cov = 0.0
|
|
160 else:
|
|
161 min_cov, max_cov, mean_cov = load_total_coverage(depth_handle, identifier, length)
|
|
162 out_handle.write("%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n"
|
|
163 % (identifier, length, mapped, placed,
|
|
164 min_cov, max_cov, mean_cov))
|
|
165 if not (min_cov <= mean_cov <= max_cov):
|
|
166 out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n")
|
|
167 idxstats_handle.close()
|
|
168 depth_handle.close()
|
|
169 out_handle.close()
|
|
170 clean_up()
|
|
171 stop_err("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov"
|
|
172 % identifier)
|
|
173
|
|
174 idxstats_handle.close()
|
|
175 depth_handle.close()
|
|
176 out_handle.close()
|
|
177
|
|
178 # Remove the temp symlinks and files:
|
|
179 clean_up()
|
|
180
|
|
181 if depth_ref is not None:
|
|
182 stop_err("Left over output from 'samtools depth'? %r" % depth_ref)
|