diff tools/coverage_stats/coverage_stats.py @ 0:ca8f63f2f7d4 draft

Uploaded v0.0.1
author peterjc
date Fri, 21 Nov 2014 09:28:29 -0500
parents
children 7254ece0c0ff
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/coverage_stats/coverage_stats.py	Fri Nov 21 09:28:29 2014 -0500
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+"""BAM coverage statistics using samtools idxstats and depth.
+
+This script takes exactly three command line arguments:
+ * Input BAM filename
+ * Input BAI filename (via Galaxy metadata)
+ * Output tabular filename
+
+This messes about with the filenames to make samtools happy, then
+runs "samtools idxstats" and "samtools depth", captures and combines
+the output to the desired output tabular file.
+"""
+import sys
+import os
+import subprocess
+import tempfile
+
+if "-v" in sys.argv or "--version" in sys.argv:
+    #Galaxy seems to invert the order of the two lines
+    print("BAM coverage statistics v0.0.1")
+    cmd = "samtools 2>&1 | grep -i ^Version"
+    sys.exit(os.system(cmd))
+
+def stop_err(msg, error_level=1):
+   """Print error message to stdout and quit with given error level."""
+   sys.stderr.write("%s\n" % msg)
+   sys.exit(error_level)
+
+if len(sys.argv) != 4:
+   stop_err("Require three arguments: BAM, BAI, tabular filenames")
+
+bam_filename, bai_filename, tabular_filename = sys.argv[1:]
+
+if not os.path.isfile(bam_filename):
+    stop_err("Input BAM file not found: %s" % bam_filename)
+if not os.path.isfile(bai_filename):
+    if bai_filename == "None":
+        stop_err("Error: Galaxy did not index your BAM file")
+    stop_err("Input BAI file not found: %s" % bai_filename)
+
+#Assign sensible names with real extensions, and setup symlinks:
+tmp_dir = tempfile.mkdtemp()
+bam_file = os.path.join(tmp_dir, "temp.bam")
+bai_file = os.path.join(tmp_dir, "temp.bam.bai")
+idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv")
+depth_filename = os.path.join(tmp_dir, "depth.tsv")
+os.symlink(os.path.abspath(bam_filename), bam_file)
+os.symlink(os.path.abspath(bai_filename), bai_file)
+assert os.path.isfile(bam_file), bam_file
+assert os.path.isfile(bai_file), bai_file
+assert os.path.isfile(bam_file + ".bai"), bam_file
+
+def clean_up():
+    os.remove(idxstats_filename)
+    os.remove(depth_filename)
+    os.remove(bam_file)
+    os.remove(bai_file)
+    os.rmdir(tmp_dir)
+
+# Run samtools idxstats:
+cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename)
+return_code = os.system(cmd)
+if return_code:
+    clean_up()
+    stop_err("Return code %i from command:\n%s" % (return_code, cmd))
+
+# Run samtools depth:
+# TODO - Parse stdout instead?
+cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename)
+return_code = os.system(cmd)
+if return_code:
+    clean_up()
+    stop_err("Return code %i from command:\n%s" % (return_code, cmd))
+
+def load_total_coverage(depth_handle, identifier, length):
+    """Parse some of the 'samtools depth' output for coverages.
+
+    Returns min_cov (int), max_cov (int) and mean cov (float).
+
+    Uses global variables to cache the first line of output from the
+    next reference sequence.
+    """
+    global depth_ref, depth_pos, depth_reads
+
+    # print("====")
+    # print("%s coverage calculation, length %i, ..." % (identifier, length))
+
+    if depth_ref is None:
+        # Right at start of file!
+        line = depth_handle.readline()
+        depth_ref, depth_pos, depth_reads = line.rstrip("\n").split()
+        depth_pos = int(depth_pos)
+        depth_reads = int(depth_reads)
+        # Can now treat as later references where first line cached
+    elif identifier != depth_ref:
+        # Infer that identifier had coverage zero,
+        # and so was not in the 'samtools depth'
+        # output.
+        # print("%s appears to have no coverage at all" % identifier)
+        return 0, 0, 0.0
+
+    # Good, at start of expected reference
+    bases = depth_reads
+    if depth_pos == 1:
+        min_cov = depth_reads
+    else:
+        # print("%s has no coverage at start" % identifier)
+        min_cov = 0
+    max_cov = depth_reads
+
+    last_pos = depth_pos
+    depth_ref = None
+    depth_pos = 0
+    depth_reads = 0
+    for line in depth_handle:
+        ref, pos, depth = line.rstrip("\n").split()
+        pos = int(pos)
+        depth = int(depth)
+        if ref != identifier:
+            # Reached the end of this identifier's coverage
+            # so cache this ready for next identifier
+            depth_ref, depth_pos, depth_reads = ref, pos, depth
+            break
+        bases += depth
+        if last_pos + 1 < pos:
+            # print("%s has no coverage between %i and %i" % (identifier, last_pos, pos))
+            min_cov = 0
+        else:
+            min_cov = min(min_cov, depth)
+        max_cov = max(max_cov, depth)
+        last_pos = pos
+
+    # Reach the end of this identifier's coverage or end of file
+    if last_pos < length:
+        # print("%s has no coverage at end" % identifier)
+        min_cov = 0
+    mean_cov = bases / float(length)
+    return min_cov, max_cov, mean_cov
+
+# Parse and combine the output
+out_handle = open(tabular_filename, "w")
+out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n")
+
+idxstats_handle = open(idxstats_filename)
+depth_handle = open(depth_filename)
+
+depth_ref = None
+depth_pos = 0
+depth_reads = 0
+
+for line in idxstats_handle:
+    identifier, length, mapped, placed = line.rstrip("\n").split()
+    length = int(length)
+    mapped = int(mapped)
+    placed = int(placed)
+    if identifier == "*":
+        min_cov = 0
+        max_cov = 0
+        mean_cov = 0.0
+    else:
+        min_cov, max_cov, mean_cov = load_total_coverage(depth_handle, identifier, length)
+    out_handle.write("%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n"
+                     % (identifier, length, mapped, placed,
+                        min_cov, max_cov, mean_cov))
+    if not (min_cov <= mean_cov <= max_cov):
+        out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n")
+        idxstats_handle.close()
+        depth_handle.close()
+        out_handle.close()
+        clean_up()
+        stop_err("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov"
+                 % identifier)
+
+idxstats_handle.close()
+depth_handle.close()
+out_handle.close()
+
+# Remove the temp symlinks and files:
+clean_up()
+
+if depth_ref is not None:
+    stop_err("Left over output from 'samtools depth'? %r" % depth_ref)