diff tools/coverage_stats/coverage_stats.py @ 2:7254ece0c0ff draft

v0.0.5 - Supports max coverage depth in recent samtools. Expects samtools 1.4.1 via Conda, not via Tool Shed.
author peterjc
date Thu, 11 May 2017 12:16:10 -0400
parents ca8f63f2f7d4
children
line wrap: on
line diff
--- a/tools/coverage_stats/coverage_stats.py	Fri Nov 21 09:43:58 2014 -0500
+++ b/tools/coverage_stats/coverage_stats.py	Thu May 11 12:16:10 2017 -0400
@@ -6,39 +6,61 @@
  * Input BAI filename (via Galaxy metadata)
  * Output tabular filename
 
+Optional fourth argument:
+ * Max coverage depth (integer)
+
 This messes about with the filenames to make samtools happy, then
 runs "samtools idxstats" and "samtools depth", captures and combines
 the output to the desired output tabular file.
+
+Because "samtools depth" treats the max depth a little fuzzily, this
+tool tries to account for this and applies a clear max-depth cut off.
 """
-import sys
+
 import os
 import subprocess
+import sys
 import tempfile
 
 if "-v" in sys.argv or "--version" in sys.argv:
-    #Galaxy seems to invert the order of the two lines
-    print("BAM coverage statistics v0.0.1")
+    # Galaxy seems to invert the order of the two lines
+    print("BAM coverage statistics v0.0.5")
     cmd = "samtools 2>&1 | grep -i ^Version"
     sys.exit(os.system(cmd))
 
-def stop_err(msg, error_level=1):
-   """Print error message to stdout and quit with given error level."""
-   sys.stderr.write("%s\n" % msg)
-   sys.exit(error_level)
-
-if len(sys.argv) != 4:
-   stop_err("Require three arguments: BAM, BAI, tabular filenames")
-
-bam_filename, bai_filename, tabular_filename = sys.argv[1:]
+# TODO - Proper command line API
+if len(sys.argv) == 4:
+    bam_filename, bai_filename, tabular_filename = sys.argv[1:]
+    max_depth = "8000"
+elif len(sys.argv) == 5:
+    bam_filename, bai_filename, tabular_filename, max_depth = sys.argv[1:]
+else:
+    sys.exit("Require 3 or 4 arguments: BAM, BAI, tabular filename, [max depth]")
 
 if not os.path.isfile(bam_filename):
-    stop_err("Input BAM file not found: %s" % bam_filename)
+    sys.exit("Input BAM file not found: %s" % bam_filename)
+if bai_filename == "-":
+    # Make this optional for ease of use at the command line by hand:
+    if os.path.isfile(bam_filename + ".bai"):
+        bai_filename = bam_filename + ".bai"
 if not os.path.isfile(bai_filename):
     if bai_filename == "None":
-        stop_err("Error: Galaxy did not index your BAM file")
-    stop_err("Input BAI file not found: %s" % bai_filename)
+        sys.exit("Error: Galaxy did not index your BAM file")
+    sys.exit("Input BAI file not found: %s" % bai_filename)
 
-#Assign sensible names with real extensions, and setup symlinks:
+try:
+    max_depth = int(max_depth)
+except ValueError:
+    sys.exit("Bad argument for max depth: %r" % max_depth)
+if max_depth < 0:
+    sys.exit("Bad argument for max depth: %r" % max_depth)
+
+# fuzz factor to ensure can reach max_depth, e.g. region with
+# many reads having a deletion present could underestimate the
+# coverage by capping the number of reads considered
+depth_margin = 100
+
+# Assign sensible names with real extensions, and setup symlinks:
 tmp_dir = tempfile.mkdtemp()
 bam_file = os.path.join(tmp_dir, "temp.bam")
 bai_file = os.path.join(tmp_dir, "temp.bam.bai")
@@ -50,27 +72,58 @@
 assert os.path.isfile(bai_file), bai_file
 assert os.path.isfile(bam_file + ".bai"), bam_file
 
+
 def clean_up():
+    """Remove our temporary files and directory."""
     os.remove(idxstats_filename)
     os.remove(depth_filename)
     os.remove(bam_file)
     os.remove(bai_file)
     os.rmdir(tmp_dir)
 
+
+def samtools_depth_opt_available():
+    """Determine if samtools depth supports maximum coverage argument."""
+    child = subprocess.Popen(["samtools", "depth"],
+                             stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    # Combined stdout/stderr in case samtools is ever inconsistent
+    output, tmp = child.communicate()
+    assert tmp is None
+    # Expect to find this line in the help text, exact wording could change:
+    #    -d/-m <int>         maximum coverage depth [8000]
+    return " -d/-m " in output
+
+
+depth_hack = False
+if not samtools_depth_opt_available():
+    if max_depth + depth_margin <= 8000:
+        sys.stderr.write("WARNING: The version of samtools depth installed does not "
+                         "support the -d option, however, the requested max-depth "
+                         "is safely under the default of 8000.\n")
+        depth_hack = True
+    else:
+        sys.exit("The version of samtools depth installed does not support the -d option.")
+
 # Run samtools idxstats:
 cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename)
 return_code = os.system(cmd)
 if return_code:
     clean_up()
-    stop_err("Return code %i from command:\n%s" % (return_code, cmd))
+    sys.exit("Return code %i from command:\n%s" % (return_code, cmd))
 
 # Run samtools depth:
 # TODO - Parse stdout instead?
-cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename)
+if depth_hack:
+    # Using an old samtools without the -d option, but hard coded default
+    # of 8000 should be fine even allowing a margin for fuzzy output
+    cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename)
+else:
+    cmd = 'samtools depth -d %i "%s" > "%s"' % (max_depth + depth_margin, bam_file, depth_filename)
 return_code = os.system(cmd)
 if return_code:
     clean_up()
-    stop_err("Return code %i from command:\n%s" % (return_code, cmd))
+    sys.exit("Return code %i from command:\n%s" % (return_code, cmd))
+
 
 def load_total_coverage(depth_handle, identifier, length):
     """Parse some of the 'samtools depth' output for coverages.
@@ -86,18 +139,23 @@
     # print("%s coverage calculation, length %i, ..." % (identifier, length))
 
     if depth_ref is None:
-        # Right at start of file!
+        # Right at start of file / new contig
         line = depth_handle.readline()
+        # Are we at the end of the file?
+        if not line:
+            # Must be at the end of the file.
+            # This can happen if the file contig(s) had no reads mapped
+            return 0, 0, 0.0, 0
         depth_ref, depth_pos, depth_reads = line.rstrip("\n").split()
         depth_pos = int(depth_pos)
-        depth_reads = int(depth_reads)
+        depth_reads = min(max_depth, int(depth_reads))
         # Can now treat as later references where first line cached
     elif identifier != depth_ref:
         # Infer that identifier had coverage zero,
         # and so was not in the 'samtools depth'
         # output.
         # print("%s appears to have no coverage at all" % identifier)
-        return 0, 0, 0.0
+        return 0, 0, 0.0, 0
 
     # Good, at start of expected reference
     bases = depth_reads
@@ -115,7 +173,7 @@
     for line in depth_handle:
         ref, pos, depth = line.rstrip("\n").split()
         pos = int(pos)
-        depth = int(depth)
+        depth = min(max_depth, int(depth))
         if ref != identifier:
             # Reached the end of this identifier's coverage
             # so cache this ready for next identifier
@@ -135,7 +193,8 @@
         # print("%s has no coverage at end" % identifier)
         min_cov = 0
     mean_cov = bases / float(length)
-    return min_cov, max_cov, mean_cov
+    return min_cov, max_cov, mean_cov, bases
+
 
 # Parse and combine the output
 out_handle = open(tabular_filename, "w")
@@ -148,6 +207,8 @@
 depth_pos = 0
 depth_reads = 0
 
+global_bases = 0
+global_length = 0
 for line in idxstats_handle:
     identifier, length, mapped, placed = line.rstrip("\n").split()
     length = int(length)
@@ -157,8 +218,12 @@
         min_cov = 0
         max_cov = 0
         mean_cov = 0.0
+        bases = 0
     else:
-        min_cov, max_cov, mean_cov = load_total_coverage(depth_handle, identifier, length)
+        min_cov, max_cov, mean_cov, bases = load_total_coverage(depth_handle, identifier, length)
+    if max_cov > max_depth:
+        sys.exit("Using max depth %i yet saw max coverage %i for %s"
+                 % (max_depth, max_cov, identifier))
     out_handle.write("%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n"
                      % (identifier, length, mapped, placed,
                         min_cov, max_cov, mean_cov))
@@ -168,15 +233,19 @@
         depth_handle.close()
         out_handle.close()
         clean_up()
-        stop_err("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov"
+        sys.exit("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov"
                  % identifier)
+    global_length += length
+    global_bases += bases
 
 idxstats_handle.close()
 depth_handle.close()
 out_handle.close()
 
+print("Total reference length %i with overall mean coverage %0.2f" % (global_length, float(global_bases) / global_length))
+
 # Remove the temp symlinks and files:
 clean_up()
 
 if depth_ref is not None:
-    stop_err("Left over output from 'samtools depth'? %r" % depth_ref)
+    sys.exit("Left over output from 'samtools depth'? %r" % depth_ref)