coverage_stats: coverage_stats-1b5523d3d2c2/tools/coverage_stats/coverage

comparison coverage_stats-1b5523d3d2c2/tools/coverage_stats/coverage_stats.py @ 3:57b3ea22aff3 draft

Uploaded v0.1.0 which was already on the Test Tool Shed. Included Python 3 support.

author	peterjc
date	Tue, 11 Aug 2020 18:23:05 -0400
parents
children

comparison

equal deleted inserted replaced

-:7254ece0c0ff
+:57b3ea22aff3
+#!/usr/bin/env python
+"""BAM coverage statistics using samtools idxstats and depth.
+This script takes exactly three command line arguments:
+* Input BAM filename
+* Input BAI filename (via Galaxy metadata)
+* Output tabular filename
+Optional fourth argument:
+* Max coverage depth (integer)
+This messes about with the filenames to make samtools happy, then
+runs "samtools idxstats" and "samtools depth", captures and combines
+the output to the desired output tabular file.
+Because "samtools depth" treats the max depth a little fuzzily, this
+tool tries to account for this and applies a clear max-depth cut off.
+"""
+import os
+import subprocess
+import sys
+import tempfile
+from optparse import OptionParser
+usage = """Example usage:
+$ python coverage_stats.py -b mapped.bam -i mapped.bai -o summary.tsv -d 10000
+"""
+parser = OptionParser(usage=usage)
+parser.add_option(
+"-b", "--bam", dest="input_bam", default=None, help="Input BAM file", metavar="FILE"
+)
+parser.add_option(
+"-i",
+"--bai",
+dest="input_bai",
+default="-",
+help="Input BAI file (BAM index, optional, default or - infer from BAM filename)",
+metavar="FILE",
+)
+parser.add_option(
+"-o",
+dest="output_tabular",
+default="-",
+help="Output tabular file (optional, default stdout)",
+metavar="FILE",
+)
+parser.add_option(
+"-d",
+"--maxdepth",
+dest="max_depth",
+default=8000,
+help="Max coverage depth (integer, default 8000)",
+type="int",
+)
+parser.add_option(
+"-v",
+"--version",
+dest="version",
+default=False,
+action="store_true",
+help="Show version and quit",
+)
+options, args = parser.parse_args()
+if options.version:
+# Galaxy seems to invert the order of the two lines
+print("BAM coverage statistics v0.1.0")
+cmd = "samtools 2>&1 | grep -i ^Version"
+sys.exit(os.system(cmd))
+bam_filename = options.input_bam
+bai_filename = options.input_bai
+tabular_filename = options.output_tabular
+max_depth = options.max_depth
+if args:
+sys.exit("Sorry, the legacy API has been dropped.")
+if not bam_filename:
+sys.exit("Input BAM filename is required.")
+if not os.path.isfile(bam_filename):
+sys.exit("Input BAM file not found: %s" % bam_filename)
+if bai_filename == "-":
+if os.path.isfile(bam_filename + ".bai"):
+bai_filename = bam_filename + ".bai"
+if not os.path.isfile(bai_filename):
+if bai_filename == "None":
+sys.exit("Error: Galaxy did not index your BAM file")
+sys.exit("Input BAI file not found: %s" % bai_filename)
+try:
+max_depth = int(max_depth)
+except ValueError:
+sys.exit("Bad argument for max depth: %r" % max_depth)
+if max_depth < 0:
+sys.exit("Bad argument for max depth: %r" % max_depth)
+# fuzz factor to ensure can reach max_depth, e.g. region with
+# many reads having a deletion present could underestimate the
+# coverage by capping the number of reads considered
+depth_margin = 100
+# Assign sensible names with real extensions, and setup symlinks:
+tmp_dir = tempfile.mkdtemp()
+bam_file = os.path.join(tmp_dir, "temp.bam")
+bai_file = os.path.join(tmp_dir, "temp.bam.bai")
+idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv")
+depth_filename = os.path.join(tmp_dir, "depth.tsv")
+os.symlink(os.path.abspath(bam_filename), bam_file)
+os.symlink(os.path.abspath(bai_filename), bai_file)
+assert os.path.isfile(bam_file), bam_file
+assert os.path.isfile(bai_file), bai_file
+assert os.path.isfile(bam_file + ".bai"), bam_file
+def clean_up():
+"""Remove our temporary files and directory."""
+os.remove(idxstats_filename)
+os.remove(depth_filename)
+os.remove(bam_file)
+os.remove(bai_file)
+os.rmdir(tmp_dir)
+def samtools_depth_opt_available():
+"""Determine if samtools depth supports maximum coverage argument."""
+child = subprocess.Popen(
+["samtools", "depth"],
+universal_newlines=True,
+stdout=subprocess.PIPE,
+stderr=subprocess.STDOUT,
+)
+# Combined stdout/stderr in case samtools is ever inconsistent
+output, tmp = child.communicate()
+assert tmp is None
+# Expect to find this line in the help text, exact wording could change:
+#    -d/-m <int>         maximum coverage depth [8000]
+return " -d/-m " in output
+depth_hack = False
+if not samtools_depth_opt_available():
+if max_depth + depth_margin <= 8000:
+sys.stderr.write(
+"WARNING: The version of samtools depth installed does not "
+"support the -d option, however, the requested max-depth "
+"is safely under the default of 8000.\n"
+)
+depth_hack = True
+else:
+sys.exit(
+"The version of samtools depth installed does not support the -d option."
+)
+# Run samtools idxstats:
+cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename)
+return_code = os.system(cmd)
+if return_code:
+clean_up()
+sys.exit("Return code %i from command:\n%s" % (return_code, cmd))
+# Run samtools depth:
+# TODO - Parse stdout instead?
+if depth_hack:
+# Using an old samtools without the -d option, but hard coded default
+# of 8000 should be fine even allowing a margin for fuzzy output
+cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename)
+else:
+cmd = 'samtools depth -d %i "%s" > "%s"' % (
+max_depth + depth_margin,
+bam_file,
+depth_filename,
+)
+return_code = os.system(cmd)
+if return_code:
+clean_up()
+sys.exit("Return code %i from command:\n%s" % (return_code, cmd))
+def load_total_coverage(depth_handle, identifier, length):
+"""Parse some of the 'samtools depth' output for coverages.
+Returns min_cov (int), max_cov (int) and mean cov (float).
+Uses global variables to cache the first line of output from the
+next reference sequence.
+"""
+global depth_ref, depth_pos, depth_reads
+# print("====")
+# print("%s coverage calculation, length %i, ..." % (identifier, length))
+if depth_ref is None:
+# Right at start of file / new contig
+line = depth_handle.readline()
+# Are we at the end of the file?
+if not line:
+# Must be at the end of the file.
+# This can happen if the file contig(s) had no reads mapped
+return 0, 0, 0.0, 0
+depth_ref, depth_pos, depth_reads = line.rstrip("\n").split()
+depth_pos = int(depth_pos)
+depth_reads = min(max_depth, int(depth_reads))
+# Can now treat as later references where first line cached
+elif identifier != depth_ref:
+# Infer that identifier had coverage zero,
+# and so was not in the 'samtools depth'
+# output.
+# print("%s appears to have no coverage at all" % identifier)
+return 0, 0, 0.0, 0
+# Good, at start of expected reference
+bases = depth_reads
+if depth_pos == 1:
+min_cov = depth_reads
+else:
+# print("%s has no coverage at start" % identifier)
+min_cov = 0
+max_cov = depth_reads
+last_pos = depth_pos
+depth_ref = None
+depth_pos = 0
+depth_reads = 0
+for line in depth_handle:
+ref, pos, depth = line.rstrip("\n").split()
+pos = int(pos)
+depth = min(max_depth, int(depth))
+if ref != identifier:
+# Reached the end of this identifier's coverage
+# so cache this ready for next identifier
+depth_ref, depth_pos, depth_reads = ref, pos, depth
+break
+bases += depth
+if last_pos + 1 < pos:
+# print("%s has no coverage between %i and %i"
+#       % (identifier, last_pos, pos))
+min_cov = 0
+else:
+min_cov = min(min_cov, depth)
+max_cov = max(max_cov, depth)
+last_pos = pos
+# Reach the end of this identifier's coverage or end of file
+if last_pos < length:
+# print("%s has no coverage at end" % identifier)
+min_cov = 0
+mean_cov = bases / float(length)
+return min_cov, max_cov, mean_cov, bases
+# Parse and combine the output
+if tabular_filename == "-":
+out_handle = sys.stdout
+else:
+out_handle = open(tabular_filename, "w")
+out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n")
+idxstats_handle = open(idxstats_filename)
+depth_handle = open(depth_filename)
+depth_ref = None
+depth_pos = 0
+depth_reads = 0
+global_bases = 0
+global_length = 0
+for line in idxstats_handle:
+identifier, length, mapped, placed = line.rstrip("\n").split()
+length = int(length)
+mapped = int(mapped)
+placed = int(placed)
+if identifier == "*":
+min_cov = 0
+max_cov = 0
+mean_cov = 0.0
+bases = 0
+else:
+min_cov, max_cov, mean_cov, bases = load_total_coverage(
+depth_handle, identifier, length
+)
+if max_cov > max_depth:
+sys.exit(
+"Using max depth %i yet saw max coverage %i for %s"
+% (max_depth, max_cov, identifier)
+)
+out_handle.write(
+"%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n"
+% (identifier, length, mapped, placed, min_cov, max_cov, mean_cov)
+)
+if not (min_cov <= mean_cov <= max_cov):
+out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n")
+idxstats_handle.close()
+depth_handle.close()
+out_handle.close()
+clean_up()
+sys.exit(
+"Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov"
+% identifier
+)
+global_length += length
+global_bases += bases
+idxstats_handle.close()
+depth_handle.close()
+if tabular_filename != "-":
+out_handle.close()
+print(
+"Total reference length %i with overall mean coverage %0.2f"
+% (global_length, float(global_bases) / global_length)
+)
+# Remove the temp symlinks and files:
+clean_up()
+if depth_ref is not None:
+sys.exit("Left over output from 'samtools depth'? %r" % depth_ref)

Mercurial > repos > peterjc > coverage_stats

comparison coverage_stats-1b5523d3d2c2/tools/coverage_stats/coverage_stats.py @ 3:57b3ea22aff3 draft