# HG changeset patch
# User peterjc
# Date 1416580109 18000
# Node ID ca8f63f2f7d43ed2f22267ca559c734ba957c0e3
Uploaded v0.0.1
diff -r 000000000000 -r ca8f63f2f7d4 test-data/coverage_test.bam
Binary file test-data/coverage_test.bam has changed
diff -r 000000000000 -r ca8f63f2f7d4 test-data/coverage_test.coverage_stats.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/coverage_test.coverage_stats.tabular Fri Nov 21 09:28:29 2014 -0500
@@ -0,0 +1,7 @@
+#identifer length mapped placed min_cov max_cov mean_cov
+ref 100 5 0 0 2 0.50
+none 200 0 0 0 0 0.00
+full 10 3 0 2 2 2.00
+head 10 1 0 0 1 0.50
+tail 20 1 0 0 1 0.25
+* 0 0 1 0 0 0.00
diff -r 000000000000 -r ca8f63f2f7d4 test-data/ex1.bam
Binary file test-data/ex1.bam has changed
diff -r 000000000000 -r ca8f63f2f7d4 test-data/ex1.coverage_stats.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ex1.coverage_stats.tabular Fri Nov 21 09:28:29 2014 -0500
@@ -0,0 +1,4 @@
+#identifer length mapped placed min_cov max_cov mean_cov
+chr1 1575 1446 18 0 63 32.32
+chr2 1584 1789 17 0 64 39.78
+* 0 0 0 0 0 0.00
diff -r 000000000000 -r ca8f63f2f7d4 tools/coverage_stats/README.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/coverage_stats/README.rst Fri Nov 21 09:28:29 2014 -0500
@@ -0,0 +1,102 @@
+BAM coverage statistics using samtools idxstats and depth
+=========================================================
+
+This tool is copyright 2014 by Peter Cock, The James Hutton Institute
+(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below.
+
+Internally this tool uses the command-line samtools suite.
+
+This tool is available from the Galaxy Tool Shed at:
+http://toolshed.g2.bx.psu.edu/view/peterjc/coverage_stats
+
+
+Automated Installation
+======================
+
+This should be straightforward, Galaxy should automatically download and install
+samtools 0.1.19 if required.
+
+
+Manual Installation
+===================
+
+This expects samtools to be on the ``$PATH``, and was tested using v0.1.19.
+
+To install the wrapper copy or move the following files under the Galaxy tools
+folder, e.g. in a ``tools/coverage_stats`` folder:
+
+* ``coverage_stats.xml`` (the Galaxy tool definition)
+* ``coverage_stats.py`` (the Python wrapper script)
+* ``README.rst`` (this file)
+
+You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer
+the tool. Just add the line, perhaps under the NGS tools section::
+
+
+
+If you wish to run the unit tests, also move/copy the ``test-data/`` files
+under Galaxy's ``test-data/`` folder. Then::
+
+ $ ./run_tests.sh -id coverage_stats
+
+That's it.
+
+
+History
+=======
+
+======= ======================================================================
+Version Changes
+------- ----------------------------------------------------------------------
+v0.0.1 - Initial public release
+======= ======================================================================
+
+
+Developers
+==========
+
+Development is on this GitHub repository:
+https://github.com/peterjc/pico_galaxy/tree/master/tools/coverage_stats
+
+For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
+the following command from the Galaxy root folder::
+
+ $ tar -czf coverage_stats.tar.gz tools/coverage_stats/README.rst tools/coverage_stats/coverage_stats.xml tools/coverage_stats/coverage_stats.py tools/coverage_stats/tool_dependencies.xml test-data/ex1.bam test-data/ex1.coverage_stats.tabular test-data/coverage_test.bam test-data/coverage_test.coverage_stats.tabular
+
+Check this worked::
+
+ $ tar -tzf coverage_stats.tar.gz
+ tools/coverage_stats/README.rst
+ tools/coverage_stats/coverage_stats.xml
+ tools/coverage_stats/coverage_stats.py
+ tools/coverage_stats/tool_dependencies.xml
+ test-data/ex1.bam
+ test-data/ex1.coverage_stats.tabular
+ test-data/coverage_test.bam
+ test-data/coverage_test.coverage_stats.tabular
+
+
+Licence (MIT)
+=============
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+NOTE: This is the licence for the Galaxy Wrapper only.
+samtools is available and licenced separately.
diff -r 000000000000 -r ca8f63f2f7d4 tools/coverage_stats/coverage_stats.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/coverage_stats/coverage_stats.py Fri Nov 21 09:28:29 2014 -0500
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+"""BAM coverage statistics using samtools idxstats and depth.
+
+This script takes exactly three command line arguments:
+ * Input BAM filename
+ * Input BAI filename (via Galaxy metadata)
+ * Output tabular filename
+
+This messes about with the filenames to make samtools happy, then
+runs "samtools idxstats" and "samtools depth", captures and combines
+the output to the desired output tabular file.
+"""
+import sys
+import os
+import subprocess
+import tempfile
+
+if "-v" in sys.argv or "--version" in sys.argv:
+ #Galaxy seems to invert the order of the two lines
+ print("BAM coverage statistics v0.0.1")
+ cmd = "samtools 2>&1 | grep -i ^Version"
+ sys.exit(os.system(cmd))
+
+def stop_err(msg, error_level=1):
+ """Print error message to stdout and quit with given error level."""
+ sys.stderr.write("%s\n" % msg)
+ sys.exit(error_level)
+
+if len(sys.argv) != 4:
+ stop_err("Require three arguments: BAM, BAI, tabular filenames")
+
+bam_filename, bai_filename, tabular_filename = sys.argv[1:]
+
+if not os.path.isfile(bam_filename):
+ stop_err("Input BAM file not found: %s" % bam_filename)
+if not os.path.isfile(bai_filename):
+ if bai_filename == "None":
+ stop_err("Error: Galaxy did not index your BAM file")
+ stop_err("Input BAI file not found: %s" % bai_filename)
+
+#Assign sensible names with real extensions, and setup symlinks:
+tmp_dir = tempfile.mkdtemp()
+bam_file = os.path.join(tmp_dir, "temp.bam")
+bai_file = os.path.join(tmp_dir, "temp.bam.bai")
+idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv")
+depth_filename = os.path.join(tmp_dir, "depth.tsv")
+os.symlink(os.path.abspath(bam_filename), bam_file)
+os.symlink(os.path.abspath(bai_filename), bai_file)
+assert os.path.isfile(bam_file), bam_file
+assert os.path.isfile(bai_file), bai_file
+assert os.path.isfile(bam_file + ".bai"), bam_file
+
+def clean_up():
+ os.remove(idxstats_filename)
+ os.remove(depth_filename)
+ os.remove(bam_file)
+ os.remove(bai_file)
+ os.rmdir(tmp_dir)
+
+# Run samtools idxstats:
+cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename)
+return_code = os.system(cmd)
+if return_code:
+ clean_up()
+ stop_err("Return code %i from command:\n%s" % (return_code, cmd))
+
+# Run samtools depth:
+# TODO - Parse stdout instead?
+cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename)
+return_code = os.system(cmd)
+if return_code:
+ clean_up()
+ stop_err("Return code %i from command:\n%s" % (return_code, cmd))
+
+def load_total_coverage(depth_handle, identifier, length):
+ """Parse some of the 'samtools depth' output for coverages.
+
+ Returns min_cov (int), max_cov (int) and mean cov (float).
+
+ Uses global variables to cache the first line of output from the
+ next reference sequence.
+ """
+ global depth_ref, depth_pos, depth_reads
+
+ # print("====")
+ # print("%s coverage calculation, length %i, ..." % (identifier, length))
+
+ if depth_ref is None:
+ # Right at start of file!
+ line = depth_handle.readline()
+ depth_ref, depth_pos, depth_reads = line.rstrip("\n").split()
+ depth_pos = int(depth_pos)
+ depth_reads = int(depth_reads)
+ # Can now treat as later references where first line cached
+ elif identifier != depth_ref:
+ # Infer that identifier had coverage zero,
+ # and so was not in the 'samtools depth'
+ # output.
+ # print("%s appears to have no coverage at all" % identifier)
+ return 0, 0, 0.0
+
+ # Good, at start of expected reference
+ bases = depth_reads
+ if depth_pos == 1:
+ min_cov = depth_reads
+ else:
+ # print("%s has no coverage at start" % identifier)
+ min_cov = 0
+ max_cov = depth_reads
+
+ last_pos = depth_pos
+ depth_ref = None
+ depth_pos = 0
+ depth_reads = 0
+ for line in depth_handle:
+ ref, pos, depth = line.rstrip("\n").split()
+ pos = int(pos)
+ depth = int(depth)
+ if ref != identifier:
+ # Reached the end of this identifier's coverage
+ # so cache this ready for next identifier
+ depth_ref, depth_pos, depth_reads = ref, pos, depth
+ break
+ bases += depth
+ if last_pos + 1 < pos:
+ # print("%s has no coverage between %i and %i" % (identifier, last_pos, pos))
+ min_cov = 0
+ else:
+ min_cov = min(min_cov, depth)
+ max_cov = max(max_cov, depth)
+ last_pos = pos
+
+ # Reach the end of this identifier's coverage or end of file
+ if last_pos < length:
+ # print("%s has no coverage at end" % identifier)
+ min_cov = 0
+ mean_cov = bases / float(length)
+ return min_cov, max_cov, mean_cov
+
+# Parse and combine the output
+out_handle = open(tabular_filename, "w")
+out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n")
+
+idxstats_handle = open(idxstats_filename)
+depth_handle = open(depth_filename)
+
+depth_ref = None
+depth_pos = 0
+depth_reads = 0
+
+for line in idxstats_handle:
+ identifier, length, mapped, placed = line.rstrip("\n").split()
+ length = int(length)
+ mapped = int(mapped)
+ placed = int(placed)
+ if identifier == "*":
+ min_cov = 0
+ max_cov = 0
+ mean_cov = 0.0
+ else:
+ min_cov, max_cov, mean_cov = load_total_coverage(depth_handle, identifier, length)
+ out_handle.write("%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n"
+ % (identifier, length, mapped, placed,
+ min_cov, max_cov, mean_cov))
+ if not (min_cov <= mean_cov <= max_cov):
+ out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n")
+ idxstats_handle.close()
+ depth_handle.close()
+ out_handle.close()
+ clean_up()
+ stop_err("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov"
+ % identifier)
+
+idxstats_handle.close()
+depth_handle.close()
+out_handle.close()
+
+# Remove the temp symlinks and files:
+clean_up()
+
+if depth_ref is not None:
+ stop_err("Left over output from 'samtools depth'? %r" % depth_ref)
diff -r 000000000000 -r ca8f63f2f7d4 tools/coverage_stats/coverage_stats.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/coverage_stats/coverage_stats.xml Fri Nov 21 09:28:29 2014 -0500
@@ -0,0 +1,97 @@
+
+ using samtools idxstats and depth
+
+ samtools
+ samtools
+
+ coverage_stats.py --version
+ coverage_stats.py "$input_bam" "${input_bam.metadata.bam_index}" "$out_tabular"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool runs the commands ``samtools idxstats`` and ``samtools depth`` from the
+SAMtools toolkit, and parses their output to produce a consise summary of the
+coverage information for each reference sequence.
+
+Input is a sorted and indexed BAM file, the output is tabular. The first four
+columns match the output from ``samtools idxstats``, the additional columns are
+calculated from the ``samtools depth`` output. The final row with a star as the
+reference identifier represents unmapped reads, and will have zeros in every
+column except columns one and four.
+
+====== =================================================================================
+Column Description
+------ ---------------------------------------------------------------------------------
+ 1 Reference sequence identifier
+ 2 Reference sequence length
+ 3 Number of mapped reads
+ 4 Number of placed but unmapped reads (typically unmapped partners of mapped reads)
+ 5 Minimum coverage
+ 6 Maximum coverage
+ 7 Mean coverage (given to 2 dp)
+====== =================================================================================
+
+Example output from a *de novo* assembly:
+
+========== ====== ====== ====== ======= ======= ========
+identiifer length mapped placed min_cov max_cov mean_cov
+---------- ------ ------ ------ ------- ------- --------
+contig_1 833604 436112 0 1 157 71.95
+contig_2 14820 9954 0 1 152 91.27
+contig_3 272099 142958 0 1 150 72.31
+contig_4 135519 73288 0 1 149 75.23
+contig_5 91245 46759 0 1 157 70.92
+contig_6 175604 95744 0 1 146 75.99
+contig_7 90586 48158 0 1 151 72.93
+contig_9 234347 126458 0 1 159 75.40
+contig_10 121515 60211 0 1 152 68.12
+... ... ... ... ... ... ...
+contig_604 712 85 0 1 49 21.97
+\* 0 0 950320 0 0 0.00
+========== ====== ====== ====== ======= ======= ========
+
+In this example there were 604 contigs, each with one line in the output table,
+plus the final row (labelled with an asterisk) representing 950320 unmapped reads.
+In this BAM file, the fourth column was otherwise zero.
+
+
+**Citation**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite:
+
+Heng Li et al (2009). The Sequence Alignment/Map format and SAMtools.
+Bioinformatics 25(16), 2078-9.
+http://dx.doi.org/10.1093/bioinformatics/btp352
+
+Peter J.A. Cock (2013), BAM coverage statistics using samtools idxstats and depth.
+http://toolshed.g2.bx.psu.edu/view/peterjc/coverage_stats
+
+This wrapper is available to install into other Galaxy Instances via the Galaxy
+Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/coverage_stats
+
+
+ 10.1093/bioinformatics/btp352
+
+
diff -r 000000000000 -r ca8f63f2f7d4 tools/coverage_stats/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/coverage_stats/tool_dependencies.xml Fri Nov 21 09:28:29 2014 -0500
@@ -0,0 +1,6 @@
+
+
+
+
+
+