Mercurial > repos > brad-chapman > bam_to_bigwig
changeset 6:9163e1db4c16 draft default tip
planemo upload for repository https://github.com/lparsons/galaxy_tools/tree/master/tools/bam_to_bigwig/ commit b5c3df21127fc27f66a97761173c07e161f2fa8e
author | lparsons |
---|---|
date | Thu, 12 May 2016 11:39:13 -0400 |
parents | 52bcd04ee0d6 |
children | |
files | README.txt bam_to_bigwig.py bam_to_bigwig.xml bam_to_bigwig/README.txt bam_to_bigwig/bam_to_bigwig.py bam_to_bigwig/bam_to_bigwig.xml test-data/bam_to_bigwig_test.bam test-data/bam_to_bigwig_test.bigwig tool_dependencies.xml |
diffstat | 9 files changed, 240 insertions(+), 176 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Thu May 12 11:39:13 2016 -0400 @@ -0,0 +1,48 @@ +Convert a BAM file into a BigWig coverage file. This can be used directly from +Galaxy for display at UCSC. The advantage over standard Wiggle format is that +the data is stored in a compressed format and can be retrieved by genome +region. This allows you to view regions of arbitrarily large Wiggle file data +at UCSC while avoiding the upload costs. + +History +------- + +v0.2.0 add a sort step after genomeCoverageBed which is required in some +instances otherwise bedGraphToBigWig will complain. This version also uses +Galaxy's dependency mechanism, added some tests, and updated some formatting. +By Lance Parsons. + +v0.1.1 passes the forgotten split argument and moves to using the new +sub-command enabled bedtools. Thanks to David Leader. + +As of v0.1.0, the Galaxy tools uses a revised bam_to_bigwig.py script using +genomeCoverageBed and bedGraphToBigWig - this approach allows gaps/skpis to +be excluded from the coverage calculation, which is important for RNA-Seq. + +Until v0.0.2, this Galaxy tool used the bam_to_wiggle.py script from +https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py +which internally used pysam (and thus samtools) and wigToBigWig from UCSC. + +Requirements +------------ + +If you are installing this tool manually, place the Python script in the +same directory as the XML configuration file, or provide a soft link to it. +Ensure the following command line tools are on the system path: + +pysam - Python interface to samtools (http://code.google.com/p/pysam/) +genomeCoverageBed - part of BedTools (http://code.google.com/p/bedtools/) +bedGraphToBigWig - from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/) + +Credits +------- + +Original script by Brad Chapman, revisions from Peter Cock including the +switch to using genomeCoverageBed and bedGraphToBigWig based on the work +of Lance Parsons. + +License +------ + +The code is freely available under the MIT license: +http://www.opensource.org/licenses/mit-license.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bam_to_bigwig.py Thu May 12 11:39:13 2016 -0400 @@ -0,0 +1,122 @@ +#!/usr/bin/env python +"""Convert BAM files to BigWig file format in a specified region. + +Original version copyright Brad Chapman with revisions from Peter Cock +and ideas from Lance Parsons + +Usage: + bam_to_bigwig.py <BAM file> [--outfile=<output file name>] [--split] + +The --split argument is passed to bedtools genomecov + +The script requires: + pysam (http://code.google.com/p/pysam/) + bedtools genomecov (http://code.google.com/p/bedtools/) + bedGraphToBigWig from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/) +""" +import os +import sys +import subprocess +import tempfile +from optparse import OptionParser +from contextlib import contextmanager, closing + +import pysam + + +def main(bam_file, outfile=None, split=False): + config = {"program": {"ucsc_bedGraphToBigWig": ["bedGraphToBigWig"], + "bedtools_genomeCoverageBed": + ["bedtools", "genomecov"]}} + if outfile is None: + outfile = "%s.bigwig" % os.path.splitext(bam_file)[0] + if os.path.abspath(bam_file) == os.path.abspath(outfile): + sys.stderr.write("Bad arguments, " + "input and output files are the same.\n") + sys.exit(1) + if os.path.exists(outfile) and os.path.getsize(outfile) > 0: + sys.stderr.write("Warning, output file already exists.\n") + + sizes = get_sizes(bam_file, config) + print "Have %i references" % len(sizes) + if not sizes: + sys.stderr.write("Problem reading BAM header.\n") + sys.exit(1) + + # Use a temp file to avoid any possiblity of not having write permission + temp_handle = tempfile.NamedTemporaryFile(delete=False) + temp_file = temp_handle.name + with closing(temp_handle): + print "Calculating coverage..." + convert_to_graph(bam_file, split, config, temp_handle) + try: + print("Converting %i MB graph file to bigwig..." % + (os.path.getsize(temp_file) // (1024 * 1024))) + # Can't pipe this as stdin due to converter design, + # https://lists.soe.ucsc.edu/pipermail/genome/2011-March/025455.html + convert_to_bigwig(temp_file, sizes, config, outfile) + finally: + if os.path.isfile(temp_file): + os.remove(temp_file) + print "Done" + + +@contextmanager +def indexed_bam(bam_file, config): + if not os.path.exists(bam_file + ".bai"): + pysam.index(bam_file) + sam_reader = pysam.Samfile(bam_file, "rb") + yield sam_reader + sam_reader.close() + + +def get_sizes(bam_file, config): + with indexed_bam(bam_file, config) as work_bam: + sizes = zip(work_bam.references, work_bam.lengths) + return sizes + + +def convert_to_graph(bam_file, split, config, out_handle): + cl = config["program"]["bedtools_genomeCoverageBed"] + \ + ["-ibam", bam_file, "-bg"] + if split: + cl.append("-split") + new_env = os.environ.copy() + new_env['LC_COLLATE'] = 'C' + p1 = subprocess.Popen(cl, stdout=subprocess.PIPE) + p2 = subprocess.Popen(["sort", "-k1,1", "-k2,2n"], + env=new_env, + stdin=p1.stdout, + stdout=out_handle) + p1.stdout.close() + p2.communicate() + + +def convert_to_bigwig(bedgraph_file, chr_sizes, config, bw_file): + # This will be fine under Galaxy, but could use temp folder? + size_file = "%s-sizes.txt" % (os.path.splitext(bw_file)[0]) + with open(size_file, "w") as out_handle: + for chrom, size in chr_sizes: + out_handle.write("%s\t%s\n" % (chrom, size)) + try: + cl = config["program"]["ucsc_bedGraphToBigWig"] + \ + [bedgraph_file, size_file, bw_file] + subprocess.check_call(cl) + finally: + os.remove(size_file) + return bw_file + + +if __name__ == "__main__": + parser = OptionParser() + parser.add_option("-o", "--outfile", dest="outfile") + parser.add_option("-s", "--split", action="store_true", dest="split") + (options, args) = parser.parse_args() + if len(args) not in [1, 2]: + print "Incorrect arguments" + print __doc__ + sys.exit() + kwargs = dict( + outfile=options.outfile, + split=options.split) + main(*args, **kwargs)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bam_to_bigwig.xml Thu May 12 11:39:13 2016 -0400 @@ -0,0 +1,58 @@ +<tool id="bam_to_bigwig" name="BAM to BigWig" version="0.2.0"> + + <description>Calculates coverage from a BAM alignment file</description> + + <requirements> + <requirement type="package" version="0.8.3">pysam</requirement> + <requirement type="package" version="2.24">bedtools</requirement> + <requirement type="package" version="312">ucsc_tools</requirement> + </requirements> + + <command detect_errors="aggressive" interpreter="python"><![CDATA[ + bam_to_bigwig.py $align --outfile=$out --split + ]]></command> + + <inputs> + <param format="bam" name="align" type="data" label="BAM alignment file"/> + </inputs> + + <outputs> + <data format="bigwig" name="out" /> + </outputs> + + <tests> + <test> + <param name="align" value="bam_to_bigwig_test.bam"/> + <output name="out" file="bam_to_bigwig_test.bigwig"/> + </test> + </tests> + +<help><![CDATA[ +**What it does** + +Creates a coverage file in BigWig format, given a BAM alignment file. + +Gaps or skips (CIGAR D or N operators) are not counted towards the coverage +calculation, which is important when mapping RNA Seq reads to genes with +introns. + +**Input** + +A BAM alignment file. This needs to have the genome database build used in +alignment annotated. If your file has '?' for the database build, click on the +pencil icon to edit the alignment attributes, and specify the organism used to +align against. + +**Output** + +BigWig files can be loaded directly from Galaxy into the UCSC browser. They can +be loaded incrementally by UCSC, so a single file can be used to represent the +entire genome without having to upload the entire thing as a custom track. + +]]></help> + + <citations> + <citation type="doi">10.1093/bioinformatics/btp352</citation> + <citation type="doi">10.1093/bioinformatics/btq033</citation> + </citations> +</tool>
--- a/bam_to_bigwig/README.txt Mon Feb 24 10:02:20 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -Convert a BAM file into a BigWig coverage file. This can be used directly from -Galaxy for display at UCSC. The advantage over standard Wiggle format is that -the data is stored in a compressed format and can be retrieved by genome -region. This allows you to view regions of arbitrarily large Wiggle file data -at UCSC while avoiding the upload costs. - -History -------- - -v0.1.1 passes the forgotten split argument and moves to using the new -sub-command enabled bedtools. Thanks to David Leader. - -As of v0.1.0, the Galaxy tools uses a revised bam_to_bigwig.py script using -genomeCoverageBed and bedGraphToBigWig - this approach allows gaps/skpis to -be excluded from the coverage calculation, which is important for RNA-Seq. - -Until v0.0.2, this Galaxy tool used the bam_to_wiggle.py script from -https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py -which internally used pysam (and thus samtools) and wigToBigWig from UCSC. - -Requirements ------------- - -If you are installing this tool manually, place the Python script in the -same directory as the XML configuration file, or provide a soft link to it. -Ensure the following command line tools are on the system path: - -pysam - Python interface to samtools (http://code.google.com/p/pysam/) -genomeCoverageBed - part of BedTools (http://code.google.com/p/bedtools/) -bedGraphToBigWig - from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/) - -Credits -------- - -Original script by Brad Chapman, revisions from Peter Cock including the -switch to using genomeCoverageBed and bedGraphToBigWig based on the work -of Lance Parsons. - -License ------- - -The code is freely available under the MIT license: -http://www.opensource.org/licenses/mit-license.html
--- a/bam_to_bigwig/bam_to_bigwig.py Mon Feb 24 10:02:20 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ -#!/usr/bin/env python -"""Convert BAM files to BigWig file format in a specified region. - -Original version copyright Brad Chapman with revisions from Peter Cock -and ideas from Lance Parsons - -Usage: - bam_to_bigwig.py <BAM file> [--outfile=<output file name>] [--split] - -The --split argument is passed to bedtools genomecov - -The script requires: - pysam (http://code.google.com/p/pysam/) - bedtools genomecov (http://code.google.com/p/bedtools/) - bedGraphToBigWig from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/) -""" -import os -import sys -import subprocess -import tempfile -from optparse import OptionParser -from contextlib import contextmanager, closing - -import pysam - -def main(bam_file, outfile=None, split=False): - config = {"program": {"ucsc_bedGraphToBigWig": ["bedGraphToBigWig"], - "bedtools_genomeCoverageBed": ["bedtools", "genomecov"]}} - if outfile is None: - outfile = "%s.bigwig" % os.path.splitext(bam_file)[0] - if os.path.abspath(bam_file) == os.path.abspath(outfile): - sys.stderr.write("Bad arguments, input and output files are the same.\n") - sys.exit(1) - if os.path.exists(outfile) and os.path.getsize(outfile) > 0: - sys.stderr.write("Warning, output file already exists.\n") - - sizes = get_sizes(bam_file, config) - print "Have %i references" % len(sizes) - if not sizes: - sys.stderr.write("Problem reading BAM header.\n") - sys.exit(1) - - #Use a temp file to avoid any possiblity of not having write permission - temp_handle = tempfile.NamedTemporaryFile(delete=False) - temp_file = temp_handle.name - with closing(temp_handle): - print "Calculating coverage..." - convert_to_graph(bam_file, split, config, temp_handle) - try: - print "Converting %i MB graph file to bigwig..." % (os.path.getsize(temp_file) // (1024 * 1024)) - #Can't pipe this as stdin due to converter design, - #https://lists.soe.ucsc.edu/pipermail/genome/2011-March/025455.html - convert_to_bigwig(temp_file, sizes, config, outfile) - finally: - if os.path.isfile(temp_file): - os.remove(temp_file) - print "Done" - -@contextmanager -def indexed_bam(bam_file, config): - if not os.path.exists(bam_file + ".bai"): - pysam.index(bam_file) - sam_reader = pysam.Samfile(bam_file, "rb") - yield sam_reader - sam_reader.close() - -def get_sizes(bam_file, config): - with indexed_bam(bam_file, config) as work_bam: - sizes = zip(work_bam.references, work_bam.lengths) - return sizes - -def convert_to_graph(bam_file, split, config, out_handle): - cl = config["program"]["bedtools_genomeCoverageBed"] + ["-ibam", bam_file, "-bg"] - if split: - cl.append("-split") - subprocess.check_call(cl, stdout=out_handle) - -def convert_to_bigwig(bedgraph_file, chr_sizes, config, bw_file): - #This will be fine under Galaxy, but could use temp folder? - size_file = "%s-sizes.txt" % (os.path.splitext(bw_file)[0]) - with open(size_file, "w") as out_handle: - for chrom, size in chr_sizes: - out_handle.write("%s\t%s\n" % (chrom, size)) - try: - cl = config["program"]["ucsc_bedGraphToBigWig"] + [bedgraph_file, size_file, bw_file] - subprocess.check_call(cl) - finally: - os.remove(size_file) - return bw_file - -if __name__ == "__main__": - parser = OptionParser() - parser.add_option("-o", "--outfile", dest="outfile") - parser.add_option("-s", "--split", action="store_true", dest="split") - (options, args) = parser.parse_args() - if len(args) not in [1, 2]: - print "Incorrect arguments" - print __doc__ - sys.exit() - kwargs = dict( - outfile=options.outfile, - split=options.split) - main(*args, **kwargs)
--- a/bam_to_bigwig/bam_to_bigwig.xml Mon Feb 24 10:02:20 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -<tool id="bam_to_bigwig" name="BAM to BigWig" version="0.1.1"> - <description>Calculates coverage from a BAM alignment file</description> - <command interpreter="python">bam_to_bigwig.py $align --outfile=$out --split</command> - <inputs> - <param format="bam" name="align" type="data" label="BAM alignment file"/> - </inputs> - <outputs> - <data format="bigwig" name="out" /> - </outputs> - <requirements> - <requirement type="python-module">pysam</requirement> - <requirement type="binary">genomeCoverageBed</requirement> - <requirement type="binary">bedGraphToBigWig</requirement> - </requirements> -<help> -**What it does** - -Creates a coverage file in BigWig format, given a BAM alignment file. - -Gaps or skips (CIGAR D or N operators) are not counted towards the coverage calculation, which is important when mapping RNA Seq reads to genes with introns. - -**Input** - -A BAM alignment file. This needs to have the genome database build used in alignment annotated. If your file has '?' for the database build, click on the pencil icon to edit the alignment attributes, and specify the organism used to align against. - -**Output** - -BigWig files can be loaded directly from Galaxy into the UCSC browser. They can be loaded incrementally by UCSC, so a single file can be used to represent the entire genome without having to upload the entire thing as a custom track. -</help> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu May 12 11:39:13 2016 -0400 @@ -0,0 +1,12 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="pysam" version="0.8.3"> + <repository changeset_revision="08db58be052a" name="package_python_2_7_pysam_0_8_3" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="bedtools" version="2.24"> + <repository changeset_revision="39b86c1e267d" name="package_bedtools_2_24" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="ucsc_tools" version="312"> + <repository changeset_revision="2d6bafd63401" name="package_ucsc_tools_312" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>