Mercurial > repos > brad-chapman > bam_to_bigwig
changeset 2:e2edfa478eb4
Covert to use bedGraph intermediate with bedtools. Thanks to Peter Cock and Lance Parsons for the updates
author | Brad Chapman <chapmanb@50mail.com> |
---|---|
date | Wed, 05 Sep 2012 21:14:54 -0400 |
parents | 0ff100a057ef |
children | 294e9dae5a9b |
files | bam_to_bigwig/README.txt bam_to_bigwig/bam_to_bigwig.py bam_to_bigwig/bam_to_bigwig.xml bam_to_bigwig/bam_to_wiggle.py |
diffstat | 4 files changed, 148 insertions(+), 135 deletions(-) [+] |
line wrap: on
line diff
--- a/bam_to_bigwig/README.txt Tue Jun 07 16:26:46 2011 -0400 +++ b/bam_to_bigwig/README.txt Wed Sep 05 21:14:54 2012 -0400 @@ -4,15 +4,31 @@ region. This allows you to view regions of arbitrarily large Wiggle file data at UCSC while avoiding the upload costs. -The latest version of the bam_to_wiggle.py script is available at: +History +------- + +As of v0.1.0, the Galaxy tools uses a revised bam_to_bigwig.py script using +genomeCoverageBed and bedGraphToBigWig - this approach allows gaps/skpis to +be excluded from the coverage calculation, which is important for RNA-Seq. +Until v0.0.2, this Galaxy tool used the bam_to_wiggle.py script from https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py +which internally used pysam (and thus samtools) and wigToBigWig from UCSC. + +Requirements +------------ -Place the script in the same directory as the XML configuration file, or -provide a soft link to it. +If you are installing this tool manually, place the Python script in the +same directory as the XML configuration file, or provide a soft link to it. +Ensure the following command line tools are on the system path: -This requires: +pysam - Python interface to samtools (http://code.google.com/p/pysam/) +genomeCoverageBed - part of BedTools (http://code.google.com/p/bedtools/) +bedGraphToBigWig - from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/) -Python2, version 2.6 or better -pysam (http://code.google.com/p/pysam/) -wigToBigWig from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/) +Credits +------- + +Original script by Brad Chapman, revisions from Peter Cock including the +switch to using genomeCoverageBed and bedGraphToBigWig based on the work +of Lance Parsons.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bam_to_bigwig/bam_to_bigwig.py Wed Sep 05 21:14:54 2012 -0400 @@ -0,0 +1,115 @@ +#!/usr/bin/env python +#Original version copyright Brad Chapman with revisions from Peter Cock +#and ideas from Lance Parsons +"""Convert BAM files to BigWig file format in a specified region. + +Usage: + bam_to_wiggle.py <BAM file> [<YAML config>] [--outfile=<output file name>] + +The optional config file (not used by the Galaxy interface) is in YAML format +and specifies the location of the binary tools. + +program: + bedtools_genomeCoverageBed: genomeCoverageBed + ucsc_bedGraphToBigWig: bedGraphToBigWig + +If not specified, these will be assumed to be present in the system path. + +The script requires: + pysam (http://code.google.com/p/pysam/) + genomeCoverageBed from BedTools (http://code.google.com/p/bedtools/) + bedGraphToBigWig from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/) +If a configuration file is used, then PyYAML is also required (http://pyyaml.org/) +""" +import os +import sys +import subprocess +import tempfile +from optparse import OptionParser +from contextlib import contextmanager, closing + +import pysam + +def main(bam_file, config_file=None, outfile=None, split=False): + if config_file: + import yaml + with open(config_file) as in_handle: + config = yaml.load(in_handle) + else: + config = {"program": {"ucsc_bedGraphToBigWig" : "bedGraphToBigWig", + "bedtools_genomeCoverageBed" : "genomeCoverageBed"}} + if outfile is None: + outfile = "%s.bigwig" % os.path.splitext(bam_file)[0] + if os.path.abspath(bam_file) == os.path.abspath(outfile): + sys.stderr.write("Bad arguments, input and output files are the same.\n") + sys.exit(1) + if os.path.exists(outfile) and os.path.getsize(outfile) > 0: + sys.stderr.write("Warning, output file already exists.\n") + + sizes = get_sizes(bam_file, config) + print "Have %i references" % len(sizes) + if not sizes: + sys.stderr.write("Problem reading BAM header.\n") + sys.exit(1) + + #Use a temp file to avoid any possiblity of not having write permission + temp_handle = tempfile.NamedTemporaryFile(delete=False) + temp_file = temp_handle.name + with closing(temp_handle): + print "Calculating coverage..." + convert_to_graph(bam_file, split, config, temp_handle) + try: + print "Converting %i MB graph file to bigwig..." % (os.path.getsize(temp_file) // (1024*1024)) + #Can't pipe this as stdin due to converter design, + #https://lists.soe.ucsc.edu/pipermail/genome/2011-March/025455.html + convert_to_bigwig(temp_file, sizes, config, outfile) + finally: + if os.path.isfile(temp_file): + os.remove(temp_file) + print "Done" + +@contextmanager +def indexed_bam(bam_file, config): + if not os.path.exists(bam_file + ".bai"): + pysam.index(bam_file) + sam_reader = pysam.Samfile(bam_file, "rb") + yield sam_reader + sam_reader.close() + +def get_sizes(bam_file, config): + with indexed_bam(bam_file, config) as work_bam: + sizes = zip(work_bam.references, work_bam.lengths) + return sizes + +def convert_to_graph(bam_file, split, config, out_handle): + cl = [config["program"]["bedtools_genomeCoverageBed"], "-ibam", bam_file, "-bg"] + if split: + cl.append("-split") + subprocess.check_call(cl, stdout=out_handle) + +def convert_to_bigwig(bedgraph_file, chr_sizes, config, bw_file): + #This will be fine under Galaxy, but could use temp folder? + size_file = "%s-sizes.txt" % (os.path.splitext(bw_file)[0]) + with open(size_file, "w") as out_handle: + for chrom, size in chr_sizes: + out_handle.write("%s\t%s\n" % (chrom, size)) + try: + cl = [config["program"]["ucsc_bedGraphToBigWig"], bedgraph_file, size_file, bw_file] + subprocess.check_call(cl) + finally: + os.remove(size_file) + return bw_file + +if __name__ == "__main__": + parser = OptionParser() + parser.add_option("-o", "--outfile", dest="outfile") + parser.add_option("-s", "--split", action="store_true", dest="split") + (options, args) = parser.parse_args() + if len(args) not in [1, 2]: + print "Incorrect arguments" + print __doc__ + sys.exit() + kwargs = dict( + outfile=options.outfile, + split=options.split) + main(*args, **kwargs)
--- a/bam_to_bigwig/bam_to_bigwig.xml Tue Jun 07 16:26:46 2011 -0400 +++ b/bam_to_bigwig/bam_to_bigwig.xml Wed Sep 05 21:14:54 2012 -0400 @@ -1,17 +1,23 @@ -<tool id="bam_to_bigwig" name="BAM to BigWig" version="0.0.2"> +<tool id="bam_to_bigwig" name="BAM to BigWig" version="0.1.0"> <description>Calculates coverage from a BAM alignment file</description> - <command interpreter="python">bam_to_wiggle.py $align --outfile=$out</command> + <command interpreter="python">bam_to_bigwig.py $align --outfile=$out</command> <inputs> <param format="bam" name="align" type="data" label="BAM alignment file"/> </inputs> <outputs> <data format="bigwig" name="out" /> </outputs> - + <requirements> + <requirement type="python-module">pysam</requirement> + <requirement type="binary">genomeCoverageBed</requirement> + <requirement type="binary">bedGraphToBigWig</requirement> + </requirements> <help> **What it does** -Creates a coverage file in BigWig format, given a BAM alignment file. +Creates a coverage file in BigWig format, given a BAM alignment file. + +Gaps or skips (CIGAR D or N operators) are not counted towards the coverage calculation, which is important when mapping RNA Seq reads to genes with introns. **Input** @@ -21,5 +27,4 @@ BigWig files can be loaded directly from Galaxy into the UCSC browser. They can be loaded incrementally by UCSC, so a single file can be used to represent the entire genome without having to upload the entire thing as a custom track. </help> - </tool>
--- a/bam_to_bigwig/bam_to_wiggle.py Tue Jun 07 16:26:46 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,123 +0,0 @@ -#!/usr/bin/env python -"""Convert BAM files to BigWig file format in a specified region. - -Usage: - bam_to_wiggle.py <BAM file> [<YAML config>] - [--outfile=<output file name> - --chrom=<chrom> - --start=<start> - --end=<end>] - -chrom start and end are optional, in which case they default to everything. - -The config file is in YAML format and specifies the location of the wigToBigWig -program from UCSC: - -program: - ucsc_bigwig: wigToBigWig - -If not specified, these will be assumed to be present in the system path. - -The script requires: - pysam (http://code.google.com/p/pysam/) - wigToBigWig from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/) -If a configuration file is used, then PyYAML is also required (http://pyyaml.org/) -""" -import os -import sys -import subprocess -import tempfile -from optparse import OptionParser -from contextlib import contextmanager, closing - -import pysam - -def main(bam_file, config_file=None, chrom='all', start=0, end=None, - outfile=None): - if config_file: - import yaml - with open(config_file) as in_handle: - config = yaml.load(in_handle) - else: - config = {"program": {"ucsc_bigwig" : "wigToBigWig"}} - if outfile is None: - outfile = "%s.bigwig" % os.path.splitext(bam_file)[0] - if start > 0: - start = int(start) - 1 - if end is not None: - end = int(end) - regions = [(chrom, start, end)] - if os.path.abspath(bam_file) == os.path.abspath(outfile): - sys.stderr.write("Bad arguments, input and output files are the same.\n") - sys.exit(1) - if not (os.path.exists(outfile) and os.path.getsize(outfile) > 0): - #Use a temp file to avoid any possiblity of not having write permission - out_handle = tempfile.NamedTemporaryFile(delete=False) - wig_file = out_handle.name - with closing(out_handle): - chr_sizes, wig_valid = write_bam_track(bam_file, regions, config, out_handle) - try: - if wig_valid: - convert_to_bigwig(wig_file, chr_sizes, config, outfile) - finally: - os.remove(wig_file) - -@contextmanager -def indexed_bam(bam_file, config): - if not os.path.exists(bam_file + ".bai"): - pysam.index(bam_file) - sam_reader = pysam.Samfile(bam_file, "rb") - yield sam_reader - sam_reader.close() - -def write_bam_track(bam_file, regions, config, out_handle): - out_handle.write("track %s\n" % " ".join(["type=wiggle_0", - "name=%s" % os.path.splitext(os.path.split(bam_file)[-1])[0], - "visibility=full", - ])) - is_valid = False - with indexed_bam(bam_file, config) as work_bam: - sizes = zip(work_bam.references, work_bam.lengths) - if len(regions) == 1 and regions[0][0] == "all": - regions = [(name, 0, length) for name, length in sizes] - for chrom, start, end in regions: - if end is None and chrom in work_bam.references: - end = work_bam.lengths[work_bam.references.index(chrom)] - assert end is not None, "Could not find %s in header" % chrom - out_handle.write("variableStep chrom=%s\n" % chrom) - for col in work_bam.pileup(chrom, start, end): - out_handle.write("%s %s\n" % (col.pos+1, col.n)) - is_valid = True - return sizes, is_valid - -def convert_to_bigwig(wig_file, chr_sizes, config, bw_file=None): - if not bw_file: - bw_file = "%s.bigwig" % (os.path.splitext(wig_file)[0]) - size_file = "%s-sizes.txt" % (os.path.splitext(wig_file)[0]) - with open(size_file, "w") as out_handle: - for chrom, size in chr_sizes: - out_handle.write("%s\t%s\n" % (chrom, size)) - try: - cl = [config["program"]["ucsc_bigwig"], wig_file, size_file, bw_file] - subprocess.check_call(cl) - finally: - os.remove(size_file) - return bw_file - -if __name__ == "__main__": - parser = OptionParser() - parser.add_option("-o", "--outfile", dest="outfile") - parser.add_option("-c", "--chrom", dest="chrom") - parser.add_option("-s", "--start", dest="start") - parser.add_option("-e", "--end", dest="end") - (options, args) = parser.parse_args() - if len(args) not in [1, 2]: - print "Incorrect arguments" - print __doc__ - sys.exit() - kwargs = dict( - outfile=options.outfile, - chrom=options.chrom or 'all', - start=options.start or 0, - end=options.end) - main(*args, **kwargs)