Mercurial > repos > thondeboer > neat_genreads
diff py/biopython_modified_bgzf.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/py/biopython_modified_bgzf.py Tue May 15 02:39:53 2018 -0400 @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# Copyright 2010-2013 by Peter Cock. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +""" ############################################################################ +####### ####### +####### 06/02/2015: ####### +####### - I picked out the bits and pieces of code needed ####### +####### to write BAM files, removed python 3.0 compatibility ####### +####### ####### +############################################################################ """ + +import zlib +import struct + +_bgzf_header = b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00" +_bgzf_eof = b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" + +class BgzfWriter(object): + + def __init__(self, filename=None, mode="w", fileobj=None, compresslevel=6): + if fileobj: + assert filename is None + handle = fileobj + else: + if "w" not in mode.lower() \ + and "a" not in mode.lower(): + raise ValueError("Must use write or append mode, not %r" % mode) + if "a" in mode.lower(): + handle = open(filename, "ab") + else: + handle = open(filename, "wb") + self._text = "b" not in mode.lower() + self._handle = handle + self._buffer = b"" + self.compresslevel = compresslevel + + def _write_block(self, block): + start_offset = self._handle.tell() + assert len(block) <= 65536 + # Giving a negative window bits means no gzip/zlib headers, -15 used in samtools + c = zlib.compressobj(self.compresslevel, + zlib.DEFLATED, + -15, + zlib.DEF_MEM_LEVEL, + 0) + compressed = c.compress(block) + c.flush() + del c + assert len(compressed) < 65536, "TODO - Didn't compress enough, try less data in this block" + crc = zlib.crc32(block) + # Should cope with a mix of Python platforms... + if crc < 0: + crc = struct.pack("<i", crc) + else: + crc = struct.pack("<I", crc) + bsize = struct.pack("<H", len(compressed) + 25) # includes -1 + crc = struct.pack("<I", zlib.crc32(block) & 0xffffffff) + uncompressed_length = struct.pack("<I", len(block)) + data = _bgzf_header + bsize + compressed + crc + uncompressed_length + self._handle.write(data) + + def write(self, data): + data_len = len(data) + if len(self._buffer) + data_len < 65536: + self._buffer += data + return + else: + self._buffer += data + while len(self._buffer) >= 65536: + self._write_block(self._buffer[:65536]) + self._buffer = self._buffer[65536:] + + def flush(self): + while len(self._buffer) >= 65536: + self._write_block(self._buffer[:65535]) + self._buffer = self._buffer[65535:] + self._write_block(self._buffer) + self._buffer = b"" + self._handle.flush() + + def close(self): + """Flush data, write 28 bytes empty BGZF EOF marker, and close the BGZF file.""" + if self._buffer: + self.flush() + # samtools will look for a magic EOF marker, just a 28 byte empty BGZF block, + # and if it is missing warns the BAM file may be truncated. In addition to + # samtools writing this block, so too does bgzip - so we should too. + self._handle.write(_bgzf_eof) + self._handle.flush() + self._handle.close() + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + + +if __name__ == "__main__": + pass
