comparison ngsutils/support/bgzip.py @ 0:4e4e4093d65d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author iuc
date Wed, 11 Nov 2015 13:04:07 -0500
parents
children 7a68005de299
comparison
equal deleted inserted replaced
-1:000000000000 0:4e4e4093d65d
1 #!/usr/bin/env python
2 '''
3 Extract the blocks from a BGZip file.
4
5 BAM files are stored as blocks in a bgzip archive. This class
6 will load the bgzip archive and output the block information.
7 '''
8
9 import sys
10 import os
11 import struct
12
13
14 class BGZip(object):
15 def __init__(self, fname):
16 self.fname = fname
17 self.pos = 0
18 self.fileobj = open(self.fname)
19 self.fsize = os.stat(self.fname).st_size
20 self.cur_chunk = 0
21
22 self.cpos = 0
23 self.cdata = 0
24
25 def close(self):
26 self.fileobj.close()
27
28 def seek(self, offset):
29 bgz_offset = offset >> 16
30 chunk_offset = offset & 0xFFFF
31
32 self.fileobj.seek(bgz_offset)
33 self.read_chunk()
34 self.chunk_pos = chunk_offset
35
36 def read(self, amount, whence=0):
37 if whence not in [0, 1]:
38 print "Bad Whence value!: %s" % whence
39 return
40
41 if whence == 0:
42 self.seek(0, 0)
43
44 ### read into chunk, if not enough data in chunk, read next chunk
45 ret = ''
46 while amount and self.pos < self.fsize:
47 if len(self.cdata) - self.cpos < amount:
48 ret += self.cdata[self.cpos:self.cpos + amount]
49 self.cpos += amount
50 return ret
51 else:
52 ret += self.cdata[self.cpos:]
53 amount = amount - len(ret)
54 self.read_chunk()
55 return ret
56
57 def read_chunk(self):
58 self.fileobj.seek(10)
59 id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
60 subpos = 0
61 bsize = 0
62
63 while subpos < xlen:
64 si1, si2, slen = self._read_fields('<BBH')
65 if si1 == 66 and si2 == 67:
66 bsize, = self._read_fields('<H')
67 else:
68 self.fileobj.seek(slen, 1)
69 self.pos += slen
70
71 subpos += 6 + slen
72
73 cdata_size = bsize - xlen - 19
74 self.cdata = self.fileobj.read(cdata_size) # inflate value
75 self.fileobj.seek(8)
76
77 self.cur_chunk += 1
78 self.cpos = 0
79
80 def dump(self):
81 self.fileobj.seek(0)
82 block_num = 0
83
84 while self.pos < self.fsize:
85 print "[BLOCK %s]" % block_num
86 # read header
87 id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
88 print 'id1: %s' % id1
89 print 'id2: %s' % id2
90 print 'cm: %s' % cm
91 print 'flg: %s' % flg
92 print 'mtime: %s' % mtime
93 print 'xfl: %s' % xfl
94 print 'os: %s' % os
95 print 'xlen: %s' % xlen
96
97 # read subfields
98 subpos = 0
99 bsize = 0
100
101 while subpos < xlen:
102 si1, si2, slen = self._read_fields('<BBH')
103 print ' si1: %s' % si1
104 print ' si1: %s' % si2
105 print ' slen: %s' % slen
106 print ' data: [%s]' % slen
107
108 if si1 == 66 and si2 == 67:
109 bsize, = self._read_fields('<H')
110 else:
111 self.fileobj.seek(slen, 1)
112 self.pos += slen
113
114 subpos += 6 + slen
115
116 cdata_size = bsize - xlen - 19
117
118 print 'bsize: %s' % bsize
119 print 'cdata: [%s]' % (cdata_size)
120
121 self.fileobj.seek(cdata_size, 1)
122 self.pos += cdata_size
123 crc, isize = self._read_fields('<II')
124
125 print "crc: %s" % crc
126 print "isize: %s" % isize
127 # print "POS: %s" % self.pos
128
129 block_num += 1
130
131 def _read_fields(self, field_types):
132 size = struct.calcsize(field_types)
133 self.pos += size
134 return struct.unpack(field_types, self.fileobj.read(size))
135
136 if __name__ == '__main__':
137 print BGZip(sys.argv[1]).dump()