Mercurial > repos > iuc > ngsutils_bam_filter
comparison ngsutils/support/bgzip.py @ 0:4e4e4093d65d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author | iuc |
---|---|
date | Wed, 11 Nov 2015 13:04:07 -0500 |
parents | |
children | 7a68005de299 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4e4e4093d65d |
---|---|
1 #!/usr/bin/env python | |
2 ''' | |
3 Extract the blocks from a BGZip file. | |
4 | |
5 BAM files are stored as blocks in a bgzip archive. This class | |
6 will load the bgzip archive and output the block information. | |
7 ''' | |
8 | |
9 import sys | |
10 import os | |
11 import struct | |
12 | |
13 | |
14 class BGZip(object): | |
15 def __init__(self, fname): | |
16 self.fname = fname | |
17 self.pos = 0 | |
18 self.fileobj = open(self.fname) | |
19 self.fsize = os.stat(self.fname).st_size | |
20 self.cur_chunk = 0 | |
21 | |
22 self.cpos = 0 | |
23 self.cdata = 0 | |
24 | |
25 def close(self): | |
26 self.fileobj.close() | |
27 | |
28 def seek(self, offset): | |
29 bgz_offset = offset >> 16 | |
30 chunk_offset = offset & 0xFFFF | |
31 | |
32 self.fileobj.seek(bgz_offset) | |
33 self.read_chunk() | |
34 self.chunk_pos = chunk_offset | |
35 | |
36 def read(self, amount, whence=0): | |
37 if whence not in [0, 1]: | |
38 print "Bad Whence value!: %s" % whence | |
39 return | |
40 | |
41 if whence == 0: | |
42 self.seek(0, 0) | |
43 | |
44 ### read into chunk, if not enough data in chunk, read next chunk | |
45 ret = '' | |
46 while amount and self.pos < self.fsize: | |
47 if len(self.cdata) - self.cpos < amount: | |
48 ret += self.cdata[self.cpos:self.cpos + amount] | |
49 self.cpos += amount | |
50 return ret | |
51 else: | |
52 ret += self.cdata[self.cpos:] | |
53 amount = amount - len(ret) | |
54 self.read_chunk() | |
55 return ret | |
56 | |
57 def read_chunk(self): | |
58 self.fileobj.seek(10) | |
59 id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH') | |
60 subpos = 0 | |
61 bsize = 0 | |
62 | |
63 while subpos < xlen: | |
64 si1, si2, slen = self._read_fields('<BBH') | |
65 if si1 == 66 and si2 == 67: | |
66 bsize, = self._read_fields('<H') | |
67 else: | |
68 self.fileobj.seek(slen, 1) | |
69 self.pos += slen | |
70 | |
71 subpos += 6 + slen | |
72 | |
73 cdata_size = bsize - xlen - 19 | |
74 self.cdata = self.fileobj.read(cdata_size) # inflate value | |
75 self.fileobj.seek(8) | |
76 | |
77 self.cur_chunk += 1 | |
78 self.cpos = 0 | |
79 | |
80 def dump(self): | |
81 self.fileobj.seek(0) | |
82 block_num = 0 | |
83 | |
84 while self.pos < self.fsize: | |
85 print "[BLOCK %s]" % block_num | |
86 # read header | |
87 id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH') | |
88 print 'id1: %s' % id1 | |
89 print 'id2: %s' % id2 | |
90 print 'cm: %s' % cm | |
91 print 'flg: %s' % flg | |
92 print 'mtime: %s' % mtime | |
93 print 'xfl: %s' % xfl | |
94 print 'os: %s' % os | |
95 print 'xlen: %s' % xlen | |
96 | |
97 # read subfields | |
98 subpos = 0 | |
99 bsize = 0 | |
100 | |
101 while subpos < xlen: | |
102 si1, si2, slen = self._read_fields('<BBH') | |
103 print ' si1: %s' % si1 | |
104 print ' si1: %s' % si2 | |
105 print ' slen: %s' % slen | |
106 print ' data: [%s]' % slen | |
107 | |
108 if si1 == 66 and si2 == 67: | |
109 bsize, = self._read_fields('<H') | |
110 else: | |
111 self.fileobj.seek(slen, 1) | |
112 self.pos += slen | |
113 | |
114 subpos += 6 + slen | |
115 | |
116 cdata_size = bsize - xlen - 19 | |
117 | |
118 print 'bsize: %s' % bsize | |
119 print 'cdata: [%s]' % (cdata_size) | |
120 | |
121 self.fileobj.seek(cdata_size, 1) | |
122 self.pos += cdata_size | |
123 crc, isize = self._read_fields('<II') | |
124 | |
125 print "crc: %s" % crc | |
126 print "isize: %s" % isize | |
127 # print "POS: %s" % self.pos | |
128 | |
129 block_num += 1 | |
130 | |
131 def _read_fields(self, field_types): | |
132 size = struct.calcsize(field_types) | |
133 self.pos += size | |
134 return struct.unpack(field_types, self.fileobj.read(size)) | |
135 | |
136 if __name__ == '__main__': | |
137 print BGZip(sys.argv[1]).dump() |