comparison commons/core/parsing/BamParser.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children
comparison
equal deleted inserted replaced
5:ea3082881bf8 6:769e306b7933
1 #
2 # Copyright INRA-URGI 2009-2012
3 #
4 # This software is governed by the CeCILL license under French law and
5 # abiding by the rules of distribution of free software. You can use,
6 # modify and/ or redistribute the software under the terms of the CeCILL
7 # license as circulated by CEA, CNRS and INRIA at the following URL
8 # "http://www.cecill.info".
9 #
10 # As a counterpart to the access to the source code and rights to copy,
11 # modify and redistribute granted by the license, users are provided only
12 # with a limited warranty and the software's author, the holder of the
13 # economic rights, and the successive licensors have only limited
14 # liability.
15 #
16 # In this respect, the user's attention is drawn to the risks associated
17 # with loading, using, modifying and/or developing or reproducing the
18 # software by the user in light of its specific status of free software,
19 # that may mean that it is complicated to manipulate, and that also
20 # therefore means that it is reserved for developers and experienced
21 # professionals having in-depth computer knowledge. Users are therefore
22 # encouraged to load and test the software's suitability as regards their
23 # requirements in conditions enabling the security of their systems and/or
24 # data to be ensured and, more generally, to use and operate it in the
25 # same conditions as regards security.
26 #
27 # The fact that you are presently reading this means that you have had
28 # knowledge of the CeCILL license and that you accept its terms.
29 #
30 import re, sys, gzip, struct
31 from commons.core.parsing.MapperParser import MapperParser
32 from SMART.Java.Python.structure.Mapping import Mapping
33 from SMART.Java.Python.structure.SubMapping import SubMapping
34 from SMART.Java.Python.structure.Interval import Interval
35
36
37 BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN"
38
39 BAM_CIGAR_LOOKUP = "MIDNSHP=X"
40 BAM_CIGAR_SHIFT = 4
41 BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1)
42
43
44
45 def pack_int32(x):
46 return struct.pack('<i', x)
47
48 def pack_uint32(x):
49 return struct.pack('<I', x)
50
51 def unpack_int8(x):
52 return struct.unpack('<b', x)[0]
53
54 def unpack_int16(x):
55 return struct.unpack('<h', x)[0]
56
57 def unpack_int32(x):
58 return struct.unpack('<i', x)[0]
59
60 def unpack_int64(x):
61 return struct.unpack('<q', x)[0]
62
63 def unpack_uint8(x):
64 return struct.unpack('<B', x)[0]
65
66 def unpack_uint16(x):
67 return struct.unpack('<H', x)[0]
68
69 def unpack_uint32(x):
70 return struct.unpack('<I', x)[0]
71
72 def unpack_uint64(x):
73 return struct.unpack('<Q', x)[0]
74
75 def unpack_float(x):
76 return struct.unpack('<f', x)[0]
77
78 def unpack_string(x):
79 length = len(x)
80 format_string = "<{0}s".format(length)
81 string = struct.unpack(format_string, x)[0]
82 if string[-1] == '\0':
83 return string[:-1]
84 else:
85 return string
86
87
88 BAM_TAG_CODE = {"c": unpack_int8, \
89 "C": unpack_uint8, \
90 "s": unpack_int16, \
91 "S": unpack_uint16, \
92 "i": unpack_int32, \
93 "I": unpack_uint32, \
94 "f": unpack_float, \
95 #"A": unpack_int8, \
96 "A": lambda x: x, \
97 "Z": unpack_int8, \
98 "H": unpack_int8}
99
100 BAM_TAG_VALUE = {"c": int, \
101 "C": int, \
102 "s": int, \
103 "S": int, \
104 "i": int, \
105 "I": int, \
106 "f": float, \
107 "A": lambda x: x}
108
109 BAM_TAG_SIZE = {"c": 1, \
110 "C": 1, \
111 "s": 2, \
112 "S": 2, \
113 "i": 4, \
114 "I": 4, \
115 "f": 4, \
116 "A": 1}
117
118
119 class CigarOp(object):
120 def __init__(self, data):
121 self._length = data >> BAM_CIGAR_SHIFT
122 self._type = BAM_CIGAR_LOOKUP[ data & BAM_CIGAR_MASK ]
123
124
125 class CigarData(object):
126 def __init__(self, data, num_ops):
127 self._ops = []
128 for i in range(num_ops):
129 cigar_data = unpack_uint32(data[i*4: (i+1)*4])
130 self._ops.append(CigarOp(cigar_data))
131
132 def getCigarData(self):
133 return self._ops
134
135 def __str__(self):
136 return "".join(["%d%s" % (op._length, op._type) for op in self._ops])
137
138
139 class TagsData(object):
140 def __init__(self):
141 self._tags = {}
142
143 def add(self, tag):
144 self._tags[tag._tag] = tag
145
146 def getTags(self):
147 return self._tags
148
149 def __str__(self):
150 return " ".join([self._tags[tag] for tag in sorted(self._tags.keys())])
151
152
153 class TagData(object):
154 def __init__(self, tag, type, value):
155 self._tag = tag
156 self._type = type
157 self._value = value
158
159 def __str__(self):
160 if self._type in "AZHB":
161 return "%s:%s:%s" % (self._tag, self._type, self._value)
162 if self._type in "cCsSiI":
163 return "%s:%s:%d" % (self._tag, self._type, self._value)
164 return "%s:%s:%f" % (self._tag, self._type, self._value)
165
166
167 class TagParser(object):
168 def __init__(self, data):
169 self._data = data
170 self._tags = TagsData()
171 self._parse()
172
173 def _parse(self):
174 while self._data:
175 tag = "%s%s" % (chr(unpack_int8(self._data[0])), chr(unpack_int8(self._data[1])))
176 type = chr(unpack_int8(self._data[2]))
177 self._data = self._data[3:]
178 if type in BAM_TAG_VALUE:
179 value = self._parseUnique(type)
180 elif type == "Z":
181 value = self._parseString()
182 elif type == "H":
183 size = unpack_int8(self._data[0])
184 self._data = self._data[1:]
185 value = self._parseSeveral("C", size)
186 elif type == "B":
187 secondType = unpack_int8(self._data[0])
188 size = unpack_int8(self._data[1]) + unpack_int8(self._data[2]) * 16 + unpack_int8(self._data[3]) * 16 * 16 + unpack_int8(self._data[4]) * 16 * 16 * 16
189 self._data = self._data[5:]
190 value = self._parseSeveral(secondType, size)
191 else:
192 raise Exception("Cannot parse type '%s'." % (type))
193 fullTag = TagData(tag, type, value)
194 self._tags.add(fullTag)
195
196 def _parseUnique(self, type):
197 value = BAM_TAG_CODE[type](self._data[:BAM_TAG_SIZE[type]])
198 self._data = self._data[BAM_TAG_SIZE[type]:]
199 return value
200
201 def _parseSeveral(self, type, size):
202 value = []
203 for i in range(size):
204 value.append(self._parseUnique(type))
205 return value
206
207 def _parseString(self):
208 value = ""
209 char = self._data[0]
210 self._data = self._data[1:]
211 while unpack_int8(char) != 0:
212 value += char
213 char = self._data[0]
214 self._data = self._data[1:]
215 return value
216
217 def getTags(self):
218 return self._tags.getTags()
219
220 def __str__(self):
221 return self._tags
222
223
224 class AlignedRead(object):
225 def __init__(self, data, refs):
226 self._data = data
227 self._refs = refs
228
229 def parse(self):
230 self._parse_common()
231 self._parse_flag_nc()
232 self._parse_bin_mq_nl()
233 self._parse_name()
234 self._parse_cigar()
235 self._parse_sequence()
236 self._parse_quality()
237 self._parse_tags()
238
239 def _parse_common(self):
240 ref_id = unpack_int32(self._data[0:4])
241 self._chromosome = self._refs[ref_id]
242 self._pos = unpack_int32(self._data[4:8]) + 1
243 mate_ref_id = unpack_int32(self._data[20:24])
244 if mate_ref_id == -1:
245 self._rnext = "*"
246 else:
247 self._rnext = self._refs[mate_ref_id]
248 if self._rnext == self._chromosome:
249 self._rnext = "="
250 self._pnext = unpack_int32(self._data[24:28]) + 1
251 self._tlen = unpack_int32(self._data[28:32])
252
253 def _parse_bin_mq_nl(self):
254 bin_mq_nl = unpack_uint32(self._data[8:12])
255 self._bin = bin_mq_nl >> 16
256 self._mappingQuality = bin_mq_nl >> 8 & 0xff
257 self._query_name_length = bin_mq_nl & 0xff
258
259 def _parse_flag_nc(self):
260 flag_nc = unpack_uint32(self._data[12:16])
261 self._flag = flag_nc >> 16
262 self._num_cigar_ops = flag_nc & 0xffff
263
264 def _parse_name(self):
265 start = 32
266 stop = start + self._query_name_length
267 self._name = unpack_string(self._data[start:stop])
268
269 def _parse_cigar(self):
270 start = 32 + self._query_name_length
271 stop = start + (self._num_cigar_ops * 4)
272 _buffer = self._data[start:stop]
273 cigar = CigarData(_buffer, self._num_cigar_ops)
274 self._cigar = cigar.getCigarData()
275
276 def _parse_sequence(self):
277 seq_length = unpack_int32(self._data[16:20])
278 start = 32 + self._query_name_length + (self._num_cigar_ops * 4)
279 stop = start + (seq_length + 1) / 2
280 _buffer = self._data[start:stop]
281 self._sequence = ""
282 for i in range(seq_length):
283 x = unpack_uint8(_buffer[(i / 2)])
284 index = (x >> (4 * (1 - (i % 2)))) & 0xf
285 base = BAM_DNA_LOOKUP[index]
286 self._sequence += base
287
288 def _parse_quality(self):
289 seq_length = unpack_int32(self._data[16:20])
290 start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2
291 stop = start + seq_length
292 _buffer = self._data[start:stop]
293 self._quality = "".join(["%s" % (chr(unpack_int8(x) + 33)) for x in _buffer])
294
295 def _parse_tags(self):
296 seq_length = unpack_int32(self._data[16:20])
297 start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2 + (seq_length + 1) - 1
298 stop = start + seq_length
299 _buffer = self._data[start:]
300 tagParser = TagParser(_buffer)
301 self._tags = tagParser.getTags()
302
303
304 class FileReader(object):
305
306 def __init__(self, handle):
307 self._handle = handle
308 self._readHeader()
309
310 def _readHeader(self):
311 magic = unpack_string(self._handle.read(4))
312 if magic != "BAM\1":
313 raise Exception("File should start with 'BAM\1', starting with '%s' instead." % (magic))
314 tlen = unpack_int32(self._handle.read(4))
315 text = unpack_string(self._handle.read(tlen))
316 nrefs = unpack_int32(self._handle.read(4))
317 self._refs = []
318 for i in range(nrefs):
319 sizeName = unpack_int32(self._handle.read(4))
320 name = unpack_string(self._handle.read(sizeName))
321 size = unpack_int32(self._handle.read(4))
322 self._refs.append(name)
323 self._startPos = self._handle.tell()
324
325 def reset(self):
326 self._handle.seek(self._startPos)
327
328 def getNextAlignment(self):
329 try:
330 blockSize = unpack_int32(self._handle.read(4))
331 except struct.error:
332 return False
333 block = self._handle.read(blockSize)
334 currentRead = AlignedRead(block, self._refs)
335 return currentRead
336
337
338
339 def parseAlignedRead(read):
340 if (read._flag & 0x4) == 0x4:
341 return None
342
343 mapping = Mapping()
344 direction = 1 if (read._flag & 0x10) == 0x0 else -1
345 genomeStart = read._pos
346 nbOccurrences = 1
347 nbMismatches = 0
348 nbMatches = 0
349 nbGaps = 0
350 subMapping = None
351 queryOffset = 0
352 targetOffset = 0
353 readStart = None
354
355 for tag, value in read._tags.iteritems():
356 if tag == "X0":
357 nbOccurrences = value._value
358 elif tag == "X1":
359 nbOccurrences += value._value
360 elif tag == "XM":
361 nbMismatches = value._value
362 mapping.setTagValue("nbOccurrences", nbOccurrences)
363 mapping.setTagValue("quality", read._mappingQuality)
364
365 for operation in read._cigar:
366 if operation._type == "M":
367 if readStart == None:
368 readStart = queryOffset
369 if subMapping == None:
370 subMapping = SubMapping()
371 subMapping.setSize(operation._length)
372 subMapping.setDirection(direction)
373 subMapping.queryInterval.setName(read._name)
374 subMapping.queryInterval.setStart(queryOffset)
375 subMapping.queryInterval.setDirection(direction)
376 subMapping.targetInterval.setChromosome(read._chromosome)
377 subMapping.targetInterval.setStart(genomeStart + targetOffset)
378 subMapping.targetInterval.setDirection(1)
379 nbMatches += operation._length
380 targetOffset += operation._length
381 queryOffset += operation._length
382 currentNumber = 0
383 continue
384 if operation._type == "I":
385 nbGaps += 1
386 queryOffset += operation._length
387 currentNumber = 0
388 continue
389 if operation._type == "D":
390 if subMapping != None:
391 subMapping.queryInterval.setEnd(queryOffset - 1)
392 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
393 mapping.addSubMapping(subMapping)
394 subMapping = None
395 nbGaps += 1
396 targetOffset += operation._length
397 currentNumber = 0
398 continue
399 if operation._type == "N":
400 if subMapping != None:
401 subMapping.queryInterval.setEnd(queryOffset - 1)
402 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
403 mapping.addSubMapping(subMapping)
404 subMapping = None
405 targetOffset += operation._length
406 currentNumber = 0
407 continue
408 if operation._type == "S":
409 nbMismatches += operation._length
410 targetOffset += operation._length
411 queryOffset += operation._length
412 currentNumber = 0
413 continue
414 if operation._type == "H":
415 targetOffset += operation._length
416 queryOffset += operation._length
417 currentNumber = 0
418 continue
419 if operation._type == "P":
420 continue
421 raise Exception("Do not understand parameter '%s'" % (operation._type))
422
423 if subMapping != None:
424 subMapping.queryInterval.setEnd(queryOffset - 1)
425 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
426 mapping.addSubMapping(subMapping)
427 mapping.queryInterval.setStart(readStart)
428 mapping.queryInterval.setEnd(queryOffset - 1)
429 mapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
430 mapping.setNbMismatches(nbMismatches)
431 mapping.setNbGaps(nbGaps)
432 mapping.queryInterval.setName(read._name)
433 mapping.queryInterval.setDirection(direction)
434 mapping.targetInterval.setChromosome(read._chromosome)
435 mapping.targetInterval.setStart(genomeStart)
436 mapping.targetInterval.setDirection(direction)
437 mapping.setSize(len(read._sequence))
438 mapping.setDirection(direction)
439 return mapping
440
441
442 class BamParser(MapperParser):
443 """A class that parses BAM format"""
444
445 def __init__(self, fileName, verbosity = 0):
446 self.verbosity = verbosity
447 self.handle = gzip.open(fileName, "rb")
448 self.reader = FileReader(self.handle)
449 self.nbMappings = None
450 self.fileName = fileName
451
452
453 def __del__(self):
454 self.handle.close()
455
456
457 def getFileFormats():
458 return ["bam"]
459 getFileFormats = staticmethod(getFileFormats)
460
461
462 def reset(self):
463 self.reader.reset()
464
465
466 def getNextMapping(self):
467 self.currentMapping = None
468 while self.currentMapping == None:
469 read = self.reader.getNextAlignment()
470 if not read:
471 self.currentMapping = False
472 return False
473 read.parse()
474 self.currentMapping = parseAlignedRead(read)
475 return self.currentMapping
476
477
478 def setDefaultTagValue(self, name, value):
479 pass
480
481
482 def skipFirstLines(self):
483 pass