Mercurial > repos > yufei-luo > s_mart
comparison commons/core/parsing/BamParser.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:ea3082881bf8 | 6:769e306b7933 |
---|---|
1 # | |
2 # Copyright INRA-URGI 2009-2012 | |
3 # | |
4 # This software is governed by the CeCILL license under French law and | |
5 # abiding by the rules of distribution of free software. You can use, | |
6 # modify and/ or redistribute the software under the terms of the CeCILL | |
7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
8 # "http://www.cecill.info". | |
9 # | |
10 # As a counterpart to the access to the source code and rights to copy, | |
11 # modify and redistribute granted by the license, users are provided only | |
12 # with a limited warranty and the software's author, the holder of the | |
13 # economic rights, and the successive licensors have only limited | |
14 # liability. | |
15 # | |
16 # In this respect, the user's attention is drawn to the risks associated | |
17 # with loading, using, modifying and/or developing or reproducing the | |
18 # software by the user in light of its specific status of free software, | |
19 # that may mean that it is complicated to manipulate, and that also | |
20 # therefore means that it is reserved for developers and experienced | |
21 # professionals having in-depth computer knowledge. Users are therefore | |
22 # encouraged to load and test the software's suitability as regards their | |
23 # requirements in conditions enabling the security of their systems and/or | |
24 # data to be ensured and, more generally, to use and operate it in the | |
25 # same conditions as regards security. | |
26 # | |
27 # The fact that you are presently reading this means that you have had | |
28 # knowledge of the CeCILL license and that you accept its terms. | |
29 # | |
30 import re, sys, gzip, struct | |
31 from commons.core.parsing.MapperParser import MapperParser | |
32 from SMART.Java.Python.structure.Mapping import Mapping | |
33 from SMART.Java.Python.structure.SubMapping import SubMapping | |
34 from SMART.Java.Python.structure.Interval import Interval | |
35 | |
36 | |
37 BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN" | |
38 | |
39 BAM_CIGAR_LOOKUP = "MIDNSHP=X" | |
40 BAM_CIGAR_SHIFT = 4 | |
41 BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1) | |
42 | |
43 | |
44 | |
45 def pack_int32(x): | |
46 return struct.pack('<i', x) | |
47 | |
48 def pack_uint32(x): | |
49 return struct.pack('<I', x) | |
50 | |
51 def unpack_int8(x): | |
52 return struct.unpack('<b', x)[0] | |
53 | |
54 def unpack_int16(x): | |
55 return struct.unpack('<h', x)[0] | |
56 | |
57 def unpack_int32(x): | |
58 return struct.unpack('<i', x)[0] | |
59 | |
60 def unpack_int64(x): | |
61 return struct.unpack('<q', x)[0] | |
62 | |
63 def unpack_uint8(x): | |
64 return struct.unpack('<B', x)[0] | |
65 | |
66 def unpack_uint16(x): | |
67 return struct.unpack('<H', x)[0] | |
68 | |
69 def unpack_uint32(x): | |
70 return struct.unpack('<I', x)[0] | |
71 | |
72 def unpack_uint64(x): | |
73 return struct.unpack('<Q', x)[0] | |
74 | |
75 def unpack_float(x): | |
76 return struct.unpack('<f', x)[0] | |
77 | |
78 def unpack_string(x): | |
79 length = len(x) | |
80 format_string = "<{0}s".format(length) | |
81 string = struct.unpack(format_string, x)[0] | |
82 if string[-1] == '\0': | |
83 return string[:-1] | |
84 else: | |
85 return string | |
86 | |
87 | |
88 BAM_TAG_CODE = {"c": unpack_int8, \ | |
89 "C": unpack_uint8, \ | |
90 "s": unpack_int16, \ | |
91 "S": unpack_uint16, \ | |
92 "i": unpack_int32, \ | |
93 "I": unpack_uint32, \ | |
94 "f": unpack_float, \ | |
95 #"A": unpack_int8, \ | |
96 "A": lambda x: x, \ | |
97 "Z": unpack_int8, \ | |
98 "H": unpack_int8} | |
99 | |
100 BAM_TAG_VALUE = {"c": int, \ | |
101 "C": int, \ | |
102 "s": int, \ | |
103 "S": int, \ | |
104 "i": int, \ | |
105 "I": int, \ | |
106 "f": float, \ | |
107 "A": lambda x: x} | |
108 | |
109 BAM_TAG_SIZE = {"c": 1, \ | |
110 "C": 1, \ | |
111 "s": 2, \ | |
112 "S": 2, \ | |
113 "i": 4, \ | |
114 "I": 4, \ | |
115 "f": 4, \ | |
116 "A": 1} | |
117 | |
118 | |
119 class CigarOp(object): | |
120 def __init__(self, data): | |
121 self._length = data >> BAM_CIGAR_SHIFT | |
122 self._type = BAM_CIGAR_LOOKUP[ data & BAM_CIGAR_MASK ] | |
123 | |
124 | |
125 class CigarData(object): | |
126 def __init__(self, data, num_ops): | |
127 self._ops = [] | |
128 for i in range(num_ops): | |
129 cigar_data = unpack_uint32(data[i*4: (i+1)*4]) | |
130 self._ops.append(CigarOp(cigar_data)) | |
131 | |
132 def getCigarData(self): | |
133 return self._ops | |
134 | |
135 def __str__(self): | |
136 return "".join(["%d%s" % (op._length, op._type) for op in self._ops]) | |
137 | |
138 | |
139 class TagsData(object): | |
140 def __init__(self): | |
141 self._tags = {} | |
142 | |
143 def add(self, tag): | |
144 self._tags[tag._tag] = tag | |
145 | |
146 def getTags(self): | |
147 return self._tags | |
148 | |
149 def __str__(self): | |
150 return " ".join([self._tags[tag] for tag in sorted(self._tags.keys())]) | |
151 | |
152 | |
153 class TagData(object): | |
154 def __init__(self, tag, type, value): | |
155 self._tag = tag | |
156 self._type = type | |
157 self._value = value | |
158 | |
159 def __str__(self): | |
160 if self._type in "AZHB": | |
161 return "%s:%s:%s" % (self._tag, self._type, self._value) | |
162 if self._type in "cCsSiI": | |
163 return "%s:%s:%d" % (self._tag, self._type, self._value) | |
164 return "%s:%s:%f" % (self._tag, self._type, self._value) | |
165 | |
166 | |
167 class TagParser(object): | |
168 def __init__(self, data): | |
169 self._data = data | |
170 self._tags = TagsData() | |
171 self._parse() | |
172 | |
173 def _parse(self): | |
174 while self._data: | |
175 tag = "%s%s" % (chr(unpack_int8(self._data[0])), chr(unpack_int8(self._data[1]))) | |
176 type = chr(unpack_int8(self._data[2])) | |
177 self._data = self._data[3:] | |
178 if type in BAM_TAG_VALUE: | |
179 value = self._parseUnique(type) | |
180 elif type == "Z": | |
181 value = self._parseString() | |
182 elif type == "H": | |
183 size = unpack_int8(self._data[0]) | |
184 self._data = self._data[1:] | |
185 value = self._parseSeveral("C", size) | |
186 elif type == "B": | |
187 secondType = unpack_int8(self._data[0]) | |
188 size = unpack_int8(self._data[1]) + unpack_int8(self._data[2]) * 16 + unpack_int8(self._data[3]) * 16 * 16 + unpack_int8(self._data[4]) * 16 * 16 * 16 | |
189 self._data = self._data[5:] | |
190 value = self._parseSeveral(secondType, size) | |
191 else: | |
192 raise Exception("Cannot parse type '%s'." % (type)) | |
193 fullTag = TagData(tag, type, value) | |
194 self._tags.add(fullTag) | |
195 | |
196 def _parseUnique(self, type): | |
197 value = BAM_TAG_CODE[type](self._data[:BAM_TAG_SIZE[type]]) | |
198 self._data = self._data[BAM_TAG_SIZE[type]:] | |
199 return value | |
200 | |
201 def _parseSeveral(self, type, size): | |
202 value = [] | |
203 for i in range(size): | |
204 value.append(self._parseUnique(type)) | |
205 return value | |
206 | |
207 def _parseString(self): | |
208 value = "" | |
209 char = self._data[0] | |
210 self._data = self._data[1:] | |
211 while unpack_int8(char) != 0: | |
212 value += char | |
213 char = self._data[0] | |
214 self._data = self._data[1:] | |
215 return value | |
216 | |
217 def getTags(self): | |
218 return self._tags.getTags() | |
219 | |
220 def __str__(self): | |
221 return self._tags | |
222 | |
223 | |
224 class AlignedRead(object): | |
225 def __init__(self, data, refs): | |
226 self._data = data | |
227 self._refs = refs | |
228 | |
229 def parse(self): | |
230 self._parse_common() | |
231 self._parse_flag_nc() | |
232 self._parse_bin_mq_nl() | |
233 self._parse_name() | |
234 self._parse_cigar() | |
235 self._parse_sequence() | |
236 self._parse_quality() | |
237 self._parse_tags() | |
238 | |
239 def _parse_common(self): | |
240 ref_id = unpack_int32(self._data[0:4]) | |
241 self._chromosome = self._refs[ref_id] | |
242 self._pos = unpack_int32(self._data[4:8]) + 1 | |
243 mate_ref_id = unpack_int32(self._data[20:24]) | |
244 if mate_ref_id == -1: | |
245 self._rnext = "*" | |
246 else: | |
247 self._rnext = self._refs[mate_ref_id] | |
248 if self._rnext == self._chromosome: | |
249 self._rnext = "=" | |
250 self._pnext = unpack_int32(self._data[24:28]) + 1 | |
251 self._tlen = unpack_int32(self._data[28:32]) | |
252 | |
253 def _parse_bin_mq_nl(self): | |
254 bin_mq_nl = unpack_uint32(self._data[8:12]) | |
255 self._bin = bin_mq_nl >> 16 | |
256 self._mappingQuality = bin_mq_nl >> 8 & 0xff | |
257 self._query_name_length = bin_mq_nl & 0xff | |
258 | |
259 def _parse_flag_nc(self): | |
260 flag_nc = unpack_uint32(self._data[12:16]) | |
261 self._flag = flag_nc >> 16 | |
262 self._num_cigar_ops = flag_nc & 0xffff | |
263 | |
264 def _parse_name(self): | |
265 start = 32 | |
266 stop = start + self._query_name_length | |
267 self._name = unpack_string(self._data[start:stop]) | |
268 | |
269 def _parse_cigar(self): | |
270 start = 32 + self._query_name_length | |
271 stop = start + (self._num_cigar_ops * 4) | |
272 _buffer = self._data[start:stop] | |
273 cigar = CigarData(_buffer, self._num_cigar_ops) | |
274 self._cigar = cigar.getCigarData() | |
275 | |
276 def _parse_sequence(self): | |
277 seq_length = unpack_int32(self._data[16:20]) | |
278 start = 32 + self._query_name_length + (self._num_cigar_ops * 4) | |
279 stop = start + (seq_length + 1) / 2 | |
280 _buffer = self._data[start:stop] | |
281 self._sequence = "" | |
282 for i in range(seq_length): | |
283 x = unpack_uint8(_buffer[(i / 2)]) | |
284 index = (x >> (4 * (1 - (i % 2)))) & 0xf | |
285 base = BAM_DNA_LOOKUP[index] | |
286 self._sequence += base | |
287 | |
288 def _parse_quality(self): | |
289 seq_length = unpack_int32(self._data[16:20]) | |
290 start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2 | |
291 stop = start + seq_length | |
292 _buffer = self._data[start:stop] | |
293 self._quality = "".join(["%s" % (chr(unpack_int8(x) + 33)) for x in _buffer]) | |
294 | |
295 def _parse_tags(self): | |
296 seq_length = unpack_int32(self._data[16:20]) | |
297 start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2 + (seq_length + 1) - 1 | |
298 stop = start + seq_length | |
299 _buffer = self._data[start:] | |
300 tagParser = TagParser(_buffer) | |
301 self._tags = tagParser.getTags() | |
302 | |
303 | |
304 class FileReader(object): | |
305 | |
306 def __init__(self, handle): | |
307 self._handle = handle | |
308 self._readHeader() | |
309 | |
310 def _readHeader(self): | |
311 magic = unpack_string(self._handle.read(4)) | |
312 if magic != "BAM\1": | |
313 raise Exception("File should start with 'BAM\1', starting with '%s' instead." % (magic)) | |
314 tlen = unpack_int32(self._handle.read(4)) | |
315 text = unpack_string(self._handle.read(tlen)) | |
316 nrefs = unpack_int32(self._handle.read(4)) | |
317 self._refs = [] | |
318 for i in range(nrefs): | |
319 sizeName = unpack_int32(self._handle.read(4)) | |
320 name = unpack_string(self._handle.read(sizeName)) | |
321 size = unpack_int32(self._handle.read(4)) | |
322 self._refs.append(name) | |
323 self._startPos = self._handle.tell() | |
324 | |
325 def reset(self): | |
326 self._handle.seek(self._startPos) | |
327 | |
328 def getNextAlignment(self): | |
329 try: | |
330 blockSize = unpack_int32(self._handle.read(4)) | |
331 except struct.error: | |
332 return False | |
333 block = self._handle.read(blockSize) | |
334 currentRead = AlignedRead(block, self._refs) | |
335 return currentRead | |
336 | |
337 | |
338 | |
339 def parseAlignedRead(read): | |
340 if (read._flag & 0x4) == 0x4: | |
341 return None | |
342 | |
343 mapping = Mapping() | |
344 direction = 1 if (read._flag & 0x10) == 0x0 else -1 | |
345 genomeStart = read._pos | |
346 nbOccurrences = 1 | |
347 nbMismatches = 0 | |
348 nbMatches = 0 | |
349 nbGaps = 0 | |
350 subMapping = None | |
351 queryOffset = 0 | |
352 targetOffset = 0 | |
353 readStart = None | |
354 | |
355 for tag, value in read._tags.iteritems(): | |
356 if tag == "X0": | |
357 nbOccurrences = value._value | |
358 elif tag == "X1": | |
359 nbOccurrences += value._value | |
360 elif tag == "XM": | |
361 nbMismatches = value._value | |
362 mapping.setTagValue("nbOccurrences", nbOccurrences) | |
363 mapping.setTagValue("quality", read._mappingQuality) | |
364 | |
365 for operation in read._cigar: | |
366 if operation._type == "M": | |
367 if readStart == None: | |
368 readStart = queryOffset | |
369 if subMapping == None: | |
370 subMapping = SubMapping() | |
371 subMapping.setSize(operation._length) | |
372 subMapping.setDirection(direction) | |
373 subMapping.queryInterval.setName(read._name) | |
374 subMapping.queryInterval.setStart(queryOffset) | |
375 subMapping.queryInterval.setDirection(direction) | |
376 subMapping.targetInterval.setChromosome(read._chromosome) | |
377 subMapping.targetInterval.setStart(genomeStart + targetOffset) | |
378 subMapping.targetInterval.setDirection(1) | |
379 nbMatches += operation._length | |
380 targetOffset += operation._length | |
381 queryOffset += operation._length | |
382 currentNumber = 0 | |
383 continue | |
384 if operation._type == "I": | |
385 nbGaps += 1 | |
386 queryOffset += operation._length | |
387 currentNumber = 0 | |
388 continue | |
389 if operation._type == "D": | |
390 if subMapping != None: | |
391 subMapping.queryInterval.setEnd(queryOffset - 1) | |
392 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
393 mapping.addSubMapping(subMapping) | |
394 subMapping = None | |
395 nbGaps += 1 | |
396 targetOffset += operation._length | |
397 currentNumber = 0 | |
398 continue | |
399 if operation._type == "N": | |
400 if subMapping != None: | |
401 subMapping.queryInterval.setEnd(queryOffset - 1) | |
402 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
403 mapping.addSubMapping(subMapping) | |
404 subMapping = None | |
405 targetOffset += operation._length | |
406 currentNumber = 0 | |
407 continue | |
408 if operation._type == "S": | |
409 nbMismatches += operation._length | |
410 targetOffset += operation._length | |
411 queryOffset += operation._length | |
412 currentNumber = 0 | |
413 continue | |
414 if operation._type == "H": | |
415 targetOffset += operation._length | |
416 queryOffset += operation._length | |
417 currentNumber = 0 | |
418 continue | |
419 if operation._type == "P": | |
420 continue | |
421 raise Exception("Do not understand parameter '%s'" % (operation._type)) | |
422 | |
423 if subMapping != None: | |
424 subMapping.queryInterval.setEnd(queryOffset - 1) | |
425 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
426 mapping.addSubMapping(subMapping) | |
427 mapping.queryInterval.setStart(readStart) | |
428 mapping.queryInterval.setEnd(queryOffset - 1) | |
429 mapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
430 mapping.setNbMismatches(nbMismatches) | |
431 mapping.setNbGaps(nbGaps) | |
432 mapping.queryInterval.setName(read._name) | |
433 mapping.queryInterval.setDirection(direction) | |
434 mapping.targetInterval.setChromosome(read._chromosome) | |
435 mapping.targetInterval.setStart(genomeStart) | |
436 mapping.targetInterval.setDirection(direction) | |
437 mapping.setSize(len(read._sequence)) | |
438 mapping.setDirection(direction) | |
439 return mapping | |
440 | |
441 | |
442 class BamParser(MapperParser): | |
443 """A class that parses BAM format""" | |
444 | |
445 def __init__(self, fileName, verbosity = 0): | |
446 self.verbosity = verbosity | |
447 self.handle = gzip.open(fileName, "rb") | |
448 self.reader = FileReader(self.handle) | |
449 self.nbMappings = None | |
450 self.fileName = fileName | |
451 | |
452 | |
453 def __del__(self): | |
454 self.handle.close() | |
455 | |
456 | |
457 def getFileFormats(): | |
458 return ["bam"] | |
459 getFileFormats = staticmethod(getFileFormats) | |
460 | |
461 | |
462 def reset(self): | |
463 self.reader.reset() | |
464 | |
465 | |
466 def getNextMapping(self): | |
467 self.currentMapping = None | |
468 while self.currentMapping == None: | |
469 read = self.reader.getNextAlignment() | |
470 if not read: | |
471 self.currentMapping = False | |
472 return False | |
473 read.parse() | |
474 self.currentMapping = parseAlignedRead(read) | |
475 return self.currentMapping | |
476 | |
477 | |
478 def setDefaultTagValue(self, name, value): | |
479 pass | |
480 | |
481 | |
482 def skipFirstLines(self): | |
483 pass |