Mercurial > repos > yufei-luo > s_mart
comparison commons/core/parsing/BamParser.py @ 6:769e306b7933
Change the repository level.
| author | yufei-luo |
|---|---|
| date | Fri, 18 Jan 2013 04:54:14 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:ea3082881bf8 | 6:769e306b7933 |
|---|---|
| 1 # | |
| 2 # Copyright INRA-URGI 2009-2012 | |
| 3 # | |
| 4 # This software is governed by the CeCILL license under French law and | |
| 5 # abiding by the rules of distribution of free software. You can use, | |
| 6 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 8 # "http://www.cecill.info". | |
| 9 # | |
| 10 # As a counterpart to the access to the source code and rights to copy, | |
| 11 # modify and redistribute granted by the license, users are provided only | |
| 12 # with a limited warranty and the software's author, the holder of the | |
| 13 # economic rights, and the successive licensors have only limited | |
| 14 # liability. | |
| 15 # | |
| 16 # In this respect, the user's attention is drawn to the risks associated | |
| 17 # with loading, using, modifying and/or developing or reproducing the | |
| 18 # software by the user in light of its specific status of free software, | |
| 19 # that may mean that it is complicated to manipulate, and that also | |
| 20 # therefore means that it is reserved for developers and experienced | |
| 21 # professionals having in-depth computer knowledge. Users are therefore | |
| 22 # encouraged to load and test the software's suitability as regards their | |
| 23 # requirements in conditions enabling the security of their systems and/or | |
| 24 # data to be ensured and, more generally, to use and operate it in the | |
| 25 # same conditions as regards security. | |
| 26 # | |
| 27 # The fact that you are presently reading this means that you have had | |
| 28 # knowledge of the CeCILL license and that you accept its terms. | |
| 29 # | |
| 30 import re, sys, gzip, struct | |
| 31 from commons.core.parsing.MapperParser import MapperParser | |
| 32 from SMART.Java.Python.structure.Mapping import Mapping | |
| 33 from SMART.Java.Python.structure.SubMapping import SubMapping | |
| 34 from SMART.Java.Python.structure.Interval import Interval | |
| 35 | |
| 36 | |
| 37 BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN" | |
| 38 | |
| 39 BAM_CIGAR_LOOKUP = "MIDNSHP=X" | |
| 40 BAM_CIGAR_SHIFT = 4 | |
| 41 BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1) | |
| 42 | |
| 43 | |
| 44 | |
| 45 def pack_int32(x): | |
| 46 return struct.pack('<i', x) | |
| 47 | |
| 48 def pack_uint32(x): | |
| 49 return struct.pack('<I', x) | |
| 50 | |
| 51 def unpack_int8(x): | |
| 52 return struct.unpack('<b', x)[0] | |
| 53 | |
| 54 def unpack_int16(x): | |
| 55 return struct.unpack('<h', x)[0] | |
| 56 | |
| 57 def unpack_int32(x): | |
| 58 return struct.unpack('<i', x)[0] | |
| 59 | |
| 60 def unpack_int64(x): | |
| 61 return struct.unpack('<q', x)[0] | |
| 62 | |
| 63 def unpack_uint8(x): | |
| 64 return struct.unpack('<B', x)[0] | |
| 65 | |
| 66 def unpack_uint16(x): | |
| 67 return struct.unpack('<H', x)[0] | |
| 68 | |
| 69 def unpack_uint32(x): | |
| 70 return struct.unpack('<I', x)[0] | |
| 71 | |
| 72 def unpack_uint64(x): | |
| 73 return struct.unpack('<Q', x)[0] | |
| 74 | |
| 75 def unpack_float(x): | |
| 76 return struct.unpack('<f', x)[0] | |
| 77 | |
| 78 def unpack_string(x): | |
| 79 length = len(x) | |
| 80 format_string = "<{0}s".format(length) | |
| 81 string = struct.unpack(format_string, x)[0] | |
| 82 if string[-1] == '\0': | |
| 83 return string[:-1] | |
| 84 else: | |
| 85 return string | |
| 86 | |
| 87 | |
| 88 BAM_TAG_CODE = {"c": unpack_int8, \ | |
| 89 "C": unpack_uint8, \ | |
| 90 "s": unpack_int16, \ | |
| 91 "S": unpack_uint16, \ | |
| 92 "i": unpack_int32, \ | |
| 93 "I": unpack_uint32, \ | |
| 94 "f": unpack_float, \ | |
| 95 #"A": unpack_int8, \ | |
| 96 "A": lambda x: x, \ | |
| 97 "Z": unpack_int8, \ | |
| 98 "H": unpack_int8} | |
| 99 | |
| 100 BAM_TAG_VALUE = {"c": int, \ | |
| 101 "C": int, \ | |
| 102 "s": int, \ | |
| 103 "S": int, \ | |
| 104 "i": int, \ | |
| 105 "I": int, \ | |
| 106 "f": float, \ | |
| 107 "A": lambda x: x} | |
| 108 | |
| 109 BAM_TAG_SIZE = {"c": 1, \ | |
| 110 "C": 1, \ | |
| 111 "s": 2, \ | |
| 112 "S": 2, \ | |
| 113 "i": 4, \ | |
| 114 "I": 4, \ | |
| 115 "f": 4, \ | |
| 116 "A": 1} | |
| 117 | |
| 118 | |
| 119 class CigarOp(object): | |
| 120 def __init__(self, data): | |
| 121 self._length = data >> BAM_CIGAR_SHIFT | |
| 122 self._type = BAM_CIGAR_LOOKUP[ data & BAM_CIGAR_MASK ] | |
| 123 | |
| 124 | |
| 125 class CigarData(object): | |
| 126 def __init__(self, data, num_ops): | |
| 127 self._ops = [] | |
| 128 for i in range(num_ops): | |
| 129 cigar_data = unpack_uint32(data[i*4: (i+1)*4]) | |
| 130 self._ops.append(CigarOp(cigar_data)) | |
| 131 | |
| 132 def getCigarData(self): | |
| 133 return self._ops | |
| 134 | |
| 135 def __str__(self): | |
| 136 return "".join(["%d%s" % (op._length, op._type) for op in self._ops]) | |
| 137 | |
| 138 | |
| 139 class TagsData(object): | |
| 140 def __init__(self): | |
| 141 self._tags = {} | |
| 142 | |
| 143 def add(self, tag): | |
| 144 self._tags[tag._tag] = tag | |
| 145 | |
| 146 def getTags(self): | |
| 147 return self._tags | |
| 148 | |
| 149 def __str__(self): | |
| 150 return " ".join([self._tags[tag] for tag in sorted(self._tags.keys())]) | |
| 151 | |
| 152 | |
| 153 class TagData(object): | |
| 154 def __init__(self, tag, type, value): | |
| 155 self._tag = tag | |
| 156 self._type = type | |
| 157 self._value = value | |
| 158 | |
| 159 def __str__(self): | |
| 160 if self._type in "AZHB": | |
| 161 return "%s:%s:%s" % (self._tag, self._type, self._value) | |
| 162 if self._type in "cCsSiI": | |
| 163 return "%s:%s:%d" % (self._tag, self._type, self._value) | |
| 164 return "%s:%s:%f" % (self._tag, self._type, self._value) | |
| 165 | |
| 166 | |
| 167 class TagParser(object): | |
| 168 def __init__(self, data): | |
| 169 self._data = data | |
| 170 self._tags = TagsData() | |
| 171 self._parse() | |
| 172 | |
| 173 def _parse(self): | |
| 174 while self._data: | |
| 175 tag = "%s%s" % (chr(unpack_int8(self._data[0])), chr(unpack_int8(self._data[1]))) | |
| 176 type = chr(unpack_int8(self._data[2])) | |
| 177 self._data = self._data[3:] | |
| 178 if type in BAM_TAG_VALUE: | |
| 179 value = self._parseUnique(type) | |
| 180 elif type == "Z": | |
| 181 value = self._parseString() | |
| 182 elif type == "H": | |
| 183 size = unpack_int8(self._data[0]) | |
| 184 self._data = self._data[1:] | |
| 185 value = self._parseSeveral("C", size) | |
| 186 elif type == "B": | |
| 187 secondType = unpack_int8(self._data[0]) | |
| 188 size = unpack_int8(self._data[1]) + unpack_int8(self._data[2]) * 16 + unpack_int8(self._data[3]) * 16 * 16 + unpack_int8(self._data[4]) * 16 * 16 * 16 | |
| 189 self._data = self._data[5:] | |
| 190 value = self._parseSeveral(secondType, size) | |
| 191 else: | |
| 192 raise Exception("Cannot parse type '%s'." % (type)) | |
| 193 fullTag = TagData(tag, type, value) | |
| 194 self._tags.add(fullTag) | |
| 195 | |
| 196 def _parseUnique(self, type): | |
| 197 value = BAM_TAG_CODE[type](self._data[:BAM_TAG_SIZE[type]]) | |
| 198 self._data = self._data[BAM_TAG_SIZE[type]:] | |
| 199 return value | |
| 200 | |
| 201 def _parseSeveral(self, type, size): | |
| 202 value = [] | |
| 203 for i in range(size): | |
| 204 value.append(self._parseUnique(type)) | |
| 205 return value | |
| 206 | |
| 207 def _parseString(self): | |
| 208 value = "" | |
| 209 char = self._data[0] | |
| 210 self._data = self._data[1:] | |
| 211 while unpack_int8(char) != 0: | |
| 212 value += char | |
| 213 char = self._data[0] | |
| 214 self._data = self._data[1:] | |
| 215 return value | |
| 216 | |
| 217 def getTags(self): | |
| 218 return self._tags.getTags() | |
| 219 | |
| 220 def __str__(self): | |
| 221 return self._tags | |
| 222 | |
| 223 | |
| 224 class AlignedRead(object): | |
| 225 def __init__(self, data, refs): | |
| 226 self._data = data | |
| 227 self._refs = refs | |
| 228 | |
| 229 def parse(self): | |
| 230 self._parse_common() | |
| 231 self._parse_flag_nc() | |
| 232 self._parse_bin_mq_nl() | |
| 233 self._parse_name() | |
| 234 self._parse_cigar() | |
| 235 self._parse_sequence() | |
| 236 self._parse_quality() | |
| 237 self._parse_tags() | |
| 238 | |
| 239 def _parse_common(self): | |
| 240 ref_id = unpack_int32(self._data[0:4]) | |
| 241 self._chromosome = self._refs[ref_id] | |
| 242 self._pos = unpack_int32(self._data[4:8]) + 1 | |
| 243 mate_ref_id = unpack_int32(self._data[20:24]) | |
| 244 if mate_ref_id == -1: | |
| 245 self._rnext = "*" | |
| 246 else: | |
| 247 self._rnext = self._refs[mate_ref_id] | |
| 248 if self._rnext == self._chromosome: | |
| 249 self._rnext = "=" | |
| 250 self._pnext = unpack_int32(self._data[24:28]) + 1 | |
| 251 self._tlen = unpack_int32(self._data[28:32]) | |
| 252 | |
| 253 def _parse_bin_mq_nl(self): | |
| 254 bin_mq_nl = unpack_uint32(self._data[8:12]) | |
| 255 self._bin = bin_mq_nl >> 16 | |
| 256 self._mappingQuality = bin_mq_nl >> 8 & 0xff | |
| 257 self._query_name_length = bin_mq_nl & 0xff | |
| 258 | |
| 259 def _parse_flag_nc(self): | |
| 260 flag_nc = unpack_uint32(self._data[12:16]) | |
| 261 self._flag = flag_nc >> 16 | |
| 262 self._num_cigar_ops = flag_nc & 0xffff | |
| 263 | |
| 264 def _parse_name(self): | |
| 265 start = 32 | |
| 266 stop = start + self._query_name_length | |
| 267 self._name = unpack_string(self._data[start:stop]) | |
| 268 | |
| 269 def _parse_cigar(self): | |
| 270 start = 32 + self._query_name_length | |
| 271 stop = start + (self._num_cigar_ops * 4) | |
| 272 _buffer = self._data[start:stop] | |
| 273 cigar = CigarData(_buffer, self._num_cigar_ops) | |
| 274 self._cigar = cigar.getCigarData() | |
| 275 | |
| 276 def _parse_sequence(self): | |
| 277 seq_length = unpack_int32(self._data[16:20]) | |
| 278 start = 32 + self._query_name_length + (self._num_cigar_ops * 4) | |
| 279 stop = start + (seq_length + 1) / 2 | |
| 280 _buffer = self._data[start:stop] | |
| 281 self._sequence = "" | |
| 282 for i in range(seq_length): | |
| 283 x = unpack_uint8(_buffer[(i / 2)]) | |
| 284 index = (x >> (4 * (1 - (i % 2)))) & 0xf | |
| 285 base = BAM_DNA_LOOKUP[index] | |
| 286 self._sequence += base | |
| 287 | |
| 288 def _parse_quality(self): | |
| 289 seq_length = unpack_int32(self._data[16:20]) | |
| 290 start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2 | |
| 291 stop = start + seq_length | |
| 292 _buffer = self._data[start:stop] | |
| 293 self._quality = "".join(["%s" % (chr(unpack_int8(x) + 33)) for x in _buffer]) | |
| 294 | |
| 295 def _parse_tags(self): | |
| 296 seq_length = unpack_int32(self._data[16:20]) | |
| 297 start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2 + (seq_length + 1) - 1 | |
| 298 stop = start + seq_length | |
| 299 _buffer = self._data[start:] | |
| 300 tagParser = TagParser(_buffer) | |
| 301 self._tags = tagParser.getTags() | |
| 302 | |
| 303 | |
| 304 class FileReader(object): | |
| 305 | |
| 306 def __init__(self, handle): | |
| 307 self._handle = handle | |
| 308 self._readHeader() | |
| 309 | |
| 310 def _readHeader(self): | |
| 311 magic = unpack_string(self._handle.read(4)) | |
| 312 if magic != "BAM\1": | |
| 313 raise Exception("File should start with 'BAM\1', starting with '%s' instead." % (magic)) | |
| 314 tlen = unpack_int32(self._handle.read(4)) | |
| 315 text = unpack_string(self._handle.read(tlen)) | |
| 316 nrefs = unpack_int32(self._handle.read(4)) | |
| 317 self._refs = [] | |
| 318 for i in range(nrefs): | |
| 319 sizeName = unpack_int32(self._handle.read(4)) | |
| 320 name = unpack_string(self._handle.read(sizeName)) | |
| 321 size = unpack_int32(self._handle.read(4)) | |
| 322 self._refs.append(name) | |
| 323 self._startPos = self._handle.tell() | |
| 324 | |
| 325 def reset(self): | |
| 326 self._handle.seek(self._startPos) | |
| 327 | |
| 328 def getNextAlignment(self): | |
| 329 try: | |
| 330 blockSize = unpack_int32(self._handle.read(4)) | |
| 331 except struct.error: | |
| 332 return False | |
| 333 block = self._handle.read(blockSize) | |
| 334 currentRead = AlignedRead(block, self._refs) | |
| 335 return currentRead | |
| 336 | |
| 337 | |
| 338 | |
| 339 def parseAlignedRead(read): | |
| 340 if (read._flag & 0x4) == 0x4: | |
| 341 return None | |
| 342 | |
| 343 mapping = Mapping() | |
| 344 direction = 1 if (read._flag & 0x10) == 0x0 else -1 | |
| 345 genomeStart = read._pos | |
| 346 nbOccurrences = 1 | |
| 347 nbMismatches = 0 | |
| 348 nbMatches = 0 | |
| 349 nbGaps = 0 | |
| 350 subMapping = None | |
| 351 queryOffset = 0 | |
| 352 targetOffset = 0 | |
| 353 readStart = None | |
| 354 | |
| 355 for tag, value in read._tags.iteritems(): | |
| 356 if tag == "X0": | |
| 357 nbOccurrences = value._value | |
| 358 elif tag == "X1": | |
| 359 nbOccurrences += value._value | |
| 360 elif tag == "XM": | |
| 361 nbMismatches = value._value | |
| 362 mapping.setTagValue("nbOccurrences", nbOccurrences) | |
| 363 mapping.setTagValue("quality", read._mappingQuality) | |
| 364 | |
| 365 for operation in read._cigar: | |
| 366 if operation._type == "M": | |
| 367 if readStart == None: | |
| 368 readStart = queryOffset | |
| 369 if subMapping == None: | |
| 370 subMapping = SubMapping() | |
| 371 subMapping.setSize(operation._length) | |
| 372 subMapping.setDirection(direction) | |
| 373 subMapping.queryInterval.setName(read._name) | |
| 374 subMapping.queryInterval.setStart(queryOffset) | |
| 375 subMapping.queryInterval.setDirection(direction) | |
| 376 subMapping.targetInterval.setChromosome(read._chromosome) | |
| 377 subMapping.targetInterval.setStart(genomeStart + targetOffset) | |
| 378 subMapping.targetInterval.setDirection(1) | |
| 379 nbMatches += operation._length | |
| 380 targetOffset += operation._length | |
| 381 queryOffset += operation._length | |
| 382 currentNumber = 0 | |
| 383 continue | |
| 384 if operation._type == "I": | |
| 385 nbGaps += 1 | |
| 386 queryOffset += operation._length | |
| 387 currentNumber = 0 | |
| 388 continue | |
| 389 if operation._type == "D": | |
| 390 if subMapping != None: | |
| 391 subMapping.queryInterval.setEnd(queryOffset - 1) | |
| 392 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
| 393 mapping.addSubMapping(subMapping) | |
| 394 subMapping = None | |
| 395 nbGaps += 1 | |
| 396 targetOffset += operation._length | |
| 397 currentNumber = 0 | |
| 398 continue | |
| 399 if operation._type == "N": | |
| 400 if subMapping != None: | |
| 401 subMapping.queryInterval.setEnd(queryOffset - 1) | |
| 402 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
| 403 mapping.addSubMapping(subMapping) | |
| 404 subMapping = None | |
| 405 targetOffset += operation._length | |
| 406 currentNumber = 0 | |
| 407 continue | |
| 408 if operation._type == "S": | |
| 409 nbMismatches += operation._length | |
| 410 targetOffset += operation._length | |
| 411 queryOffset += operation._length | |
| 412 currentNumber = 0 | |
| 413 continue | |
| 414 if operation._type == "H": | |
| 415 targetOffset += operation._length | |
| 416 queryOffset += operation._length | |
| 417 currentNumber = 0 | |
| 418 continue | |
| 419 if operation._type == "P": | |
| 420 continue | |
| 421 raise Exception("Do not understand parameter '%s'" % (operation._type)) | |
| 422 | |
| 423 if subMapping != None: | |
| 424 subMapping.queryInterval.setEnd(queryOffset - 1) | |
| 425 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
| 426 mapping.addSubMapping(subMapping) | |
| 427 mapping.queryInterval.setStart(readStart) | |
| 428 mapping.queryInterval.setEnd(queryOffset - 1) | |
| 429 mapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
| 430 mapping.setNbMismatches(nbMismatches) | |
| 431 mapping.setNbGaps(nbGaps) | |
| 432 mapping.queryInterval.setName(read._name) | |
| 433 mapping.queryInterval.setDirection(direction) | |
| 434 mapping.targetInterval.setChromosome(read._chromosome) | |
| 435 mapping.targetInterval.setStart(genomeStart) | |
| 436 mapping.targetInterval.setDirection(direction) | |
| 437 mapping.setSize(len(read._sequence)) | |
| 438 mapping.setDirection(direction) | |
| 439 return mapping | |
| 440 | |
| 441 | |
| 442 class BamParser(MapperParser): | |
| 443 """A class that parses BAM format""" | |
| 444 | |
| 445 def __init__(self, fileName, verbosity = 0): | |
| 446 self.verbosity = verbosity | |
| 447 self.handle = gzip.open(fileName, "rb") | |
| 448 self.reader = FileReader(self.handle) | |
| 449 self.nbMappings = None | |
| 450 self.fileName = fileName | |
| 451 | |
| 452 | |
| 453 def __del__(self): | |
| 454 self.handle.close() | |
| 455 | |
| 456 | |
| 457 def getFileFormats(): | |
| 458 return ["bam"] | |
| 459 getFileFormats = staticmethod(getFileFormats) | |
| 460 | |
| 461 | |
| 462 def reset(self): | |
| 463 self.reader.reset() | |
| 464 | |
| 465 | |
| 466 def getNextMapping(self): | |
| 467 self.currentMapping = None | |
| 468 while self.currentMapping == None: | |
| 469 read = self.reader.getNextAlignment() | |
| 470 if not read: | |
| 471 self.currentMapping = False | |
| 472 return False | |
| 473 read.parse() | |
| 474 self.currentMapping = parseAlignedRead(read) | |
| 475 return self.currentMapping | |
| 476 | |
| 477 | |
| 478 def setDefaultTagValue(self, name, value): | |
| 479 pass | |
| 480 | |
| 481 | |
| 482 def skipFirstLines(self): | |
| 483 pass |
