6
|
1 #
|
|
2 # Copyright INRA-URGI 2009-2012
|
|
3 #
|
|
4 # This software is governed by the CeCILL license under French law and
|
|
5 # abiding by the rules of distribution of free software. You can use,
|
|
6 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
7 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
8 # "http://www.cecill.info".
|
|
9 #
|
|
10 # As a counterpart to the access to the source code and rights to copy,
|
|
11 # modify and redistribute granted by the license, users are provided only
|
|
12 # with a limited warranty and the software's author, the holder of the
|
|
13 # economic rights, and the successive licensors have only limited
|
|
14 # liability.
|
|
15 #
|
|
16 # In this respect, the user's attention is drawn to the risks associated
|
|
17 # with loading, using, modifying and/or developing or reproducing the
|
|
18 # software by the user in light of its specific status of free software,
|
|
19 # that may mean that it is complicated to manipulate, and that also
|
|
20 # therefore means that it is reserved for developers and experienced
|
|
21 # professionals having in-depth computer knowledge. Users are therefore
|
|
22 # encouraged to load and test the software's suitability as regards their
|
|
23 # requirements in conditions enabling the security of their systems and/or
|
|
24 # data to be ensured and, more generally, to use and operate it in the
|
|
25 # same conditions as regards security.
|
|
26 #
|
|
27 # The fact that you are presently reading this means that you have had
|
|
28 # knowledge of the CeCILL license and that you accept its terms.
|
|
29 #
|
|
30 import re, sys, gzip, struct
|
|
31 from commons.core.parsing.MapperParser import MapperParser
|
|
32 from SMART.Java.Python.structure.Mapping import Mapping
|
|
33 from SMART.Java.Python.structure.SubMapping import SubMapping
|
|
34 from SMART.Java.Python.structure.Interval import Interval
|
|
35
|
|
36
|
|
37 BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN"
|
|
38
|
|
39 BAM_CIGAR_LOOKUP = "MIDNSHP=X"
|
|
40 BAM_CIGAR_SHIFT = 4
|
|
41 BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1)
|
|
42
|
|
43
|
|
44
|
|
45 def pack_int32(x):
|
|
46 return struct.pack('<i', x)
|
|
47
|
|
48 def pack_uint32(x):
|
|
49 return struct.pack('<I', x)
|
|
50
|
|
51 def unpack_int8(x):
|
|
52 return struct.unpack('<b', x)[0]
|
|
53
|
|
54 def unpack_int16(x):
|
|
55 return struct.unpack('<h', x)[0]
|
|
56
|
|
57 def unpack_int32(x):
|
|
58 return struct.unpack('<i', x)[0]
|
|
59
|
|
60 def unpack_int64(x):
|
|
61 return struct.unpack('<q', x)[0]
|
|
62
|
|
63 def unpack_uint8(x):
|
|
64 return struct.unpack('<B', x)[0]
|
|
65
|
|
66 def unpack_uint16(x):
|
|
67 return struct.unpack('<H', x)[0]
|
|
68
|
|
69 def unpack_uint32(x):
|
|
70 return struct.unpack('<I', x)[0]
|
|
71
|
|
72 def unpack_uint64(x):
|
|
73 return struct.unpack('<Q', x)[0]
|
|
74
|
|
75 def unpack_float(x):
|
|
76 return struct.unpack('<f', x)[0]
|
|
77
|
|
78 def unpack_string(x):
|
|
79 length = len(x)
|
|
80 format_string = "<{0}s".format(length)
|
|
81 string = struct.unpack(format_string, x)[0]
|
|
82 if string[-1] == '\0':
|
|
83 return string[:-1]
|
|
84 else:
|
|
85 return string
|
|
86
|
|
87
|
|
88 BAM_TAG_CODE = {"c": unpack_int8, \
|
|
89 "C": unpack_uint8, \
|
|
90 "s": unpack_int16, \
|
|
91 "S": unpack_uint16, \
|
|
92 "i": unpack_int32, \
|
|
93 "I": unpack_uint32, \
|
|
94 "f": unpack_float, \
|
|
95 #"A": unpack_int8, \
|
|
96 "A": lambda x: x, \
|
|
97 "Z": unpack_int8, \
|
|
98 "H": unpack_int8}
|
|
99
|
|
100 BAM_TAG_VALUE = {"c": int, \
|
|
101 "C": int, \
|
|
102 "s": int, \
|
|
103 "S": int, \
|
|
104 "i": int, \
|
|
105 "I": int, \
|
|
106 "f": float, \
|
|
107 "A": lambda x: x}
|
|
108
|
|
109 BAM_TAG_SIZE = {"c": 1, \
|
|
110 "C": 1, \
|
|
111 "s": 2, \
|
|
112 "S": 2, \
|
|
113 "i": 4, \
|
|
114 "I": 4, \
|
|
115 "f": 4, \
|
|
116 "A": 1}
|
|
117
|
|
118
|
|
119 class CigarOp(object):
|
|
120 def __init__(self, data):
|
|
121 self._length = data >> BAM_CIGAR_SHIFT
|
|
122 self._type = BAM_CIGAR_LOOKUP[ data & BAM_CIGAR_MASK ]
|
|
123
|
|
124
|
|
125 class CigarData(object):
|
|
126 def __init__(self, data, num_ops):
|
|
127 self._ops = []
|
|
128 for i in range(num_ops):
|
|
129 cigar_data = unpack_uint32(data[i*4: (i+1)*4])
|
|
130 self._ops.append(CigarOp(cigar_data))
|
|
131
|
|
132 def getCigarData(self):
|
|
133 return self._ops
|
|
134
|
|
135 def __str__(self):
|
|
136 return "".join(["%d%s" % (op._length, op._type) for op in self._ops])
|
|
137
|
|
138
|
|
139 class TagsData(object):
|
|
140 def __init__(self):
|
|
141 self._tags = {}
|
|
142
|
|
143 def add(self, tag):
|
|
144 self._tags[tag._tag] = tag
|
|
145
|
|
146 def getTags(self):
|
|
147 return self._tags
|
|
148
|
|
149 def __str__(self):
|
|
150 return " ".join([self._tags[tag] for tag in sorted(self._tags.keys())])
|
|
151
|
|
152
|
|
153 class TagData(object):
|
|
154 def __init__(self, tag, type, value):
|
|
155 self._tag = tag
|
|
156 self._type = type
|
|
157 self._value = value
|
|
158
|
|
159 def __str__(self):
|
|
160 if self._type in "AZHB":
|
|
161 return "%s:%s:%s" % (self._tag, self._type, self._value)
|
|
162 if self._type in "cCsSiI":
|
|
163 return "%s:%s:%d" % (self._tag, self._type, self._value)
|
|
164 return "%s:%s:%f" % (self._tag, self._type, self._value)
|
|
165
|
|
166
|
|
167 class TagParser(object):
|
|
168 def __init__(self, data):
|
|
169 self._data = data
|
|
170 self._tags = TagsData()
|
|
171 self._parse()
|
|
172
|
|
173 def _parse(self):
|
|
174 while self._data:
|
|
175 tag = "%s%s" % (chr(unpack_int8(self._data[0])), chr(unpack_int8(self._data[1])))
|
|
176 type = chr(unpack_int8(self._data[2]))
|
|
177 self._data = self._data[3:]
|
|
178 if type in BAM_TAG_VALUE:
|
|
179 value = self._parseUnique(type)
|
|
180 elif type == "Z":
|
|
181 value = self._parseString()
|
|
182 elif type == "H":
|
|
183 size = unpack_int8(self._data[0])
|
|
184 self._data = self._data[1:]
|
|
185 value = self._parseSeveral("C", size)
|
|
186 elif type == "B":
|
|
187 secondType = unpack_int8(self._data[0])
|
|
188 size = unpack_int8(self._data[1]) + unpack_int8(self._data[2]) * 16 + unpack_int8(self._data[3]) * 16 * 16 + unpack_int8(self._data[4]) * 16 * 16 * 16
|
|
189 self._data = self._data[5:]
|
|
190 value = self._parseSeveral(secondType, size)
|
|
191 else:
|
|
192 raise Exception("Cannot parse type '%s'." % (type))
|
|
193 fullTag = TagData(tag, type, value)
|
|
194 self._tags.add(fullTag)
|
|
195
|
|
196 def _parseUnique(self, type):
|
|
197 value = BAM_TAG_CODE[type](self._data[:BAM_TAG_SIZE[type]])
|
|
198 self._data = self._data[BAM_TAG_SIZE[type]:]
|
|
199 return value
|
|
200
|
|
201 def _parseSeveral(self, type, size):
|
|
202 value = []
|
|
203 for i in range(size):
|
|
204 value.append(self._parseUnique(type))
|
|
205 return value
|
|
206
|
|
207 def _parseString(self):
|
|
208 value = ""
|
|
209 char = self._data[0]
|
|
210 self._data = self._data[1:]
|
|
211 while unpack_int8(char) != 0:
|
|
212 value += char
|
|
213 char = self._data[0]
|
|
214 self._data = self._data[1:]
|
|
215 return value
|
|
216
|
|
217 def getTags(self):
|
|
218 return self._tags.getTags()
|
|
219
|
|
220 def __str__(self):
|
|
221 return self._tags
|
|
222
|
|
223
|
|
224 class AlignedRead(object):
|
|
225 def __init__(self, data, refs):
|
|
226 self._data = data
|
|
227 self._refs = refs
|
|
228
|
|
229 def parse(self):
|
|
230 self._parse_common()
|
|
231 self._parse_flag_nc()
|
|
232 self._parse_bin_mq_nl()
|
|
233 self._parse_name()
|
|
234 self._parse_cigar()
|
|
235 self._parse_sequence()
|
|
236 self._parse_quality()
|
|
237 self._parse_tags()
|
|
238
|
|
239 def _parse_common(self):
|
|
240 ref_id = unpack_int32(self._data[0:4])
|
|
241 self._chromosome = self._refs[ref_id]
|
|
242 self._pos = unpack_int32(self._data[4:8]) + 1
|
|
243 mate_ref_id = unpack_int32(self._data[20:24])
|
|
244 if mate_ref_id == -1:
|
|
245 self._rnext = "*"
|
|
246 else:
|
|
247 self._rnext = self._refs[mate_ref_id]
|
|
248 if self._rnext == self._chromosome:
|
|
249 self._rnext = "="
|
|
250 self._pnext = unpack_int32(self._data[24:28]) + 1
|
|
251 self._tlen = unpack_int32(self._data[28:32])
|
|
252
|
|
253 def _parse_bin_mq_nl(self):
|
|
254 bin_mq_nl = unpack_uint32(self._data[8:12])
|
|
255 self._bin = bin_mq_nl >> 16
|
|
256 self._mappingQuality = bin_mq_nl >> 8 & 0xff
|
|
257 self._query_name_length = bin_mq_nl & 0xff
|
|
258
|
|
259 def _parse_flag_nc(self):
|
|
260 flag_nc = unpack_uint32(self._data[12:16])
|
|
261 self._flag = flag_nc >> 16
|
|
262 self._num_cigar_ops = flag_nc & 0xffff
|
|
263
|
|
264 def _parse_name(self):
|
|
265 start = 32
|
|
266 stop = start + self._query_name_length
|
|
267 self._name = unpack_string(self._data[start:stop])
|
|
268
|
|
269 def _parse_cigar(self):
|
|
270 start = 32 + self._query_name_length
|
|
271 stop = start + (self._num_cigar_ops * 4)
|
|
272 _buffer = self._data[start:stop]
|
|
273 cigar = CigarData(_buffer, self._num_cigar_ops)
|
|
274 self._cigar = cigar.getCigarData()
|
|
275
|
|
276 def _parse_sequence(self):
|
|
277 seq_length = unpack_int32(self._data[16:20])
|
|
278 start = 32 + self._query_name_length + (self._num_cigar_ops * 4)
|
|
279 stop = start + (seq_length + 1) / 2
|
|
280 _buffer = self._data[start:stop]
|
|
281 self._sequence = ""
|
|
282 for i in range(seq_length):
|
|
283 x = unpack_uint8(_buffer[(i / 2)])
|
|
284 index = (x >> (4 * (1 - (i % 2)))) & 0xf
|
|
285 base = BAM_DNA_LOOKUP[index]
|
|
286 self._sequence += base
|
|
287
|
|
288 def _parse_quality(self):
|
|
289 seq_length = unpack_int32(self._data[16:20])
|
|
290 start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2
|
|
291 stop = start + seq_length
|
|
292 _buffer = self._data[start:stop]
|
|
293 self._quality = "".join(["%s" % (chr(unpack_int8(x) + 33)) for x in _buffer])
|
|
294
|
|
295 def _parse_tags(self):
|
|
296 seq_length = unpack_int32(self._data[16:20])
|
|
297 start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2 + (seq_length + 1) - 1
|
|
298 stop = start + seq_length
|
|
299 _buffer = self._data[start:]
|
|
300 tagParser = TagParser(_buffer)
|
|
301 self._tags = tagParser.getTags()
|
|
302
|
|
303
|
|
304 class FileReader(object):
|
|
305
|
|
306 def __init__(self, handle):
|
|
307 self._handle = handle
|
|
308 self._readHeader()
|
|
309
|
|
310 def _readHeader(self):
|
|
311 magic = unpack_string(self._handle.read(4))
|
|
312 if magic != "BAM\1":
|
|
313 raise Exception("File should start with 'BAM\1', starting with '%s' instead." % (magic))
|
|
314 tlen = unpack_int32(self._handle.read(4))
|
|
315 text = unpack_string(self._handle.read(tlen))
|
|
316 nrefs = unpack_int32(self._handle.read(4))
|
|
317 self._refs = []
|
|
318 for i in range(nrefs):
|
|
319 sizeName = unpack_int32(self._handle.read(4))
|
|
320 name = unpack_string(self._handle.read(sizeName))
|
|
321 size = unpack_int32(self._handle.read(4))
|
|
322 self._refs.append(name)
|
|
323 self._startPos = self._handle.tell()
|
|
324
|
|
325 def reset(self):
|
|
326 self._handle.seek(self._startPos)
|
|
327
|
|
328 def getNextAlignment(self):
|
|
329 try:
|
|
330 blockSize = unpack_int32(self._handle.read(4))
|
|
331 except struct.error:
|
|
332 return False
|
|
333 block = self._handle.read(blockSize)
|
|
334 currentRead = AlignedRead(block, self._refs)
|
|
335 return currentRead
|
|
336
|
|
337
|
|
338
|
|
339 def parseAlignedRead(read):
|
|
340 if (read._flag & 0x4) == 0x4:
|
|
341 return None
|
|
342
|
|
343 mapping = Mapping()
|
|
344 direction = 1 if (read._flag & 0x10) == 0x0 else -1
|
|
345 genomeStart = read._pos
|
|
346 nbOccurrences = 1
|
|
347 nbMismatches = 0
|
|
348 nbMatches = 0
|
|
349 nbGaps = 0
|
|
350 subMapping = None
|
|
351 queryOffset = 0
|
|
352 targetOffset = 0
|
|
353 readStart = None
|
|
354
|
|
355 for tag, value in read._tags.iteritems():
|
|
356 if tag == "X0":
|
|
357 nbOccurrences = value._value
|
|
358 elif tag == "X1":
|
|
359 nbOccurrences += value._value
|
|
360 elif tag == "XM":
|
|
361 nbMismatches = value._value
|
|
362 mapping.setTagValue("nbOccurrences", nbOccurrences)
|
|
363 mapping.setTagValue("quality", read._mappingQuality)
|
|
364
|
|
365 for operation in read._cigar:
|
|
366 if operation._type == "M":
|
|
367 if readStart == None:
|
|
368 readStart = queryOffset
|
|
369 if subMapping == None:
|
|
370 subMapping = SubMapping()
|
|
371 subMapping.setSize(operation._length)
|
|
372 subMapping.setDirection(direction)
|
|
373 subMapping.queryInterval.setName(read._name)
|
|
374 subMapping.queryInterval.setStart(queryOffset)
|
|
375 subMapping.queryInterval.setDirection(direction)
|
|
376 subMapping.targetInterval.setChromosome(read._chromosome)
|
|
377 subMapping.targetInterval.setStart(genomeStart + targetOffset)
|
|
378 subMapping.targetInterval.setDirection(1)
|
|
379 nbMatches += operation._length
|
|
380 targetOffset += operation._length
|
|
381 queryOffset += operation._length
|
|
382 currentNumber = 0
|
|
383 continue
|
|
384 if operation._type == "I":
|
|
385 nbGaps += 1
|
|
386 queryOffset += operation._length
|
|
387 currentNumber = 0
|
|
388 continue
|
|
389 if operation._type == "D":
|
|
390 if subMapping != None:
|
|
391 subMapping.queryInterval.setEnd(queryOffset - 1)
|
|
392 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
|
|
393 mapping.addSubMapping(subMapping)
|
|
394 subMapping = None
|
|
395 nbGaps += 1
|
|
396 targetOffset += operation._length
|
|
397 currentNumber = 0
|
|
398 continue
|
|
399 if operation._type == "N":
|
|
400 if subMapping != None:
|
|
401 subMapping.queryInterval.setEnd(queryOffset - 1)
|
|
402 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
|
|
403 mapping.addSubMapping(subMapping)
|
|
404 subMapping = None
|
|
405 targetOffset += operation._length
|
|
406 currentNumber = 0
|
|
407 continue
|
|
408 if operation._type == "S":
|
|
409 nbMismatches += operation._length
|
|
410 targetOffset += operation._length
|
|
411 queryOffset += operation._length
|
|
412 currentNumber = 0
|
|
413 continue
|
|
414 if operation._type == "H":
|
|
415 targetOffset += operation._length
|
|
416 queryOffset += operation._length
|
|
417 currentNumber = 0
|
|
418 continue
|
|
419 if operation._type == "P":
|
|
420 continue
|
|
421 raise Exception("Do not understand parameter '%s'" % (operation._type))
|
|
422
|
|
423 if subMapping != None:
|
|
424 subMapping.queryInterval.setEnd(queryOffset - 1)
|
|
425 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
|
|
426 mapping.addSubMapping(subMapping)
|
|
427 mapping.queryInterval.setStart(readStart)
|
|
428 mapping.queryInterval.setEnd(queryOffset - 1)
|
|
429 mapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
|
|
430 mapping.setNbMismatches(nbMismatches)
|
|
431 mapping.setNbGaps(nbGaps)
|
|
432 mapping.queryInterval.setName(read._name)
|
|
433 mapping.queryInterval.setDirection(direction)
|
|
434 mapping.targetInterval.setChromosome(read._chromosome)
|
|
435 mapping.targetInterval.setStart(genomeStart)
|
|
436 mapping.targetInterval.setDirection(direction)
|
|
437 mapping.setSize(len(read._sequence))
|
|
438 mapping.setDirection(direction)
|
|
439 return mapping
|
|
440
|
|
441
|
|
442 class BamParser(MapperParser):
|
|
443 """A class that parses BAM format"""
|
|
444
|
|
445 def __init__(self, fileName, verbosity = 0):
|
|
446 self.verbosity = verbosity
|
|
447 self.handle = gzip.open(fileName, "rb")
|
|
448 self.reader = FileReader(self.handle)
|
|
449 self.nbMappings = None
|
|
450 self.fileName = fileName
|
|
451
|
|
452
|
|
453 def __del__(self):
|
|
454 self.handle.close()
|
|
455
|
|
456
|
|
457 def getFileFormats():
|
|
458 return ["bam"]
|
|
459 getFileFormats = staticmethod(getFileFormats)
|
|
460
|
|
461
|
|
462 def reset(self):
|
|
463 self.reader.reset()
|
|
464
|
|
465
|
|
466 def getNextMapping(self):
|
|
467 self.currentMapping = None
|
|
468 while self.currentMapping == None:
|
|
469 read = self.reader.getNextAlignment()
|
|
470 if not read:
|
|
471 self.currentMapping = False
|
|
472 return False
|
|
473 read.parse()
|
|
474 self.currentMapping = parseAlignedRead(read)
|
|
475 return self.currentMapping
|
|
476
|
|
477
|
|
478 def setDefaultTagValue(self, name, value):
|
|
479 pass
|
|
480
|
|
481
|
|
482 def skipFirstLines(self):
|
|
483 pass
|