diff commons/core/parsing/BamParser.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents 769e306b7933
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/core/parsing/BamParser.py	Tue Apr 30 15:02:29 2013 -0400
@@ -0,0 +1,483 @@
+#
+# Copyright INRA-URGI 2009-2012
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+import re, sys, gzip, struct
+from commons.core.parsing.MapperParser import MapperParser
+from SMART.Java.Python.structure.Mapping import Mapping
+from SMART.Java.Python.structure.SubMapping import SubMapping
+from SMART.Java.Python.structure.Interval import Interval
+
+
+BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN"
+
+BAM_CIGAR_LOOKUP = "MIDNSHP=X"
+BAM_CIGAR_SHIFT = 4
+BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1)
+
+
+
+def pack_int32(x):
+	return struct.pack('<i', x)
+
+def pack_uint32(x):
+	return struct.pack('<I', x)
+
+def unpack_int8(x):
+	return struct.unpack('<b', x)[0]
+
+def unpack_int16(x):
+	return struct.unpack('<h', x)[0]
+
+def unpack_int32(x):
+	return struct.unpack('<i', x)[0]
+
+def unpack_int64(x):
+	return struct.unpack('<q', x)[0]
+
+def unpack_uint8(x):
+	return struct.unpack('<B', x)[0]
+
+def unpack_uint16(x):
+	return struct.unpack('<H', x)[0]
+
+def unpack_uint32(x):
+	return struct.unpack('<I', x)[0]
+
+def unpack_uint64(x):
+	return struct.unpack('<Q', x)[0]
+
+def unpack_float(x):
+	return struct.unpack('<f', x)[0]
+
+def unpack_string(x):
+	length = len(x)
+	format_string = "<{0}s".format(length)
+	string = struct.unpack(format_string, x)[0]
+	if string[-1] == '\0':
+		return string[:-1]
+	else:
+		return string
+
+
+BAM_TAG_CODE = {"c": unpack_int8, \
+				"C": unpack_uint8, \
+				"s": unpack_int16, \
+				"S": unpack_uint16, \
+				"i": unpack_int32, \
+				"I": unpack_uint32, \
+				"f": unpack_float, \
+				#"A": unpack_int8, \
+				"A": lambda x: x, \
+				"Z": unpack_int8, \
+				"H": unpack_int8}
+
+BAM_TAG_VALUE = {"c": int, \
+				 "C": int, \
+				 "s": int, \
+				 "S": int, \
+				 "i": int, \
+				 "I": int, \
+				 "f": float, \
+				 "A": lambda x: x}
+
+BAM_TAG_SIZE = {"c": 1, \
+				"C": 1, \
+				"s": 2, \
+				"S": 2, \
+				"i": 4, \
+				"I": 4, \
+				"f": 4, \
+				"A": 1}
+
+
+class CigarOp(object):
+	def __init__(self, data):
+		self._length = data >> BAM_CIGAR_SHIFT
+		self._type   = BAM_CIGAR_LOOKUP[ data & BAM_CIGAR_MASK ]
+
+
+class CigarData(object):
+	def __init__(self, data, num_ops):
+		self._ops = []
+		for i in range(num_ops):
+			cigar_data = unpack_uint32(data[i*4: (i+1)*4])
+			self._ops.append(CigarOp(cigar_data))		
+
+	def getCigarData(self):
+		return self._ops
+	
+	def __str__(self):
+		return "".join(["%d%s" % (op._length, op._type) for op in self._ops])
+
+
+class TagsData(object):
+	def __init__(self):
+		self._tags = {}
+
+	def add(self, tag):
+		self._tags[tag._tag] = tag
+
+	def getTags(self):
+		return self._tags
+
+	def __str__(self):
+		return " ".join([self._tags[tag] for tag in sorted(self._tags.keys())])
+		
+
+class TagData(object):
+	def __init__(self, tag, type, value):
+		self._tag = tag
+		self._type = type
+		self._value = value
+
+	def __str__(self):
+		if self._type in "AZHB":
+			return "%s:%s:%s" % (self._tag, self._type, self._value)
+		if self._type in "cCsSiI":
+			return "%s:%s:%d" % (self._tag, self._type, self._value)
+		return "%s:%s:%f" % (self._tag, self._type, self._value)
+
+
+class TagParser(object):
+	def __init__(self, data):
+		self._data = data
+		self._tags = TagsData()
+		self._parse()
+
+	def _parse(self):
+		while self._data:
+			tag  = "%s%s" % (chr(unpack_int8(self._data[0])), chr(unpack_int8(self._data[1])))
+			type = chr(unpack_int8(self._data[2]))
+			self._data = self._data[3:]
+			if type in BAM_TAG_VALUE:
+				value = self._parseUnique(type)
+			elif type == "Z":
+				value = self._parseString()
+			elif type == "H":
+				size = unpack_int8(self._data[0])
+				self._data = self._data[1:]
+				value = self._parseSeveral("C", size)
+			elif type == "B":
+				secondType = unpack_int8(self._data[0])
+				size       = unpack_int8(self._data[1]) + unpack_int8(self._data[2]) * 16 + unpack_int8(self._data[3]) * 16 * 16 + unpack_int8(self._data[4]) * 16 * 16 * 16
+				self._data = self._data[5:]
+				value      = self._parseSeveral(secondType, size)
+			else:
+				raise Exception("Cannot parse type '%s'." % (type))
+			fullTag = TagData(tag, type, value)
+			self._tags.add(fullTag)
+
+	def _parseUnique(self, type):
+		value = BAM_TAG_CODE[type](self._data[:BAM_TAG_SIZE[type]])
+		self._data = self._data[BAM_TAG_SIZE[type]:]
+		return value
+
+	def _parseSeveral(self, type, size):
+		value = []
+		for i in range(size):
+			value.append(self._parseUnique(type))
+		return value
+
+	def _parseString(self):
+		value      = ""
+		char       = self._data[0]
+		self._data = self._data[1:]
+		while unpack_int8(char) != 0:
+			value     += char
+			char       = self._data[0]
+			self._data = self._data[1:]
+		return value
+			
+	def getTags(self):
+		return self._tags.getTags()
+
+	def __str__(self):
+		return self._tags
+
+
+class AlignedRead(object):
+	def __init__(self, data, refs):
+		self._data = data
+		self._refs = refs
+
+	def parse(self):
+		self._parse_common()
+		self._parse_flag_nc()
+		self._parse_bin_mq_nl()
+		self._parse_name()
+		self._parse_cigar()
+		self._parse_sequence()
+		self._parse_quality()
+		self._parse_tags()
+	
+	def _parse_common(self):
+		ref_id           = unpack_int32(self._data[0:4])
+		self._chromosome = self._refs[ref_id]
+		self._pos        = unpack_int32(self._data[4:8]) + 1
+		mate_ref_id      = unpack_int32(self._data[20:24])
+		if mate_ref_id == -1:
+			self._rnext = "*"
+		else:
+			self._rnext = self._refs[mate_ref_id]
+		if self._rnext == self._chromosome:
+			self._rnext = "="
+		self._pnext = unpack_int32(self._data[24:28]) + 1
+		self._tlen  = unpack_int32(self._data[28:32])
+	
+	def _parse_bin_mq_nl(self):
+		bin_mq_nl = unpack_uint32(self._data[8:12])
+		self._bin = bin_mq_nl >> 16
+		self._mappingQuality = bin_mq_nl >> 8 & 0xff
+		self._query_name_length = bin_mq_nl & 0xff
+
+	def _parse_flag_nc(self):
+		flag_nc = unpack_uint32(self._data[12:16])
+		self._flag = flag_nc >> 16
+		self._num_cigar_ops = flag_nc & 0xffff
+		
+	def _parse_name(self):
+		start = 32
+		stop = start + self._query_name_length
+		self._name = unpack_string(self._data[start:stop])
+	
+	def _parse_cigar(self):
+		start = 32 + self._query_name_length
+		stop = start + (self._num_cigar_ops * 4)
+		_buffer = self._data[start:stop]
+		cigar = CigarData(_buffer, self._num_cigar_ops)
+		self._cigar = cigar.getCigarData()
+	
+	def _parse_sequence(self):
+		seq_length = unpack_int32(self._data[16:20])
+		start = 32 + self._query_name_length + (self._num_cigar_ops * 4)
+		stop = start + (seq_length + 1) / 2
+		_buffer = self._data[start:stop]
+		self._sequence = ""
+		for i in range(seq_length):
+			x = unpack_uint8(_buffer[(i / 2)])
+			index = (x >> (4 * (1 - (i % 2)))) & 0xf 
+			base = BAM_DNA_LOOKUP[index]
+			self._sequence += base
+
+	def _parse_quality(self):
+		seq_length = unpack_int32(self._data[16:20])
+		start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2
+		stop = start + seq_length
+		_buffer = self._data[start:stop]
+		self._quality = "".join(["%s" % (chr(unpack_int8(x) + 33)) for x in _buffer])
+
+	def _parse_tags(self):
+		seq_length = unpack_int32(self._data[16:20])
+		start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2 + (seq_length + 1) - 1
+		stop = start + seq_length
+		_buffer = self._data[start:]
+		tagParser = TagParser(_buffer)
+		self._tags = tagParser.getTags()
+		
+
+class FileReader(object):
+
+	def __init__(self, handle):
+		self._handle = handle
+		self._readHeader()
+
+	def _readHeader(self):
+		magic = unpack_string(self._handle.read(4))
+		if magic != "BAM\1":
+			raise Exception("File should start with 'BAM\1', starting with '%s' instead." % (magic))
+		tlen       = unpack_int32(self._handle.read(4))
+		text       = unpack_string(self._handle.read(tlen))
+		nrefs      = unpack_int32(self._handle.read(4))
+		self._refs = []
+		for i in range(nrefs):
+			sizeName = unpack_int32(self._handle.read(4))
+			name     = unpack_string(self._handle.read(sizeName))
+			size     = unpack_int32(self._handle.read(4))
+			self._refs.append(name)
+		self._startPos = self._handle.tell()
+
+	def reset(self):
+		self._handle.seek(self._startPos)
+
+	def getNextAlignment(self):
+		try:
+			blockSize = unpack_int32(self._handle.read(4))
+		except struct.error:
+			return False
+		block       = self._handle.read(blockSize)
+		currentRead = AlignedRead(block, self._refs)
+		return currentRead
+			
+
+
+def parseAlignedRead(read):
+	if (read._flag & 0x4) == 0x4:
+		return None
+
+	mapping	      = Mapping()
+	direction     = 1 if (read._flag & 0x10) == 0x0 else -1
+	genomeStart   = read._pos
+	nbOccurrences = 1
+	nbMismatches  = 0
+	nbMatches	  = 0
+	nbGaps		  = 0
+	subMapping	  = None
+	queryOffset   = 0
+	targetOffset  = 0
+	readStart	  = None
+
+	for tag, value in read._tags.iteritems():
+		if tag == "X0":
+			nbOccurrences = value._value
+		elif tag == "X1":
+			nbOccurrences += value._value
+		elif tag == "XM":
+			nbMismatches = value._value
+	mapping.setTagValue("nbOccurrences", nbOccurrences)
+	mapping.setTagValue("quality", read._mappingQuality)
+
+	for operation in read._cigar:
+		if operation._type == "M":
+			if readStart == None:
+				readStart = queryOffset
+			if subMapping == None:
+				subMapping = SubMapping()
+				subMapping.setSize(operation._length)
+				subMapping.setDirection(direction)
+				subMapping.queryInterval.setName(read._name)
+				subMapping.queryInterval.setStart(queryOffset)
+				subMapping.queryInterval.setDirection(direction)
+				subMapping.targetInterval.setChromosome(read._chromosome)
+				subMapping.targetInterval.setStart(genomeStart + targetOffset)
+				subMapping.targetInterval.setDirection(1)
+			nbMatches	 += operation._length
+			targetOffset += operation._length
+			queryOffset  += operation._length
+			currentNumber = 0
+			continue
+		if operation._type == "I":
+			nbGaps	   += 1
+			queryOffset  += operation._length
+			currentNumber = 0
+			continue
+		if operation._type == "D":
+			if subMapping != None:
+				subMapping.queryInterval.setEnd(queryOffset - 1)
+				subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+				mapping.addSubMapping(subMapping)
+			subMapping	  = None
+			nbGaps	     += 1
+			targetOffset += operation._length
+			currentNumber = 0
+			continue
+		if operation._type == "N":
+			if subMapping != None:
+				subMapping.queryInterval.setEnd(queryOffset - 1)
+				subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+				mapping.addSubMapping(subMapping)
+			subMapping	= None
+			targetOffset += operation._length
+			currentNumber = 0
+			continue
+		if operation._type == "S":
+			nbMismatches += operation._length
+			targetOffset += operation._length
+			queryOffset  += operation._length
+			currentNumber = 0
+			continue
+		if operation._type == "H":
+			targetOffset += operation._length
+			queryOffset  += operation._length
+			currentNumber = 0
+			continue
+		if operation._type == "P":
+			continue
+		raise Exception("Do not understand parameter '%s'" % (operation._type))
+
+	if subMapping != None:
+		subMapping.queryInterval.setEnd(queryOffset - 1)
+		subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+		mapping.addSubMapping(subMapping)
+	mapping.queryInterval.setStart(readStart)
+	mapping.queryInterval.setEnd(queryOffset - 1)
+	mapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+	mapping.setNbMismatches(nbMismatches)
+	mapping.setNbGaps(nbGaps)
+	mapping.queryInterval.setName(read._name)
+	mapping.queryInterval.setDirection(direction)
+	mapping.targetInterval.setChromosome(read._chromosome)
+	mapping.targetInterval.setStart(genomeStart)
+	mapping.targetInterval.setDirection(direction)
+	mapping.setSize(len(read._sequence))
+	mapping.setDirection(direction)
+	return mapping
+
+	
+class BamParser(MapperParser):
+	"""A class that parses BAM format"""
+
+	def __init__(self, fileName, verbosity = 0):
+		self.verbosity = verbosity
+		self.handle = gzip.open(fileName, "rb")
+		self.reader = FileReader(self.handle)
+		self.nbMappings = None
+		self.fileName   = fileName
+
+
+	def __del__(self):
+		self.handle.close()
+
+
+	def getFileFormats():
+		return ["bam"]
+	getFileFormats = staticmethod(getFileFormats)
+
+
+	def reset(self):
+		self.reader.reset()
+
+
+	def getNextMapping(self):
+		self.currentMapping = None
+		while self.currentMapping == None:
+			read = self.reader.getNextAlignment()
+			if not read:
+				self.currentMapping = False
+				return False
+			read.parse()
+			self.currentMapping = parseAlignedRead(read)
+		return self.currentMapping
+		
+		
+	def setDefaultTagValue(self, name, value):
+		pass
+
+
+	def skipFirstLines(self):
+		pass