diff commons/core/parsing/WigParser.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children 94ab73e8a190
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/core/parsing/WigParser.py	Fri Jan 18 04:54:14 2013 -0500
@@ -0,0 +1,324 @@
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+import re
+import sys
+import os.path
+import struct
+from commons.core.parsing.TranscriptListParser import TranscriptListParser
+from SMART.Java.Python.structure.Transcript import Transcript
+
+STRANDTOSTR = {1: "(+)", 0: "(=)", None: "(=)", -1: "(-)"}
+
+nbOpenHandles = 30
+
+
+class WigParser(TranscriptListParser):
+	"""A class that parses a big WIG file, creates an index and make it possible to quickly retrieve some data"""
+
+	def __init__(self, fileName, verbosity = 1):
+		self.fileName		   = fileName
+		self.filler			   = "\xFF" * struct.calcsize('Q')
+		self.strands		   = False
+		self.indexFiles	   	   = {}
+		self.indexBuilt		   = False
+		self.defaultValue	   = 0.0
+		self.currentChromosome = None
+		self.currentStrand	   = 1
+		self.verbosity         = verbosity
+		super(WigParser, self).__init__(fileName, verbosity)
+
+
+	def __def__(self):
+		for file in self.indexFiles.values():
+			file.close()
+
+
+	def setStrands(self, strands):
+		self.strands = strands
+
+
+	def setDefaultValue(self, value):
+		self.defaultValue = value
+
+
+	def getFileFormats():
+		return ["wig"]
+	getFileFormats = staticmethod(getFileFormats)
+
+
+	def setStrands(self, strands):
+		"""
+		Consider both strands separately
+		"""
+		self.strands = strands
+
+
+	def makeIndexName(self, chromosome, strand = None):
+		"""
+		Create an index name for a file
+		"""
+		directoryName = os.path.dirname(self.fileName)
+		if strand == None:
+			strandName = ""
+		else:
+			strandName = "+" if strand == 1 else "-"
+		indexName = os.path.join(directoryName, ".%s%s.index" % (chromosome, strandName))
+		return indexName
+	
+	
+	def findIndexFile(self, chromosome, strand = None):
+		"""
+		Check if the index of a file exists
+		""" 
+		indexName = self.makeIndexName(chromosome, strand)
+		if os.path.exists(indexName):
+			return indexName
+		return False
+	
+	
+	def makeIndexFile(self):
+		"""
+		Create the index for a file
+		"""
+		if self.indexBuilt:
+			return
+
+		inputFile  = open(self.fileName)
+		outputFile = None
+		index	  = 0
+		mark	   = inputFile.tell()
+		line	   = inputFile.readline().strip()
+		chromosome = None
+
+		while line != "":
+			m1 = re.search(r"^\s*-?\d+\.?\d*\s*$", line)
+			m2 = re.search(r"^\s*(\d+)\s+-?\d+\.?\d*\s*$", line)
+			m3 = re.search(r"^\s*fixedStep\s+chrom=(\S+)\s+start=(\d+)\s+step=1\s*$", line)
+			m4 = re.search(r"^\s*fixedStep\s+chrom=\S+\s+start=\d+\s+step=\d+\s+span=\d+\s*$", line)
+			m5 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s*$", line)
+			m6 = re.search(r"^\s*variableStep\s+chrom=(\S+)span=(\d+)\s*$", line)
+						
+			if m1 != None:
+				outputFile.write(struct.pack("Q", mark))
+				index += 1
+			elif m2 != None:
+				nextIndex = int(m2.group(1))
+				if index < nextIndex - 1:
+					outputFile.write(self.filler * (nextIndex - index - 1))
+				outputFile.write(struct.pack("Q", mark))
+				index = nextIndex
+			elif m3 != None:
+				newChromosome = m3.group(1)
+				if newChromosome != chromosome:
+					if outputFile != None:
+						outputFile.close()
+					outputFile = open(self.makeIndexName(newChromosome), "wb")
+				chromosome = newChromosome
+				nextIndex = int(m3.group(2))
+				outputFile.write(self.filler * (nextIndex - index))
+				index = nextIndex
+			elif m4 != None:
+				raise Exception("Error! Cannot parse fixed step WIG files with step > 1 or span > 1")
+			elif m5 != None:
+				newChromosome = m5.group(1)
+				if newChromosome != chromosome:
+					if outputFile != None:
+						outputFile.close()
+					outputFile = open(self.makeIndexName(newChromosome), "wb")
+					index = 0
+					outputFile.write(self.filler)
+				chromosome = newChromosome
+			elif m6 != None:
+				raise Exception("Error! Cannot parse variable step WIG files with step > 1 or span > 1")
+			elif (len(line) == 0) or line[0] == "#" or line.startswith("track"):
+				pass
+			else:
+				raise Exception("Error! Cannot understand line '%s' of WIG file while creating index file! Aborting." % (line))
+			
+			mark = inputFile.tell()
+			line = inputFile.readline().strip()
+
+		inputFile.close
+		if outputFile != None:
+			outputFile.close()
+		self.indexBuilt = True
+	
+	
+	def getIndexFileHandle(self, chromosome, strand = None):
+		"""
+		Get the handle of an index file
+		"""
+		indexFileKey = chromosome
+		if strand != None:
+			indexFileKey += "+" if strand == 1 else "-"
+		if indexFileKey in self.indexFiles:
+			return self.indexFiles[indexFileKey]
+
+		indexFileName = self.makeIndexName(chromosome, strand)
+		if not self.findIndexFile(chromosome, strand):
+			self.makeIndexFile()
+	
+		if not os.path.exists(indexFileName):
+			print "Warning! Index for chromosome %s, strand %s does not exist." % (chromosome, STRANDTOSTR[strand])
+			return False
+		indexFile = open(indexFileName, "rb")
+
+		if len(self.indexFiles.keys()) > nbOpenHandles:
+			removedKey = set(self.indexFiles.keys()).pop()
+			self.indexFiles[removedKey].close()
+			del self.indexFiles[removedKey]
+		self.indexFiles[indexFileKey] = indexFile
+		return indexFile
+		
+
+	
+	def findIndex(self, chromosome, start, strand = None):
+		"""
+		Find the point where to start reading file
+		"""
+
+		sizeOfLong = struct.calcsize("Q")
+		empty	  = int(struct.unpack("Q", self.filler)[0])
+		offset	 = empty
+		indexFile  = self.getIndexFileHandle(chromosome, strand)
+	
+		if not indexFile:
+			return (None, None)
+		
+		while offset == empty:
+			address = start * sizeOfLong
+			indexFile.seek(address, os.SEEK_SET)
+			
+			buffer = indexFile.read(sizeOfLong)
+			if len(buffer) != sizeOfLong:
+				if buffer == "":
+					print "Warning! Index position %d of chromosome %s on strand %s seems out of range!" % (start, chromosome, STRANDTOSTR[strand])
+					return (None, None)
+				else:
+					raise Exception("Problem fetching position %d of chromosome %s on strand %s seems out of range!" % (start, chromosome, STRANDTOSTR[strand]))
+			
+			offset = int(struct.unpack("Q", buffer)[0])
+			start += 1
+			
+		start -= 1
+		return (offset, start)
+	
+	
+
+	def getRange(self, chromosome, start, end):
+		"""
+		Parse a wig file and output a range
+		"""
+		arrays  = {}
+		strands = {1: "+", -1: "-"} if self.strands else {0: ""}
+
+		for strand in strands:
+
+			array = [self.defaultValue] * (end - start + 1)
+			file  = open(self.fileName)
+			offset, index = self.findIndex(chromosome, start, strand if self.strands else None)
+			if offset == None:
+				arrays[strand] = array
+				continue
+			file.seek(offset, os.SEEK_SET)
+
+			for line in file:
+				line = line.strip()
+
+				m1 = re.search(r"^\s*(-?\d+\.?\d*)\s*$", line)
+				m2 = re.search(r"^\s*(\d+)\s+(-?\d+\.?\d*)\s*$", line)
+				m3 = re.search(r"^\s*fixedStep\s+chrom=(\S+)\s+start=(\d+)\s+step=\d+\s*$", line)
+				m4 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s*$", line)
+
+				if m1 != None:
+					if index > end:
+						break
+					if index >= start:
+						array[index - start] = float(m1.group(1))
+					index += 1
+				elif m2 != None:
+					index = int(m2.group(1))
+					if index > end:
+						break
+					if index >= start:
+						array[index - start] = float(m2.group(2))
+					index += 1
+				elif m3 != None:
+					if m3.group(1) != "%s%s" % (chromosome, strands[strand]):
+						break
+					index = int(m3.group(2))
+				elif m4 != None:
+					if m4.group(1) != "%s%s" % (chromosome, strands[strand]):
+						break
+				elif (len(line) == 0) or (line[0] == "#") or line.startswith("track"):
+					pass
+				else:
+					raise Exception("Error! Cannot read line '%s' of wig file" % (line))
+
+			file.close()
+	
+			arrays[strand] = array
+			
+		if self.strands:
+			return arrays
+		return array
+	
+
+	def skipFirstLines(self):
+		return
+
+	
+	def parseLine(self, line):
+		if line.startswith("track"):
+			return None
+		m = re.search(r"^\s*variableStep\s+chrom=(\S+)", line)
+		if m != None:
+			chromosome = m.group(1)
+			if chromosome.endswith("+"):
+				self.currentStrand = 1
+				self.currentChromosome = chromosome[:-1]
+			elif chromosome.endswith("-"):
+				self.currentStrand = -1
+				self.currentChromosome = chromosome[:-1]
+			else:
+				self.currentStrand = 1
+				self.currentChromosome = chromosome
+			return None
+		position, value = line.split()
+		position = int(position)
+		value	= float(value)
+		transcript = Transcript()
+		transcript.setChromosome(self.currentChromosome)
+		transcript.setStart(position)
+		transcript.setEnd(position)
+		transcript.setDirection(self.currentStrand)
+		transcript.setTagValue("ID", "wig_%s_%d_%d" % (self.currentChromosome, self.currentStrand, position))
+		transcript.setTagValue("nbElements", value)
+		return transcript