Mercurial > repos > yufei-luo > s_mart
diff commons/core/parsing/WigParser.py @ 36:44d5973c188c
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 15:02:29 -0400 |
parents | |
children | 169d364ddd91 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/WigParser.py Tue Apr 30 15:02:29 2013 -0400 @@ -0,0 +1,333 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +import os.path +import struct +from commons.core.parsing.TranscriptListParser import TranscriptListParser +from SMART.Java.Python.structure.Transcript import Transcript + +STRANDTOSTR = {1: "(+)", 0: "(=)", None: "(=)", -1: "(-)"} + +nbOpenHandles = 30 + + +class WigParser(TranscriptListParser): + """A class that parses a big WIG file, creates an index and make it possible to quickly retrieve some data""" + + def __init__(self, fileName, verbosity = 1): + self.fileName = fileName + self.filler = "\xFF" * struct.calcsize('Q') + self.strands = False + self.indexFiles = {} + self.indexBuilt = False + self.defaultValue = 0.0 + self.currentChromosome = None + self.currentStrand = 1 + self.verbosity = verbosity + super(WigParser, self).__init__(fileName, verbosity) + + + def __def__(self): + for file in self.indexFiles.values(): + file.close() + + + def setStrands(self, strands): + self.strands = strands + + + def setDefaultValue(self, value): + self.defaultValue = value + + + def getFileFormats(): + return ["wig"] + getFileFormats = staticmethod(getFileFormats) + + + def setStrands(self, strands): + """ + Consider both strands separately + """ + self.strands = strands + + + def makeIndexName(self, chromosome, strand = None): + """ + Create an index name for a file + """ + directoryName = os.path.dirname(self.fileName) + if strand == None: + strandName = "" + else: + strandName = "+" if strand == 1 else "-" + indexName = os.path.join(directoryName, ".%s%s.index" % (chromosome, strandName)) + return indexName + + + def findIndexFile(self, chromosome, strand = None): + """ + Check if the index of a file exists + """ + indexName = self.makeIndexName(chromosome, strand) + if os.path.exists(indexName): + return indexName + return False + + + def makeIndexFile(self): + """ + Create the index for a file + """ + if self.indexBuilt: + return + + inputFile = open(self.fileName) + outputFile = None + index = 0 + mark = inputFile.tell() + line = inputFile.readline().strip() + chromosome = None + + while line != "": + m1 = re.search(r"^\s*-?\d+\.?\d*\s*$", line) + m2 = re.search(r"^\s*(\d+)\s+-?\d+\.?\d*\s*$", line) + m3 = re.search(r"^\s*fixedStep\s+chrom=(\S+)\s+start=(\d+)\s+step=1\s*$", line) + m4 = re.search(r"^\s*fixedStep\s+chrom=\S+\s+start=\d+\s+step=\d+\s+span=\d+\s*$", line) + m5 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s*$", line) + m6 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s+span=(\d+)\s*$", line) + + if m1 != None: + outputFile.write(struct.pack("Q", mark)) + index += 1 + elif m2 != None: + nextIndex = int(m2.group(1)) + if index < nextIndex - 1: + outputFile.write(self.filler * (nextIndex - index - 1)) + outputFile.write(struct.pack("Q", mark)) + index = nextIndex + elif m3 != None: + newChromosome = m3.group(1) + if newChromosome != chromosome: + if outputFile != None: + outputFile.close() + outputFile = open(self.makeIndexName(newChromosome), "wb") + chromosome = newChromosome + nextIndex = int(m3.group(2)) + outputFile.write(self.filler * (nextIndex - index)) + index = nextIndex + elif m4 != None: + raise Exception("Error! Cannot parse fixed step WIG files with step > 1 or span > 1") + elif m5 != None: + newChromosome = m5.group(1) + if newChromosome != chromosome: + if outputFile != None: + outputFile.close() + outputFile = open(self.makeIndexName(newChromosome), "wb") + index = 0 + outputFile.write(self.filler) + chromosome = newChromosome + elif m6 != None: + if m6.group(2) != "1": + raise Exception("Error! Cannot parse variable step WIG files with step > 1 or span > 1") + newChromosome = m6.group(1) + if newChromosome != chromosome: + if outputFile != None: + outputFile.close() + outputFile = open(self.makeIndexName(newChromosome), "wb") + index = 0 + outputFile.write(self.filler) + chromosome = newChromosome + elif (len(line) == 0) or line[0] == "#" or line.startswith("track"): + pass + else: + raise Exception("Error! Cannot understand line '%s' of WIG file while creating index file! Aborting." % (line)) + + mark = inputFile.tell() + line = inputFile.readline().strip() + + inputFile.close + if outputFile != None: + outputFile.close() + self.indexBuilt = True + + + def getIndexFileHandle(self, chromosome, strand = None): + """ + Get the handle of an index file + """ + indexFileKey = chromosome + if strand != None: + indexFileKey += "+" if strand == 1 else "-" + if indexFileKey in self.indexFiles: + return self.indexFiles[indexFileKey] + + indexFileName = self.makeIndexName(chromosome, strand) + if not self.findIndexFile(chromosome, strand): + self.makeIndexFile() + + if not os.path.exists(indexFileName): + print "Warning! Index for chromosome %s, strand %s does not exist." % (chromosome, STRANDTOSTR[strand]) + return False + indexFile = open(indexFileName, "rb") + + if len(self.indexFiles.keys()) > nbOpenHandles: + removedKey = set(self.indexFiles.keys()).pop() + self.indexFiles[removedKey].close() + del self.indexFiles[removedKey] + self.indexFiles[indexFileKey] = indexFile + return indexFile + + + + def findIndex(self, chromosome, start, strand = None): + """ + Find the point where to start reading file + """ + + sizeOfLong = struct.calcsize("Q") + empty = int(struct.unpack("Q", self.filler)[0]) + offset = empty + indexFile = self.getIndexFileHandle(chromosome, strand) + + if not indexFile: + return (None, None) + + while offset == empty: + address = start * sizeOfLong + indexFile.seek(address, os.SEEK_SET) + + buffer = indexFile.read(sizeOfLong) + if len(buffer) != sizeOfLong: + if buffer == "": + print "Warning! Index position %d of chromosome %s on strand %s seems out of range!" % (start, chromosome, STRANDTOSTR[strand]) + return (None, None) + else: + raise Exception("Problem fetching position %d of chromosome %s on strand %s seems out of range!" % (start, chromosome, STRANDTOSTR[strand])) + + offset = int(struct.unpack("Q", buffer)[0]) + start += 1 + + start -= 1 + return (offset, start) + + + + def getRange(self, chromosome, start, end): + """ + Parse a wig file and output a range + """ + arrays = {} + strands = {1: "+", -1: "-"} if self.strands else {0: ""} + + for strand in strands: + + array = [self.defaultValue] * (end - start + 1) + file = open(self.fileName) + offset, index = self.findIndex(chromosome, start, strand if self.strands else None) + if offset == None: + arrays[strand] = array + continue + file.seek(offset, os.SEEK_SET) + + for line in file: + line = line.strip() + + m1 = re.search(r"^\s*(-?\d+\.?\d*)\s*$", line) + m2 = re.search(r"^\s*(\d+)\s+(-?\d+\.?\d*)\s*$", line) + m3 = re.search(r"^\s*fixedStep\s+chrom=(\S+)\s+start=(\d+)\s+step=\d+\s*$", line) + m4 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s*$", line) + + if m1 != None: + if index > end: + break + if index >= start: + array[index - start] = float(m1.group(1)) + index += 1 + elif m2 != None: + index = int(m2.group(1)) + if index > end: + break + if index >= start: + array[index - start] = float(m2.group(2)) + index += 1 + elif m3 != None: + if m3.group(1) != "%s%s" % (chromosome, strands[strand]): + break + index = int(m3.group(2)) + elif m4 != None: + if m4.group(1) != "%s%s" % (chromosome, strands[strand]): + break + elif (len(line) == 0) or (line[0] == "#") or line.startswith("track"): + pass + else: + raise Exception("Error! Cannot read line '%s' of wig file" % (line)) + + file.close() + + arrays[strand] = array + + if self.strands: + return arrays + return array + + + def skipFirstLines(self): + return + + + def parseLine(self, line): + if line.startswith("track"): + return None + m = re.search(r"^\s*variableStep\s+chrom=(\S+)", line) + if m != None: + chromosome = m.group(1) + if chromosome.endswith("+"): + self.currentStrand = 1 + self.currentChromosome = chromosome[:-1] + elif chromosome.endswith("-"): + self.currentStrand = -1 + self.currentChromosome = chromosome[:-1] + else: + self.currentStrand = 1 + self.currentChromosome = chromosome + return None + position, value = line.split() + position = int(position) + value = float(value) + transcript = Transcript() + transcript.setChromosome(self.currentChromosome) + transcript.setStart(position) + transcript.setEnd(position) + transcript.setDirection(self.currentStrand) + transcript.setTagValue("ID", "wig_%s_%d_%d" % (self.currentChromosome, self.currentStrand, position)) + transcript.setTagValue("nbElements", value) + return transcript