Mercurial > repos > yufei-luo > s_mart
view commons/core/parsing/WigParser.py @ 60:90f4b29d884f
Uploaded
author | m-zytnicki |
---|---|
date | Fri, 21 Feb 2014 08:32:36 -0500 |
parents | 169d364ddd91 |
children |
line wrap: on
line source
# # Copyright INRA-URGI 2009-2010 # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. # import re import sys import os.path import struct from commons.core.parsing.TranscriptListParser import TranscriptListParser from SMART.Java.Python.structure.Transcript import Transcript STRANDTOSTR = {1: "(+)", 0: "(=)", None: "(=)", -1: "(-)"} nbOpenHandles = 30 class WigParser(TranscriptListParser): """A class that parses a big WIG file, creates an index and make it possible to quickly retrieve some data""" def __init__(self, fileName, verbosity = 1): self.fileName = fileName self.filler = "\xFF" * struct.calcsize('Q') self.strands = False self.indexFiles = {} self.indexBuilt = False self.defaultValue = 0.0 self.currentChromosome = None self.currentStrand = 1 self.verbosity = verbosity super(WigParser, self).__init__(fileName, verbosity) def __def__(self): for file in self.indexFiles.values(): file.close() def setStrands(self, strands): self.strands = strands def setDefaultValue(self, value): self.defaultValue = value def getFileFormats(): return ["wig"] getFileFormats = staticmethod(getFileFormats) def setStrands(self, strands): """ Consider both strands separately """ self.strands = strands def makeIndexName(self, chromosome, strand = None): """ Create an index name for a file """ directoryName = os.path.dirname(self.fileName) baseName = os.path.splitext(os.path.basename(self.fileName))[0] if strand == None: strandName = "" else: strandName = "+" if strand == 1 else "-" indexName = os.path.join(directoryName, ".%s.%s%s.index" % (baseName, chromosome, strandName)) return indexName def findIndexFile(self, chromosome, strand = None): """ Check if the index of a file exists """ indexName = self.makeIndexName(chromosome, strand) if os.path.exists(indexName): return indexName return False def makeIndexFile(self): """ Create the index for a file """ if self.indexBuilt: return inputFile = open(self.fileName) outputFile = None index = 0 mark = inputFile.tell() line = inputFile.readline().strip() chromosome = None while line != "": m1 = re.search(r"^\s*-?\d+\.?\d*\s*$", line) m2 = re.search(r"^\s*(\d+)\s+-?\d+\.?\d*\s*$", line) m3 = re.search(r"^\s*fixedStep\s+chrom=(\S+)\s+start=(\d+)\s+step=1\s*$", line) m4 = re.search(r"^\s*fixedStep\s+chrom=\S+\s+start=\d+\s+step=\d+\s+span=\d+\s*$", line) m5 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s*$", line) m6 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s+span=(\d+)\s*$", line) if m1 != None: outputFile.write(struct.pack("Q", mark)) index += 1 elif m2 != None: nextIndex = int(m2.group(1)) if index < nextIndex - 1: outputFile.write(self.filler * (nextIndex - index - 1)) outputFile.write(struct.pack("Q", mark)) index = nextIndex elif m3 != None: newChromosome = m3.group(1) if newChromosome != chromosome: if outputFile != None: outputFile.close() outputFile = open(self.makeIndexName(newChromosome), "wb") chromosome = newChromosome nextIndex = int(m3.group(2)) outputFile.write(self.filler * (nextIndex - index)) index = nextIndex elif m4 != None: raise Exception("Error! Cannot parse fixed step WIG files with step > 1 or span > 1") elif m5 != None: newChromosome = m5.group(1) if newChromosome != chromosome: if outputFile != None: outputFile.close() outputFile = open(self.makeIndexName(newChromosome), "wb") index = 0 outputFile.write(self.filler) chromosome = newChromosome elif m6 != None: if m6.group(2) != "1": raise Exception("Error! Cannot parse variable step WIG files with step > 1 or span > 1") newChromosome = m6.group(1) if newChromosome != chromosome: if outputFile != None: outputFile.close() outputFile = open(self.makeIndexName(newChromosome), "wb") index = 0 outputFile.write(self.filler) chromosome = newChromosome elif (len(line) == 0) or line[0] == "#" or line.startswith("track"): pass else: raise Exception("Error! Cannot understand line '%s' of WIG file while creating index file! Aborting." % (line)) mark = inputFile.tell() line = inputFile.readline().strip() inputFile.close if outputFile != None: outputFile.close() self.indexBuilt = True def getIndexFileHandle(self, chromosome, strand = None): """ Get the handle of an index file """ indexFileKey = chromosome if strand != None: indexFileKey += "+" if strand == 1 else "-" if indexFileKey in self.indexFiles: return self.indexFiles[indexFileKey] indexFileName = self.makeIndexName(chromosome, strand) if not self.findIndexFile(chromosome, strand): self.makeIndexFile() if not os.path.exists(indexFileName): print "Warning! Index for chromosome %s, strand %s does not exist." % (chromosome, STRANDTOSTR[strand]) return False indexFile = open(indexFileName, "rb") if len(self.indexFiles.keys()) > nbOpenHandles: removedKey = set(self.indexFiles.keys()).pop() self.indexFiles[removedKey].close() del self.indexFiles[removedKey] self.indexFiles[indexFileKey] = indexFile return indexFile def findIndex(self, chromosome, start, strand = None): """ Find the point where to start reading file """ sizeOfLong = struct.calcsize("Q") empty = int(struct.unpack("Q", self.filler)[0]) offset = empty indexFile = self.getIndexFileHandle(chromosome, strand) if not indexFile: return (None, None) while offset == empty: address = start * sizeOfLong indexFile.seek(address, os.SEEK_SET) buffer = indexFile.read(sizeOfLong) if len(buffer) != sizeOfLong: if buffer == "": print "Warning! Index position %d of chromosome %s on strand %s seems out of range!" % (start, chromosome, STRANDTOSTR[strand]) return (None, None) else: raise Exception("Problem fetching position %d of chromosome %s on strand %s seems out of range!" % (start, chromosome, STRANDTOSTR[strand])) offset = int(struct.unpack("Q", buffer)[0]) start += 1 start -= 1 return (offset, start) def getRange(self, chromosome, start, end): """ Parse a wig file and output a range """ arrays = {} strands = {1: "+", -1: "-"} if self.strands else {0: ""} for strand in strands: array = [self.defaultValue] * (end - start + 1) file = open(self.fileName) offset, index = self.findIndex(chromosome, start, strand if self.strands else None) if offset == None: arrays[strand] = array continue file.seek(offset, os.SEEK_SET) for line in file: line = line.strip() m1 = re.search(r"^\s*(-?\d+\.?\d*)\s*$", line) m2 = re.search(r"^\s*(\d+)\s+(-?\d+\.?\d*)\s*$", line) m3 = re.search(r"^\s*fixedStep\s+chrom=(\S+)\s+start=(\d+)\s+step=\d+\s*$", line) m4 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s*$", line) if m1 != None: if index > end: break if index >= start: array[index - start] = float(m1.group(1)) index += 1 elif m2 != None: index = int(m2.group(1)) if index > end: break if index >= start: array[index - start] = float(m2.group(2)) index += 1 elif m3 != None: if m3.group(1) != "%s%s" % (chromosome, strands[strand]): break index = int(m3.group(2)) elif m4 != None: if m4.group(1) != "%s%s" % (chromosome, strands[strand]): break elif (len(line) == 0) or (line[0] == "#") or line.startswith("track"): pass else: raise Exception("Error! Cannot read line '%s' of wig file" % (line)) file.close() arrays[strand] = array if self.strands: return arrays return array def skipFirstLines(self): return def parseLine(self, line): if line.startswith("track"): return None m = re.search(r"^\s*variableStep\s+chrom=(\S+)", line) if m != None: chromosome = m.group(1) if chromosome.endswith("+"): self.currentStrand = 1 self.currentChromosome = chromosome[:-1] elif chromosome.endswith("-"): self.currentStrand = -1 self.currentChromosome = chromosome[:-1] else: self.currentStrand = 1 self.currentChromosome = chromosome return None position, value = line.split() position = int(position) value = float(value) transcript = Transcript() transcript.setChromosome(self.currentChromosome) transcript.setStart(position) transcript.setEnd(position) transcript.setDirection(self.currentStrand) transcript.setTagValue("ID", "wig_%s_%d_%d" % (self.currentChromosome, self.currentStrand, position)) transcript.setTagValue("nbElements", value) return transcript