Mercurial > repos > yufei-luo > s_mart
view smart_toolShed/SMART/Java/Python/clusterizeBySlidingWindows.py @ 4:1fc014126d55
Uploaded
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:45:50 -0500 |
parents | e0f8dcca02ed |
children |
line wrap: on
line source
#! /usr/bin/env python # # Copyright INRA-URGI 2009-2010 # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. # import re from commons.core.writer.WriterChooser import WriterChooser """ Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks. """ import os, os.path from optparse import OptionParser from SMART.Java.Python.structure.Transcript import Transcript from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer from SMART.Java.Python.misc.RPlotter import RPlotter from SMART.Java.Python.misc.Progress import Progress from commons.core.writer.Gff3Writer import Gff3Writer class ClusterizeBySlidingWindows(object): def __init__(self, verbosity = 0): self.verbosity = verbosity self.strands = (0, ) self.normalize = False self.plot = None self.excel = None self.outputFileName = '' self.defaultValue = None def __del__(self): pass def setInputFile(self, fileName, format): self.parser = TranscriptContainer(fileName, format, self.verbosity) def setOutputFileName(self, fileName, format="gff", title="S-MART", feature="transcript", featurePart="exon"): writerChooser = WriterChooser(self.verbosity) writerChooser.findFormat(format) self.writer = writerChooser.getWriter(fileName) self.writer.setTitle(title) self.writer.setFeature(feature) self.writer.setFeaturePart(featurePart) # self.outputFileName = fileName # self.outputFormat = format def setWindowSize(self, size): self.size = size def setWindowOverlap(self, overlap): self.overlap = overlap def setTag(self, tag): self.tag = tag def setOperation(self, operation): self.operation = operation def setBothStrands(self, bothStrands): if bothStrands: self.strands = (-1, 1) def setNormalize(self, normalize): self.normalize = normalize def setPlot(self, plot): self.plot = plot def setExcel(self, excel): self.excel = excel def setOutputTag(self, tag): self.outputTagName = tag def setDefaultValue(self, defaultValue): self.defaultValue = defaultValue def checkOptions(self): # if self.operation != None: # raise Exception("Trying to combine the values without specifying tag! Aborting...") if self.operation != None and self.operation not in ("sum", "avg", "med", "min", "max"): raise Exception("Do not understand tag '%s'! Aborting..." % (self.operation)) def getChromosomeSizes(self): self.sizes = {} progress = Progress(self.parser.getNbTranscripts(), "Getting sizes in genome", self.verbosity) for transcript in self.parser.getIterator(): self.sizes[transcript.getChromosome()] = max(transcript.getStart(), self.sizes.get(transcript.getChromosome(), 0)) progress.inc() progress.done() def getBinsFromPos(self, pos): bin = (pos - 1) / (self.size - self.overlap) if bin >= 1 and pos <= bin * (self.size - self.overlap) + self.overlap: return (bin - 1, bin) return (bin, ) def getPosFromBin(self, bin): return (bin * (self.size - self.overlap) + 1, bin * (self.size - self.overlap) + self.size) def initializeBins(self): self.binsPerStrand = {} self.sumsPerStrand = {} self.valuesPerStrand = {} self.toBePlottedPerStrand = {} for strand in self.strands: self.binsPerStrand[strand] = {} self.sumsPerStrand[strand] = {} self.valuesPerStrand[strand] = {} self.toBePlottedPerStrand[strand] = {} for chromosome in self.sizes: binRange = range(self.getBinsFromPos(self.sizes[chromosome])[-1] + 1) self.binsPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange]) self.sumsPerStrand[strand][chromosome] = dict([[i, 0.0] for i in binRange]) self.valuesPerStrand[strand][chromosome] = dict([[i, []] for i in binRange]) self.toBePlottedPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange]) def getNbElements(self, transcript): nbOccurrences = 1 if "nbOccurrences" not in transcript.getTagNames() else transcript.getTagValue("nbOccurrences") nbElements = 1 if "nbElements" not in transcript.getTagNames() else transcript.getTagValue("nbElements") nbOccurrences = float(nbOccurrences) nbElements = float(nbElements) nbElements /= float(nbOccurrences) return nbElements def setBins(self): progress = Progress(self.parser.getNbTranscripts(), "Setting bins", self.verbosity) for transcript in self.parser.getIterator(): nbElements = self.getNbElements(transcript) strand = transcript.getDirection() if len(self.strands) == 2 else 0 for bin in self.getBinsFromPos(transcript.getStart()): self.binsPerStrand[strand][transcript.getChromosome()][bin] += nbElements if self.tag != None: if self.tag not in transcript.getTagNames(): if self.defaultValue is None: raise Exception("Tag %s undefined in transcript %s" % (self.tag, transcript)) value = self.defaultValue else: value = float(transcript.getTagValue(self.tag)) self.sumsPerStrand[strand][transcript.getChromosome()][bin] += value self.valuesPerStrand[strand][transcript.getChromosome()][bin].append(value) progress.inc() progress.done() def aggregateData(self): if self.operation == "sum": self.computeSumData() elif self.operation == "avg": self.computeAvgData() elif self.operation == "med": self.computeMedData() elif self.operation == "min": self.computeMinData() elif self.operation == "max": self.computeMaxData() elif self.operation == "GCpercent": self.computeGCPercent() else: self.toBePlottedPerStrand = self.binsPerStrand def computeSumData(self): self.toBePlottedPerStrand = self.sumsPerStrand def computeAvgData(self): for strand in self.strands: for chromosome in self.binsPerStrand[strand]: for bin in self.binsPerStrand[strand][chromosome]: if self.binsPerStrand[strand][chromosome][bin] != 0: self.toBePlottedPerStrand[strand][chromosome][bin] = float(self.sumsPerStrand[strand][chromosome][bin]) / self.binsPerStrand[strand][chromosome][bin] def computeMedData(self): for strand in self.strands: for chromosome in self.binsPerStrand[strand]: for bin in self.binsPerStrand[strand][chromosome]: if self.valuesPerStrand[strand][chromosome][bin]: self.valuesPerStrand[strand][chromosome][bin].sort() size = len(self.valuesPerStrand[strand][chromosome][bin]) if size % 2 == 1: self.toBePlottedPerStrand[strand][chromosome][bin] = self.valuesPerStrand[strand][chromosome][bin][(size - 1) / 2] else: self.toBePlottedPerStrand[strand][chromosome][bin] = (self.valuesPerStrand[strand][chromosome][bin][size / 2 - 1] + self.valuesPerStrand[strand][chromosome][bin][size / 2]) / 2.0 def computeMinData(self): for strand in self.strands: for chromosome in self.binsPerStrand[strand]: for bin in self.binsPerStrand[strand][chromosome]: if self.valuesPerStrand[strand][chromosome][bin]: self.toBePlottedPerStrand[strand][chromosome][bin] = min(self.valuesPerStrand[strand][chromosome][bin]) def computeMaxData(self): for strand in self.strands: for chromosome in self.binsPerStrand[strand]: for bin in self.binsPerStrand[strand][chromosome]: if self.valuesPerStrand[strand][chromosome][bin]: self.toBePlottedPerStrand[strand][chromosome][bin] = max(self.valuesPerStrand[strand][chromosome][bin]) def computeGCPercent(self): for strand in self.strands: for chromosome in self.binsPerStrand[strand]: for bin in self.binsPerStrand[strand][chromosome]: if self.valuesPerStrand[strand][chromosome][bin]: subSequence = self.valuesPerStrand[strand][chromosome][bin] NPercent = 100 * (subSequence.countNt("N") / float(subSequence.getSize())) if NPercent >= 50: currentGCpercent = "NA" else: currentGCpercent = subSequence.getGCpercentageInSequenceWithoutCountNInLength() self.toBePlottedPerStrand[strand][chromosome][bin] = currentGCpercent #TODO: see if a map method could be used for the various "compute" methods #return currentGCpercent, NPercent def plotData(self): if self.plot != None: for strand in self.strands: adjunct = "" if strand != 0: adjunct = "Strand%d" % (strand) for chromosome in self.toBePlottedPerStrand[strand]: if len(self.toBePlottedPerStrand[strand][chromosome].keys()) > 0: plotter = RPlotter(self.plot, self.verbosity) plotter.setFill(0) plotter.addLine(self.toBePlottedPerStrand[strand][chromosome], chromosome) plotter.plot() def writeExcel(self): if self.excel != None: excelFile = open(self.excel, "w") for strand in self.strands: maxBin = max([max(self.toBePlottedPerStrand[strand][chromosome].keys()) for chromosome in self.binsPerStrand[strand]]) for bin in range(0, maxBin + 1): excelFile.write(",%d-%d" % self.getPosFromBin(bin)) excelFile.write("\n") for chromosome in self.toBePlottedPerStrand[strand]: excelFile.write("%s" % (chromosome)) for bin in self.toBePlottedPerStrand[strand][chromosome]: excelFile.write(",%f" % (self.toBePlottedPerStrand[strand][chromosome][bin])) excelFile.write("\n") excelFile.close() def printRegions(self): cpt = 1 tagOp = "nb" tagName = "Elements" outputTagName = "nbElements" if self.operation != None: tagOp = self.operation.lower() if self.tag != None: tagName = self.tag.title() if self.outputTagName != None: outputTagName = self.outputTagName #writer = Gff3Writer(self.outputFileName, self.verbosity) for strand in self.strands: for chromosome in self.toBePlottedPerStrand[strand]: for bin in self.toBePlottedPerStrand[strand][chromosome]: transcript = Transcript() transcript.setName("region%d" % cpt) transcript.setChromosome(chromosome) transcript.setStart(self.getPosFromBin(bin)[0]) transcript.setEnd(self.getPosFromBin(bin)[1]) transcript.setDirection(1 if strand == 0 else strand) transcript.setTagValue(outputTagName, self.binsPerStrand[strand][chromosome][bin]) transcript.setTagValue("%s%s" % (tagOp, tagName), str(self.toBePlottedPerStrand[strand][chromosome][bin])) self.writer.addTranscript(transcript) cpt += 1 self.writer.close() def run(self): self.checkOptions() self.getChromosomeSizes() self.initializeBins() self.setBins() self.aggregateData() if self.excel: self.writeExcel() if self.plot: self.plotData() self.printRegions() if __name__ == "__main__": # parse command line description = "Clusterize by Sliding Windows v1.0.1: Produces a GFF3 file that clusters a list of transcripts using a sliding window. [Category: Sliding Windows]" parser = OptionParser(description = description) parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="format of the output file [format: transcript file format]") parser.add_option("-s", "--size", dest="size", action="store", type="int", help="size of the regions [compulsory] [format: int]") parser.add_option("-e", "--overlap", dest="overlap", action="store", type="int", help="overlap between two consecutive regions [compulsory] [format: int]") parser.add_option("-m", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") parser.add_option("-g", "--tag", dest="tag", action="store", default=None, type="string", help="use a given tag as input (instead of summing number of features) [format: string]") parser.add_option("-r", "--operation", dest="operation", action="store", default=None, type="string", help="combine tag value with given operation [format: choice (sum, avg, med, min, max)]") parser.add_option("-d", "--defaultValue",dest="defaultValue", action="store", type="float", help="default value for input tag [format: float]") parser.add_option("-w", "--write", dest="writeTag", action="store", default=None, type="string", help="print the result in the given tag (default usually is 'nbElements') [format: string]") parser.add_option("-2", "--strands", dest="strands", action="store_true", default=False, help="consider the two strands separately [format: bool] [default: false]") parser.add_option("-p", "--plot", dest="plot", action="store", default=None, type="string", help="plot regions to the given file [format: output file in PNG format]") parser.add_option("-x", "--excel", dest="excel", action="store", default=None, type="string", help="write an Excel file to the given file [format: output file in Excel format]") parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") (options, args) = parser.parse_args() cbsw = ClusterizeBySlidingWindows(options.verbosity) cbsw.setInputFile(options.inputFileName, options.inputFormat) cbsw.setOutputFileName(options.outputFileName, options.outputFormat) cbsw.setWindowSize(options.size) cbsw.setWindowOverlap(options.overlap) cbsw.setTag(options.tag) cbsw.setDefaultValue(options.defaultValue) cbsw.setOperation(options.operation) cbsw.setOutputTag(options.writeTag) cbsw.setBothStrands(options.strands) cbsw.setPlot(options.plot) cbsw.setExcel(options.excel) cbsw.run()