view smart_toolShed/SMART/Java/Python/clusterizeBySlidingWindows.py @ 4:1fc014126d55

Uploaded
author yufei-luo
date Fri, 18 Jan 2013 04:45:50 -0500
parents e0f8dcca02ed
children
line wrap: on
line source

#! /usr/bin/env python
#
# Copyright INRA-URGI 2009-2010
# 
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
# 
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
# 
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
# 
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
import re
from commons.core.writer.WriterChooser import WriterChooser
"""
Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks.
"""

import os, os.path
from optparse import OptionParser
from SMART.Java.Python.structure.Transcript import Transcript
from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
from SMART.Java.Python.misc.RPlotter import RPlotter
from SMART.Java.Python.misc.Progress import Progress
from commons.core.writer.Gff3Writer import Gff3Writer

class ClusterizeBySlidingWindows(object):

    def __init__(self, verbosity = 0):
        self.verbosity = verbosity
        self.strands   = (0, )
        self.normalize = False
        self.plot      = None
        self.excel     = None
        self.outputFileName = ''
        self.defaultValue = None

    def __del__(self):
        pass

    def setInputFile(self, fileName, format):
        self.parser = TranscriptContainer(fileName, format, self.verbosity)

    def setOutputFileName(self, fileName, format="gff", title="S-MART", feature="transcript", featurePart="exon"):
        writerChooser = WriterChooser(self.verbosity)
        writerChooser.findFormat(format)
        self.writer = writerChooser.getWriter(fileName)
        self.writer.setTitle(title)
        self.writer.setFeature(feature)
        self.writer.setFeaturePart(featurePart)
#        self.outputFileName = fileName
#        self.outputFormat = format

    def setWindowSize(self, size):
        self.size = size

    def setWindowOverlap(self, overlap):
        self.overlap = overlap

    def setTag(self, tag):
        self.tag = tag

    def setOperation(self, operation):
        self.operation = operation

    def setBothStrands(self, bothStrands):
        if bothStrands:
            self.strands = (-1, 1)

    def setNormalize(self, normalize):
        self.normalize = normalize

    def setPlot(self, plot):
        self.plot = plot

    def setExcel(self, excel):
        self.excel = excel

    def setOutputTag(self, tag):
        self.outputTagName = tag
        
    def setDefaultValue(self, defaultValue):
        self.defaultValue = defaultValue

    def checkOptions(self):
#        if self.operation != None:
#            raise Exception("Trying to combine the values without specifying tag! Aborting...")
        if self.operation != None and self.operation not in ("sum", "avg", "med", "min", "max"):
            raise Exception("Do not understand tag '%s'! Aborting..." % (self.operation))

    def getChromosomeSizes(self):
        self.sizes = {}
        progress = Progress(self.parser.getNbTranscripts(), "Getting sizes in genome", self.verbosity)
        for transcript in self.parser.getIterator():
            self.sizes[transcript.getChromosome()] = max(transcript.getStart(), self.sizes.get(transcript.getChromosome(), 0))
            progress.inc()
        progress.done()

    def getBinsFromPos(self, pos):
        bin = (pos - 1) / (self.size - self.overlap)
        if bin >= 1 and pos <= bin * (self.size - self.overlap) + self.overlap:
            return (bin - 1, bin)
        return (bin, )

    def getPosFromBin(self, bin):
        return (bin * (self.size - self.overlap) + 1, bin * (self.size - self.overlap) + self.size)

    def initializeBins(self):
        self.binsPerStrand        = {}
        self.sumsPerStrand        = {}
        self.valuesPerStrand      = {}
        self.toBePlottedPerStrand = {}
        for strand in self.strands:
            self.binsPerStrand[strand]        = {}
            self.sumsPerStrand[strand]        = {}
            self.valuesPerStrand[strand]      = {}
            self.toBePlottedPerStrand[strand] = {}
            for chromosome in self.sizes:
                binRange = range(self.getBinsFromPos(self.sizes[chromosome])[-1] + 1)
                self.binsPerStrand[strand][chromosome]        = dict([[i, 0]   for i in binRange])
                self.sumsPerStrand[strand][chromosome]        = dict([[i, 0.0] for i in binRange])
                self.valuesPerStrand[strand][chromosome]      = dict([[i, []]  for i in binRange])
                self.toBePlottedPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange])

    def getNbElements(self, transcript):
        nbOccurrences = 1 if "nbOccurrences" not in transcript.getTagNames() else transcript.getTagValue("nbOccurrences")
        nbElements    = 1 if "nbElements"    not in transcript.getTagNames() else transcript.getTagValue("nbElements")
        nbOccurrences = float(nbOccurrences)
        nbElements = float(nbElements)
        nbElements /= float(nbOccurrences)
        return nbElements

    def setBins(self):
        progress = Progress(self.parser.getNbTranscripts(), "Setting bins", self.verbosity)
        for transcript in self.parser.getIterator():
            nbElements = self.getNbElements(transcript)
            strand     = transcript.getDirection() if len(self.strands) == 2 else 0
            for bin in self.getBinsFromPos(transcript.getStart()):
                self.binsPerStrand[strand][transcript.getChromosome()][bin] += nbElements
                if self.tag != None:
                    if self.tag not in transcript.getTagNames():
                        if self.defaultValue is None:
                            raise Exception("Tag %s undefined in transcript %s" % (self.tag, transcript))
                        value = self.defaultValue
                    else:
                        value = float(transcript.getTagValue(self.tag))
                    self.sumsPerStrand[strand][transcript.getChromosome()][bin] += value
                    self.valuesPerStrand[strand][transcript.getChromosome()][bin].append(value)
            progress.inc()
        progress.done()

    def aggregateData(self):
        if self.operation == "sum":
            self.computeSumData()
        elif self.operation == "avg":
            self.computeAvgData()
        elif self.operation == "med":
            self.computeMedData()
        elif self.operation == "min":
            self.computeMinData()
        elif self.operation == "max":
            self.computeMaxData()
        elif self.operation == "GCpercent":
            self.computeGCPercent()
        else:
            self.toBePlottedPerStrand = self.binsPerStrand

    def computeSumData(self):
        self.toBePlottedPerStrand = self.sumsPerStrand

    def computeAvgData(self):
        for strand in self.strands:
            for chromosome in self.binsPerStrand[strand]:
                for bin in self.binsPerStrand[strand][chromosome]:
                    if self.binsPerStrand[strand][chromosome][bin] != 0:
                        self.toBePlottedPerStrand[strand][chromosome][bin] = float(self.sumsPerStrand[strand][chromosome][bin]) / self.binsPerStrand[strand][chromosome][bin]

    def computeMedData(self):
        for strand in self.strands:
            for chromosome in self.binsPerStrand[strand]:
                for bin in self.binsPerStrand[strand][chromosome]:
                    if self.valuesPerStrand[strand][chromosome][bin]:
                        self.valuesPerStrand[strand][chromosome][bin].sort()
                        size = len(self.valuesPerStrand[strand][chromosome][bin])
                        if size % 2 == 1:
                            self.toBePlottedPerStrand[strand][chromosome][bin] = self.valuesPerStrand[strand][chromosome][bin][(size - 1) / 2]
                        else:
                            self.toBePlottedPerStrand[strand][chromosome][bin] = (self.valuesPerStrand[strand][chromosome][bin][size / 2 - 1] + self.valuesPerStrand[strand][chromosome][bin][size / 2]) / 2.0

    def computeMinData(self):
        for strand in self.strands:
            for chromosome in self.binsPerStrand[strand]:
                for bin in self.binsPerStrand[strand][chromosome]:
                    if self.valuesPerStrand[strand][chromosome][bin]:
                        self.toBePlottedPerStrand[strand][chromosome][bin] = min(self.valuesPerStrand[strand][chromosome][bin])

    def computeMaxData(self):
        for strand in self.strands:
            for chromosome in self.binsPerStrand[strand]:
                for bin in self.binsPerStrand[strand][chromosome]:
                    if self.valuesPerStrand[strand][chromosome][bin]:
                        self.toBePlottedPerStrand[strand][chromosome][bin] = max(self.valuesPerStrand[strand][chromosome][bin])
                        
    def computeGCPercent(self):
        for strand in self.strands:
            for chromosome in self.binsPerStrand[strand]:
                for bin in self.binsPerStrand[strand][chromosome]:
                    if self.valuesPerStrand[strand][chromosome][bin]:
                        subSequence = self.valuesPerStrand[strand][chromosome][bin]
                        NPercent = 100 * (subSequence.countNt("N") / float(subSequence.getSize()))
                        if NPercent >= 50:
                            currentGCpercent = "NA"
                        else:
                            currentGCpercent = subSequence.getGCpercentageInSequenceWithoutCountNInLength()
                        
                        self.toBePlottedPerStrand[strand][chromosome][bin] = currentGCpercent
        #TODO: see if a map method could be used for the various "compute" methods 
        #return currentGCpercent, NPercent
        
    def plotData(self):
        if self.plot != None:
            for strand in self.strands:
                adjunct = ""
                if strand != 0:
                    adjunct = "Strand%d" % (strand)
                for chromosome in self.toBePlottedPerStrand[strand]:
                    if len(self.toBePlottedPerStrand[strand][chromosome].keys()) > 0:
                        plotter = RPlotter(self.plot, self.verbosity)
                        plotter.setFill(0)
                        plotter.addLine(self.toBePlottedPerStrand[strand][chromosome], chromosome)
                        plotter.plot()

    def writeExcel(self):
        if self.excel != None:
            excelFile = open(self.excel, "w")
            for strand in self.strands:
                maxBin = max([max(self.toBePlottedPerStrand[strand][chromosome].keys()) for chromosome in self.binsPerStrand[strand]])
                for bin in range(0, maxBin + 1):
                    excelFile.write(",%d-%d" % self.getPosFromBin(bin))
                excelFile.write("\n")
                for chromosome in self.toBePlottedPerStrand[strand]:
                    excelFile.write("%s" % (chromosome))
                    for bin in self.toBePlottedPerStrand[strand][chromosome]:
                        excelFile.write(",%f" % (self.toBePlottedPerStrand[strand][chromosome][bin]))
                    excelFile.write("\n")
            excelFile.close()

    def printRegions(self):
        cpt           = 1
        tagOp         = "nb"
        tagName       = "Elements"
        outputTagName = "nbElements"
        if self.operation != None:
            tagOp = self.operation.lower()
        if self.tag != None:
            tagName = self.tag.title()
        if self.outputTagName != None:
            outputTagName = self.outputTagName
            
     
        #writer = Gff3Writer(self.outputFileName, self.verbosity)
        
        for strand in self.strands:
            for chromosome in self.toBePlottedPerStrand[strand]:
                for bin in self.toBePlottedPerStrand[strand][chromosome]:
                    transcript = Transcript()
                    transcript.setName("region%d" % cpt)
                    transcript.setChromosome(chromosome)
                    transcript.setStart(self.getPosFromBin(bin)[0])
                    transcript.setEnd(self.getPosFromBin(bin)[1])
                    transcript.setDirection(1 if strand == 0 else strand)
                    transcript.setTagValue(outputTagName, self.binsPerStrand[strand][chromosome][bin])
                    transcript.setTagValue("%s%s" % (tagOp, tagName), str(self.toBePlottedPerStrand[strand][chromosome][bin]))
                    self.writer.addTranscript(transcript)
                    cpt += 1
        self.writer.close()

    def run(self):
        self.checkOptions()
        self.getChromosomeSizes()
        self.initializeBins()
        self.setBins()
        self.aggregateData()
        if self.excel:
            self.writeExcel()
        if self.plot:
            self.plotData()
        self.printRegions()


if __name__ == "__main__":
    
    # parse command line
    description = "Clusterize by Sliding Windows v1.0.1: Produces a GFF3 file that clusters a list of transcripts using a sliding window. [Category: Sliding Windows]"

    parser = OptionParser(description = description)
    parser.add_option("-i", "--input",       dest="inputFileName",  action="store",                     type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
    parser.add_option("-f", "--inputFormat", dest="inputFormat",    action="store",                     type="string", help="format of the input file [compulsory] [format: transcript file format]")
    parser.add_option("-o", "--output",      dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
    parser.add_option("-u", "--outputFormat", dest="outputFormat",  action="store",     default="gff",  type="string", help="format of the output file [format: transcript file format]")
    parser.add_option("-s", "--size",        dest="size",           action="store",                     type="int",    help="size of the regions [compulsory] [format: int]")
    parser.add_option("-e", "--overlap",     dest="overlap",        action="store",                     type="int",    help="overlap between two consecutive regions [compulsory] [format: int]")
    parser.add_option("-m", "--normalize",   dest="normalize",      action="store_true", default=False,                help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
    parser.add_option("-g", "--tag",         dest="tag",            action="store",      default=None,  type="string", help="use a given tag as input (instead of summing number of features) [format: string]")    
    parser.add_option("-r", "--operation",   dest="operation",      action="store",      default=None,  type="string", help="combine tag value with given operation [format: choice (sum, avg, med, min, max)]")
    parser.add_option("-d", "--defaultValue",dest="defaultValue",   action="store",                     type="float",    help="default value for input tag [format: float]")
    parser.add_option("-w", "--write",       dest="writeTag",       action="store",      default=None,  type="string", help="print the result in the given tag (default usually is 'nbElements') [format: string]")    
    parser.add_option("-2", "--strands",     dest="strands",        action="store_true", default=False,                help="consider the two strands separately [format: bool] [default: false]")
    parser.add_option("-p", "--plot",        dest="plot",           action="store",      default=None,  type="string", help="plot regions to the given file [format: output file in PNG format]")
    parser.add_option("-x", "--excel",       dest="excel",          action="store",      default=None,  type="string", help="write an Excel file to the given file [format: output file in Excel format]")
    parser.add_option("-v", "--verbosity",   dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int] [default: 1]")
    (options, args) = parser.parse_args()

    cbsw = ClusterizeBySlidingWindows(options.verbosity)
    cbsw.setInputFile(options.inputFileName, options.inputFormat)
    cbsw.setOutputFileName(options.outputFileName, options.outputFormat)
    cbsw.setWindowSize(options.size)
    cbsw.setWindowOverlap(options.overlap)
    cbsw.setTag(options.tag)
    cbsw.setDefaultValue(options.defaultValue)
    cbsw.setOperation(options.operation)
    cbsw.setOutputTag(options.writeTag)
    cbsw.setBothStrands(options.strands)
    cbsw.setPlot(options.plot)
    cbsw.setExcel(options.excel)
    cbsw.run()