#! /usr/bin/env python
import re
from commons.core.writer.WriterChooser import WriterChooser
Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks.

import os, os.path
from optparse import OptionParser
from SMART.Java.Python.structure.Transcript import Transcript
from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
from SMART.Java.Python.misc.RPlotter import RPlotter
from SMART.Java.Python.misc.Progress import Progress
from commons.core.writer.Gff3Writer import Gff3Writer

class ClusterizeBySlidingWindows(object):

    def __init__(self, verbosity = 0):
        self.verbosity = verbosity
        self.strands   = (0, )
        self.normalize = False
        self.plot      = None
        self.excel     = None
        self.outputFileName = ''
        self.defaultValue = None

    def __del__(self):

    def setInputFile(self, fileName, format):
        self.parser = TranscriptContainer(fileName, format, self.verbosity)

    def setOutputFileName(self, fileName, format="gff", title="S-MART", feature="transcript", featurePart="exon"):
        writerChooser = WriterChooser(self.verbosity)
        self.writer = writerChooser.getWriter(fileName)
#        self.outputFileName = fileName
#        self.outputFormat = format

    def setWindowSize(self, size):
        self.size = size

    def setWindowOverlap(self, overlap):
        self.overlap = overlap

    def setTag(self, tag):
        self.tag = tag

    def setOperation(self, operation):
        self.operation = operation

    def setBothStrands(self, bothStrands):
        if bothStrands:
            self.strands = (-1, 1)

    def setNormalize(self, normalize):
        self.normalize = normalize

    def setPlot(self, plot):
        self.plot = plot

    def setExcel(self, excel):
        self.excel = excel

    def setOutputTag(self, tag):
        self.outputTagName = tag
    def setDefaultValue(self, defaultValue):
        self.defaultValue = defaultValue

    def checkOptions(self):
#        if self.operation != None:
#            raise Exception("Trying to combine the values without specifying tag! Aborting...")
        if self.operation != None and self.operation not in ("sum", "avg", "med", "min", "max"):
            raise Exception("Do not understand tag '%s'! Aborting..." % (self.operation))

    def getChromosomeSizes(self):
        self.sizes = {}
        progress = Progress(self.parser.getNbTranscripts(), "Getting sizes in genome", self.verbosity)
        for transcript in self.parser.getIterator():
            self.sizes[transcript.getChromosome()] = max(transcript.getStart(), self.sizes.get(transcript.getChromosome(), 0))

    def getBinsFromPos(self, pos):
        bin = (pos - 1) / (self.size - self.overlap)
        if bin >= 1 and pos <= bin * (self.size - self.overlap) + self.overlap:
            return (bin - 1, bin)
        return (bin, )

    def getPosFromBin(self, bin):
        return (bin * (self.size - self.overlap) + 1, bin * (self.size - self.overlap) + self.size)

    def initializeBins(self):
        self.binsPerStrand        = {}
        self.sumsPerStrand        = {}
        self.valuesPerStrand      = {}
        self.toBePlottedPerStrand = {}
        for strand in self.strands:
            self.binsPerStrand[strand]        = {}
            self.sumsPerStrand[strand]        = {}
            self.valuesPerStrand[strand]      = {}
            self.toBePlottedPerStrand[strand] = {}
            for chromosome in self.sizes:
                binRange = range(self.getBinsFromPos(self.sizes[chromosome])[-1] + 1)
                self.binsPerStrand[strand][chromosome]        = dict([[i, 0]   for i in binRange])
                self.sumsPerStrand[strand][chromosome]        = dict([[i, 0.0] for i in binRange])
                self.valuesPerStrand[strand][chromosome]      = dict([[i, []]  for i in binRange])
                self.toBePlottedPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange])

    def getNbElements(self, transcript):
        nbOccurrences = 1 if "nbOccurrences" not in transcript.getTagNames() else transcript.getTagValue("nbOccurrences")
        nbElements    = 1 if "nbElements"    not in transcript.getTagNames() else transcript.getTagValue("nbElements")
        nbOccurrences = float(nbOccurrences)
        nbElements = float(nbElements)
        nbElements /= float(nbOccurrences)
        return nbElements

    def setBins(self):
        progress = Progress(self.parser.getNbTranscripts(), "Setting bins", self.verbosity)
        for transcript in self.parser.getIterator():
            nbElements = self.getNbElements(transcript)
            strand     = transcript.getDirection() if len(self.strands) == 2 else 0
            for bin in self.getBinsFromPos(transcript.getStart()):
                self.binsPerStrand[strand][transcript.getChromosome()][bin] += nbElements
                if self.tag != None:
                    if self.tag not in transcript.getTagNames():
                        if self.defaultValue is None:
                            raise Exception("Tag %s undefined in transcript %s" % (self.tag, transcript))
                        value = self.defaultValue
                        value = float(transcript.getTagValue(self.tag))
                    self.sumsPerStrand[strand][transcript.getChromosome()][bin] += value

    def aggregateData(self):
        if self.operation == "sum":
        elif self.operation == "avg":
        elif self.operation == "med":
        elif self.operation == "min":
        elif self.operation == "max":
        elif self.operation == "GCpercent":
            self.toBePlottedPerStrand = self.binsPerStrand

    def computeSumData(self):
        self.toBePlottedPerStrand = self.sumsPerStrand

    def computeAvgData(self):
        for strand in self.strands:
            for chromosome in self.binsPerStrand[strand]:
                for bin in self.binsPerStrand[strand][chromosome]:
                    if self.binsPerStrand[strand][chromosome][bin] != 0:
                        self.toBePlottedPerStrand[strand][chromosome][bin] = float(self.sumsPerStrand[strand][chromosome][bin]) / self.binsPerStrand[strand][chromosome][bin]

    def computeMedData(self):
        for strand in self.strands:
            for chromosome in self.binsPerStrand[strand]:
                for bin in self.binsPerStrand[strand][chromosome]:
                    if self.valuesPerStrand[strand][chromosome][bin]:
                        size = len(self.valuesPerStrand[strand][chromosome][bin])
                        if size % 2 == 1:
                            self.toBePlottedPerStrand[strand][chromosome][bin] = self.valuesPerStrand[strand][chromosome][bin][(size - 1) / 2]
                            self.toBePlottedPerStrand[strand][chromosome][bin] = (self.valuesPerStrand[strand][chromosome][bin][size / 2 - 1] + self.valuesPerStrand[strand][chromosome][bin][size / 2]) / 2.0

    def computeMinData(self):
        for strand in self.strands:
            for chromosome in self.binsPerStrand[strand]:
                for bin in self.binsPerStrand[strand][chromosome]:
                    if self.valuesPerStrand[strand][chromosome][bin]:
                        self.toBePlottedPerStrand[strand][chromosome][bin] = min(self.valuesPerStrand[strand][chromosome][bin])

    def computeMaxData(self):
        for strand in self.strands:
            for chromosome in self.binsPerStrand[strand]:
                for bin in self.binsPerStrand[strand][chromosome]:
                    if self.valuesPerStrand[strand][chromosome][bin]:
                        self.toBePlottedPerStrand[strand][chromosome][bin] = max(self.valuesPerStrand[strand][chromosome][bin])
    def computeGCPercent(self):
        for strand in self.strands:
            for chromosome in self.binsPerStrand[strand]:
                for bin in self.binsPerStrand[strand][chromosome]:
                    if self.valuesPerStrand[strand][chromosome][bin]:
                        subSequence = self.valuesPerStrand[strand][chromosome][bin]
                        NPercent = 100 * (subSequence.countNt("N") / float(subSequence.getSize()))
                        if NPercent >= 50:
                            currentGCpercent = "NA"
                            currentGCpercent = subSequence.getGCpercentageInSequenceWithoutCountNInLength()
                        self.toBePlottedPerStrand[strand][chromosome][bin] = currentGCpercent
        #TODO: see if a map method could be used for the various "compute" methods 
        #return currentGCpercent, NPercent
    def plotData(self):
        if self.plot != None:
            for strand in self.strands:
                adjunct = ""
                if strand != 0:
                    adjunct = "Strand%d" % (strand)
                for chromosome in self.toBePlottedPerStrand[strand]:
                    if len(self.toBePlottedPerStrand[strand][chromosome].keys()) > 0:
                        plotter = RPlotter(self.plot, self.verbosity)
                        plotter.addLine(self.toBePlottedPerStrand[strand][chromosome], chromosome)

    def writeExcel(self):
        if self.excel != None:
            excelFile = open(self.excel, "w")
            for strand in self.strands:
                maxBin = max([max(self.toBePlottedPerStrand[strand][chromosome].keys()) for chromosome in self.binsPerStrand[strand]])
                for bin in range(0, maxBin + 1):
                    excelFile.write(",%d-%d" % self.getPosFromBin(bin))
                for chromosome in self.toBePlottedPerStrand[strand]:
                    excelFile.write("%s" % (chromosome))
                    for bin in self.toBePlottedPerStrand[strand][chromosome]:
                        excelFile.write(",%f" % (self.toBePlottedPerStrand[strand][chromosome][bin]))

    def printRegions(self):
        cpt           = 1
        tagOp         = "nb"
        tagName       = "Elements"
        outputTagName = "nbElements"
        if self.operation != None:
            tagOp = self.operation.lower()
        if self.tag != None:
            tagName = self.tag.title()
        if self.outputTagName != None:
            outputTagName = self.outputTagName
        #writer = Gff3Writer(self.outputFileName, self.verbosity)
        for strand in self.strands:
            for chromosome in self.toBePlottedPerStrand[strand]:
                for bin in self.toBePlottedPerStrand[strand][chromosome]:
                    transcript = Transcript()
                    transcript.setName("region%d" % cpt)
                    transcript.setDirection(1 if strand == 0 else strand)
                    transcript.setTagValue(outputTagName, self.binsPerStrand[strand][chromosome][bin])
                    transcript.setTagValue("%s%s" % (tagOp, tagName), str(self.toBePlottedPerStrand[strand][chromosome][bin]))
                    cpt += 1

    def run(self):
        if self.excel:
        if self.plot:

if __name__ == "__main__":
    # parse command line
    description = "Clusterize by Sliding Windows v1.0.1: Produces a GFF3 file that clusters a list of transcripts using a sliding window. [Category: Sliding Windows]"

    parser = OptionParser(description = description)
    parser.add_option("-i", "--input",       dest="inputFileName",  action="store",                     type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
    parser.add_option("-f", "--inputFormat", dest="inputFormat",    action="store",                     type="string", help="format of the input file [compulsory] [format: transcript file format]")
    parser.add_option("-o", "--output",      dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
    parser.add_option("-u", "--outputFormat", dest="outputFormat",  action="store",     default="gff",  type="string", help="format of the output file [format: transcript file format]")
    parser.add_option("-s", "--size",        dest="size",           action="store",                     type="int",    help="size of the regions [compulsory] [format: int]")
    parser.add_option("-e", "--overlap",     dest="overlap",        action="store",                     type="int",    help="overlap between two consecutive regions [compulsory] [format: int]")
    parser.add_option("-m", "--normalize",   dest="normalize",      action="store_true", default=False,                help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
    parser.add_option("-g", "--tag",         dest="tag",            action="store",      default=None,  type="string", help="use a given tag as input (instead of summing number of features) [format: string]")    
    parser.add_option("-r", "--operation",   dest="operation",      action="store",      default=None,  type="string", help="combine tag value with given operation [format: choice (sum, avg, med, min, max)]")
    parser.add_option("-d", "--defaultValue",dest="defaultValue",   action="store",                     type="float",    help="default value for input tag [format: float]")
    parser.add_option("-w", "--write",       dest="writeTag",       action="store",      default=None,  type="string", help="print the result in the given tag (default usually is 'nbElements') [format: string]")    
    parser.add_option("-2", "--strands",     dest="strands",        action="store_true", default=False,                help="consider the two strands separately [format: bool] [default: false]")
    parser.add_option("-p", "--plot",        dest="plot",           action="store",      default=None,  type="string", help="plot regions to the given file [format: output file in PNG format]")
    parser.add_option("-x", "--excel",       dest="excel",          action="store",      default=None,  type="string", help="write an Excel file to the given file [format: output file in Excel format]")
    parser.add_option("-v", "--verbosity",   dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int] [default: 1]")
    (options, args) = parser.parse_args()

    cbsw = ClusterizeBySlidingWindows(options.verbosity)
    cbsw.setInputFile(options.inputFileName, options.inputFormat)
    cbsw.setOutputFileName(options.outputFileName, options.outputFormat)