#! /usr/bin/env python
import sys
import random
import re
import os
import MySQLdb
from structure.transcript import *
from structure.transcriptList import *
from mySql.mySqlConnection import *
from mySql.mySqlTable import *
from mySql.mySqlTranscriptTable import *
from structure.transcriptContainer import *
from misc.progress import *
from misc.utils import *
from structure import bins


class TranscriptListsComparator(object):
  """
  Compare two transcript lists, using a database for one of the list
  Uses one TranscriptContainer for query data, 
       one TranscriptContainer exported to MySqlTranscriptTable for reference data, 
       one MySqlTranscriptTable for output data,
       one MySqlTranscriptTable for transformed reference data
  @ivar inputTranscriptContainers: parsers to the list of query transcripts
  @type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>}
  @ivar transcriptBases:           tables of database for the transcripts (for query/reference/output/workingReference, for each chromosome)
  @type transcriptBases:           list of 4 dict of chromsomes to L{MySqlTranscriptTable<MySqlTranscriptTable>}
  @ivar mySqlConnection:           connection to a MySQL database (to compute the ovelapping efficiently)
  @type mySqlConnection:           class L{MySqlConnection<MySqlConnection>}
  @ivar introns:                   compare transcripts or exons only
  @type introns:                   list of 2 boolean
  @ivar starts:                    restrict the query transcripts to first nucleotides
  @type starts:                    list of 2 int or None
  @ivar fivePrimes:                extend a list of transcripts by their 5' end
  @type fivePrimes:                list of 2 int or None
  @ivar threePrimes:               extend a list of transcripts by their 3' end
  @type threePrimes:               list of 2 int or None
  @ivar minDistance:               min distance between two transcripts [default: 0]
  @type minDistance:               int
  @ivar maxDistance:               max distance between two transcripts [default: 0]
  @type maxDistance:               int
  @ivar upstreams:                 consider distances with elements which are upstream of the transcripts
  @type upstreams:                 boolean
  @ivar downstreams:               consider distances with elements which are downstream of the transcripts
  @type downstreams:               boolean
  @ivar colinear:                  whether transcripts should overlap in the same direction
  @type colinear:                  boolean
  @ivar antisense:                 whether transcripts should overlap in the opposite direction
  @type antisense:                 boolean
  @ivar outputDistance:            output distance between query and reference instead of query transcript
  @type outputDistance:            boolean
  @ivar absolute:                  do not consider the strand while computing distance
  @type absolute:                  boolean
  @ivar QUERY:                     constant specifying the query objects
  @type QUERY:                     int
  @ivar REFERENCE:                 constant specifying the reference objects
  @type REFERENCE:                 int
  @ivar OUTPUT:                    constant specifying the output objects
  @type OUTPUT:                    int
  @ivar INPUTTYPES:                set of input types of data (query or reference) objects
  @type INPUTTYPES:                list of 2 int
  @ivar TYPES:                     set of types of data (query, reference or output) objects
  @type TYPES:                     list of 3 int
  @ivar typeToString:              string representation of the previous types
  @type typeToString:              dict
  @ivar tableNames:                name of the transcript tables
  @type tableNames:                dict of strings
  @ivar nbTranscripts:             number of transcript in the query/reference set
  @type nbTranscripts:             list of 2 int or None
  @ivar nbNucleotides:             number of nucleotides in the query/reference set
  @type nbNucleotides:             list of 2 int or None
  @ivar transcriptsToBeStored:     transcripts that will be stored into database
  @type transcriptsToBeStored:     dict of class L{TranscriptList<TranscriptList>}
  @ivar nbTranscriptsStored:       max number of transcripts to be stored
  @type nbTranscriptsStored:       int
  @ivar multiple:                  in merge mode, aggregate multiple transcripts
  @type multiple:                  boolean
  @ivar invert:                    invert the current comparison
  @type invert:                    boolean
  @ivar odds:                      whether odds about the comparison should be computed
  @type odds:                      boolean
  @ivar overlapResults:            count the number of overlaps
  @type overlapResults:            dictionary
  @ivar oddResults:                compute the number of times each transcript overlaps (or is merged with) another one
  @type oddResults:                dictionary
  @ivar outputContainer:           container of the output transcripts
  @type outputContainer:           class L{TranscriptContainer<TranscriptContainer>}
  @ivar logHandle:                 log handle
  @type logHandle:                 file
  @ivar verbosity:                 verbosity
  @type verbosity:                 int  
  """
  
  def __init__(self, logHandle = None, verbosity = 0):
    """
    Constructor
    @param transcriptListParser2: parser to the list of reference transcripts
    @type  transcriptListParser2: class L{TranscriptListParser<TranscriptListParser>}
    @param logHandle:             log handle
    @type  logHandle:             file
    @param verbosity:             verbosity
    @type  verbosity:             int
    """
    self.QUERY                     = 0
    self.REFERENCE                 = 1
    self.OUTPUT                    = 2
    self.WORKING                   = 3
    self.INPUTTYPES                = (self.QUERY, self.REFERENCE)
    self.INPUTWORKINGTYPES         = (self.QUERY, self.REFERENCE, self.WORKING)
    self.TYPES                     = (self.QUERY, self.REFERENCE, self.OUTPUT, self.WORKING)
    self.typeToString              = {self.QUERY: "Query", self.REFERENCE: "Reference", self.OUTPUT: "Output", self.WORKING: "Working"}

    self.logHandle                 = logHandle
    self.verbosity                 = verbosity
    self.mySqlConnection           = MySqlConnection(self.verbosity)
    self.inputTranscriptContainers = [None, None]
    self.transcriptBases           = [{}, {}, {}, {}]
    self.tableNames                = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))]
    self.introns                   = [False, False]
    self.starts                    = [None, None]
    self.ends                      = [None, None]
    self.fivePrimes                = [None, None]
    self.threePrimes               = [None, None]
    self.minDistance               = None
    self.maxDistance               = 0
    self.colinear                  = False
    self.antisense                 = False
    self.downstreams               = [False, False]
    self.upstreams                 = [False, False]
    self.outputDistance            = False
    self.absolute                  = False
    self.nbTranscripts             = [None, None]
    self.nbNucleotides             = [None, None]
    self.invert                    = False
    self.multiple                  = False
    self.odds                      = False
    self.overlapResults            = None
    self.oddResults                = None
    self.outputContainer           = None
    self.transcriptsToBeStored     = dict([(type, TranscriptList()) for type in self.TYPES])
    self.nbTranscriptsStored       = 10000

    
  def __del__(self):
    """
    Destructor
    Remove all temporary tables
    """
    for type in self.TYPES:
      for chromosome in self.transcriptBases[type]:
        self.transcriptBases[type][chromosome].remove()    

    
  def acceptIntrons(self, type, bool):
    """
    Compare transcripts or exons only
    @param type: whether use query/reference data
    @type  type: int
    @param bool: include introns or not
    @type  bool: boolean
    """
    self.introns[type] = bool

    
  def restrictToStart(self, type, size):
    """
    Restrict a list of transcripts to first nucleotides
    @param type: whether use query/reference data
    @type  type: int
    @param size: the size of the transcript to be considered
    @type  size: int
    """
    self.starts[type]  = size
    self.introns[type] = False
    
    
  def restrictToEnd(self, type, size):
    """
    Restrict a list of transcripts to first nucleotides
    @param type: whether use query/reference data
    @type  type: int
    @param size: the size of the transcript to be considered
    @type  size: int
    """
    self.ends[type]  = size
    self.introns[type] = False
    
    
  def extendFivePrime(self, type, size):
    """
    Extend a list of transcripts by their 5' end
    @param type: whether use query/reference data
    @type  type: int
    @param size: size of the extension
    @type  size: int
    """
    self.fivePrimes[type] = size
    

  def extendThreePrime(self, type, size):
    """
    Extend the list of query transcripts by their 3' end
    @param type: whether use query/reference data
    @type  type: int
    @param size: size of the extension
    @type  size: int
    """
    self.threePrimes[type] = size
    
    
  def setMinDistance(self, distance):
    """
    Set the min distance between two transcripts
    @param distance: distance
    @type  distance: int
    """
    self.minDistance = distance


  def setMaxDistance(self, distance):
    """
    Set the max distance between two transcripts
    @param distance: distance
    @type  distance: int
    """
    self.maxDistance = distance
    

  def setUpstream(self, type, boolean):
    """
    Consider transcripts which are upstream of some transcripts
    @param type:    whether use query/reference data
    @type  type:    int
    @param boolean: consider only these transcripts or not
    @type  boolean: boolean
    """
    self.upstreams[type] = boolean


  def setDownstream(self, type, boolean):
    """
    Consider transcripts which are downstream of some transcripts
    @param type:    whether use query/reference data
    @type  type:    int
    @param boolean: consider only these transcripts or not
    @type  boolean: boolean
    """
    self.downstreams[type] = boolean


  def setOutputDistance(self, boolean):
    """
    Output distance between query and reference instead of query transcript
    @param boolean: whether distance should be output
    @type  boolean: boolean
    """
    self.outputDistance = boolean
    

  def setAbsolute(self, boolean):
    """
    Do not consider strand when computing distance (thus, having only non-negative values)
    @param boolean: whether we should consider strands
    @type  boolean: boolean
    """
    self.absolute = boolean
    

  def getColinearOnly(self, boolean):
    """
    Only consider transcripts that overlap in the same direction
    @param boolean: whether transcripts should overlap in the same direction
    @type  boolean: boolean
    """
    self.colinear = boolean
    
        
  def getAntisenseOnly(self, boolean):
    """
    Only consider transcripts that overlap in the opposite direction
    @param boolean: whether transcripts should overlap in the opposite direction
    @type  boolean: boolean
    """
    self.antisense = boolean
    
    
  def getInvert(self, boolean):
    """
    Only consider transcripts that do not overlap
    @param boolean: whether invert the selection
    @type  boolean: boolean
    """
    self.invert = boolean
    
    
  def aggregate(self, boolean):
    """
    In merge mode, aggregate multiple transcripts
    @param boolean: aggregate multiple transcripts
    @type  boolean: boolean
    """
    self.multiple = boolean
  

  def computeOdds(self, boolean):
    """
    Compute odds
    @param boolean: whether odds should be computed
    @type  boolean: boolean
    """
    self.odds = boolean
    if self.odds:
      self.overlapResults = dict()
    
    
  def computeOddsPerTranscript(self, boolean):
    """
    Compute odds for each transcript
    @param boolean: whether odds for each transcript should be computed
    @type  boolean: boolean
    """
    self.odds = boolean
    if self.odds:
      self.overlapResults = dict()
    
    
  def removeTables(self):
    """
    Remove the temporary MySQL tables
    """
    for type in self.INPUTWORKINGTYPES:
      for chromosome in self.transcriptBases[type]:
        self.transcriptBases[type][chromosome].remove()


  def clearTables(self):
    """
    Empty the content of the databases
    """
    for type in self.INPUTWORKINGTYPES:
      if self.transcriptListParsers[type] != None:
        for chromosome in self.transcriptBases[type]:
          self.transcriptBases[type][chromosome].clear()


  def extendTranscript(self, type, transcript):
    """
    Extend a transcript corresponding to the parameters of the class
    @param transcript: a transcript
    @type  transcript: class L{Transcript<Transcript>}
    @return:           the possibly extended transcript
    """
    extendedTranscript = Transcript()
    extendedTranscript.copy(transcript)
    if self.starts[type] != None:
      extendedTranscript.restrictStart(self.starts[type])
    if self.ends[type] != None:
      extendedTranscript.restrictEnd(self.ends[type])
    if self.fivePrimes[type] != None:
      extendedTranscript.extendStart(self.fivePrimes[type])
    if self.threePrimes[type] != None:
      extendedTranscript.extendEnd(self.threePrimes[type])
    return extendedTranscript


  def reallyStoreTranscript(self, type, transcript):
    """
    Add a transcript to a MySQL database
    @param type:       whether use query/reference table
    @type  type:       int
    @param transcript: a transcript
    @type  transcript: class L{Transcript<Transcript>}
    """
    if transcript.chromosome not in self.transcriptBases[type]:
      self.transcriptBases[type][transcript.chromosome] = MySqlTranscriptTable(self.tableNames[type], transcript.chromosome, self.verbosity)
      self.transcriptBases[type][transcript.chromosome].createTranscriptTable()
      if type == self.REFERENCE:
        self.transcriptBases[self.WORKING][transcript.chromosome] = MySqlTranscriptTable(self.tableNames[self.WORKING], transcript.chromosome, self.verbosity)
        self.transcriptBases[self.WORKING][transcript.chromosome].createTranscriptTable()
    self.transcriptBases[type][transcript.chromosome].addTranscript(transcript)
    if type == self.REFERENCE:
      self.transcriptBases[self.WORKING][transcript.chromosome].addTranscript(transcript)


  def reallyStoreTranscriptList(self, type):
    """
    Add transcripts to a MySQL database
    @param type:           whether use query/reference table
    @type  type:           int
    """
    for chromosome in self.transcriptsToBeStored[type].getChromosomes():
      transcripts = self.transcriptsToBeStored[type].getTranscriptsOnChromosome(chromosome)
      if chromosome not in self.transcriptBases[type]:
        self.transcriptBases[type][chromosome] = MySqlTranscriptTable(self.tableNames[type], chromosome, self.verbosity)
        self.transcriptBases[type][chromosome].createTranscriptTable()
        if type == self.REFERENCE:
          self.transcriptBases[self.WORKING][chromosome] = MySqlTranscriptTable(self.tableNames[self.WORKING], chromosome, self.verbosity)
          self.transcriptBases[self.WORKING][chromosome].createTranscriptTable()

        writer = MySqlTranscriptWriter(None, self.verbosity)
        for transcript in transcripts:
          writer.addTranscript(transcript)
        writer.write()
        tmpTable = writer.getTables()[chromosome]
        self.transcriptBases[type][chromosome].add(tmpTable)

        if type == self.REFERENCE:
          self.transcriptBases[self.WORKING][chromosome].add(tmpTable)

    self.transcriptsToBeStored[type].removeAll()
      


  def storeTranscript(self, type, transcript, now = True):
    """
    Add a transcript to a MySQL database, or postpone the store
    @param type:       whether use query/reference table
    @type  type:       int
    @param transcript: a transcript
    @type  transcript: class L{Transcript<Transcript>}
    @param now:        whether transcript should be stored now (or stored can be postponed)
    @type  now:        bool
    """
    if now:
      self.reallyStoreTranscript(type, transcript)
      return
    self.transcriptsToBeStored[type].addTranscript(transcript)
    if self.transcriptsToBeStored[type].getNbTranscripts() >= self.nbTranscriptsStored:
      self.reallyStoreTranscriptList(type)


  def flushTables(self, type = None):
    """
    Store the remaining transcripts
    @param type: whether use query/reference table (None for all)
    @type  type: int or None
    """
    if type == None:
      types = self.TYPES
    else:
      types = [type]
    for type in types:
      self.reallyStoreTranscriptList(type)
    
    
  def unstoreTranscript(self, type, transcript):
    """
    Remove a transcript from a MySQL database
    @param type:       whether use query/reference table
    @type  type:       int
    @param transcript: a transcript
    @type  transcript: class L{Transcript<Transcript>}
    """
    self.transcriptBases[type][transcript.chromosome].removeTranscript(transcript)
    if type == self.REFERENCE:
      self.transcriptBases[self.WORKING][transcript.chromosome].removeTranscript(transcript)


  def storeTranscriptList(self, type, transcriptListParser, extension):
    """
    Store a transcript list into database
    @param type:      whether use query/reference parser
    @type  type:      int
    @param parser:    a parser of transcript list
    @type  parser:    class L{TranscriptContainer<TranscriptContainer>}
    @param extension: extend (or not) the transcripts
    @type  extension: boolean
    """
    writer   = MySqlTranscriptWriter(self.tableNames[type], self.verbosity)
    progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity)
    for transcript in transcriptListParser.getIterator():
      if extension:
        transcript = self.extendTranscript(type, transcript)
      writer.addTranscript(transcript)
      progress.inc()
    writer.write()
    progress.done()

    self.transcriptBases[type] = writer.getTables()
    
  
  def setInputTranscriptContainer(self, type, inputTranscriptContainer):
    """
    Set an input transcript list container
    @param type:                      whether use query/reference parser
    @type  type:                      int
    @param inputTranscriptContainer:  a container
    @type  inputTranscriptContainer:  class L{TranscriptContainer<TranscriptContainer>}
    """
    self.inputTranscriptContainers[type] = inputTranscriptContainer
    self.nbTranscripts[type]             = self.inputTranscriptContainers[type].getNbTranscripts()
    self.nbNucleotides[type]             = self.inputTranscriptContainers[type].getNbNucleotides()


  def compareTranscript(self, transcript1, transcript2, includeDistance = False):
    """
    Compare two transcripts, using user defined parameters
    @param transcript1:     a transcript from the query set (already extended)
    @type  transcript1:     class L{Transcript<Transcript>}
    @param transcript2:     a transcript from the reference set (already extended)
    @type  transcript2:     class L{Transcript<Transcript>}
    @param includeDistance: take into account the distance too
    @type  includeDistance: boolean
    @return:                true, if they overlap
    """
    extendedTranscript1 = Transcript()
    extendedTranscript1.copy(transcript1)
    if includeDistance:
      if (not self.colinear) and self.maxDistance > 0:
        extendedTranscript1.extendStart(self.maxDistance)
      if (not self.antisense) and self.maxDistance > 0:
        extendedTranscript1.extendEnd(self.maxDistance)

    if not extendedTranscript1.overlapWith(transcript2):
      return False
    if (self.downstreams[self.QUERY]     and transcript2.start > transcript1.start) or \
       (self.upstreams[self.QUERY]       and transcript2.end < transcript1.end)     or \
       (self.downstreams[self.REFERENCE] and transcript1.start > transcript2.start) or \
       (self.upstreams[self.REFERENCE]   and transcript1.end < transcript2.end):
      return False
    if (self.antisense and transcript1.direction == transcript2.direction) or (self.colinear and transcript1.direction != transcript2.direction):
      return False
    if self.introns[self.REFERENCE] and self.introns[self.QUERY]:
      if self.logHandle != None:
        self.logHandle.write("%s overlaps with intron of %s\n" % (str(transcript1), str(transcript2)))
      return True
    if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2):
      if self.logHandle != None:
        self.logHandle.write("%s overlaps with exon of %s\n" % (str(transcript1), str(transcript2)))
      return True
    return False


  def compareTranscriptToList(self, transcript1):
    """
    Compare a transcript to the reference list of transcripts
    (Do not extend the transcripts, except for the distance)
    @param transcript1: a transcript (from the query set)
    @type  transcript1: class L{Transcript<Transcript>}
    @return:            the reference transcripts overlapping
    """
    # no transcript in the reference table
    if transcript1.chromosome not in self.transcriptBases[self.REFERENCE]:
      return []
    
    # retrieve the ids of the transcripts that may overlap in the working tables
    clauses = []
    extendedTranscript1 = Transcript()
    extendedTranscript1.copy(transcript1)
    if self.maxDistance > 0:
      extendedTranscript1.extendStart(self.maxDistance)
    if self.maxDistance > 0:
      extendedTranscript1.extendEnd(self.maxDistance)
    command = "SELECT id FROM %s WHERE (" % (self.transcriptBases[self.WORKING][transcript1.chromosome].name)
    for binPair in extendedTranscript1.getBins():
      clause = "bin "
      if binPair[0] == binPair[1]:
        clause += "= %i" % (binPair[0])
      else:
        clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1])
      clauses.append(clause)
    command += " OR ".join(clauses)
    command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.end, extendedTranscript1.start)
    query = self.mySqlConnection.executeQuery(command)

    # find those transcripts that actually overlap in the reference tables
    transcripts = []
    for line in query.getIterator():
      transcript2 = self.transcriptBases[self.REFERENCE][transcript1.chromosome].retrieveTranscriptFromId(line[0])
      if self.compareTranscript(extendedTranscript1, transcript2):
        transcripts.append(transcript2)
    return transcripts

  
  def compareTranscriptList(self):
    """
    Compare a list of transcript to the reference one
    @return: the transcripts that overlap with the reference set
    """
    nbTranscripts = 0

    # export the container into tables
    self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
    for chromosome in self.transcriptBases[self.REFERENCE]:
      self.transcriptBases[self.WORKING][chromosome] = MySqlTranscriptTable()
      self.transcriptBases[self.WORKING][chromosome].copy(self.transcriptBases[self.REFERENCE][chromosome])
    
    progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity)
    for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator():
      transcript1 = self.extendTranscript(self.QUERY, transcript1)
      transcripts = self.compareTranscriptToList(transcript1)
      if ((not self.invert) and (len(transcripts) > 0)) or ((self.invert) and (len(transcripts) == 0)):
        # possibly output distance instead of transcript
        if (not self.invert) and (self.outputDistance):
          transcript1.removeExons()
          transcript1.setEnd(transcript1.start)
          for transcript2 in transcripts:
            # come back to the original start
            extension  = self.fivePrimes[self.REFERENCE] if transcript2.direction == 1 else self.threePrimes[self.REFERENCE]
            correction = extension if extension != None else 0
            transcript1.setStart(min(transcript1.start, transcript2.start + correction))
            transcript1.setEnd(max(transcript1.end, transcript2.start + correction))
        self.storeTranscript(self.OUTPUT, transcript1)
        nbTranscripts += 1
        if self.odds:
          for transcript2 in transcripts:
            if transcript2.name not in self.overlapResults:
              self.overlapResults[transcript2.name] = 1
            else:
              self.overlapResults[transcript2.name] += 1
      progress.inc()
    progress.done()
    
    if self.verbosity > 0:
      print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE])
      print "query:     %d sequences" % (self.nbTranscripts[self.QUERY])
      if self.nbTranscripts[self.QUERY] != 0:
        print "output:    %d sequences (%f%%)" % (nbTranscripts, nbTranscripts / float(self.nbTranscripts[self.QUERY]) * 100)

  
  def compareTranscriptListDistance(self):
    """
    Compare a list of transcript to the reference one
    @return: the distance distributions in a hash
    """
    distanceSum  = 0
    distanceMin  = self.maxDistance
    distanceMax  = 0
    nbDistances  = 0
    distances    = dict()
    
    # export the container into tables
    self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
    for chromosome in self.transcriptBases[self.REFERENCE]:
      self.transcriptBases[self.WORKING][chromosome] = MySqlTranscriptTable(self.tableNames[self.WORKING], chromosome)
      self.transcriptBases[self.WORKING][chromosome].copy(self.transcriptBases[self.REFERENCE][chromosome])
    
    progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity)
    for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator():

      # get the distance
      transcript1    = self.extendTranscript(self.QUERY, transcript1)
      overlapping    = self.compareTranscriptToList(transcript1)
      distance       = self.maxDistance + 1
      closestElement = "None"
      for transcript2 in overlapping:
        if self.antisense or (not self.colinear and transcript1.direction != transcript2.direction):
          transcript2.direction *= -1
        if self.absolute:
          transcript2.direction = transcript1.direction
        if transcript2.direction == transcript1.direction:
          if self.starts[self.REFERENCE] != None:
            transcript2.restrictStart(self.starts[self.REFERENCE])
          if self.ends[self.REFERENCE] != None:
            transcript2.restrictEnd(self.ends[self.REFERENCE])
          thisDistance = transcript1.getRelativeDistance(transcript2)
          if (self.absolute):
            thisDistance = abs(thisDistance)
          if abs(thisDistance) < abs(distance):
            distance = thisDistance
      if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance):
        nbDistances += 1
        distanceSum += abs(distance)
        distanceMin  = min(distanceMin, abs(distance))
        distanceMax  = max(distanceMax, abs(distance))
        if distance not in distances:
          distances[distance] = 1
        else:
          distances[distance] += 1
        closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.name

      # write transcript
      if distance == self.maxDistance + 1:
        distance = "None"
      tmpTranscript = Transcript()
      tmpTranscript.copy(transcript1)
      tmpTranscript.setTagValue("distance", distance)
      tmpTranscript.setTagValue("closestElement", closestElement)
      self.storeTranscript(self.OUTPUT, tmpTranscript, False)

      progress.inc()
    progress.done()

    if self.verbosity > 0:
      print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE])
      print "query:     %d sequences" % (self.nbTranscripts[self.QUERY])
      if nbDistances == 0:
        print "Nothing matches"
      else:
        print "distances: %d/%d/%d" % (distanceMin, int(float(distanceSum) / nbDistances), distanceMax)
        print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100)
    return distances

  
  def compareTranscriptListMerge2(self):
    """
    Merge a list of transcript with the reference one, with some conditions
    @return: the merged transcripts in a transcript list
    """
    nbMerged               = 0
    nbMerges               = 0
    nbPrinted              = 0
    toBeSkipped            = set()
    if self.odds:
      self.overlapResults = dict()

    progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity)
    for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator():
      if transcript1.name in toBeSkipped:
        toBeSkipped.remove(transcript1.name)
      else:
        transcript1  = self.extendTranscript(self.QUERY, transcript1)
        overlappings = self.compareTranscriptToList(transcript1)
        merged       = False
        if self.multiple:
          mergedTranscript = Transcript()          
          mergedTranscript.copy(transcript1)
        if self.odds:
          self.overlapResults[transcript1.name] = 1
        for transcript2 in overlappings:
          if transcript1.name != transcript2.name:
            if not self.multiple:
              mergedTranscript = Transcript()
              mergedTranscript.copy(transcript1)
            if self.antisense == True or (self.colinear == False and transcript1.direction != transcript2.direction):
              transcript2.setDirection(-1 * transcript2.direction)
            mergedTranscript.merge(transcript2)
            if self.antisense == True:
              mergedTranscript.removeExons()
            if self.multiple:
              toBeSkipped.add(transcript2.name)
            else:
              self.storeTranscript(self.OUTPUT, mergedTranscript)
              nbPrinted += 1
            nbMerges += 1
            merged    = True
            if self.odds:
              self.overlapResults[transcript1.name] += 1
        if merged:
          nbMerged += 1
        if self.multiple:
          self.storeTranscript(self.OUTPUT, mergedTranscript)
          nbPrinted += 1
      progress.inc()
    progress.done()
    
    if self.verbosity > 0:
      print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE])
      print "query:     %d sequences" % (self.nbTranscripts[self.QUERY])
      print "# merged:  %d sequences (%.2f%%)" % (nbMerged, nbMerged / float(self.nbTranscripts[self.QUERY]) * 100)
      print "# merges:  %d merges" % (nbMerges)
      print "# printed: %d sequences (%.2f%%)" % (nbPrinted, nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)

  
  def compareTranscriptListMerge(self):
    """
    Merge the query list of transcript with itself
    @return: the merged transcripts in a transcript list database
    """
    nbMerges  = 0
    nbPrinted = 0

    for type in (self.QUERY, self.REFERENCE):
      self.storeTranscriptList(type, self.inputTranscriptContainers[type], True)

    # Loop on the chromosomes
    for chromosome in self.transcriptBases[self.QUERY]:
      if chromosome not in self.transcriptBases[self.REFERENCE]:
        continue

      # Get the size of the chromosome
      maxEnd   = 0
      nbChunks = 0
      for type in (self.QUERY, self.REFERENCE):
        command  = "SELECT MAX(end) from %s" % (self.transcriptBases[type][chromosome].name)
        query    = self.mySqlConnection.executeQuery(command)
        maxEnd   = max(maxEnd, int(query.getLine()[0]))
        nbChunks = max(nbChunks, self.transcriptBases[type][chromosome].getNbElements())

      mergedTranscripts = {}
      transcripts       = {self.QUERY: [], self.REFERENCE: []}
      progress          = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity)
      for i in range(nbChunks):
        rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1
        rangeEnd   = int((i+1) * (float(maxEnd) / nbChunks))

        # Get all transcripts in query and reference from chunk
        for type in (self.QUERY, self.REFERENCE):
          correction = 0 if self.QUERY else self.maxDistance
          command    = "SELECT * FROM %s WHERE start <= %d" % (self.transcriptBases[type][chromosome].name, rangeEnd + correction)
          query      = self.mySqlConnection.executeQuery(command)
          for line in query.getIterator():
            # Load SQL transcript into Python objects
            transcript = Transcript()
            transcript.setSqlValues(line)
            for exon in self.transcriptBases[type][chromosome].exonsTable.retrieveExonsFromTranscriptId(transcript.id):
              transcript.addExon(exon)
            transcripts[type].append(transcript)

        # Merge elements between the two samples
        for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]):
          for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]):
            if referenceTranscript == None: continue
            if self.compareTranscript(queryTranscript, referenceTranscript, True):
              if queryTranscript.direction != referenceTranscript.direction:
                referenceTranscript.setDirection(queryTranscript.direction)
              queryTranscript.merge(referenceTranscript)
              nbMerges += 1
              transcripts[self.REFERENCE][iReference] = None
              if not self.multiple:
                mergedTranscripts[iQuery] = 0

        # Remove transcripts from database
        for type in (self.QUERY, self.REFERENCE):
          correction = 0 if self.QUERY else self.maxDistance
          command    = "DELETE FROM %s WHERE start <= %d" % (self.transcriptBases[type][chromosome].name, rangeEnd - correction)
          query      = self.mySqlConnection.executeQuery(command)

        # Just in case, self-merge the elements in the query (beware of mergedTranscripts!)
        if (self.multiple):
          for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]):
            if queryTranscript1 == None: continue
            for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY][iQuery1+1:]):
              if queryTranscript2 == None: continue
              if queryTranscript2.overlapWith(queryTranscript1) and (queryTranscript1.direction == queryTranscript2.direction or not self.colinear):
                if queryTranscript1.direction != queryTranscript2.direction:
                  queryTranscript1.setDirection(queryTranscript2.direction)
                queryTranscript2.merge(queryTranscript1)
                transcripts[self.QUERY][iQuery1] = None
                nbMerges += 1
                if not self.multiple:
                  mergedTranscripts[iQuery2] = 0

        # Update the sets of transcripts and write into database (also update mergedTranscripts)
        newTranscripts = {self.QUERY: [], self.REFERENCE: []}
        newMergedTranscripts = {}
        for type in (self.QUERY, self.REFERENCE):
          for i, transcript in enumerate(transcripts[type]):
            if transcript == None: continue
            correction = 0 if self.QUERY else self.maxDistance
            if transcript.end < rangeEnd - correction:
              if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
                self.storeTranscript(self.OUTPUT, transcripts[type][i], False)
                nbPrinted += 1
            else:
              if type == self.QUERY and i in mergedTranscripts:
                newMergedTranscripts[len(newTranscripts[type])] = 0
              newTranscripts[type].append(transcript)
        transcripts = newTranscripts
        mergedTranscripts = newMergedTranscripts

        progress.inc()
      progress.done()

      for type in (self.QUERY, self.REFERENCE):
        for i, transcript in enumerate(transcripts[type]):
          if transcripts == None: continue
          if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
            self.storeTranscript(self.OUTPUT, transcripts[type][i], False)
            nbPrinted += 1

    # Manage chromosomes with no corresponding data
    if self.multiple:
      for type in self.INPUTTYPES:
        for chromosome in self.transcriptBases[type]:
          if chromosome in self.transcriptBases[1 - type]:
            continue
          self.transcriptBases[self.OUTPUT][chromosome] = MySqlTranscriptTable(self.tableNames[self.OUTPUT], chromosome, self.verbosity)
          self.transcriptBases[self.OUTPUT][chromosome].copy(self.transcriptBases[type][chromosome])
          nbPrinted += self.transcriptBases[self.OUTPUT][chromosome].getNbElements()

    self.flushTables()

    if self.verbosity > 0:
      print "query:    %d sequences, %d nucleotides" % (self.nbTranscripts[self.QUERY], self.nbNucleotides[self.QUERY])
      print "# merges: %d" % (nbMerges)
      print "# printed %d (%.2f%%)" % (nbPrinted, nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)


  def compareTranscriptListSelfMerge(self):
    """
    Merge the query list of transcript with itself
    @return: the merged transcripts in a transcript list database
    """
    nbMerges  = 0
    nbPrinted = 0

    self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], False)
    
    #self.inputTranscriptContainers[self.QUERY].storeIntoDatabase(self.tableNames[self.QUERY])
    #self.transcriptBases[self.QUERY] = self.inputTranscriptContainers[self.QUERY].getTables()
    tmpTable = MySqlTranscriptTable(None, None, self.verbosity)
    tmpTable.createTranscriptTable()

    # Loop on the chromosomes
    for chromosome in self.transcriptBases[self.QUERY]:
      nbPass1 = 0

      # Get all the bins
      command = "SELECT DISTINCT bin from %s" % (self.transcriptBases[self.QUERY][chromosome].name)
      query   = self.mySqlConnection.executeQuery(command)
      allBins = [int(line[0]) for line in query.getLines()]

      # Loop on the bins
      progress = Progress(len(allBins), "1st pass on %s" % (chromosome), self.verbosity)
      for bin in allBins:
        # Get all transcripts from bin
        command = "SELECT * from %s WHERE bin = %d" % (self.transcriptBases[self.QUERY][chromosome].name, bin)
        query = self.mySqlConnection.executeQuery(command)
        clusteredTranscripts = []
        for line in query.getIterator():
          # Move SQL transcript to Python transcript
          transcript = Transcript()
          transcript.setSqlValues(line)
          for exon in self.transcriptBases[self.QUERY][chromosome].exonsTable.retrieveExonsFromTranscriptId(transcript.id):
            transcript.addExon(exon)
          transcript = self.extendTranscript(self.QUERY, transcript)

          # Merge the transcript with already clustered transcripts
          toBeRemoved = []
          for index in range(len(clusteredTranscripts)):
            clusteredTranscript = clusteredTranscripts[index]
            if self.compareTranscript(transcript, clusteredTranscript, True):
              if transcript.direction != clusteredTranscript.direction:
                transcript.setDirection(clusteredTranscript.direction)
              transcript.merge(clusteredTranscript)
              toBeRemoved.append(index)
              nbMerges += 1
          for index in reversed(toBeRemoved):
            del clusteredTranscripts[index]
          clusteredTranscripts.append(transcript)

        # Put transcript into working table
        for clusteredTranscript in clusteredTranscripts:
          tmpTable.addTranscript(clusteredTranscript)
          nbPass1 += 1
        progress.inc()
      progress.done()

      # 2nd pass: merge transcripts in different bins
      progress = Progress(nbPass1, "2nd pass on %s" % (chromosome), self.verbosity)
      for transcript1 in tmpTable.getIterator():
        overlappings = self.compareTranscriptToList(transcript1)
        toBeRemoved  = []

        for transcript2 in overlappings:
          if transcript1.direction != transcript2.direction:
            transcript1.setDirection(transcript2.direction)
          transcript1.merge(transcript2)
          toBeRemoved.append(transcript2)
          nbMerges += 1
            
        for transcript2 in toBeRemoved:
          self.unstoreTranscript(self.REFERENCE, transcript2)
        self.storeTranscript(self.REFERENCE, transcript1)
        progress.inc()
      progress.done()
      
      # copy into output tables
      progress = Progress(self.transcriptBases[self.REFERENCE][chromosome].getNbElements(), "Writing results of %s" % (chromosome), self.verbosity)
      for transcript in self.transcriptBases[self.REFERENCE][chromosome].getIterator():
        self.storeTranscript(self.OUTPUT, transcript)
        nbPrinted += 1
        progress.inc()
      progress.done()

      tmpTable.clear()
    tmpTable.remove()

    if self.verbosity > 0:
      print "query:    %d sequences, %d nucleotides" % (self.nbTranscripts[self.QUERY], self.nbNucleotides[self.QUERY])
      print "# merges: %d" % (nbMerges)
      print "# printed %d (%.2f%%)" % (nbPrinted, nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)

  
  def getOutputTranscripts(self):
    """
    Return overlap transcripts
    @return a list of transcripts in a container
    """
    if self.outputContainer == None:
      if self.tableNames[self.OUTPUT]:
        self.flushTables(self.OUTPUT)
        self.outputContainer = TranscriptContainer(self.tableNames[self.OUTPUT], "sql", self.verbosity)
    return self.outputContainer
  

  def getOddsPerTranscript(self):
    """
    Return overlap results
    @return a dict of data
    """
    if not self.odds:
      sys.exit("Did not compute odds!")
    return self.overlapResults


  def getOdds(self):
    """
    Return odds about the overlap
    @return a dict of data
    """
    if not self.odds:
      sys.exit("Did not compute odds!")
    if self.oddResults != None:
      return self.oddResults
    self.oddResults = {}
    for name in self.overlapResults:
      if self.overlapResults[name] not in self.oddResults:
        self.oddResults[self.overlapResults[name]] = 1
      else:
        self.oddResults[self.overlapResults[name]] += 1
    return self.oddResults
