import re
import sys
from structure.interval import *
from parsing.transcriptListParser import *


class BedParser(TranscriptListParser):
  """A class that parses a BED file and create a transcript list"""


  def __init__(self, fileName, verbosity = 0):
    self.title = None
    super(BedParser, self).__init__(fileName, verbosity)


  def __del__(self):
    super(BedParser, self).__del__()


  def getFileFormats():
    return ["bed"]
  getFileFormats = staticmethod(getFileFormats)


  def skipFirstLines(self):
    mark = self.handle.tell()
    for line in self.handle:
      line = line.strip()
      m = re.search(r"^\s*track\s+name\s*=\s*(\S+)\s+", line)
      if m != None:
        self.title = m.group(1)
        self.currentLineNb += 1
      else:
        self.handle.seek(mark)
      return


  def parseLine(self, line):
    m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+\d+\t+([+-])\t+\d+\t+\d+\t+0\t+(\d+)\t+(\S+)\t+(\S+)\s*$", line)
    if m == None:
      sys.exit("\nLine %d '%s' does not has a BED format." % (self.currentLineNb, line))
    transcript = Transcript()
    transcript.setChromosome(m.group(1))
    transcript.setStart(min(int(m.group(2)), int(m.group(3))-1))
    transcript.setEnd(max(int(m.group(2)), int(m.group(3))-1))
    transcript.setName(m.group(4))
    transcript.setDirection(m.group(5))
    nbExons = int(m.group(6))
    sizes   = m.group(7).split(",")
    starts  = m.group(8).split(",")

    # check for comment in name
    m = re.search(r"^([^\(]*)\((\S+)\)$", transcript.name)
    if m != None:
      transcript.setName(m.group(1))
      transcript.setTagValues(m.group(2), ";", "=")
    
    # check for nb occurrences in name
    m = re.search(r"(.*)-(\d+)$", transcript.name)
    if m != None:
      transcript.setName(m.group(1))
      transcript.setOccurrence(int(m.group(2)))

    for i in range(nbExons):
      exon = Interval(transcript)
      exon.setStart(int(starts[i])+transcript.start)
      exon.setEnd(transcript.start+int(starts[i])+int(sizes[i])-1)
      exon.setSize(int(sizes[i]))
      transcript.addExon(exon)
      
    if transcript.exons[0].start != transcript.start:
      sys.exit("There is something wrong with the start of transcript line '%s': transcript starts at %d whereas first exon starts at %d" % (line.strip(), transcript.start, transcript.exons[0].start))
    if transcript.exons[-1].end != transcript.end:
      sys.exit("There is something wrong with the end of transcript line '%s': transcript ends at %d whereas last exon ends at %d" % (line.strip(), transcript.end, transcript.exons[-1].end))

    return transcript

