import re
from parser.mapperParser import *


class SamParser(MapperParser):
  """A class that parses SAM format (as given by BWA)"""

  def __init__(self, fileName, verbosity = 0):
    super(SamParser, self).__init__(fileName, verbosity)


  def __del__(self):
    super(SamParser, self).__del__()


  def skipFirstLines(self):
    pass


  def getInfos(self):
    super(SamParser, self).getInfos()
    

  def parseLine(self, line):

    line = line.strip()
    if line[0] == "@":
      return

    fields = line.split("\t")
    if len(fields) < 11:
      sys.exit("Line '%s' does not look like a SAM line (number of fields is %d instead of 11)" % (line, len(fields)))

    name = fields[0]
    flag = int(fields[1])

    if (flag & 0x4) == 0x4:
      return None

    direction       = 1 if (flag & 0x10) == 0x0 else -1
    chromosome      = fields[2]
    genomeStart     = int(fields[3])
    quality         = fields[4]
    cigar           = fields[5]
    mate            = fields[6]
    mateGenomeStart = int(fields[7])
    gapSize         = int(fields[8])
    sequence        = fields[9]
    quality         = fields[10]
    tags            = fields[11:]

    mapping = Mapping()
    mapping.setTagValue("quality", int(fields[4]))
    
    mapping.queryInterval.setName(name)
    mapping.queryInterval.setDirection(direction)

    mapping.targetInterval.setChromosome(chromosome)
    mapping.targetInterval.setStart(genomeStart)
    mapping.targetInterval.setDirection(direction)

    mapping.setSize(len(sequence))
    mapping.setDirection(direction)

    nbOccurrences = 1
    nbMismatches  = 0
    nbMatches     = 0
    nbGaps        = 0
    subMapping    = None
    queryOffset   = 0
    targetOffset  = 0
    currentNumber = 0
    readStart     = None

    for tag in tags:
      name = tag[:2]
      if name == "X0":
        nbOccurrences = int(tag[5:])
      elif name == "X1":
        nbOccurrences += int(tag[5:])
      elif name == "XM":
        nbMismatches = int(tag[5:])
    mapping.setTagValue("nbOccurrences", nbOccurrences)

    for char in cigar:
      m = re.match(r"[0-9]", char)
      if m != None:
        currentNumber = currentNumber * 10 + (ord(char) - ord("0"))
        continue
      m = re.match(r"[M]", char)
      if m != None:
        if readStart == None:
          readStart = targetOffset

        subMapping = SubMapping()

        subMapping.setSize(currentNumber)
        subMapping.setDirection(direction)
  
        subMapping.queryInterval.setName(name)
        subMapping.queryInterval.setStart(readStart + queryOffset)
        subMapping.queryInterval.setDirection(direction)
  
        subMapping.targetInterval.setChromosome(chromosome)
        subMapping.targetInterval.setStart(genomeStart + targetOffset)
        subMapping.targetInterval.setDirection(1)

        nbMatches    += currentNumber
        targetOffset += currentNumber
        queryOffset  += currentNumber
        currentNumber = 0
        continue
      m = re.match(r"[I]", char)
      if m != None:
        if subMapping != None:
          subMapping.queryInterval.setEnd(readStart + queryOffset)
          subMapping.targetInterval.setEnd(genomeStart + targetOffset)
        nbGaps       += 1
        queryOffset  += currentNumber
        currentNumber = 0
        continue
      m = re.match(r"[DNHP]", char)
      if m != None:
        if subMapping != None:
          subMapping.queryInterval.setEnd(readStart + queryOffset)
          subMapping.targetInterval.setEnd(genomeStart + targetOffset)
        nbGaps       += 1
        targetOffset += currentNumber
        currentNumber = 0
        continue
      m = re.match(r"[S]", char)
      if m != None:
        if readStart == None:
          readStart = targetOffset

        nbMismatches += currentNumber
        targetOffset += currentNumber
        queryOffset  += currentNumber
        currentNumber = 0
        continue
      sys.exit("Do not understand paramer '%s' in line %s" % (char, line))

    mapping.queryInterval.setStart(readStart)
    mapping.queryInterval.setEnd(readStart + queryOffset - 1)
    mapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
    mapping.setNbMismatches(nbMismatches)
    mapping.setNbGaps(nbGaps)

    return mapping



