#! /usr/bin/env python
"""
Clean a GFF file (as given by NCBI or TAIR) and outputs a GFF3 file.
"""

import os
import re
from optparse import OptionParser
from parsing.gffParser import *
from misc.rPlotter import *
from misc.progress import *


if __name__ == "__main__":
  
  # parse command line
  description = "Clean GFF: Clean a GFF file (as given by NCBI) and outputs a GFF3 file. [Category: Other]"

  parser = OptionParser(description = description)
  parser.add_option("-i", "--input",        dest="inputFileName",     action="store",                        type="string", help="input file name [compulsory] [format: file in GFF format]")
  parser.add_option("-o", "--output",       dest="outputFileName",    action="store",                        type="string", help="output file [compulsory] [format: output file in GFF3 format]")
  parser.add_option("-v", "--verbosity",    dest="verbosity",         action="store",      default=1,        type="int",    help="trace level [format: int]")
  (options, args) = parser.parse_args()

  # lines with these types will be skipped
  typesToBeRemoved = ("chromosome", "similarity", "start_codon", "stop_codon", "CDS", "five_prime_UTR", "three_prime_UTR")
  # lines with these types will be skipped but one should keep track of their ID to modify their child
  typesToBeSkipped = ("gene", "protein", "pseudogene", "transposable_element_gene")
  # these types will be changed to other type names
  typesToBeRenamed = {}


  outputFile = open("%s.gff3" % (options.outputFileName), "w")

  cpt                  = 0
  previousTranscriptId = None
  ids                  = {}
  translation          = {}
  chromosomeNames      = {}

  for line in open(options.inputFileName):

    # print count
    if cpt % 100 == 0 and options.verbosity >= 10:
      sys.stdout.write("%d line(s) read\r" % (cpt))
      sys.stdout.flush()

    cpt += 1
    line = line.strip()
    if line == "": continue

    # split line into fields
    splittedLine = line.split("\t")
    if len(splittedLine) == 1:
      splittedLine = line.split()

    # check number of fields
    if len(splittedLine) < 9:
      sys.exit("\nLines should have exactly 9 fields. Line #%d has not:\n%s\n" % (cpt, line))
    if len(splittedLine) > 9:
      splittedLine[8] = " ".join(splittedLine[8:])
      del splittedLine[9:]

    # read options
    parsedOptions = {}
    for option in splittedLine[8].split(";"):
      option = option.strip()
      if option == "": continue
      posSpace = option.find(" ")
      posEqual = option.find("=")
      if posEqual != -1 and (posEqual < posSpace or posSpace == -1):
        parts = option.split("=")
      else:
        parts = option.split()
      parsedOptions[parts[0].strip()] = " ".join(parts[1:]).strip(" \"")

    # give default id
    if "ID" not in parsedOptions:
      parsedOptions["ID"] = "smart%d" % (cpt)
    
    # read type
    if splittedLine[2] in typesToBeRemoved:
      continue
    if splittedLine[2] in typesToBeSkipped:
      if "Parent" in parsedOptions:
        translation[parsedOptions["ID"]] = parsedOptions["Parent"]
      continue
    if splittedLine[2] in typesToBeRenamed:
      splittedLine[2] = typesToBeRenamed[splittedLine[2]]

    # possibly add the type with the 3rd field
    if splittedLine[2] not in ("transcript", "exon") and "Type" not in parsedOptions:
      parsedOptions["Type"] = splittedLine[2]

    # possibly update the chromosome name
    if splittedLine[2] == "source" and "chromosome" in parsedOptions:
      chromosome = parsedOptions["chromosome"]
      if not chromosome.lower().startswith("chr"):
        chromosome = "chr%s" % (chromosome)
      chromosomeNames[splittedLine[0]] = chromosome
      continue
    if splittedLine[0] in chromosomeNames:
      splittedLine[0] = chromosomeNames[splittedLine[0]]

    # possibly change the parent ID to the grand-parent ID, or remove it
    if "Parent" in parsedOptions:
      if parsedOptions["Parent"] in translation:
        parsedOptions["Parent"] = translation[parsedOptions["Parent"]]
      elif parsedOptions["Parent"] not in ids:
        del parsedOptions["Parent"]

    # try to find parent id when none is provided
    if splittedLine[2] == "exon" and "Parent" not in parsedOptions:
      if previousTranscriptId == None:
        sys.exit("\nCannot find the parent id of line #%d\n%s" % (cpt, line))
      parsedOptions["Parent"] = previousTranscriptId
    elif splittedLine[2] == "transcript":
      previousTranscriptId = parsedOptions["ID"]

    # print line
    ids[parsedOptions["ID"]] = None
    splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in parsedOptions.items()])
    outputFile.write("%s\n" % ("\t".join(splittedLine)))


  print "%d lines read.    " % (cpt)

