#! /usr/bin/env python
"""Remove adaptors"""

import os
from optparse import OptionParser
from structure.sequence import *
from structure.sequenceList import *
from parsing.fastaParser import *
from misc.progress import *


def distance (string1, string2):
  if len(string1) != len(string2):
    return None
  distance = 0
  for i in range(0, len(string1)):
    if string1[i] != string2[i]:
      distance += 1
  return distance



if __name__ == "__main__":
  nbRemaining = 0
  
  # parse command line
  description = "Adaptor Stripper: Remove the adaptor of a list of reads. [Category: Personnal]"

  parser = OptionParser(description = description)
  parser.add_option("-i", "--input",         dest="inputFileName",      action="store",                                            type="string", help="input file [compulsory] [format: file in FASTA format]")
  parser.add_option("-o", "--output",        dest="outputFileName",     action="store",                                            type="string", help="output file [compulsory] [format: output file in FASTA format]")
  parser.add_option("-5", "--5primeAdaptor", dest="fivePrimeAdaptor",   action="store",      default="CTACTAGACCTTGGCTGTCACTCAAA", type="string", help="five prime adaptor [format: string]")
  parser.add_option("-3", "--3primeAdaptor", dest="threePrimeAdaptor",  action="store",      default="TCGCAGTGAGTGACAGGCTAGTAG",   type="string", help="three prime adaptor [format: string]")
  parser.add_option("-d", "--5primeDist",    dest="fivePrimeDistance",  action="store",      default=3,                            type="int",    help="five prime distance [format: int] [default: 3]")
  parser.add_option("-e", "--3primeDist",    dest="threePrimeDistance", action="store",      default=3,                            type="int",    help="three prime distance [format: int [default: 3]]")
  parser.add_option("-m", "--3primeSize",    dest="threePrimeSize",     action="store",      default=10,                           type="int",    help="three prime size [format: int] [default: 10]")
  parser.add_option("-v", "--verbosity",     dest="verbosity",          action="store",      default=1,                            type="int",    help="trace level [format: int] [default: 1]")
  parser.add_option("-l", "--log",           dest="log",                action="store_true", default=False,                                       help="write a log file [format: bool] [default: false]")
  (options, args) = parser.parse_args()

  if options.log:
    logHandle = open(options.outputFileName + ".log", "w")

  # remove possible existing output file
  if os.path.exists(options.outputFileName + ".fas"):
    os.unlink(options.outputFileName + ".fas")

  # check size
  sequenceParser = SequenceListParser(options.inputFileName, options.verbosity)
  nbSequences    = sequenceParser.getNbSequences()
  if options.verbosity > 0:
    print "%i lines parsed" % sequenceParser.getNbSequences()
    progress = Progress(sequenceParser.getNbSequences(), "Analyzing " + options.inputFileName, options.verbosity)

  # treat sequences
  while sequenceParser.getNextSequence():
    sequence          = sequenceParser.getCurrentSequence()
    fivePrimeAdaptor  = sequence.sequence[0:len(options.fivePrimeAdaptor)]
    threePrimeAdaptor = sequence.sequence[len(sequence.sequence)-len(options.threePrimeAdaptor):]

    # check 5' adaptor
    fivePrimeDistance = distance(fivePrimeAdaptor, options.fivePrimeAdaptor)
    # check 3' adaptor
    threePrimeDistance = len(threePrimeAdaptor)
    for i in range(options.threePrimeSize, len(threePrimeAdaptor)+1):
      threePrimeDistance = min(threePrimeDistance, distance(threePrimeAdaptor[-i:], options.threePrimeAdaptor[:i]))

    # sort candidates
    if fivePrimeDistance > options.fivePrimeDistance:
      if options.log:
        logHandle.write("Sequence %s does not start with the right adaptor (%s != %s)\n" % (sequence.sequence, fivePrimeAdaptor, options.fivePrimeAdaptor))
    elif threePrimeDistance > options.threePrimeDistance:
      if options.log:
        logHandle.write("Sequence %s does not end with the right adaptor (%s != %s)\n" % (sequence.sequence, threePrimeAdaptor, options.threePrimeAdaptor))
    else:
      nbRemaining += 1
      sequence.sequence = sequence.sequence[len(options.fivePrimeAdaptor):len(sequence.sequence)-len(options.threePrimeAdaptor)]
      outputHandle = open(options.outputFileName + ".fas", "a")
      outputHandle.write(sequence.printFasta())
      outputHandle.close()

    if options.verbosity > 0:
      progress.inc()

  if options.verbosity > 0:
    progress.done()

  if options.log:
    logHandle.close()

  print "kept %i over %i (%f%%)" % (nbRemaining, nbSequences, float(nbRemaining) / nbSequences * 100)

