#! /usr/bin/env python
"""Trim the sequences from a 5' adaptor"""

import os
from optparse import OptionParser
from parsing.fastaParser import *
from parsing.fastqParser import *
from writer.fastaWriter import *
from writer.fastqWriter import *
from misc.progress import *
from misc import utils


if __name__ == "__main__":
  
  # parse command line
  description = "Trim Sequences: Remove the adaptor of a list of reads. [Category: Sequences]"

  parser = OptionParser(description = description)
  parser.add_option("-i", "--input",           dest="inputFileName",  action="store",                     type="string", help="input file [compulsory] [format: file in sequence format given by -f]")
  parser.add_option("-f", "--format",          dest="format",         action="store",                     type="string", help="format of file [compulsory] [format: sequence file format]")
  parser.add_option("-o", "--output",          dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in sequence format given by -f]")
  parser.add_option("-a", "--adaptor",         dest="adaptor",        action="store",                     type="string", help="adaptor [compulsory] [format: string]")
  parser.add_option("-e", "--errors",          dest="errors",         action="store",      default=0,     type="int" ,   help="number of errors in percent [format: int] [default: 0]")
  parser.add_option("-n", "--noAdaptor",       dest="noAdaptor",      action="store",      default=None,  type="string", help="file name where to print sequences with no adaptor [format: output file in sequence format given by -f]")
  parser.add_option("-v", "--verbosity",       dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int]")
  (options, args) = parser.parse_args()

  minSize = 2

  if options.format == "fasta":
    parser = FastaParser(options.inputFileName, options.verbosity)
  elif options.format == "fastq":
    parser = FastqParser(options.inputFileName, options.verbosity)
  else:
    sys.exit("Cannot handle files with '%s' format." % (options.format))

  if options.format == "fasta":
    writer = FastaWriter("%s.mfa" % (options.outputFileName), options.verbosity)
  elif options.format == "fastq":
    writer = FastqWriter("%s.mfq" % (options.outputFileName), options.verbosity)
  else:
    sys.exit("Cannot handle files with '%s' format." % (options.format))

  writerNoAdaptor = None
  if options.noAdaptor != None:
    if options.format == "fasta":
      writerNoAdaptor = FastaWriter("%s.mfa" % (options.noAdaptor), options.verbosity)
    elif options.format == "fastq":
      writerNoAdaptor = FastqWriter("%s.mfq" % (options.noAdaptor), options.verbosity)
    else:
      sys.exit("Cannot handle files with '%s' format." % (options.format))

  nbFound = 0
    
  progress = Progress(parser.getNbSequences(), "Reading %s" % (options.inputFileName), options.verbosity)
  for sequence in parser.getIterator():
    progress.inc()
    nucleotides = sequence.sequence
    found       = False
    for i in range(len(nucleotides) - minSize):
      nucleotidesPart = nucleotides[i:]
      adaptorPart     = options.adaptor if len(nucleotidesPart) >= len(options.adaptor) else options.adaptor[:len(nucleotidesPart)]
      nucleotidesPart = nucleotidesPart if len(adaptorPart) == len(nucleotidesPart) else nucleotidesPart[:len(adaptorPart)]
      if utils.getHammingDistance(adaptorPart, nucleotidesPart) <= int(options.errors / 100.0 * len(adaptorPart)):
        nbFound += 1
        sequence.shrinkToFirstNucleotides(i)
        writer.addSequence(sequence)
        found = True
        break
    if not found:
      writer.addSequence(sequence)
      if writerNoAdaptor != None:
        writerNoAdaptor.addSequence(sequence)
  progress.done()

  print "%d sequences with adaptors on %d (%.2f%%)" % (nbFound, parser.getNbSequences(), float(nbFound) / parser.getNbSequences() * 100)

