view commons/core/seq/FastaUtils.py @ 59:2a4884ba3e5c

Uploaded
author m-zytnicki
date Mon, 10 Feb 2014 03:39:09 -0500
parents 44d5973c188c
children
line wrap: on
line source

# Copyright INRA (Institut National de la Recherche Agronomique)
# http://www.inra.fr
# http://urgi.versailles.inra.fr
#
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software.  You can  use, 
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info". 
#
# As a counterpart to the access to the source code and  rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited
# liability. 
#
# In this respect, the user's attention is drawn to the risks associated
# with loading,  using,  modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also
# therefore means  that it is reserved for developers  and  experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or 
# data to be ensured and,  more generally, to use and operate it in the 
# same conditions as regards security. 
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.


import os
import sys
import string
import math
import shutil
import re
import glob
from operator import itemgetter
from commons.core.seq.BioseqDB import BioseqDB
from commons.core.seq.Bioseq import Bioseq
from commons.core.coord.MapUtils import MapUtils
from commons.core.coord.Range import Range
from commons.core.checker.CheckerUtils import CheckerUtils
from commons.core.launcher.LauncherUtils import LauncherUtils
from commons.core.coord.ConvCoord import ConvCoord
from commons.core.parsing.FastaParser import FastaParser


## Static methods for fasta file manipulation
#
class FastaUtils( object ):
    
    ## Count the number of sequences in the input fasta file
    #
    # @param inFile name of the input fasta file
    #
    # @return integer number of sequences in the input fasta file
    #
    @staticmethod
    def dbSize( inFile ):
        nbSeq = 0
        inFileHandler = open( inFile, "r" )
        line = inFileHandler.readline()
        while line:
            if line[0] == ">":
                nbSeq = nbSeq + 1
            line = inFileHandler.readline()
        inFileHandler.close()
        
        return nbSeq
    
    
    ## Compute the cumulative sequence length in the input fasta file
    #
    # @param inFile handler of the input fasta file
    #
    @staticmethod
    def dbCumLength( inFile ):
        cumLength = 0
        line = inFile.readline()
        while line:
            if line[0] != ">":
                cumLength += len(string.rstrip(line))
            line = inFile.readline()
    
        return cumLength
    
    
    ## Return a list with the length of each sequence in the input fasta file
    #
    # @param inFile string name of the input fasta file
    #
    @staticmethod
    def dbLengths( inFile ):
        lLengths = []
        inFileHandler = open( inFile, "r" )
        currentLength = 0
        line = inFileHandler.readline()
        while line:
            if line[0] == ">":
                if currentLength != 0:
                    lLengths.append( currentLength )
                currentLength = 0
            else:
                currentLength += len(line[:-1])
            line = inFileHandler.readline()
        lLengths.append( currentLength )
        inFileHandler.close()
        return lLengths
    
    
    ## Retrieve the sequence headers present in the input fasta file
    #
    # @param inFile string name of the input fasta file
    # @param verbose integer level of verbosity
    #
    # @return list of sequence headers
    #
    @staticmethod
    def dbHeaders( inFile, verbose=0 ):
        lHeaders = []
        
        inFileHandler = open( inFile, "r" )
        line = inFileHandler.readline()
        while line:
            if line[0] == ">":
                lHeaders.append( string.rstrip(line[1:]) )
                if verbose > 0:
                    print string.rstrip(line[1:])
            line = inFileHandler.readline()
        inFileHandler.close()
        
        return lHeaders
    
    
    ## Cut a data bank into chunks according to the input parameters
    # If a sequence is shorter than the threshold, it is only renamed (not cut)
    #
    # @param inFileName string name of the input fasta file
    # @param chkLgth string chunk length (in bp, default=200000)
    # @param chkOver string chunk overlap (in bp, default=10000)
    # @param wordN string N stretch word length (default=11, 0 for no detection)
    # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
    # @param clean boolean remove 'cut' and 'Nstretch' files
    # @param verbose integer (default = 0)
    #
    @staticmethod
    def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ):
        nbSeq = FastaUtils.dbSize( inFileName )
        if verbose > 0:
            print "cut the %i input sequences with cutterDB..." % ( nbSeq )
            sys.stdout.flush()
            
        prg = "cutterDB"
        cmd = prg
        cmd += " -l %s" % ( chkLgth )
        cmd += " -o %s"  %( chkOver )
        cmd += " -w %s" % ( wordN )
        cmd += " %s" % ( inFileName )
        returnStatus = os.system( cmd )
        if returnStatus != 0:
            msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
            sys.stderr.write( "%s\n" % ( msg ) )
            sys.exit(1)
            
        nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) )
        if verbose > 0:
            print "done (%i chunks)" % ( nbChunks )
            sys.stdout.flush()
            
        if verbose > 0:
            print "rename the headers..."
            sys.stdout.flush()
            
        if outFilePrefix == "":
            outFastaName = inFileName + "_chunks.fa"
            outMapName = inFileName + "_chunks.map"
        else:
            outFastaName = outFilePrefix + ".fa"
            outMapName = outFilePrefix + ".map"
            
        inFile = open( "%s_cut" % ( inFileName ), "r" )
        line = inFile.readline()
        
        outFasta = open( outFastaName, "w" )
        outMap = open( outMapName, "w" )
        
        # read line after line (no need for big RAM) and change the sequence headers
        while line:
            
            if line[0] == ">":
                if verbose > 1:
                    print "rename '%s'" % ( line[:-1] ); sys.stdout.flush()
                data = line[:-1].split(" ")
                seqID = data[0].split(">")[1]
                newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) )
                oldHeader = data[2]
                seqStart = data[4].split("..")[0]
                seqEnd = data[4].split("..")[1]
                outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) )
                outFasta.write( ">%s\n" % ( newHeader ) )
                
            else:
                outFasta.write( line.upper() )
                
            line = inFile.readline()
            
        inFile.close()
        outFasta.close()
        outMap.close()
        
        if clean == True:
            os.remove(inFileName + "_cut")
            os.remove(inFileName + ".Nstretch.map")
            
    
    ## Split the input fasta file in several output files
    #
    # @param inFile string name of the input fasta file
    # @param nbSeqPerBatch integer number of sequences per output file
    # @param newDir boolean put the sequences in a new directory called 'batches'
    # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
    # @param prefix prefix in output file name 
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ):
        if not os.path.exists( inFile ):
            msg = "ERROR: file '%s' doesn't exist" % ( inFile )
            sys.stderr.write( "%s\n" % ( msg ) )
            sys.exit(1)
            
        nbSeq = FastaUtils.dbSize( inFile )
        
        nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) )
        if verbose > 0:
            print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches )
            sys.stdout.flush()
            
        if nbSeqPerBatch > 1 and useSeqHeader:
            useSeqHeader = False
            
        if newDir == True:
            if os.path.exists( "batches" ):
                shutil.rmtree( "batches" )
            os.mkdir( "batches" )
            os.chdir( "batches" )
            os.system( "ln -s ../%s ." % ( inFile ) )
            
        inFileHandler = open( inFile, "r" )
        inFileHandler.seek( 0, 0 )
        countBatch = 0
        countSeq = 0
        line = inFileHandler.readline()
        while line:
            if line == "":
                break
            if line[0] == ">":
                countSeq += 1
                if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
                    if "outFile" in locals():
                        outFile.close()
                    countBatch += 1
                    if nbSeqPerBatch == 1 and useSeqHeader:
                        outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") )
                    else:
                        outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) )
                    outFile = open( outFileName, "w" )
                if verbose > 1:
                    print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName )
                    sys.stdout.flush()
            outFile.write( line )
            line = inFileHandler.readline()
        inFileHandler.close()
        
        if newDir == True:
            os.remove( os.path.basename( inFile ) )
            os.chdir( ".." )
            
    
    ## Split the input fasta file in several output files
    #
    # @param inFileName string name of the input fasta file
    # @param maxSize integer max cumulative length for each output file
    #
    @staticmethod
    def splitFastaFileInBatches(inFileName, maxSize = 200000):
        iBioseqDB = BioseqDB(inFileName)
        lHeadersSizeTuples = []
        for iBioseq in iBioseqDB.db:
            lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
            
        lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
        os.mkdir("batches")
        os.chdir("batches")
        
        iterator = 0 
        for lHeader in lHeadersList :
            iterator += 1
            with open("batch_%s.fa" % iterator, 'w') as f :
                for header in lHeader :
                    iBioseq = iBioseqDB.fetch(header)
                    iBioseq.write(f)
        os.chdir("..")
        
                    
    ## Split the input fasta file in several output files according to their cluster identifier
    #
    # @param inFileName string  name of the input fasta file
    # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
    # @param simplifyHeader boolean simplify the headers
    # @param createDir boolean put the sequences in different directories
    # @param outPrefix string prefix of the output files (default='seqCluster')
    # @param verbose integer (default = 0)
    #
    @staticmethod
    def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ):
        if not os.path.exists( inFileName ):
            print "ERROR: %s doesn't exist" % ( inFileName )
            sys.exit(1)
    
        inFile = open( inFileName, "r" )
    
        line = inFile.readline()
        if line:
            name = line.split(" ")[0]
            if "Cluster" in name:
                clusterID = name.split("Cluster")[1].split("Mb")[0]
                seqID = name.split("Mb")[1]
            else:
                clusterID = name.split("Cl")[0].split("Gr")[1]   # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
                if "Q" in name.split("Gr")[0]:
                    seqID = name.split("Gr")[0].split("MbQ")[1]
                elif "S" in name:
                    seqID = name.split("Gr")[0].split("MbS")[1]
            sClusterIDs = set( [ clusterID ] )
            if simplifyHeader == True:
                header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
            else:
                header = line[1:-1]
            if createDir == True:
                if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID )  ):
                    os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
                os.chdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
            outFileName = "%s%s.fa" % ( outPrefix, clusterID )
            outFile = open( outFileName, "w" )
            outFile.write( ">%s\n" % ( header ) )
            prevClusterID = clusterID
        
            line = inFile.readline()
            while line:
                if line[0] == ">":
                    name = line.split(" ")[0]
                    if "Cluster" in name:
                        clusterID = name.split("Cluster")[1].split("Mb")[0]
                        seqID = name.split("Mb")[1]
                    else:
                        clusterID = name.split("Cl")[0].split("Gr")[1]
                        if "Q" in name.split("Gr")[0]:
                            seqID = name.split("Gr")[0].split("MbQ")[1]
                        elif "S" in name:
                            seqID = name.split("Gr")[0].split("MbS")[1]
        
                    if clusterID != prevClusterID:
                        outFile.close()
        
                    if simplifyHeader == True:
                        header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
                    else:
                        header = line[1:-1]
        
                    if createDir == True:
                        os.chdir( ".." )
                        if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID )  ):
                            os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
                        os.chdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
        
                    outFileName = "%s%s.fa" % ( outPrefix, clusterID )
                    if not os.path.exists( outFileName ):
                        outFile = open( outFileName, "w" )
                    else:
                        if clusterID != prevClusterID:
                            outFile.close()
                            outFile = open( outFileName, "a" )
                    outFile.write( ">%s\n" % ( header ) )
                    prevClusterID = clusterID
                    sClusterIDs.add( clusterID )
        
                else:
                    outFile.write( line )
        
                line = inFile.readline()
        
            outFile.close()
            if verbose > 0:
                print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
                
            if createDir == True:
                os.chdir("..")
        else:
            print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
                

    ## Filter a fasta file in two fasta files using the length of each sequence as a criteron
    #
    # @param len_min integer    length sequence criterion to filter
    # @param inFileName string  name of the input fasta file
    # @param verbose integer (default = 0)      
    #
    @staticmethod
    def dbLengthFilter( len_min, inFileName, verbose=0 ):
        file_db = open( inFileName, "r" )
        file_dbInf = open( inFileName+".Inf"+str(len_min), "w" )
        file_dbSup = open( inFileName+".Sup"+str(len_min), "w" )
        seq = Bioseq()
        numseq = 0
        nbsave = 0
        
        seq.read( file_db )
        while seq.sequence:
            l = seq.getLength()
            numseq = numseq + 1
            if l >= len_min:
                seq.write( file_dbSup )
                if verbose > 0:
                        print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!'
                        nbsave=nbsave+1
            else:
                seq.write( file_dbInf )
                if verbose > 0:
                        print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!'
                        nbsave=nbsave+1
            seq.read( file_db )
                        
        file_db.close()
        file_dbInf.close()
        file_dbSup.close()
        if verbose > 0:
            print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
            
    
    ## Extract the longest sequences from a fasta file
    #
    # @param num integer maximum number of sequences in the output file
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output fasta file
    # @param minThresh integer minimum length threshold (default=0)
    # @param verbose integer (default = 0)
    #
    @staticmethod
    def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ):
        bsDB = BioseqDB( inFileName )
        if verbose > 0:
            print "nb of input sequences: %i" % ( bsDB.getSize() )
    
        if outFileName == "":
            outFileName = inFileName + ".best" + str(num)
        outFile = open( outFileName, "w" )
        
        if bsDB.getSize()==0:
            return 0
        
        num = int(num)
        if verbose > 0:
            print "keep the %i longest sequences" % ( num )
            if minThresh > 0:
                print "with length > %i bp" % ( minThresh )
            sys.stdout.flush()
            
        # retrieve the length of each input sequence
        tmpLSeqLgth = []
        seqNum = 0
        for bs in bsDB.db:
            seqNum += 1
            tmpLSeqLgth.append( bs.getLength() )
            if verbose > 1:
                print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() )
            sys.stdout.flush()
    
        # sort the lengths
        tmpLSeqLgth.sort()
        tmpLSeqLgth.reverse()
    
        # select the longest
        lSeqLgth = []
        for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
            if tmpLSeqLgth[i] >= minThresh:
                lSeqLgth.append( tmpLSeqLgth[i] )
        if verbose > 0:
            print "selected max length: %i" % ( max(lSeqLgth) )
            print "selected min length: %i" % ( min(lSeqLgth) )
            sys.stdout.flush()
    
        # save the longest
        inFile = open( inFileName )
        seqNum = 0
        nbSave = 0
        for bs in bsDB.db:
            seqNum += 1
            if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
                bs.write( outFile )
                if verbose > 1:
                    print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
                    sys.stdout.flush()
                nbSave += 1
            if nbSave == num:
                break
        inFile.close()
        outFile.close()
        if verbose > 0:
            print nbSave, "saved sequences in ", outFileName
            sys.stdout.flush()
            
        return 0
    
    
    ## Extract all the sequence headers from a fasta file and write them in a new fasta file
    #
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
    #
    @staticmethod
    def dbExtractSeqHeaders( inFileName, outFileName="" ):
        lHeaders = FastaUtils.dbHeaders( inFileName )
        
        if outFileName == "":
            outFileName = inFileName + ".headers"
            
        outFile = open( outFileName, "w" )
        for i in lHeaders:
            outFile.write( i + "\n" )
        outFile.close()
    
        return 0
    
    
    ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
    #
    # @param pattern regular expression to search in headers
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ):
        if pattern == "":
            return
        
        if outFileName == "":
            outFileName = inFileName + '.extracted'
        outFile = open( outFileName, 'w' )
        
        patternTosearch = re.compile( pattern )
        bioseq = Bioseq()
        bioseqNb = 0
        savedBioseqNb = 0
        inFile = open( inFileName, "r" )
        bioseq.read( inFile )
        while bioseq.sequence:
            bioseqNb = bioseqNb + 1
            m = patternTosearch.search( bioseq.header )
            if m:
                bioseq.write( outFile )
                if verbose > 1:
                    print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!'
                savedBioseqNb = savedBioseqNb + 1
            bioseq.read( inFile )
        inFile.close()
        
        outFile.close()
        
        if verbose > 0:
            print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
            
    
    ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
    #
    # @param patternFileName string file containing regular expression to search in headers
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
    
        if patternFileName == "":
            print "ERROR: no file of pattern"
            sys.exit(1)
    
        bioseq = Bioseq()
        bioseqNb = 0
        savedBioseqNb = 0
        lHeaders = []

        inFile = open( inFileName, "r" )
        bioseq.read( inFile )
        while bioseq.sequence != None:
            lHeaders.append( bioseq.header )
            bioseq.read( inFile )
        inFile.close()
    
        lHeadersToKeep = []
        patternFile = open( patternFileName, "r" )
        for pattern in patternFile:
            if verbose > 0:
                print "pattern: ",pattern[:-1]; sys.stdout.flush()
                
            patternToSearch = re.compile(pattern[:-1])
            for h in lHeaders:
                if patternToSearch.search(h):
                    lHeadersToKeep.append(h)
        patternFile.close()
    
        if outFileName == "":
            outFileName = inFileName + ".extracted"
        outFile=open( outFileName, "w" )
    
        inFile = open( inFileName, "r" )
        bioseq.read(inFile)
        while bioseq.sequence:
            bioseqNb += 1
            if bioseq.header in lHeadersToKeep:
                bioseq.write(outFile)
                if verbose > 1:
                    print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
                savedBioseqNb += 1
            bioseq.read(inFile)
        inFile.close()
    
        outFile.close()
        
        if verbose > 0:
            print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
            
    
    ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
    #
    # @param pattern regular expression to search in headers
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ):
        if pattern == "":
            return
        
        patternToSearch = re.compile(pattern)
        
        if outFileName == "":
            outFileName = inFileName + '.cleaned'
        outFile = open(outFileName,'w')
        
        bioseq = Bioseq()
        bioseqNb = 0
        savedBioseqNb = 0
        inFile = open(inFileName)
        bioseq.read(inFile)
        while bioseq.sequence != None:
            bioseqNb += 1
            if not patternToSearch.search(bioseq.header):
                bioseq.write(outFile)
                if verbose > 1:
                    print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'
                savedBioseqNb += 1
            bioseq.read(inFile)
        inFile.close()
        
        outFile.close()
        
        if verbose > 0:
            print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
            
    
    ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
    #
    # @param patternFileName string file containing regular expression to search in headers
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
        if patternFileName == "":
            print "ERROR: no file of pattern"
            sys.exit(1)
            
        bioseq = Bioseq()
        bioseqNb = 0
        savedBioseqNb = 0
        lHeaders = []
        inFile = open( inFileName, "r" )
        bioseq.read( inFile )
        while bioseq.sequence != None:
            bioseqNb += 1
            lHeaders.append( bioseq.header )
            bioseq.read( inFile )
        inFile.close()
        
        patternFile = open( patternFileName, "r")
        lHeadersToRemove = []
        for pattern in patternFile:
            if verbose > 0:
                print "pattern: ",pattern[:-1]; sys.stdout.flush()
                
            patternToSearch = re.compile( pattern[:-1] )
            for h in lHeaders:
                if patternToSearch.search(h):
                    lHeadersToRemove.append(h)
        patternFile.close()
        
        if outFileName == "":
            outFileName = inFileName + '.cleaned'
        outFile = open( outFileName, 'w' )
    
        bioseqNum = 0
        inFile=open( inFileName )
        bioseq.read( inFile )
        while bioseq.sequence != None:
            bioseqNum += 1
            if bioseq.header not in lHeadersToRemove:
                bioseq.write( outFile )
                if verbose > 1:
                    print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
                savedBioseqNb += 1
            bioseq.read( inFile )
        inFile.close()
        
        outFile.close()
        
        if verbose > 0:
            print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
            
    
    ## Find sequence's ORFs from a fasta file and write them in a tab file 
    # 
    # @param inFileName string name of the input fasta file
    # @param orfMaxNb integer Select orfMaxNb best ORFs
    # @param orfMinLength integer Keep ORFs with length > orfMinLength 
    # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ):
        if outFileName == "":
            outFileName = inFileName + ".ORF.map"
        outFile = open( outFileName, "w" )
    
        bioseq = Bioseq()
        bioseqNb = 0
    
        inFile = open( inFileName )
        bioseq.read( inFile )
        while bioseq.sequence != None:
            bioseq.upCase() 
            bioseqNb += 1
            if verbose > 0:
                print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]'
                
            orf = bioseq.findORF()
            bestOrf = []
            for i in orf.keys():
                orfLen = len(orf[i])
                for j in xrange(1, orfLen):
                    start = orf[i][j-1] + 4
                    end = orf[i][j] + 3
                    if end - start >= orfMinLength:
                        bestOrf.append( ( end-start, i+1, start, end ) )
    
            bioseq.reverseComplement()
            
            orf = bioseq.findORF()
            seqLen = bioseq.getLength()
            for i in orf.keys():
                orfLen = len(orf[i])
                for j in xrange(1, orfLen):
                    start = seqLen - orf[i][j-1] - 3
                    end = seqLen - orf[i][j] - 2
                    if start - end >= orfMinLength:
                        bestOrf.append( ( start-end, (i+1)*-1, start, end ) )
    
            bestOrf.sort()
            bestOrf.reverse()
            bestOrfNb = len(bestOrf)
            if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
                bestOrfNb = orfMaxNb
            for i in xrange(0, bestOrfNb):
                if verbose > 0:
                    print bestOrf[i]
                outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\
                                   "|"+str(bestOrf[i][0]),bioseq.header,
                                   bestOrf[i][2],bestOrf[i][3]))
            bioseq.read( inFile )
    
        inFile.close()
        outFile.close()
    
        return 0


    ## Sort sequences by increasing length (write a new file)
    #
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output fasta file
    # @param verbose integer verbosity level   
    #
    @staticmethod
    def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
        if verbose > 0:
            print "sort sequences by increasing length"
            sys.stdout.flush()
        if not os.path.exists( inFileName ):
            print "ERROR: file '%s' doesn't exist" % ( inFileName )
            sys.exit(1)
            
        # read each seq one by one
        # save them in distinct temporary files
        # with their length in the name
        inFileHandler = open( inFileName, "r" )
        countSeq = 0
        bs = Bioseq()
        bs.read( inFileHandler )
        while bs.header:
            countSeq += 1
            tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
            bs.appendBioseqInFile( tmpFile )
            if verbose > 1:
                print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
            bs.header = ""
            bs.sequence = ""
            bs.read( inFileHandler )
        inFileHandler.close()
        
        # sort temporary file names
        # concatenate them into the output file
        if os.path.exists( outFileName ):
            os.remove( outFileName )
        lFiles = glob.glob( "*bp_*nb" )
        lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
        for fileName in lFiles:
            cmd = "cat %s >> %s" % ( fileName, outFileName )
            returnValue = os.system( cmd )
            if returnValue != 0:
                print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
                sys.exit(1)
            os.remove( fileName )
            
        return 0

    
    ## Sort sequences by header
    #
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output fasta file
    # @param verbose integer verbosity level   
    #
    @staticmethod
    def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0):
        if outFileName == "":
            outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
        iBioseqDB = BioseqDB(inFileName)
        f = open(outFileName, "w")
        lHeaders = sorted(iBioseqDB.getHeaderList())
        for header in lHeaders:
            iBioseq = iBioseqDB.fetch(header)
            iBioseq.write(f)
        f.close()

    
    ## Return a dictionary which keys are the headers and values the length of the sequences
    #
    # @param inFile string name of the input fasta file
    # @param verbose integer verbosity level   
    #
    @staticmethod
    def getLengthPerHeader( inFile, verbose=0 ):
        dHeader2Length = {}
        
        inFileHandler = open( inFile, "r" )
        currentSeqHeader = ""
        currentSeqLength = 0
        line = inFileHandler.readline()
        while line:
            if line[0] == ">":
                if currentSeqHeader != "":
                    dHeader2Length[ currentSeqHeader ] = currentSeqLength
                    currentSeqLength = 0
                currentSeqHeader = line[1:-1]
                if verbose > 0:
                    print "current header: %s" % ( currentSeqHeader )
                    sys.stdout.flush()
            else:
                currentSeqLength += len( line.replace("\n","") )
            line = inFileHandler.readline()
        dHeader2Length[ currentSeqHeader ] = currentSeqLength
        inFileHandler.close()
        
        return dHeader2Length
    
    
    ## Convert headers from a fasta file having chunk coordinates
    #
    # @param inFile string name of the input fasta file
    # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
    # @param outFile string name of the output file
    #
    @staticmethod
    def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
        inFileHandler = open(inFile, "r")
        outFileHandler = open(outFile, "w")
        dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
        iConvCoord = ConvCoord()
        line = inFileHandler.readline()
        while line:
            if line[0] == ">":
                if "{Fragment}" in line:
                    chkName = line.split(" ")[1]
                    chrName = dChunk2Map[chkName].seqname
                    lCoordPairs = line.split(" ")[3].split(",")
                    lRangesOnChk = []
                    for i in lCoordPairs:
                        iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
                        lRangesOnChk.append(iRange)
                    lRangesOnChr = []
                    for iRange in lRangesOnChk:
                        lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
                    newHeader = line[1:-1].split(" ")[0]
                    newHeader += " %s" % chrName
                    newHeader += " {Fragment}"
                    newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
                    for iRange in lRangesOnChr[1:]:
                        newHeader += ",%i..%i" % (iRange.start, iRange.end)
                    outFileHandler.write(">%s\n" % newHeader)
                else:
                    chkName = line.split("_")[1].split(" ")[0]
                    chrName = dChunk2Map[chkName].seqname
                    coords = line[line.find("[")+1 : line.find("]")]
                    start = int(coords.split(",")[0])
                    end = int(coords.split(",")[1])
                    iRangeOnChk = Range(chkName, start, end)
                    iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
                    newHeader = line[1:-1].split("_")[0]
                    newHeader += " %s" % chrName
                    newHeader += " %s" % line[line.find("(") : line.find(")")+1]
                    newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
                    outFileHandler.write(">%s\n" % newHeader)
            else:
                outFileHandler.write(line)
            line = inFileHandler.readline()
        inFileHandler.close()
        outFileHandler.close()
        
    
    ## Convert a fasta file to a length file
    #
    # @param inFile string name of the input fasta file
    # @param outFile string name of the output file
    #
    @staticmethod
    def convertFastaToLength(inFile, outFile = ""):
        if outFile == "":
            outFile = "%s.length" % inFile
        
        if inFile != "":
            with open(inFile, "r") as inFH:
                with open(outFile, "w") as outFH:
                    bioseq = Bioseq()
                    bioseq.read(inFH)
                    while bioseq.sequence != None:
                        seqLen = bioseq.getLength()
                        outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
                        bioseq.read(inFH)

    
    ## Convert a fasta file to a seq file
    #
    # @param inFile string name of the input fasta file
    # @param outFile string name of the output file
    #
    @staticmethod
    def convertFastaToSeq(inFile, outFile = ""):
        if outFile == "":
            outFile = "%s.seq" % inFile
        
        if inFile != "":
            with open(inFile, "r") as inFH:
                with open(outFile, "w") as outFH:
                    bioseq = Bioseq()
                    bioseq.read(inFH)
                    while bioseq.sequence != None:
                        seqLen = bioseq.getLength()
                        outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
                                                bioseq.sequence, bioseq.header, seqLen))
                        bioseq.read(inFH)


    ## Splice an input fasta file using coordinates in a Map file
    #
    # @note the coordinates should be merged beforehand!
    #
    @staticmethod
    def spliceFromCoords( genomeFile, coordFile, obsFile ):
        genomeFileHandler = open( genomeFile, "r" )
        obsFileHandler = open( obsFile, "w" )
        dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
            
        bs = Bioseq()
        bs.read( genomeFileHandler )
        while bs.sequence:
            if dChr2Maps.has_key( bs.header ):
                lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
                splicedSeq = ""
                currentSite = 0
                for iMap in lCoords:
                    minSplice = iMap.getMin() - 1
                    if minSplice > currentSite:
                        splicedSeq += bs.sequence[ currentSite : minSplice ]
                    currentSite = iMap.getMax()
                splicedSeq += bs.sequence[ currentSite : ]
                bs.sequence = splicedSeq
            bs.write( obsFileHandler )
            bs.read( genomeFileHandler )
            
        genomeFileHandler.close()
        obsFileHandler.close()
        
    
    ## Shuffle input sequences (single file or files in a directory)
    #
    @staticmethod
    def dbShuffle( inData, outData, verbose=0 ):
        if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
            prg = "esl-shuffle"
        else : prg = "shuffle"
        genericCmd = prg + " -d INPUT > OUTPUT"
        if os.path.isfile( inData ):
            if verbose > 0:
                print "shuffle input file '%s'" % inData
            cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData)
            print cmd
            returnStatus = os.system( cmd )
            if returnStatus != 0:
                sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
                sys.exit(1)
                
        elif os.path.isdir( inData ):
            if verbose > 0:
                print "shuffle files in input directory '%s'" % inData
            if os.path.exists( outData ):
                shutil.rmtree( outData )
            os.mkdir( outData )
            lInputFiles = glob.glob( "%s/*.fa" %( inData ) )
            nbFastaFiles = 0
            for inputFile in lInputFiles:
                nbFastaFiles += 1
                if verbose > 1:
                    print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) )
                fastaBaseName = os.path.basename( inputFile )
                prefix, extension = os.path.splitext( fastaBaseName )
                cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix))
                returnStatus = os.system( cmd )
                if returnStatus != 0:
                    sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
                    sys.exit(1)
                    
    
    ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
    #
    # @param inClusterFileName string input cluster file name
    # @param inFastaFileName string input fasta file name
    # @param outFileName string output file name
    # @param verbosity integer verbosity
    #
    @staticmethod
    def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
        dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
        iFastaParser = FastaParser(inFastaFileName)
        with open(outFileName, "w") as f:
            for iSequence in iFastaParser.getIterator():
                
                header = iSequence.getName()
                if dHeader2ClusterClusterMember.get(header):
                    cluster = dHeader2ClusterClusterMember[header][0]
                    member = dHeader2ClusterClusterMember[header][1]
                else:
                    clusterIdForSingletonCluster +=  1
                    cluster = clusterIdForSingletonCluster
                    member = 1
                
                newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
                iSequence.setName(newHeader)
                f.write(iSequence.printFasta())
              
    @staticmethod
    def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
        dHeader2ClusterClusterMember = {}
        clusterId = 0
        with open(inClusterFileName) as f:
            line = f.readline()
            while line:
                lineWithoutLastChar = line.rstrip()
                lHeaders = lineWithoutLastChar.split("\t")
                clusterId += 1
                if verbosity > 0:
                    print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
                memberId = 0
                for header in lHeaders:
                    memberId += 1
                    dHeader2ClusterClusterMember[header] = (clusterId, memberId)
                line = f.readline()
            if verbosity > 0:
                print "%i clusters" % clusterId
        return dHeader2ClusterClusterMember, clusterId
    
    @staticmethod
    def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
        """
        Write a map file from fasta output of clustering tool.
        Warning: only works if input fasta headers are formated like LTRharvest fasta output.
        """
        if not outMapFileName:
            outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
        
        fileDb = open(fastaFileNameFromClustering , "r")
        fileMap = open(outMapFileName, "w")
        seq = Bioseq()
        numseq = 0
        while 1:
            seq.read(fileDb)
            if seq.sequence == None:
                break
            numseq = numseq + 1
            ID = seq.header.split(' ')[0].split('_')[0]
            chunk = seq.header.split(' ')[0].split('_')[1]
            start = seq.header.split(' ')[-1].split(',')[0][1:]
            end = seq.header.split(' ')[-1].split(',')[1][:-1]
            line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
            fileMap.write(line + "\n")
    
        fileDb.close()
        fileMap.close()
        print "saved in %s" % outMapFileName

    @staticmethod
    def writeNstreches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
        outFormat = outFormat.lower()
        if outFormat in ["gff", "gff3"]:
            outFormat = "gff3"
        else:
            outFormat = "map"
            
        lTNstretches = []
        if nbN != 0:
            iBSDB = BioseqDB(fastaFileName)
            for iBS in iBSDB.db:
                nbNFound = 0
                start = 1
                pos = 1
                lastPos = 0
                
                while pos <= iBS.getLength():
                    if nbNFound == 0:
                        start = pos
                        
                    while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
                        nbNFound += 1
                        pos += 1
                        lastPos = pos
                    
                    if pos - lastPos >= nbN:
                        if nbNFound >= nbN:
                            lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
                        nbNFound = 0
                    pos += 1
                
                if nbNFound >= nbN:
                    lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
    
            lTNstretches.sort(key = itemgetter(0, 1, 2))
        
        if outFileName == "":
            outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)
        
        with open(outFileName, "w") as fH:
            if outFormat == "gff3":
                fH.write("##gff-version 3\n")
            for item in lTNstretches:
                seq = item[0]
                start = item[1]
                end = item[2]
                if outFormat == "gff3":
                    fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
                else:
                    fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))