diff commons/core/coord/AlignUtils.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/core/coord/AlignUtils.py	Fri Jan 18 04:54:14 2013 -0500
@@ -0,0 +1,359 @@
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use, 
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info". 
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability. 
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or 
+# data to be ensured and,  more generally, to use and operate it in the 
+# same conditions as regards security. 
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+
+
+import os
+import sys
+import shutil
+from commons.core.coord.Align import Align
+
+
+## Static methods manipulating Align instances
+#
+class AlignUtils( object ):
+    
+    ## Return a list with Align instances from the given file
+    #
+    # @param inFile name of a file in the Align format
+    #
+    def getAlignListFromFile( inFile ):
+        lAlignInstances = []
+        inFileHandler = open( inFile, "r" )
+        while True:
+            line = inFileHandler.readline()
+            if line == "":
+                break
+            a = Align()
+            a.setFromString( line )
+            lAlignInstances.append( a )
+        inFileHandler.close()
+        return lAlignInstances
+
+    getAlignListFromFile = staticmethod( getAlignListFromFile )
+    
+    
+    ## Return a list with all the scores
+    #
+    # @param lAlignInstances: list of Align instances
+    #
+    def getListOfScores( lAlignInstances ):
+        lScores = []
+        for iAlign in lAlignInstances:
+            lScores.append( iAlign.score )
+        return lScores
+    
+    getListOfScores = staticmethod( getListOfScores )
+
+    
+    ## Return a list with all the scores from the given file
+    #
+    # @param inFile name of a file in the Align format
+    #
+    def getScoreListFromFile(inFile):
+        lScores = []
+        append = lScores.append
+        with open(inFile, "r") as inFileHandler:
+            line = inFileHandler.readline()
+            while line:
+                if line != "\n":
+                    append(int(line.split('\t')[7]))
+                line = inFileHandler.readline()
+        return lScores
+    
+    getScoreListFromFile = staticmethod( getScoreListFromFile )
+    
+    
+    ## for each line of a given Align file, write the coordinates on the query and the subject as two distinct lines in a Map file
+    #
+    # @param alignFile: name of the input Align file
+    # @param mapFile: name of the output Map file
+    #
+    def convertAlignFileIntoMapFileWithQueriesAndSubjects( alignFile, mapFile ):
+        alignFileHandler = open( alignFile, "r" )
+        mapFileHandler = open( mapFile, "w" )
+        iAlign = Align()
+        while True:
+            line = alignFileHandler.readline()
+            if line == "":
+                break
+            iAlign.setFromString( line )
+            iMapQ, iMapS = iAlign.getMapsOfQueryAndSubject()
+            iMapQ.write( mapFileHandler )
+            iMapS.write( mapFileHandler )
+        alignFileHandler.close()
+        mapFileHandler.close()
+        
+    convertAlignFileIntoMapFileWithQueriesAndSubjects = staticmethod( convertAlignFileIntoMapFileWithQueriesAndSubjects )
+    
+    
+    ## for each line of a given Align file, write the coordinates of the subject on the query as one line in a Map file
+    #
+    # @param alignFile: name of the input Align file
+    # @param mapFile: name of the output Map file
+    #
+    def convertAlignFileIntoMapFileWithSubjectsOnQueries( alignFile, mapFile ):
+        alignFileHandler = open( alignFile, "r" )
+        mapFileHandler = open( mapFile, "w" )
+        iAlign = Align()
+        while True:
+            line = alignFileHandler.readline()
+            if line == "":
+                break
+            iAlign.setFromString( line )
+            iMapQ = iAlign.getSubjectAsMapOfQuery()
+            iMapQ.write( mapFileHandler )
+        alignFileHandler.close()
+        mapFileHandler.close()
+        
+    convertAlignFileIntoMapFileWithSubjectsOnQueries = staticmethod( convertAlignFileIntoMapFileWithSubjectsOnQueries )
+    
+    
+    ## return a list of Align instances sorted in decreasing order according to their score, then their length on the query and finally their initial order
+    #
+    # @param lAligns: list of Align instances
+    #
+    def getAlignListSortedByDecreasingScoreThenLength( lAligns ):
+        return sorted( lAligns, key=lambda iAlign: ( 1 / float(iAlign.getScore()), 1 / float(iAlign.getLengthOnQuery()) ) )
+    
+    getAlignListSortedByDecreasingScoreThenLength = staticmethod( getAlignListSortedByDecreasingScoreThenLength )
+    
+    
+    ## Convert an Align file into a Path file
+    #
+    # @param alignFile string name of the input Align file
+    # @param pathFile string name of the output Path file
+    #
+    def convertAlignFileIntoPathFile( alignFile, pathFile ):
+        alignFileHandler = open( alignFile, "r" )
+        pathFileHandler = open( pathFile, "w" )
+        iAlign = Align()
+        countAlign = 0
+        while True:
+            line = alignFileHandler.readline()
+            if line == "":
+                break
+            countAlign += 1
+            iAlign.setFromString( line, "\t" )
+            pathFileHandler.write( "%i\t%s\n" % ( countAlign, iAlign.toString() ) )
+        alignFileHandler.close()
+        pathFileHandler.close()
+        
+    convertAlignFileIntoPathFile = staticmethod( convertAlignFileIntoPathFile )
+    
+    
+    ## Sort an Align file
+    #
+    def sortAlignFile( inFile, outFile="" ):
+        if outFile == "":
+            outFile = "%s.sort" % ( inFile )
+        prg = "sort"
+        cmd = prg
+        cmd += " -k 1,1 -k 4,4 -k 2,2n -k 3,3n -k 5,5n -k 6,6n -k 8,8n"
+        cmd += " %s" % ( inFile )
+        cmd += " > %s" % ( outFile )
+        exitStatus = os.system( cmd )
+        if exitStatus != 0:
+            msg = "ERROR: '%s' returned '%i'" % ( prg, exitStatus )
+            sys.stderr.write( "%s\n" % ( msg ) )
+            sys.exit( exitStatus )
+            
+    sortAlignFile = staticmethod( sortAlignFile )
+    
+    
+    ## Write Align instances contained in the given list
+    #
+    # @param lAlign a list of Align instances
+    # @param fileName name of the file to write the Align instances
+    # @param mode the open mode of the file ""w"" or ""a"" 
+    #
+    def writeListInFile( lAlign, fileName, mode="w" ):
+        fileHandler = open( fileName, mode )
+        for iAlign in lAlign:
+            iAlign.write( fileHandler )
+        fileHandler.close()
+        
+    writeListInFile = staticmethod( writeListInFile )
+
+        
+    ## Split a list of Align instances according to the name of the query
+    #
+    # @param lInAlign list of align instances
+    # @return lOutAlignList list of align instances lists 
+    #
+    def splitAlignListByQueryName( lInAlign ):
+        lSortedAlign = sorted(lInAlign, key=lambda o: o.range_query.seqname)
+        lOutAlignList = []
+        if len(lSortedAlign) != 0 :
+            lAlignForCurrentQuery = [] 
+            previousQuery = lSortedAlign[0].range_query.seqname
+            for align in lSortedAlign :
+                currentQuery = align.range_query.seqname
+                if previousQuery != currentQuery :
+                    lOutAlignList.append(lAlignForCurrentQuery)
+                    previousQuery = currentQuery 
+                    lAlignForCurrentQuery = []
+                lAlignForCurrentQuery.append(align)
+                    
+            lOutAlignList.append(lAlignForCurrentQuery)         
+                
+        return lOutAlignList
+    
+    splitAlignListByQueryName = staticmethod( splitAlignListByQueryName )
+    
+    
+    ## Create an Align file from each list of Align instances in the input list
+    #
+    # @param lAlignList list of lists with Align instances
+    # @param pattern string
+    # @param dirName string 
+    #
+    def createAlignFiles( lAlignList, pattern, dirName="" ):
+        savedDir = os.getcwd()
+        nbFiles = len(lAlignList)
+        countFile = 1
+        if dirName != "" :
+            try:
+                os.makedirs(dirName)
+            except:
+                pass
+            os.chdir(dirName)
+            
+        for lAlign in lAlignList:
+            fileName = "%s_%s.align" % (pattern, str(countFile).zfill(len(str(nbFiles))))
+            AlignUtils.writeListInFile(lAlign, fileName)
+            countFile += 1
+        os.chdir(savedDir)
+            
+    createAlignFiles = staticmethod( createAlignFiles )
+    
+    
+    ## Return a list with Align instances sorted by query name, subject name, query start, query end and score
+    #
+    def sortList( lAligns ):
+        return sorted( lAligns, key=lambda iAlign: ( iAlign.getQueryName(),
+                                                     iAlign.getSubjectName(),
+                                                     iAlign.getQueryStart(),
+                                                     iAlign.getQueryEnd(),
+                                                     iAlign.getScore() ) )
+        
+    sortList = staticmethod( sortList )
+    
+    
+    ## Return a list after merging all overlapping Align instances
+    #
+    def mergeList( lAligns ):
+        lMerged = []
+        
+        lSorted = AlignUtils.sortList( lAligns )
+        
+        prev_count = 0
+        for iAlign in lSorted:
+            if prev_count != len(lSorted):
+                for i in lSorted[ prev_count + 1: ]:
+                    if iAlign.isOverlapping( i ):
+                        iAlign.merge( i )
+                IsAlreadyInList = False
+                for newAlign in lMerged:
+                    if newAlign.isOverlapping( iAlign ):
+                        IsAlreadyInList = True
+                        newAlign.merge( iAlign )
+                        lMerged [ lMerged.index( newAlign ) ] = newAlign
+                if not IsAlreadyInList:
+                    lMerged.append( iAlign )
+                prev_count += 1
+                
+        return lMerged
+    
+    mergeList = staticmethod( mergeList )
+    
+    
+    ## Merge all Align instance in a given Align file
+    #
+    def mergeFile( inFile, outFile="" ):
+        if outFile == "":
+            outFile = "%s.merged" % ( inFile )
+        if os.path.exists( outFile ):
+            os.remove( outFile )
+            
+        tmpFile = "%s.sorted" % ( inFile )
+        AlignUtils.sortAlignFile( inFile, tmpFile )
+        
+        tmpF = open( tmpFile, "r" )
+        dQrySbj2Aligns = {}
+        prevPairQrySbj = ""
+        while True:
+            line = tmpF.readline()
+            if line == "":
+                break
+            iAlign = Align()
+            iAlign.setFromString( line )
+            pairQrySbj = "%s_%s" % ( iAlign.getQueryName(), iAlign.getSubjectName() )
+            if not dQrySbj2Aligns.has_key( pairQrySbj ):
+                if prevPairQrySbj != "":
+                    lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] )
+                    AlignUtils.writeListInFile( lMerged, outFile, "a" )
+                    del dQrySbj2Aligns[ prevPairQrySbj ]
+                    prevPairQrySbj = pairQrySbj
+                else:
+                    prevPairQrySbj = pairQrySbj
+                dQrySbj2Aligns[ pairQrySbj ] = []
+            dQrySbj2Aligns[ pairQrySbj ].append( iAlign )
+        lMerged = []
+        if len(dQrySbj2Aligns.keys()) > 0:
+            lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] )
+        AlignUtils.writeListInFile( lMerged, outFile, "a" )
+        tmpF.close()
+        os.remove( tmpFile )
+        
+    mergeFile = staticmethod( mergeFile )
+
+
+    ## Update the scores of each match in the input file
+    #
+    # @note the new score is the length on the query times the percentage of identity
+    #
+    def updateScoresInFile( inFile, outFile ):
+        inHandler = open( inFile, "r" )
+        outHandler = open( outFile, "w" )
+        iAlign = Align()
+        
+        while True:
+            line = inHandler.readline()
+            if line == "":
+                break
+            iAlign.reset()
+            iAlign.setFromString( line, "\t" )
+            iAlign.updateScore()
+            iAlign.write( outHandler )
+            
+        inHandler.close()
+        outHandler.close()
+        
+    updateScoresInFile = staticmethod( updateScoresInFile )