diff commons/core/coord/MatchUtils.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/core/coord/MatchUtils.py	Fri Jan 18 04:54:14 2013 -0500
@@ -0,0 +1,288 @@
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use, 
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info". 
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability. 
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or 
+# data to be ensured and,  more generally, to use and operate it in the 
+# same conditions as regards security. 
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+
+import math
+import os
+import sys
+from commons.core.coord.Match import Match
+from commons.core.checker.RepetException import RepetException
+
+## Static methods for the manipulation of Match instances
+#
+class MatchUtils ( object ):
+    
+    ## Return a list with Match instances from the given file
+    #
+    # @param inFile name of a file in the Match format
+    # @return a list of Match instances
+    #
+    def getMatchListFromFile(inFile ):
+        lMatchInstances = []
+        inFileHandler = open( inFile, "r" )
+        while True:
+            line = inFileHandler.readline()
+            if line == "":
+                break
+            if line[0:10] == "query.name":
+                continue
+            m = Match()
+            m.setFromString( line )
+            lMatchInstances.append( m )
+        inFileHandler.close()
+        return lMatchInstances
+    
+    getMatchListFromFile = staticmethod( getMatchListFromFile )
+    
+    ##  Split a Match list in several Match lists according to the subject
+    #
+    #  @param lMatches a list of Match instances
+    #  @return a dictionary which keys are subject names and values Match lists
+    #
+    def getDictOfListsWithSubjectAsKey( lMatches ):
+        dSubject2MatchList = {}
+        for iMatch in lMatches:
+            if not dSubject2MatchList.has_key( iMatch.range_subject.seqname ):
+                dSubject2MatchList[ iMatch.range_subject.seqname ] = []
+            dSubject2MatchList[ iMatch.range_subject.seqname ].append( iMatch )
+        return dSubject2MatchList
+    
+    getDictOfListsWithSubjectAsKey = staticmethod( getDictOfListsWithSubjectAsKey )
+    
+    ##  Split a Match list in several Match lists according to the query
+    #
+    #  @param lMatches a list of Match instances
+    #  @return a dictionary which keys are query names and values Match lists
+    #
+    def getDictOfListsWithQueryAsKey ( lMatches ):
+        dQuery2MatchList = {}
+        for iMatch in lMatches:
+            if not dQuery2MatchList.has_key( iMatch.range_query.seqname ):
+                dQuery2MatchList[ iMatch.range_query.seqname ] = []
+            dQuery2MatchList[ iMatch.range_query.seqname ].append( iMatch )
+        return dQuery2MatchList
+    
+    getDictOfListsWithQueryAsKey = staticmethod( getDictOfListsWithQueryAsKey )   
+         
+    ## Write Match instances contained in the given list
+    #
+    # @param lMatches a list of Match instances
+    # @param fileName name of the file to write the Match instances
+    # @param mode the open mode of the file ""w"" or ""a"" 
+    #
+    def writeListInFile( lMatches, fileName, mode="w", header=None ):
+        fileHandler = open( fileName, mode )
+        if header:
+            fileHandler.write( header )
+        for iMatch in lMatches:
+            iMatch.write( fileHandler )
+        fileHandler.close()
+        
+    writeListInFile = staticmethod( writeListInFile )
+
+    ## Give path id list from a list of Match instances
+    #
+    # @param lMatch list of Match instances
+    #
+    # @return lId integer list
+    #
+    def getIdListFromMatchList(lMatch):
+        lId = []
+        for iMatch in lMatch:
+            lId.append(iMatch.id)
+        return lId
+    
+    getIdListFromMatchList = staticmethod(getIdListFromMatchList)
+    
+    ## Remove duplicated matches in a match list
+    # ## replace old PyRepet.MatchDB.rmvDoublons()
+    # @param lMatch list of Match instances
+    #
+    # @return lMatchesUniq match unique list
+    #
+    def rmvDuplicateMatches(lMatch):
+        lMatchesUniq = []
+        for match in lMatch:
+            if len(lMatchesUniq) == 0:
+                lMatchesUniq.append( match )
+            else:
+                nbDoublons = 0
+                for m in lMatchesUniq:
+                    if match.isDoublonWith( m ):
+                        nbDoublons += 1
+                if nbDoublons == 0:
+                    lMatchesUniq.append( match )
+                    
+        for match1 in lMatchesUniq:
+            for match2 in lMatchesUniq:
+                if match1.id != match2.id:
+                    if match1.isDoublonWith( match2 ):
+                        raise RepetException ( "*** Error: doublon not removed" )
+        return lMatchesUniq
+    
+    rmvDuplicateMatches = staticmethod(rmvDuplicateMatches)
+
+    ## Return the list of queries 'included' in subjects when two different databanks are used.
+    ##replace old pyRepet.MatchDB.filterDiffQrySbj()
+    #
+    # @param iBioseqDB bioseqDB databank of queries
+    #
+    # @param thresIdentity float identity threshold
+    #
+    # @param thresLength float length threshold
+    #
+    # @param verbose int verbosity
+    #
+    # @return lMatches match list to keep according to length and identity thresholds
+    #TODO: don't take into account match for sequence against itself. To do ? 
+    def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity=0.95, thresLength=0.98, verbose=0 ):
+        if verbose > 0:
+            print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % ( thresIdentity, thresLength ); sys.stdout.flush()
+
+        thresIdentityPerc = math.floor( thresIdentity*100 )
+        lQryToKeep = []
+        dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile))
+
+        for seqH in iBioseqDB.idx.keys():
+            # keep it if it has no match
+            if not dQry2Matches.has_key( seqH ):
+                if seqH not in lQryToKeep:
+                    lQryToKeep.append( seqH )
+            else:
+                isConditionsMet = False
+                for match in dQry2Matches[ seqH ]:
+                    # check if they are above the thresholds
+                    if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength:
+                        isConditionsMet = True
+                        break
+                if not isConditionsMet and  seqH not in lQryToKeep:
+                    lQryToKeep.append( seqH )
+        return lQryToKeep
+    
+    filterDiffQrySbj = staticmethod(filterDiffQrySbj)
+    
+    ## Count the number of distinct matches involved in at least one match above the thresholds.
+    ##replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres()
+    # @param thresIdentity float identity threshold
+    #
+    # @param thresLength float length threshold 
+    #
+    def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity=0.95, thresLength=0.98, whatToCount="query" ):
+        thresIdentityPerc = math.floor( thresIdentity*100 )
+        countSbj = 0
+        if whatToCount.lower() == "query":
+            dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches)
+        else:
+            dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches)
+            
+        for qry in dMatches.keys():
+            countMatch = 0
+            for match in dMatches[ qry ]:
+                
+                if match.identity >= thresIdentityPerc and getattr(match,whatToCount.lower() +"_length_perc") >= thresLength:
+                    countMatch += 1
+            if countMatch > 0:
+                countSbj += 1
+        return countSbj
+    
+    getNbDistinctSequencesInsideMatchesWithThresh = staticmethod(getNbDistinctSequencesInsideMatchesWithThresh)
+    
+    ## Convert a 'match' file (output from Matcher) into an 'align' file
+    ## replace old parser.tab2align
+    #
+    # @param inFileName  a string input file name
+    #
+    def convertMatchFileToAlignFile(inFileName):
+        basename = os.path.splitext(inFileName)[0]
+        outFileName = "%s.align" % basename
+        outFile = open(outFileName, "w")
+        
+        lMatches = MatchUtils.getMatchListFromFile(inFileName) 
+        
+        for match in lMatches:
+            string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity() )
+            outFile.write( string )
+            
+        outFile.close()
+        
+    convertMatchFileToAlignFile = staticmethod(convertMatchFileToAlignFile)
+    
+    ## Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file)
+    # Use coverage on query for arc value
+    #
+    # @param matchFileName string input match file name
+    # @param outFileName string output abc file name
+    # @param coverage float query coverage filter threshold
+    #
+    @staticmethod
+    def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0):
+        with open(matchFileName) as inF:
+            with open(outFileName, "w") as outF:
+                inF.readline()
+                inLine = inF.readline()
+                while inLine:
+                    splittedLine = inLine.split("\t")
+                    if float(splittedLine[4]) >= coverage:
+                        outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]])
+                        outLine += "\n"
+                        outF.write(outLine)
+                    inLine = inF.readline()
+
+    ## Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. 
+    ## replace old parser.tabnum2id
+    #
+    # @param fileName  a string input file name
+    # @param  outputFileName  a string output file name (optional)
+    #
+    def generateMatchFileWithNewPathId(fileName, outputFileName=None):
+        if outputFileName is None:   
+            outFile = open(fileName, "w")
+        else:
+            outFile = open(outputFileName, "w")      
+        outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n")
+      
+        lMatches = MatchUtils.getMatchListFromFile(fileName) 
+        count = 1
+        dMatchKeyIdcount = {}
+        
+        for match in lMatches:
+            key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName()
+            if not key_id in dMatchKeyIdcount.keys():
+                newPath = count
+                count += 1
+                dMatchKeyIdcount[ key_id ] = newPath
+            else:
+                newPath = dMatchKeyIdcount[ key_id ]
+                
+            match.id = newPath
+            outFile.write( match.toString()+"\n" )  
+        outFile.close()
+        
+    generateMatchFileWithNewPathId = staticmethod(generateMatchFileWithNewPathId)
+     
\ No newline at end of file