Mercurial > repos > yufei-luo > s_mart
view commons/core/coord/AlignUtils.py @ 62:8c42a6d7ffd4
Added simple test BED file.
author | m-zytnicki |
---|---|
date | Mon, 19 Oct 2015 11:25:11 +0200 |
parents | 769e306b7933 |
children |
line wrap: on
line source
# Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. import os import sys import shutil from commons.core.coord.Align import Align ## Static methods manipulating Align instances # class AlignUtils( object ): ## Return a list with Align instances from the given file # # @param inFile name of a file in the Align format # def getAlignListFromFile( inFile ): lAlignInstances = [] inFileHandler = open( inFile, "r" ) while True: line = inFileHandler.readline() if line == "": break a = Align() a.setFromString( line ) lAlignInstances.append( a ) inFileHandler.close() return lAlignInstances getAlignListFromFile = staticmethod( getAlignListFromFile ) ## Return a list with all the scores # # @param lAlignInstances: list of Align instances # def getListOfScores( lAlignInstances ): lScores = [] for iAlign in lAlignInstances: lScores.append( iAlign.score ) return lScores getListOfScores = staticmethod( getListOfScores ) ## Return a list with all the scores from the given file # # @param inFile name of a file in the Align format # def getScoreListFromFile(inFile): lScores = [] append = lScores.append with open(inFile, "r") as inFileHandler: line = inFileHandler.readline() while line: if line != "\n": append(int(line.split('\t')[7])) line = inFileHandler.readline() return lScores getScoreListFromFile = staticmethod( getScoreListFromFile ) ## for each line of a given Align file, write the coordinates on the query and the subject as two distinct lines in a Map file # # @param alignFile: name of the input Align file # @param mapFile: name of the output Map file # def convertAlignFileIntoMapFileWithQueriesAndSubjects( alignFile, mapFile ): alignFileHandler = open( alignFile, "r" ) mapFileHandler = open( mapFile, "w" ) iAlign = Align() while True: line = alignFileHandler.readline() if line == "": break iAlign.setFromString( line ) iMapQ, iMapS = iAlign.getMapsOfQueryAndSubject() iMapQ.write( mapFileHandler ) iMapS.write( mapFileHandler ) alignFileHandler.close() mapFileHandler.close() convertAlignFileIntoMapFileWithQueriesAndSubjects = staticmethod( convertAlignFileIntoMapFileWithQueriesAndSubjects ) ## for each line of a given Align file, write the coordinates of the subject on the query as one line in a Map file # # @param alignFile: name of the input Align file # @param mapFile: name of the output Map file # def convertAlignFileIntoMapFileWithSubjectsOnQueries( alignFile, mapFile ): alignFileHandler = open( alignFile, "r" ) mapFileHandler = open( mapFile, "w" ) iAlign = Align() while True: line = alignFileHandler.readline() if line == "": break iAlign.setFromString( line ) iMapQ = iAlign.getSubjectAsMapOfQuery() iMapQ.write( mapFileHandler ) alignFileHandler.close() mapFileHandler.close() convertAlignFileIntoMapFileWithSubjectsOnQueries = staticmethod( convertAlignFileIntoMapFileWithSubjectsOnQueries ) ## return a list of Align instances sorted in decreasing order according to their score, then their length on the query and finally their initial order # # @param lAligns: list of Align instances # def getAlignListSortedByDecreasingScoreThenLength( lAligns ): return sorted( lAligns, key=lambda iAlign: ( 1 / float(iAlign.getScore()), 1 / float(iAlign.getLengthOnQuery()) ) ) getAlignListSortedByDecreasingScoreThenLength = staticmethod( getAlignListSortedByDecreasingScoreThenLength ) ## Convert an Align file into a Path file # # @param alignFile string name of the input Align file # @param pathFile string name of the output Path file # def convertAlignFileIntoPathFile( alignFile, pathFile ): alignFileHandler = open( alignFile, "r" ) pathFileHandler = open( pathFile, "w" ) iAlign = Align() countAlign = 0 while True: line = alignFileHandler.readline() if line == "": break countAlign += 1 iAlign.setFromString( line, "\t" ) pathFileHandler.write( "%i\t%s\n" % ( countAlign, iAlign.toString() ) ) alignFileHandler.close() pathFileHandler.close() convertAlignFileIntoPathFile = staticmethod( convertAlignFileIntoPathFile ) ## Sort an Align file # def sortAlignFile( inFile, outFile="" ): if outFile == "": outFile = "%s.sort" % ( inFile ) prg = "sort" cmd = prg cmd += " -k 1,1 -k 4,4 -k 2,2n -k 3,3n -k 5,5n -k 6,6n -k 8,8n" cmd += " %s" % ( inFile ) cmd += " > %s" % ( outFile ) exitStatus = os.system( cmd ) if exitStatus != 0: msg = "ERROR: '%s' returned '%i'" % ( prg, exitStatus ) sys.stderr.write( "%s\n" % ( msg ) ) sys.exit( exitStatus ) sortAlignFile = staticmethod( sortAlignFile ) ## Write Align instances contained in the given list # # @param lAlign a list of Align instances # @param fileName name of the file to write the Align instances # @param mode the open mode of the file ""w"" or ""a"" # def writeListInFile( lAlign, fileName, mode="w" ): fileHandler = open( fileName, mode ) for iAlign in lAlign: iAlign.write( fileHandler ) fileHandler.close() writeListInFile = staticmethod( writeListInFile ) ## Split a list of Align instances according to the name of the query # # @param lInAlign list of align instances # @return lOutAlignList list of align instances lists # def splitAlignListByQueryName( lInAlign ): lSortedAlign = sorted(lInAlign, key=lambda o: o.range_query.seqname) lOutAlignList = [] if len(lSortedAlign) != 0 : lAlignForCurrentQuery = [] previousQuery = lSortedAlign[0].range_query.seqname for align in lSortedAlign : currentQuery = align.range_query.seqname if previousQuery != currentQuery : lOutAlignList.append(lAlignForCurrentQuery) previousQuery = currentQuery lAlignForCurrentQuery = [] lAlignForCurrentQuery.append(align) lOutAlignList.append(lAlignForCurrentQuery) return lOutAlignList splitAlignListByQueryName = staticmethod( splitAlignListByQueryName ) ## Create an Align file from each list of Align instances in the input list # # @param lAlignList list of lists with Align instances # @param pattern string # @param dirName string # def createAlignFiles( lAlignList, pattern, dirName="" ): savedDir = os.getcwd() nbFiles = len(lAlignList) countFile = 1 if dirName != "" : try: os.makedirs(dirName) except: pass os.chdir(dirName) for lAlign in lAlignList: fileName = "%s_%s.align" % (pattern, str(countFile).zfill(len(str(nbFiles)))) AlignUtils.writeListInFile(lAlign, fileName) countFile += 1 os.chdir(savedDir) createAlignFiles = staticmethod( createAlignFiles ) ## Return a list with Align instances sorted by query name, subject name, query start, query end and score # def sortList( lAligns ): return sorted( lAligns, key=lambda iAlign: ( iAlign.getQueryName(), iAlign.getSubjectName(), iAlign.getQueryStart(), iAlign.getQueryEnd(), iAlign.getScore() ) ) sortList = staticmethod( sortList ) ## Return a list after merging all overlapping Align instances # def mergeList( lAligns ): lMerged = [] lSorted = AlignUtils.sortList( lAligns ) prev_count = 0 for iAlign in lSorted: if prev_count != len(lSorted): for i in lSorted[ prev_count + 1: ]: if iAlign.isOverlapping( i ): iAlign.merge( i ) IsAlreadyInList = False for newAlign in lMerged: if newAlign.isOverlapping( iAlign ): IsAlreadyInList = True newAlign.merge( iAlign ) lMerged [ lMerged.index( newAlign ) ] = newAlign if not IsAlreadyInList: lMerged.append( iAlign ) prev_count += 1 return lMerged mergeList = staticmethod( mergeList ) ## Merge all Align instance in a given Align file # def mergeFile( inFile, outFile="" ): if outFile == "": outFile = "%s.merged" % ( inFile ) if os.path.exists( outFile ): os.remove( outFile ) tmpFile = "%s.sorted" % ( inFile ) AlignUtils.sortAlignFile( inFile, tmpFile ) tmpF = open( tmpFile, "r" ) dQrySbj2Aligns = {} prevPairQrySbj = "" while True: line = tmpF.readline() if line == "": break iAlign = Align() iAlign.setFromString( line ) pairQrySbj = "%s_%s" % ( iAlign.getQueryName(), iAlign.getSubjectName() ) if not dQrySbj2Aligns.has_key( pairQrySbj ): if prevPairQrySbj != "": lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) AlignUtils.writeListInFile( lMerged, outFile, "a" ) del dQrySbj2Aligns[ prevPairQrySbj ] prevPairQrySbj = pairQrySbj else: prevPairQrySbj = pairQrySbj dQrySbj2Aligns[ pairQrySbj ] = [] dQrySbj2Aligns[ pairQrySbj ].append( iAlign ) lMerged = [] if len(dQrySbj2Aligns.keys()) > 0: lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) AlignUtils.writeListInFile( lMerged, outFile, "a" ) tmpF.close() os.remove( tmpFile ) mergeFile = staticmethod( mergeFile ) ## Update the scores of each match in the input file # # @note the new score is the length on the query times the percentage of identity # def updateScoresInFile( inFile, outFile ): inHandler = open( inFile, "r" ) outHandler = open( outFile, "w" ) iAlign = Align() while True: line = inHandler.readline() if line == "": break iAlign.reset() iAlign.setFromString( line, "\t" ) iAlign.updateScore() iAlign.write( outHandler ) inHandler.close() outHandler.close() updateScoresInFile = staticmethod( updateScoresInFile )