6
+ − 1 # Copyright INRA (Institut National de la Recherche Agronomique)
+ − 2 # http://www.inra.fr
+ − 3 # http://urgi.versailles.inra.fr
+ − 4 #
+ − 5 # This software is governed by the CeCILL license under French law and
+ − 6 # abiding by the rules of distribution of free software. You can use,
+ − 7 # modify and/ or redistribute the software under the terms of the CeCILL
+ − 8 # license as circulated by CEA, CNRS and INRIA at the following URL
+ − 9 # "http://www.cecill.info".
+ − 10 #
+ − 11 # As a counterpart to the access to the source code and rights to copy,
+ − 12 # modify and redistribute granted by the license, users are provided only
+ − 13 # with a limited warranty and the software's author, the holder of the
+ − 14 # economic rights, and the successive licensors have only limited
+ − 15 # liability.
+ − 16 #
+ − 17 # In this respect, the user's attention is drawn to the risks associated
+ − 18 # with loading, using, modifying and/or developing or reproducing the
+ − 19 # software by the user in light of its specific status of free software,
+ − 20 # that may mean that it is complicated to manipulate, and that also
+ − 21 # therefore means that it is reserved for developers and experienced
+ − 22 # professionals having in-depth computer knowledge. Users are therefore
+ − 23 # encouraged to load and test the software's suitability as regards their
+ − 24 # requirements in conditions enabling the security of their systems and/or
+ − 25 # data to be ensured and, more generally, to use and operate it in the
+ − 26 # same conditions as regards security.
+ − 27 #
+ − 28 # The fact that you are presently reading this means that you have had
+ − 29 # knowledge of the CeCILL license and that you accept its terms.
+ − 30
+ − 31
+ − 32 import os
+ − 33 import sys
+ − 34 import shutil
+ − 35 from commons.core.coord.Align import Align
+ − 36
+ − 37
+ − 38 ## Static methods manipulating Align instances
+ − 39 #
+ − 40 class AlignUtils( object ):
+ − 41
+ − 42 ## Return a list with Align instances from the given file
+ − 43 #
+ − 44 # @param inFile name of a file in the Align format
+ − 45 #
+ − 46 def getAlignListFromFile( inFile ):
+ − 47 lAlignInstances = []
+ − 48 inFileHandler = open( inFile, "r" )
+ − 49 while True:
+ − 50 line = inFileHandler.readline()
+ − 51 if line == "":
+ − 52 break
+ − 53 a = Align()
+ − 54 a.setFromString( line )
+ − 55 lAlignInstances.append( a )
+ − 56 inFileHandler.close()
+ − 57 return lAlignInstances
+ − 58
+ − 59 getAlignListFromFile = staticmethod( getAlignListFromFile )
+ − 60
+ − 61
+ − 62 ## Return a list with all the scores
+ − 63 #
+ − 64 # @param lAlignInstances: list of Align instances
+ − 65 #
+ − 66 def getListOfScores( lAlignInstances ):
+ − 67 lScores = []
+ − 68 for iAlign in lAlignInstances:
+ − 69 lScores.append( iAlign.score )
+ − 70 return lScores
+ − 71
+ − 72 getListOfScores = staticmethod( getListOfScores )
+ − 73
+ − 74
+ − 75 ## Return a list with all the scores from the given file
+ − 76 #
+ − 77 # @param inFile name of a file in the Align format
+ − 78 #
+ − 79 def getScoreListFromFile(inFile):
+ − 80 lScores = []
+ − 81 append = lScores.append
+ − 82 with open(inFile, "r") as inFileHandler:
+ − 83 line = inFileHandler.readline()
+ − 84 while line:
+ − 85 if line != "\n":
+ − 86 append(int(line.split('\t')[7]))
+ − 87 line = inFileHandler.readline()
+ − 88 return lScores
+ − 89
+ − 90 getScoreListFromFile = staticmethod( getScoreListFromFile )
+ − 91
+ − 92
+ − 93 ## for each line of a given Align file, write the coordinates on the query and the subject as two distinct lines in a Map file
+ − 94 #
+ − 95 # @param alignFile: name of the input Align file
+ − 96 # @param mapFile: name of the output Map file
+ − 97 #
+ − 98 def convertAlignFileIntoMapFileWithQueriesAndSubjects( alignFile, mapFile ):
+ − 99 alignFileHandler = open( alignFile, "r" )
+ − 100 mapFileHandler = open( mapFile, "w" )
+ − 101 iAlign = Align()
+ − 102 while True:
+ − 103 line = alignFileHandler.readline()
+ − 104 if line == "":
+ − 105 break
+ − 106 iAlign.setFromString( line )
+ − 107 iMapQ, iMapS = iAlign.getMapsOfQueryAndSubject()
+ − 108 iMapQ.write( mapFileHandler )
+ − 109 iMapS.write( mapFileHandler )
+ − 110 alignFileHandler.close()
+ − 111 mapFileHandler.close()
+ − 112
+ − 113 convertAlignFileIntoMapFileWithQueriesAndSubjects = staticmethod( convertAlignFileIntoMapFileWithQueriesAndSubjects )
+ − 114
+ − 115
+ − 116 ## for each line of a given Align file, write the coordinates of the subject on the query as one line in a Map file
+ − 117 #
+ − 118 # @param alignFile: name of the input Align file
+ − 119 # @param mapFile: name of the output Map file
+ − 120 #
+ − 121 def convertAlignFileIntoMapFileWithSubjectsOnQueries( alignFile, mapFile ):
+ − 122 alignFileHandler = open( alignFile, "r" )
+ − 123 mapFileHandler = open( mapFile, "w" )
+ − 124 iAlign = Align()
+ − 125 while True:
+ − 126 line = alignFileHandler.readline()
+ − 127 if line == "":
+ − 128 break
+ − 129 iAlign.setFromString( line )
+ − 130 iMapQ = iAlign.getSubjectAsMapOfQuery()
+ − 131 iMapQ.write( mapFileHandler )
+ − 132 alignFileHandler.close()
+ − 133 mapFileHandler.close()
+ − 134
+ − 135 convertAlignFileIntoMapFileWithSubjectsOnQueries = staticmethod( convertAlignFileIntoMapFileWithSubjectsOnQueries )
+ − 136
+ − 137
+ − 138 ## return a list of Align instances sorted in decreasing order according to their score, then their length on the query and finally their initial order
+ − 139 #
+ − 140 # @param lAligns: list of Align instances
+ − 141 #
+ − 142 def getAlignListSortedByDecreasingScoreThenLength( lAligns ):
+ − 143 return sorted( lAligns, key=lambda iAlign: ( 1 / float(iAlign.getScore()), 1 / float(iAlign.getLengthOnQuery()) ) )
+ − 144
+ − 145 getAlignListSortedByDecreasingScoreThenLength = staticmethod( getAlignListSortedByDecreasingScoreThenLength )
+ − 146
+ − 147
+ − 148 ## Convert an Align file into a Path file
+ − 149 #
+ − 150 # @param alignFile string name of the input Align file
+ − 151 # @param pathFile string name of the output Path file
+ − 152 #
+ − 153 def convertAlignFileIntoPathFile( alignFile, pathFile ):
+ − 154 alignFileHandler = open( alignFile, "r" )
+ − 155 pathFileHandler = open( pathFile, "w" )
+ − 156 iAlign = Align()
+ − 157 countAlign = 0
+ − 158 while True:
+ − 159 line = alignFileHandler.readline()
+ − 160 if line == "":
+ − 161 break
+ − 162 countAlign += 1
+ − 163 iAlign.setFromString( line, "\t" )
+ − 164 pathFileHandler.write( "%i\t%s\n" % ( countAlign, iAlign.toString() ) )
+ − 165 alignFileHandler.close()
+ − 166 pathFileHandler.close()
+ − 167
+ − 168 convertAlignFileIntoPathFile = staticmethod( convertAlignFileIntoPathFile )
+ − 169
+ − 170
+ − 171 ## Sort an Align file
+ − 172 #
+ − 173 def sortAlignFile( inFile, outFile="" ):
+ − 174 if outFile == "":
+ − 175 outFile = "%s.sort" % ( inFile )
+ − 176 prg = "sort"
+ − 177 cmd = prg
+ − 178 cmd += " -k 1,1 -k 4,4 -k 2,2n -k 3,3n -k 5,5n -k 6,6n -k 8,8n"
+ − 179 cmd += " %s" % ( inFile )
+ − 180 cmd += " > %s" % ( outFile )
+ − 181 exitStatus = os.system( cmd )
+ − 182 if exitStatus != 0:
+ − 183 msg = "ERROR: '%s' returned '%i'" % ( prg, exitStatus )
+ − 184 sys.stderr.write( "%s\n" % ( msg ) )
+ − 185 sys.exit( exitStatus )
+ − 186
+ − 187 sortAlignFile = staticmethod( sortAlignFile )
+ − 188
+ − 189
+ − 190 ## Write Align instances contained in the given list
+ − 191 #
+ − 192 # @param lAlign a list of Align instances
+ − 193 # @param fileName name of the file to write the Align instances
+ − 194 # @param mode the open mode of the file ""w"" or ""a""
+ − 195 #
+ − 196 def writeListInFile( lAlign, fileName, mode="w" ):
+ − 197 fileHandler = open( fileName, mode )
+ − 198 for iAlign in lAlign:
+ − 199 iAlign.write( fileHandler )
+ − 200 fileHandler.close()
+ − 201
+ − 202 writeListInFile = staticmethod( writeListInFile )
+ − 203
+ − 204
+ − 205 ## Split a list of Align instances according to the name of the query
+ − 206 #
+ − 207 # @param lInAlign list of align instances
+ − 208 # @return lOutAlignList list of align instances lists
+ − 209 #
+ − 210 def splitAlignListByQueryName( lInAlign ):
+ − 211 lSortedAlign = sorted(lInAlign, key=lambda o: o.range_query.seqname)
+ − 212 lOutAlignList = []
+ − 213 if len(lSortedAlign) != 0 :
+ − 214 lAlignForCurrentQuery = []
+ − 215 previousQuery = lSortedAlign[0].range_query.seqname
+ − 216 for align in lSortedAlign :
+ − 217 currentQuery = align.range_query.seqname
+ − 218 if previousQuery != currentQuery :
+ − 219 lOutAlignList.append(lAlignForCurrentQuery)
+ − 220 previousQuery = currentQuery
+ − 221 lAlignForCurrentQuery = []
+ − 222 lAlignForCurrentQuery.append(align)
+ − 223
+ − 224 lOutAlignList.append(lAlignForCurrentQuery)
+ − 225
+ − 226 return lOutAlignList
+ − 227
+ − 228 splitAlignListByQueryName = staticmethod( splitAlignListByQueryName )
+ − 229
+ − 230
+ − 231 ## Create an Align file from each list of Align instances in the input list
+ − 232 #
+ − 233 # @param lAlignList list of lists with Align instances
+ − 234 # @param pattern string
+ − 235 # @param dirName string
+ − 236 #
+ − 237 def createAlignFiles( lAlignList, pattern, dirName="" ):
+ − 238 savedDir = os.getcwd()
+ − 239 nbFiles = len(lAlignList)
+ − 240 countFile = 1
+ − 241 if dirName != "" :
+ − 242 try:
+ − 243 os.makedirs(dirName)
+ − 244 except:
+ − 245 pass
+ − 246 os.chdir(dirName)
+ − 247
+ − 248 for lAlign in lAlignList:
+ − 249 fileName = "%s_%s.align" % (pattern, str(countFile).zfill(len(str(nbFiles))))
+ − 250 AlignUtils.writeListInFile(lAlign, fileName)
+ − 251 countFile += 1
+ − 252 os.chdir(savedDir)
+ − 253
+ − 254 createAlignFiles = staticmethod( createAlignFiles )
+ − 255
+ − 256
+ − 257 ## Return a list with Align instances sorted by query name, subject name, query start, query end and score
+ − 258 #
+ − 259 def sortList( lAligns ):
+ − 260 return sorted( lAligns, key=lambda iAlign: ( iAlign.getQueryName(),
+ − 261 iAlign.getSubjectName(),
+ − 262 iAlign.getQueryStart(),
+ − 263 iAlign.getQueryEnd(),
+ − 264 iAlign.getScore() ) )
+ − 265
+ − 266 sortList = staticmethod( sortList )
+ − 267
+ − 268
+ − 269 ## Return a list after merging all overlapping Align instances
+ − 270 #
+ − 271 def mergeList( lAligns ):
+ − 272 lMerged = []
+ − 273
+ − 274 lSorted = AlignUtils.sortList( lAligns )
+ − 275
+ − 276 prev_count = 0
+ − 277 for iAlign in lSorted:
+ − 278 if prev_count != len(lSorted):
+ − 279 for i in lSorted[ prev_count + 1: ]:
+ − 280 if iAlign.isOverlapping( i ):
+ − 281 iAlign.merge( i )
+ − 282 IsAlreadyInList = False
+ − 283 for newAlign in lMerged:
+ − 284 if newAlign.isOverlapping( iAlign ):
+ − 285 IsAlreadyInList = True
+ − 286 newAlign.merge( iAlign )
+ − 287 lMerged [ lMerged.index( newAlign ) ] = newAlign
+ − 288 if not IsAlreadyInList:
+ − 289 lMerged.append( iAlign )
+ − 290 prev_count += 1
+ − 291
+ − 292 return lMerged
+ − 293
+ − 294 mergeList = staticmethod( mergeList )
+ − 295
+ − 296
+ − 297 ## Merge all Align instance in a given Align file
+ − 298 #
+ − 299 def mergeFile( inFile, outFile="" ):
+ − 300 if outFile == "":
+ − 301 outFile = "%s.merged" % ( inFile )
+ − 302 if os.path.exists( outFile ):
+ − 303 os.remove( outFile )
+ − 304
+ − 305 tmpFile = "%s.sorted" % ( inFile )
+ − 306 AlignUtils.sortAlignFile( inFile, tmpFile )
+ − 307
+ − 308 tmpF = open( tmpFile, "r" )
+ − 309 dQrySbj2Aligns = {}
+ − 310 prevPairQrySbj = ""
+ − 311 while True:
+ − 312 line = tmpF.readline()
+ − 313 if line == "":
+ − 314 break
+ − 315 iAlign = Align()
+ − 316 iAlign.setFromString( line )
+ − 317 pairQrySbj = "%s_%s" % ( iAlign.getQueryName(), iAlign.getSubjectName() )
+ − 318 if not dQrySbj2Aligns.has_key( pairQrySbj ):
+ − 319 if prevPairQrySbj != "":
+ − 320 lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] )
+ − 321 AlignUtils.writeListInFile( lMerged, outFile, "a" )
+ − 322 del dQrySbj2Aligns[ prevPairQrySbj ]
+ − 323 prevPairQrySbj = pairQrySbj
+ − 324 else:
+ − 325 prevPairQrySbj = pairQrySbj
+ − 326 dQrySbj2Aligns[ pairQrySbj ] = []
+ − 327 dQrySbj2Aligns[ pairQrySbj ].append( iAlign )
+ − 328 lMerged = []
+ − 329 if len(dQrySbj2Aligns.keys()) > 0:
+ − 330 lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] )
+ − 331 AlignUtils.writeListInFile( lMerged, outFile, "a" )
+ − 332 tmpF.close()
+ − 333 os.remove( tmpFile )
+ − 334
+ − 335 mergeFile = staticmethod( mergeFile )
+ − 336
+ − 337
+ − 338 ## Update the scores of each match in the input file
+ − 339 #
+ − 340 # @note the new score is the length on the query times the percentage of identity
+ − 341 #
+ − 342 def updateScoresInFile( inFile, outFile ):
+ − 343 inHandler = open( inFile, "r" )
+ − 344 outHandler = open( outFile, "w" )
+ − 345 iAlign = Align()
+ − 346
+ − 347 while True:
+ − 348 line = inHandler.readline()
+ − 349 if line == "":
+ − 350 break
+ − 351 iAlign.reset()
+ − 352 iAlign.setFromString( line, "\t" )
+ − 353 iAlign.updateScore()
+ − 354 iAlign.write( outHandler )
+ − 355
+ − 356 inHandler.close()
+ − 357 outHandler.close()
+ − 358
+ − 359 updateScoresInFile = staticmethod( updateScoresInFile )