Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.0/commons/core/coord/AlignUtils.py @ 6:20ec0d14798e draft
Uploaded
| author | urgi-team |
|---|---|
| date | Wed, 20 Jul 2016 05:00:24 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:4093a2fb58be | 6:20ec0d14798e |
|---|---|
| 1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
| 2 # http://www.inra.fr | |
| 3 # http://urgi.versailles.inra.fr | |
| 4 # | |
| 5 # This software is governed by the CeCILL license under French law and | |
| 6 # abiding by the rules of distribution of free software. You can use, | |
| 7 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 9 # "http://www.cecill.info". | |
| 10 # | |
| 11 # As a counterpart to the access to the source code and rights to copy, | |
| 12 # modify and redistribute granted by the license, users are provided only | |
| 13 # with a limited warranty and the software's author, the holder of the | |
| 14 # economic rights, and the successive licensors have only limited | |
| 15 # liability. | |
| 16 # | |
| 17 # In this respect, the user's attention is drawn to the risks associated | |
| 18 # with loading, using, modifying and/or developing or reproducing the | |
| 19 # software by the user in light of its specific status of free software, | |
| 20 # that may mean that it is complicated to manipulate, and that also | |
| 21 # therefore means that it is reserved for developers and experienced | |
| 22 # professionals having in-depth computer knowledge. Users are therefore | |
| 23 # encouraged to load and test the software's suitability as regards their | |
| 24 # requirements in conditions enabling the security of their systems and/or | |
| 25 # data to be ensured and, more generally, to use and operate it in the | |
| 26 # same conditions as regards security. | |
| 27 # | |
| 28 # The fact that you are presently reading this means that you have had | |
| 29 # knowledge of the CeCILL license and that you accept its terms. | |
| 30 | |
| 31 | |
| 32 import os | |
| 33 import sys | |
| 34 import shutil | |
| 35 from commons.core.coord.Align import Align | |
| 36 | |
| 37 | |
| 38 ## Static methods manipulating Align instances | |
| 39 # | |
| 40 class AlignUtils( object ): | |
| 41 | |
| 42 ## Return a list with Align instances from the given file | |
| 43 # | |
| 44 # @param inFile name of a file in the Align format | |
| 45 # | |
| 46 def getAlignListFromFile( inFile ): | |
| 47 lAlignInstances = [] | |
| 48 inFileHandler = open( inFile, "r" ) | |
| 49 while True: | |
| 50 line = inFileHandler.readline() | |
| 51 if line == "": | |
| 52 break | |
| 53 a = Align() | |
| 54 a.setFromString( line ) | |
| 55 lAlignInstances.append( a ) | |
| 56 inFileHandler.close() | |
| 57 return lAlignInstances | |
| 58 | |
| 59 getAlignListFromFile = staticmethod( getAlignListFromFile ) | |
| 60 | |
| 61 | |
| 62 ## Return a list with all the scores | |
| 63 # | |
| 64 # @param lAlignInstances: list of Align instances | |
| 65 # | |
| 66 def getListOfScores( lAlignInstances ): | |
| 67 lScores = [] | |
| 68 for iAlign in lAlignInstances: | |
| 69 lScores.append( iAlign.score ) | |
| 70 return lScores | |
| 71 | |
| 72 getListOfScores = staticmethod( getListOfScores ) | |
| 73 | |
| 74 | |
| 75 ## Return a list with all the scores from the given file | |
| 76 # | |
| 77 # @param inFile name of a file in the Align format | |
| 78 # | |
| 79 def getScoreListFromFile(inFile): | |
| 80 lScores = [] | |
| 81 append = lScores.append | |
| 82 with open(inFile, "r") as inFileHandler: | |
| 83 line = inFileHandler.readline() | |
| 84 while line: | |
| 85 if line != "\n": | |
| 86 append(int(line.split('\t')[7])) | |
| 87 line = inFileHandler.readline() | |
| 88 return lScores | |
| 89 | |
| 90 getScoreListFromFile = staticmethod( getScoreListFromFile ) | |
| 91 | |
| 92 | |
| 93 ## for each line of a given Align file, write the coordinates on the query and the subject as two distinct lines in a Map file | |
| 94 # | |
| 95 # @param alignFile: name of the input Align file | |
| 96 # @param mapFile: name of the output Map file | |
| 97 # | |
| 98 def convertAlignFileIntoMapFileWithQueriesAndSubjects( alignFile, mapFile ): | |
| 99 alignFileHandler = open( alignFile, "r" ) | |
| 100 mapFileHandler = open( mapFile, "w" ) | |
| 101 iAlign = Align() | |
| 102 while True: | |
| 103 line = alignFileHandler.readline() | |
| 104 if line == "": | |
| 105 break | |
| 106 iAlign.setFromString( line ) | |
| 107 iMapQ, iMapS = iAlign.getMapsOfQueryAndSubject() | |
| 108 iMapQ.write( mapFileHandler ) | |
| 109 iMapS.write( mapFileHandler ) | |
| 110 alignFileHandler.close() | |
| 111 mapFileHandler.close() | |
| 112 | |
| 113 convertAlignFileIntoMapFileWithQueriesAndSubjects = staticmethod( convertAlignFileIntoMapFileWithQueriesAndSubjects ) | |
| 114 | |
| 115 | |
| 116 ## for each line of a given Align file, write the coordinates of the subject on the query as one line in a Map file | |
| 117 # | |
| 118 # @param alignFile: name of the input Align file | |
| 119 # @param mapFile: name of the output Map file | |
| 120 # | |
| 121 def convertAlignFileIntoMapFileWithSubjectsOnQueries( alignFile, mapFile ): | |
| 122 alignFileHandler = open( alignFile, "r" ) | |
| 123 mapFileHandler = open( mapFile, "w" ) | |
| 124 iAlign = Align() | |
| 125 while True: | |
| 126 line = alignFileHandler.readline() | |
| 127 if line == "": | |
| 128 break | |
| 129 iAlign.setFromString( line ) | |
| 130 iMapQ = iAlign.getSubjectAsMapOfQuery() | |
| 131 iMapQ.write( mapFileHandler ) | |
| 132 alignFileHandler.close() | |
| 133 mapFileHandler.close() | |
| 134 | |
| 135 convertAlignFileIntoMapFileWithSubjectsOnQueries = staticmethod( convertAlignFileIntoMapFileWithSubjectsOnQueries ) | |
| 136 | |
| 137 | |
| 138 ## return a list of Align instances sorted in decreasing order according to their score, then their length on the query and finally their initial order | |
| 139 # | |
| 140 # @param lAligns: list of Align instances | |
| 141 # | |
| 142 def getAlignListSortedByDecreasingScoreThenLength( lAligns ): | |
| 143 return sorted( lAligns, key=lambda iAlign: ( 1 / float(iAlign.getScore()), 1 / float(iAlign.getLengthOnQuery()) ) ) | |
| 144 | |
| 145 getAlignListSortedByDecreasingScoreThenLength = staticmethod( getAlignListSortedByDecreasingScoreThenLength ) | |
| 146 | |
| 147 | |
| 148 ## Convert an Align file into a Path file | |
| 149 # | |
| 150 # @param alignFile string name of the input Align file | |
| 151 # @param pathFile string name of the output Path file | |
| 152 # | |
| 153 def convertAlignFileIntoPathFile( alignFile, pathFile ): | |
| 154 alignFileHandler = open( alignFile, "r" ) | |
| 155 pathFileHandler = open( pathFile, "w" ) | |
| 156 iAlign = Align() | |
| 157 countAlign = 0 | |
| 158 while True: | |
| 159 line = alignFileHandler.readline() | |
| 160 if line == "": | |
| 161 break | |
| 162 countAlign += 1 | |
| 163 iAlign.setFromString( line, "\t" ) | |
| 164 pathFileHandler.write( "%i\t%s\n" % ( countAlign, iAlign.toString() ) ) | |
| 165 alignFileHandler.close() | |
| 166 pathFileHandler.close() | |
| 167 | |
| 168 convertAlignFileIntoPathFile = staticmethod( convertAlignFileIntoPathFile ) | |
| 169 | |
| 170 | |
| 171 ## Sort an Align file | |
| 172 # | |
| 173 def sortAlignFile( inFile, outFile="" ): | |
| 174 if outFile == "": | |
| 175 outFile = "%s.sort" % ( inFile ) | |
| 176 prg = "sort" | |
| 177 cmd = prg | |
| 178 cmd += " -k 1,1 -k 4,4 -k 2,2n -k 3,3n -k 5,5n -k 6,6n -k 8,8n" | |
| 179 cmd += " %s" % ( inFile ) | |
| 180 cmd += " > %s" % ( outFile ) | |
| 181 exitStatus = os.system( cmd ) | |
| 182 if exitStatus != 0: | |
| 183 msg = "ERROR: '%s' returned '%i'" % ( prg, exitStatus ) | |
| 184 sys.stderr.write( "%s\n" % ( msg ) ) | |
| 185 sys.exit( exitStatus ) | |
| 186 | |
| 187 sortAlignFile = staticmethod( sortAlignFile ) | |
| 188 | |
| 189 | |
| 190 ## Write Align instances contained in the given list | |
| 191 # | |
| 192 # @param lAlign a list of Align instances | |
| 193 # @param fileName name of the file to write the Align instances | |
| 194 # @param mode the open mode of the file ""w"" or ""a"" | |
| 195 # | |
| 196 def writeListInFile( lAlign, fileName, mode="w" ): | |
| 197 fileHandler = open( fileName, mode ) | |
| 198 for iAlign in lAlign: | |
| 199 iAlign.write( fileHandler ) | |
| 200 fileHandler.close() | |
| 201 | |
| 202 writeListInFile = staticmethod( writeListInFile ) | |
| 203 | |
| 204 | |
| 205 ## Split a list of Align instances according to the name of the query | |
| 206 # | |
| 207 # @param lInAlign list of align instances | |
| 208 # @return lOutAlignList list of align instances lists | |
| 209 # | |
| 210 def splitAlignListByQueryName( lInAlign ): | |
| 211 lSortedAlign = sorted(lInAlign, key=lambda o: o.range_query.seqname) | |
| 212 lOutAlignList = [] | |
| 213 if len(lSortedAlign) != 0 : | |
| 214 lAlignForCurrentQuery = [] | |
| 215 previousQuery = lSortedAlign[0].range_query.seqname | |
| 216 for align in lSortedAlign : | |
| 217 currentQuery = align.range_query.seqname | |
| 218 if previousQuery != currentQuery : | |
| 219 lOutAlignList.append(lAlignForCurrentQuery) | |
| 220 previousQuery = currentQuery | |
| 221 lAlignForCurrentQuery = [] | |
| 222 lAlignForCurrentQuery.append(align) | |
| 223 | |
| 224 lOutAlignList.append(lAlignForCurrentQuery) | |
| 225 | |
| 226 return lOutAlignList | |
| 227 | |
| 228 splitAlignListByQueryName = staticmethod( splitAlignListByQueryName ) | |
| 229 | |
| 230 | |
| 231 ## Create an Align file from each list of Align instances in the input list | |
| 232 # | |
| 233 # @param lAlignList list of lists with Align instances | |
| 234 # @param pattern string | |
| 235 # @param dirName string | |
| 236 # | |
| 237 def createAlignFiles( lAlignList, pattern, dirName="" ): | |
| 238 savedDir = os.getcwd() | |
| 239 nbFiles = len(lAlignList) | |
| 240 countFile = 1 | |
| 241 if dirName != "" : | |
| 242 try: | |
| 243 os.makedirs(dirName) | |
| 244 except: | |
| 245 pass | |
| 246 os.chdir(dirName) | |
| 247 | |
| 248 for lAlign in lAlignList: | |
| 249 fileName = "%s_%s.align" % (pattern, str(countFile).zfill(len(str(nbFiles)))) | |
| 250 AlignUtils.writeListInFile(lAlign, fileName) | |
| 251 countFile += 1 | |
| 252 os.chdir(savedDir) | |
| 253 | |
| 254 createAlignFiles = staticmethod( createAlignFiles ) | |
| 255 | |
| 256 | |
| 257 ## Return a list with Align instances sorted by query name, subject name, query start, query end and score | |
| 258 # | |
| 259 def sortList( lAligns ): | |
| 260 return sorted( lAligns, key=lambda iAlign: ( iAlign.getQueryName(), | |
| 261 iAlign.getSubjectName(), | |
| 262 iAlign.getQueryStart(), | |
| 263 iAlign.getQueryEnd(), | |
| 264 iAlign.getScore() ) ) | |
| 265 | |
| 266 sortList = staticmethod( sortList ) | |
| 267 | |
| 268 | |
| 269 ## Return a list after merging all overlapping Align instances | |
| 270 # | |
| 271 def mergeList( lAligns ): | |
| 272 lMerged = [] | |
| 273 | |
| 274 lSorted = AlignUtils.sortList( lAligns ) | |
| 275 | |
| 276 prev_count = 0 | |
| 277 for iAlign in lSorted: | |
| 278 if prev_count != len(lSorted): | |
| 279 for i in lSorted[ prev_count + 1: ]: | |
| 280 if iAlign.isOverlapping( i ): | |
| 281 iAlign.merge( i ) | |
| 282 IsAlreadyInList = False | |
| 283 for newAlign in lMerged: | |
| 284 if newAlign.isOverlapping( iAlign ): | |
| 285 IsAlreadyInList = True | |
| 286 newAlign.merge( iAlign ) | |
| 287 lMerged [ lMerged.index( newAlign ) ] = newAlign | |
| 288 if not IsAlreadyInList: | |
| 289 lMerged.append( iAlign ) | |
| 290 prev_count += 1 | |
| 291 | |
| 292 return lMerged | |
| 293 | |
| 294 mergeList = staticmethod( mergeList ) | |
| 295 | |
| 296 | |
| 297 ## Merge all Align instance in a given Align file | |
| 298 # | |
| 299 def mergeFile( inFile, outFile="" ): | |
| 300 if outFile == "": | |
| 301 outFile = "%s.merged" % ( inFile ) | |
| 302 if os.path.exists( outFile ): | |
| 303 os.remove( outFile ) | |
| 304 | |
| 305 tmpFile = "%s.sorted" % ( inFile ) | |
| 306 AlignUtils.sortAlignFile( inFile, tmpFile ) | |
| 307 | |
| 308 tmpF = open( tmpFile, "r" ) | |
| 309 dQrySbj2Aligns = {} | |
| 310 prevPairQrySbj = "" | |
| 311 while True: | |
| 312 line = tmpF.readline() | |
| 313 if line == "": | |
| 314 break | |
| 315 iAlign = Align() | |
| 316 iAlign.setFromString( line ) | |
| 317 pairQrySbj = "%s_%s" % ( iAlign.getQueryName(), iAlign.getSubjectName() ) | |
| 318 if not dQrySbj2Aligns.has_key( pairQrySbj ): | |
| 319 if prevPairQrySbj != "": | |
| 320 lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) | |
| 321 AlignUtils.writeListInFile( lMerged, outFile, "a" ) | |
| 322 del dQrySbj2Aligns[ prevPairQrySbj ] | |
| 323 prevPairQrySbj = pairQrySbj | |
| 324 else: | |
| 325 prevPairQrySbj = pairQrySbj | |
| 326 dQrySbj2Aligns[ pairQrySbj ] = [] | |
| 327 dQrySbj2Aligns[ pairQrySbj ].append( iAlign ) | |
| 328 lMerged = [] | |
| 329 if len(dQrySbj2Aligns.keys()) > 0: | |
| 330 lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) | |
| 331 AlignUtils.writeListInFile( lMerged, outFile, "a" ) | |
| 332 tmpF.close() | |
| 333 os.remove( tmpFile ) | |
| 334 | |
| 335 mergeFile = staticmethod( mergeFile ) | |
| 336 | |
| 337 | |
| 338 ## Update the scores of each match in the input file | |
| 339 # | |
| 340 # @note the new score is the length on the query times the percentage of identity | |
| 341 # | |
| 342 def updateScoresInFile( inFile, outFile ): | |
| 343 inHandler = open( inFile, "r" ) | |
| 344 outHandler = open( outFile, "w" ) | |
| 345 iAlign = Align() | |
| 346 | |
| 347 while True: | |
| 348 line = inHandler.readline() | |
| 349 if line == "": | |
| 350 break | |
| 351 iAlign.reset() | |
| 352 iAlign.setFromString( line, "\t" ) | |
| 353 iAlign.updateScore() | |
| 354 iAlign.write( outHandler ) | |
| 355 | |
| 356 inHandler.close() | |
| 357 outHandler.close() | |
| 358 | |
| 359 updateScoresInFile = staticmethod( updateScoresInFile ) |
