Mercurial > repos > yufei-luo > s_mart
comparison commons/core/seq/FastaUtils.py @ 6:769e306b7933
Change the repository level.
| author | yufei-luo |
|---|---|
| date | Fri, 18 Jan 2013 04:54:14 -0500 |
| parents | |
| children | 94ab73e8a190 |
comparison
equal
deleted
inserted
replaced
| 5:ea3082881bf8 | 6:769e306b7933 |
|---|---|
| 1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
| 2 # http://www.inra.fr | |
| 3 # http://urgi.versailles.inra.fr | |
| 4 # | |
| 5 # This software is governed by the CeCILL license under French law and | |
| 6 # abiding by the rules of distribution of free software. You can use, | |
| 7 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 9 # "http://www.cecill.info". | |
| 10 # | |
| 11 # As a counterpart to the access to the source code and rights to copy, | |
| 12 # modify and redistribute granted by the license, users are provided only | |
| 13 # with a limited warranty and the software's author, the holder of the | |
| 14 # economic rights, and the successive licensors have only limited | |
| 15 # liability. | |
| 16 # | |
| 17 # In this respect, the user's attention is drawn to the risks associated | |
| 18 # with loading, using, modifying and/or developing or reproducing the | |
| 19 # software by the user in light of its specific status of free software, | |
| 20 # that may mean that it is complicated to manipulate, and that also | |
| 21 # therefore means that it is reserved for developers and experienced | |
| 22 # professionals having in-depth computer knowledge. Users are therefore | |
| 23 # encouraged to load and test the software's suitability as regards their | |
| 24 # requirements in conditions enabling the security of their systems and/or | |
| 25 # data to be ensured and, more generally, to use and operate it in the | |
| 26 # same conditions as regards security. | |
| 27 # | |
| 28 # The fact that you are presently reading this means that you have had | |
| 29 # knowledge of the CeCILL license and that you accept its terms. | |
| 30 | |
| 31 | |
| 32 import os | |
| 33 import sys | |
| 34 import string | |
| 35 import math | |
| 36 import shutil | |
| 37 import re | |
| 38 import glob | |
| 39 from commons.core.seq.BioseqDB import BioseqDB | |
| 40 from commons.core.seq.Bioseq import Bioseq | |
| 41 from commons.core.coord.MapUtils import MapUtils | |
| 42 from commons.core.coord.Range import Range | |
| 43 from commons.core.checker.CheckerUtils import CheckerUtils | |
| 44 from commons.core.launcher.LauncherUtils import LauncherUtils | |
| 45 from commons.core.coord.ConvCoord import ConvCoord | |
| 46 from commons.core.parsing.FastaParser import FastaParser | |
| 47 | |
| 48 | |
| 49 ## Static methods for fasta file manipulation | |
| 50 # | |
| 51 class FastaUtils( object ): | |
| 52 | |
| 53 ## Count the number of sequences in the input fasta file | |
| 54 # | |
| 55 # @param inFile name of the input fasta file | |
| 56 # | |
| 57 # @return integer number of sequences in the input fasta file | |
| 58 # | |
| 59 @staticmethod | |
| 60 def dbSize( inFile ): | |
| 61 nbSeq = 0 | |
| 62 inFileHandler = open( inFile, "r" ) | |
| 63 line = inFileHandler.readline() | |
| 64 while line: | |
| 65 if line[0] == ">": | |
| 66 nbSeq = nbSeq + 1 | |
| 67 line = inFileHandler.readline() | |
| 68 inFileHandler.close() | |
| 69 | |
| 70 return nbSeq | |
| 71 | |
| 72 | |
| 73 ## Compute the cumulative sequence length in the input fasta file | |
| 74 # | |
| 75 # @param inFile handler of the input fasta file | |
| 76 # | |
| 77 @staticmethod | |
| 78 def dbCumLength( inFile ): | |
| 79 cumLength = 0 | |
| 80 line = inFile.readline() | |
| 81 while line: | |
| 82 if line[0] != ">": | |
| 83 cumLength += len(string.rstrip(line)) | |
| 84 line = inFile.readline() | |
| 85 | |
| 86 return cumLength | |
| 87 | |
| 88 | |
| 89 ## Return a list with the length of each sequence in the input fasta file | |
| 90 # | |
| 91 # @param inFile string name of the input fasta file | |
| 92 # | |
| 93 @staticmethod | |
| 94 def dbLengths( inFile ): | |
| 95 lLengths = [] | |
| 96 inFileHandler = open( inFile, "r" ) | |
| 97 currentLength = 0 | |
| 98 line = inFileHandler.readline() | |
| 99 while line: | |
| 100 if line[0] == ">": | |
| 101 if currentLength != 0: | |
| 102 lLengths.append( currentLength ) | |
| 103 currentLength = 0 | |
| 104 else: | |
| 105 currentLength += len(line[:-1]) | |
| 106 line = inFileHandler.readline() | |
| 107 lLengths.append( currentLength ) | |
| 108 inFileHandler.close() | |
| 109 return lLengths | |
| 110 | |
| 111 | |
| 112 ## Retrieve the sequence headers present in the input fasta file | |
| 113 # | |
| 114 # @param inFile string name of the input fasta file | |
| 115 # @param verbose integer level of verbosity | |
| 116 # | |
| 117 # @return list of sequence headers | |
| 118 # | |
| 119 @staticmethod | |
| 120 def dbHeaders( inFile, verbose=0 ): | |
| 121 lHeaders = [] | |
| 122 | |
| 123 inFileHandler = open( inFile, "r" ) | |
| 124 line = inFileHandler.readline() | |
| 125 while line: | |
| 126 if line[0] == ">": | |
| 127 lHeaders.append( string.rstrip(line[1:]) ) | |
| 128 if verbose > 0: | |
| 129 print string.rstrip(line[1:]) | |
| 130 line = inFileHandler.readline() | |
| 131 inFileHandler.close() | |
| 132 | |
| 133 return lHeaders | |
| 134 | |
| 135 | |
| 136 ## Cut a data bank into chunks according to the input parameters | |
| 137 # If a sequence is shorter than the threshold, it is only renamed (not cut) | |
| 138 # | |
| 139 # @param inFileName string name of the input fasta file | |
| 140 # @param chkLgth string chunk length (in bp, default=200000) | |
| 141 # @param chkOver string chunk overlap (in bp, default=10000) | |
| 142 # @param wordN string N stretch word length (default=11, 0 for no detection) | |
| 143 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map') | |
| 144 # @param clean boolean remove 'cut' and 'Nstretch' files | |
| 145 # @param verbose integer (default = 0) | |
| 146 # | |
| 147 @staticmethod | |
| 148 def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ): | |
| 149 nbSeq = FastaUtils.dbSize( inFileName ) | |
| 150 if verbose > 0: | |
| 151 print "cut the %i input sequences with cutterDB..." % ( nbSeq ) | |
| 152 sys.stdout.flush() | |
| 153 | |
| 154 prg = "cutterDB" | |
| 155 cmd = prg | |
| 156 cmd += " -l %s" % ( chkLgth ) | |
| 157 cmd += " -o %s" %( chkOver ) | |
| 158 cmd += " -w %s" % ( wordN ) | |
| 159 cmd += " %s" % ( inFileName ) | |
| 160 returnStatus = os.system( cmd ) | |
| 161 if returnStatus != 0: | |
| 162 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus ) | |
| 163 sys.stderr.write( "%s\n" % ( msg ) ) | |
| 164 sys.exit(1) | |
| 165 | |
| 166 nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) ) | |
| 167 if verbose > 0: | |
| 168 print "done (%i chunks)" % ( nbChunks ) | |
| 169 sys.stdout.flush() | |
| 170 | |
| 171 if verbose > 0: | |
| 172 print "rename the headers..." | |
| 173 sys.stdout.flush() | |
| 174 | |
| 175 if outFilePrefix == "": | |
| 176 outFastaName = inFileName + "_chunks.fa" | |
| 177 outMapName = inFileName + "_chunks.map" | |
| 178 else: | |
| 179 outFastaName = outFilePrefix + ".fa" | |
| 180 outMapName = outFilePrefix + ".map" | |
| 181 | |
| 182 inFile = open( "%s_cut" % ( inFileName ), "r" ) | |
| 183 line = inFile.readline() | |
| 184 | |
| 185 outFasta = open( outFastaName, "w" ) | |
| 186 outMap = open( outMapName, "w" ) | |
| 187 | |
| 188 # read line after line (no need for big RAM) and change the sequence headers | |
| 189 while line: | |
| 190 | |
| 191 if line[0] == ">": | |
| 192 if verbose > 1: | |
| 193 print "rename '%s'" % ( line[:-1] ); sys.stdout.flush() | |
| 194 data = line[:-1].split(" ") | |
| 195 seqID = data[0].split(">")[1] | |
| 196 newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) ) | |
| 197 oldHeader = data[2] | |
| 198 seqStart = data[4].split("..")[0] | |
| 199 seqEnd = data[4].split("..")[1] | |
| 200 outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) ) | |
| 201 outFasta.write( ">%s\n" % ( newHeader ) ) | |
| 202 | |
| 203 else: | |
| 204 outFasta.write( line.upper() ) | |
| 205 | |
| 206 line = inFile.readline() | |
| 207 | |
| 208 inFile.close() | |
| 209 outFasta.close() | |
| 210 outMap.close() | |
| 211 | |
| 212 if clean == True: | |
| 213 os.remove(inFileName + "_cut") | |
| 214 os.remove(inFileName + ".Nstretch.map") | |
| 215 | |
| 216 | |
| 217 ## Split the input fasta file in several output files | |
| 218 # | |
| 219 # @param inFile string name of the input fasta file | |
| 220 # @param nbSeqPerBatch integer number of sequences per output file | |
| 221 # @param newDir boolean put the sequences in a new directory called 'batches' | |
| 222 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1') | |
| 223 # @param prefix prefix in output file name | |
| 224 # @param verbose integer verbosity level (default = 0) | |
| 225 # | |
| 226 @staticmethod | |
| 227 def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ): | |
| 228 if not os.path.exists( inFile ): | |
| 229 msg = "ERROR: file '%s' doesn't exist" % ( inFile ) | |
| 230 sys.stderr.write( "%s\n" % ( msg ) ) | |
| 231 sys.exit(1) | |
| 232 | |
| 233 nbSeq = FastaUtils.dbSize( inFile ) | |
| 234 | |
| 235 nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) ) | |
| 236 if verbose > 0: | |
| 237 print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches ) | |
| 238 sys.stdout.flush() | |
| 239 | |
| 240 if nbSeqPerBatch > 1 and useSeqHeader: | |
| 241 useSeqHeader = False | |
| 242 | |
| 243 if newDir == True: | |
| 244 if os.path.exists( "batches" ): | |
| 245 shutil.rmtree( "batches" ) | |
| 246 os.mkdir( "batches" ) | |
| 247 os.chdir( "batches" ) | |
| 248 os.system( "ln -s ../%s ." % ( inFile ) ) | |
| 249 | |
| 250 inFileHandler = open( inFile, "r" ) | |
| 251 inFileHandler.seek( 0, 0 ) | |
| 252 countBatch = 0 | |
| 253 countSeq = 0 | |
| 254 line = inFileHandler.readline() | |
| 255 while line: | |
| 256 if line == "": | |
| 257 break | |
| 258 if line[0] == ">": | |
| 259 countSeq += 1 | |
| 260 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1: | |
| 261 if "outFile" in locals(): | |
| 262 outFile.close() | |
| 263 countBatch += 1 | |
| 264 if nbSeqPerBatch == 1 and useSeqHeader: | |
| 265 outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") ) | |
| 266 else: | |
| 267 outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) ) | |
| 268 outFile = open( outFileName, "w" ) | |
| 269 if verbose > 1: | |
| 270 print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName ) | |
| 271 sys.stdout.flush() | |
| 272 outFile.write( line ) | |
| 273 line = inFileHandler.readline() | |
| 274 inFileHandler.close() | |
| 275 | |
| 276 if newDir == True: | |
| 277 os.remove( os.path.basename( inFile ) ) | |
| 278 os.chdir( ".." ) | |
| 279 | |
| 280 | |
| 281 ## Split the input fasta file in several output files | |
| 282 # | |
| 283 # @param inFileName string name of the input fasta file | |
| 284 # @param maxSize integer max cumulative length for each output file | |
| 285 # | |
| 286 @staticmethod | |
| 287 def splitFastaFileInBatches(inFileName, maxSize = 200000): | |
| 288 iBioseqDB = BioseqDB(inFileName) | |
| 289 lHeadersSizeTuples = [] | |
| 290 for iBioseq in iBioseqDB.db: | |
| 291 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength())) | |
| 292 | |
| 293 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize) | |
| 294 os.mkdir("batches") | |
| 295 os.chdir("batches") | |
| 296 | |
| 297 iterator = 0 | |
| 298 for lHeader in lHeadersList : | |
| 299 iterator += 1 | |
| 300 with open("batch_%s.fa" % iterator, 'w') as f : | |
| 301 for header in lHeader : | |
| 302 iBioseq = iBioseqDB.fetch(header) | |
| 303 iBioseq.write(f) | |
| 304 os.chdir("..") | |
| 305 | |
| 306 | |
| 307 ## Split the input fasta file in several output files according to their cluster identifier | |
| 308 # | |
| 309 # @param inFileName string name of the input fasta file | |
| 310 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust) | |
| 311 # @param simplifyHeader boolean simplify the headers | |
| 312 # @param createDir boolean put the sequences in different directories | |
| 313 # @param outPrefix string prefix of the output files (default='seqCluster') | |
| 314 # @param verbose integer (default = 0) | |
| 315 # | |
| 316 @staticmethod | |
| 317 def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ): | |
| 318 if not os.path.exists( inFileName ): | |
| 319 print "ERROR: %s doesn't exist" % ( inFileName ) | |
| 320 sys.exit(1) | |
| 321 | |
| 322 inFile = open( inFileName, "r" ) | |
| 323 | |
| 324 line = inFile.readline() | |
| 325 if line: | |
| 326 name = line.split(" ")[0] | |
| 327 if "Cluster" in name: | |
| 328 clusterID = name.split("Cluster")[1].split("Mb")[0] | |
| 329 seqID = name.split("Mb")[1] | |
| 330 else: | |
| 331 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust | |
| 332 if "Q" in name.split("Gr")[0]: | |
| 333 seqID = name.split("Gr")[0].split("MbQ")[1] | |
| 334 elif "S" in name: | |
| 335 seqID = name.split("Gr")[0].split("MbS")[1] | |
| 336 sClusterIDs = set( [ clusterID ] ) | |
| 337 if simplifyHeader == True: | |
| 338 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) | |
| 339 else: | |
| 340 header = line[1:-1] | |
| 341 if createDir == True: | |
| 342 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): | |
| 343 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
| 344 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
| 345 outFileName = "%s%s.fa" % ( outPrefix, clusterID ) | |
| 346 outFile = open( outFileName, "w" ) | |
| 347 outFile.write( ">%s\n" % ( header ) ) | |
| 348 prevClusterID = clusterID | |
| 349 | |
| 350 line = inFile.readline() | |
| 351 while line: | |
| 352 if line[0] == ">": | |
| 353 name = line.split(" ")[0] | |
| 354 if "Cluster" in name: | |
| 355 clusterID = name.split("Cluster")[1].split("Mb")[0] | |
| 356 seqID = name.split("Mb")[1] | |
| 357 else: | |
| 358 clusterID = name.split("Cl")[0].split("Gr")[1] | |
| 359 if "Q" in name.split("Gr")[0]: | |
| 360 seqID = name.split("Gr")[0].split("MbQ")[1] | |
| 361 elif "S" in name: | |
| 362 seqID = name.split("Gr")[0].split("MbS")[1] | |
| 363 | |
| 364 if clusterID != prevClusterID: | |
| 365 outFile.close() | |
| 366 | |
| 367 if simplifyHeader == True: | |
| 368 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) | |
| 369 else: | |
| 370 header = line[1:-1] | |
| 371 | |
| 372 if createDir == True: | |
| 373 os.chdir( ".." ) | |
| 374 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): | |
| 375 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
| 376 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
| 377 | |
| 378 outFileName = "%s%s.fa" % ( outPrefix, clusterID ) | |
| 379 if not os.path.exists( outFileName ): | |
| 380 outFile = open( outFileName, "w" ) | |
| 381 else: | |
| 382 if clusterID != prevClusterID: | |
| 383 outFile.close() | |
| 384 outFile = open( outFileName, "a" ) | |
| 385 outFile.write( ">%s\n" % ( header ) ) | |
| 386 prevClusterID = clusterID | |
| 387 sClusterIDs.add( clusterID ) | |
| 388 | |
| 389 else: | |
| 390 outFile.write( line ) | |
| 391 | |
| 392 line = inFile.readline() | |
| 393 | |
| 394 outFile.close() | |
| 395 if verbose > 0: | |
| 396 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush() | |
| 397 | |
| 398 if createDir == True: | |
| 399 os.chdir("..") | |
| 400 else: | |
| 401 print "WARNING: empty input file - no cluster found"; sys.stdout.flush() | |
| 402 | |
| 403 | |
| 404 ## Filter a fasta file in two fasta files using the length of each sequence as a criteron | |
| 405 # | |
| 406 # @param len_min integer length sequence criterion to filter | |
| 407 # @param inFileName string name of the input fasta file | |
| 408 # @param verbose integer (default = 0) | |
| 409 # | |
| 410 @staticmethod | |
| 411 def dbLengthFilter( len_min, inFileName, verbose=0 ): | |
| 412 file_db = open( inFileName, "r" ) | |
| 413 file_dbInf = open( inFileName+".Inf"+str(len_min), "w" ) | |
| 414 file_dbSup = open( inFileName+".Sup"+str(len_min), "w" ) | |
| 415 seq = Bioseq() | |
| 416 numseq = 0 | |
| 417 nbsave = 0 | |
| 418 | |
| 419 seq.read( file_db ) | |
| 420 while seq.sequence: | |
| 421 l = seq.getLength() | |
| 422 numseq = numseq + 1 | |
| 423 if l >= len_min: | |
| 424 seq.write( file_dbSup ) | |
| 425 if verbose > 0: | |
| 426 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!' | |
| 427 nbsave=nbsave+1 | |
| 428 else: | |
| 429 seq.write( file_dbInf ) | |
| 430 if verbose > 0: | |
| 431 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!' | |
| 432 nbsave=nbsave+1 | |
| 433 seq.read( file_db ) | |
| 434 | |
| 435 file_db.close() | |
| 436 file_dbInf.close() | |
| 437 file_dbSup.close() | |
| 438 if verbose > 0: | |
| 439 print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min) | |
| 440 | |
| 441 | |
| 442 ## Extract the longest sequences from a fasta file | |
| 443 # | |
| 444 # @param num integer maximum number of sequences in the output file | |
| 445 # @param inFileName string name of the input fasta file | |
| 446 # @param outFileName string name of the output fasta file | |
| 447 # @param minThresh integer minimum length threshold (default=0) | |
| 448 # @param verbose integer (default = 0) | |
| 449 # | |
| 450 @staticmethod | |
| 451 def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ): | |
| 452 bsDB = BioseqDB( inFileName ) | |
| 453 if verbose > 0: | |
| 454 print "nb of input sequences: %i" % ( bsDB.getSize() ) | |
| 455 | |
| 456 if outFileName == "": | |
| 457 outFileName = inFileName + ".best" + str(num) | |
| 458 outFile = open( outFileName, "w" ) | |
| 459 | |
| 460 if bsDB.getSize()==0: | |
| 461 return 0 | |
| 462 | |
| 463 num = int(num) | |
| 464 if verbose > 0: | |
| 465 print "keep the %i longest sequences" % ( num ) | |
| 466 if minThresh > 0: | |
| 467 print "with length > %i bp" % ( minThresh ) | |
| 468 sys.stdout.flush() | |
| 469 | |
| 470 # retrieve the length of each input sequence | |
| 471 tmpLSeqLgth = [] | |
| 472 seqNum = 0 | |
| 473 for bs in bsDB.db: | |
| 474 seqNum += 1 | |
| 475 tmpLSeqLgth.append( bs.getLength() ) | |
| 476 if verbose > 1: | |
| 477 print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() ) | |
| 478 sys.stdout.flush() | |
| 479 | |
| 480 # sort the lengths | |
| 481 tmpLSeqLgth.sort() | |
| 482 tmpLSeqLgth.reverse() | |
| 483 | |
| 484 # select the longest | |
| 485 lSeqLgth = [] | |
| 486 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ): | |
| 487 if tmpLSeqLgth[i] >= minThresh: | |
| 488 lSeqLgth.append( tmpLSeqLgth[i] ) | |
| 489 if verbose > 0: | |
| 490 print "selected max length: %i" % ( max(lSeqLgth) ) | |
| 491 print "selected min length: %i" % ( min(lSeqLgth) ) | |
| 492 sys.stdout.flush() | |
| 493 | |
| 494 # save the longest | |
| 495 inFile = open( inFileName ) | |
| 496 seqNum = 0 | |
| 497 nbSave = 0 | |
| 498 for bs in bsDB.db: | |
| 499 seqNum += 1 | |
| 500 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh: | |
| 501 bs.write( outFile ) | |
| 502 if verbose > 1: | |
| 503 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] ) | |
| 504 sys.stdout.flush() | |
| 505 nbSave += 1 | |
| 506 if nbSave == num: | |
| 507 break | |
| 508 inFile.close() | |
| 509 outFile.close() | |
| 510 if verbose > 0: | |
| 511 print nbSave, "saved sequences in ", outFileName | |
| 512 sys.stdout.flush() | |
| 513 | |
| 514 return 0 | |
| 515 | |
| 516 | |
| 517 ## Extract all the sequence headers from a fasta file and write them in a new fasta file | |
| 518 # | |
| 519 # @param inFileName string name of the input fasta file | |
| 520 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers') | |
| 521 # | |
| 522 @staticmethod | |
| 523 def dbExtractSeqHeaders( inFileName, outFileName="" ): | |
| 524 lHeaders = FastaUtils.dbHeaders( inFileName ) | |
| 525 | |
| 526 if outFileName == "": | |
| 527 outFileName = inFileName + ".headers" | |
| 528 | |
| 529 outFile = open( outFileName, "w" ) | |
| 530 for i in lHeaders: | |
| 531 outFile.write( i + "\n" ) | |
| 532 outFile.close() | |
| 533 | |
| 534 return 0 | |
| 535 | |
| 536 | |
| 537 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file | |
| 538 # | |
| 539 # @param pattern regular expression to search in headers | |
| 540 # @param inFileName string name of the input fasta file | |
| 541 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
| 542 # @param verbose integer verbosity level (default = 0) | |
| 543 # | |
| 544 @staticmethod | |
| 545 def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ): | |
| 546 if pattern == "": | |
| 547 return | |
| 548 | |
| 549 if outFileName == "": | |
| 550 outFileName = inFileName + '.extracted' | |
| 551 outFile = open( outFileName, 'w' ) | |
| 552 | |
| 553 patternTosearch = re.compile( pattern ) | |
| 554 bioseq = Bioseq() | |
| 555 bioseqNb = 0 | |
| 556 savedBioseqNb = 0 | |
| 557 inFile = open( inFileName, "r" ) | |
| 558 bioseq.read( inFile ) | |
| 559 while bioseq.sequence: | |
| 560 bioseqNb = bioseqNb + 1 | |
| 561 m = patternTosearch.search( bioseq.header ) | |
| 562 if m: | |
| 563 bioseq.write( outFile ) | |
| 564 if verbose > 1: | |
| 565 print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!' | |
| 566 savedBioseqNb = savedBioseqNb + 1 | |
| 567 bioseq.read( inFile ) | |
| 568 inFile.close() | |
| 569 | |
| 570 outFile.close() | |
| 571 | |
| 572 if verbose > 0: | |
| 573 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) | |
| 574 | |
| 575 | |
| 576 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file | |
| 577 # | |
| 578 # @param patternFileName string file containing regular expression to search in headers | |
| 579 # @param inFileName string name of the input fasta file | |
| 580 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
| 581 # @param verbose integer verbosity level (default = 0) | |
| 582 # | |
| 583 @staticmethod | |
| 584 def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): | |
| 585 | |
| 586 if patternFileName == "": | |
| 587 print "ERROR: no file of pattern" | |
| 588 sys.exit(1) | |
| 589 | |
| 590 bioseq = Bioseq() | |
| 591 bioseqNb = 0 | |
| 592 savedBioseqNb = 0 | |
| 593 lHeaders = [] | |
| 594 | |
| 595 inFile = open( inFileName, "r" ) | |
| 596 bioseq.read( inFile ) | |
| 597 while bioseq.sequence != None: | |
| 598 lHeaders.append( bioseq.header ) | |
| 599 bioseq.read( inFile ) | |
| 600 inFile.close() | |
| 601 | |
| 602 lHeadersToKeep = [] | |
| 603 patternFile = open( patternFileName, "r" ) | |
| 604 for pattern in patternFile: | |
| 605 if verbose > 0: | |
| 606 print "pattern: ",pattern[:-1]; sys.stdout.flush() | |
| 607 | |
| 608 patternToSearch = re.compile(pattern[:-1]) | |
| 609 for h in lHeaders: | |
| 610 if patternToSearch.search(h): | |
| 611 lHeadersToKeep.append(h) | |
| 612 patternFile.close() | |
| 613 | |
| 614 if outFileName == "": | |
| 615 outFileName = inFileName + ".extracted" | |
| 616 outFile=open( outFileName, "w" ) | |
| 617 | |
| 618 inFile = open( inFileName, "r" ) | |
| 619 bioseq.read(inFile) | |
| 620 while bioseq.sequence: | |
| 621 bioseqNb += 1 | |
| 622 if bioseq.header in lHeadersToKeep: | |
| 623 bioseq.write(outFile) | |
| 624 if verbose > 1: | |
| 625 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() | |
| 626 savedBioseqNb += 1 | |
| 627 bioseq.read(inFile) | |
| 628 inFile.close() | |
| 629 | |
| 630 outFile.close() | |
| 631 | |
| 632 if verbose > 0: | |
| 633 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) | |
| 634 | |
| 635 | |
| 636 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file | |
| 637 # | |
| 638 # @param pattern regular expression to search in headers | |
| 639 # @param inFileName string name of the input fasta file | |
| 640 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
| 641 # @param verbose integer verbosity level (default = 0) | |
| 642 # | |
| 643 @staticmethod | |
| 644 def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ): | |
| 645 if pattern == "": | |
| 646 return | |
| 647 | |
| 648 patternToSearch = re.compile(pattern) | |
| 649 | |
| 650 if outFileName == "": | |
| 651 outFileName = inFileName + '.cleaned' | |
| 652 outFile = open(outFileName,'w') | |
| 653 | |
| 654 bioseq = Bioseq() | |
| 655 bioseqNb = 0 | |
| 656 savedBioseqNb = 0 | |
| 657 inFile = open(inFileName) | |
| 658 bioseq.read(inFile) | |
| 659 while bioseq.sequence != None: | |
| 660 bioseqNb += 1 | |
| 661 if not patternToSearch.search(bioseq.header): | |
| 662 bioseq.write(outFile) | |
| 663 if verbose > 1: | |
| 664 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!' | |
| 665 savedBioseqNb += 1 | |
| 666 bioseq.read(inFile) | |
| 667 inFile.close() | |
| 668 | |
| 669 outFile.close() | |
| 670 | |
| 671 if verbose > 0: | |
| 672 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) | |
| 673 | |
| 674 | |
| 675 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file | |
| 676 # | |
| 677 # @param patternFileName string file containing regular expression to search in headers | |
| 678 # @param inFileName string name of the input fasta file | |
| 679 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
| 680 # @param verbose integer verbosity level (default = 0) | |
| 681 # | |
| 682 @staticmethod | |
| 683 def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): | |
| 684 if patternFileName == "": | |
| 685 print "ERROR: no file of pattern" | |
| 686 sys.exit(1) | |
| 687 | |
| 688 bioseq = Bioseq() | |
| 689 bioseqNb = 0 | |
| 690 savedBioseqNb = 0 | |
| 691 lHeaders = [] | |
| 692 inFile = open( inFileName, "r" ) | |
| 693 bioseq.read( inFile ) | |
| 694 while bioseq.sequence != None: | |
| 695 bioseqNb += 1 | |
| 696 lHeaders.append( bioseq.header ) | |
| 697 bioseq.read( inFile ) | |
| 698 inFile.close() | |
| 699 | |
| 700 patternFile = open( patternFileName, "r") | |
| 701 lHeadersToRemove = [] | |
| 702 for pattern in patternFile: | |
| 703 if verbose > 0: | |
| 704 print "pattern: ",pattern[:-1]; sys.stdout.flush() | |
| 705 | |
| 706 patternToSearch = re.compile( pattern[:-1] ) | |
| 707 for h in lHeaders: | |
| 708 if patternToSearch.search(h): | |
| 709 lHeadersToRemove.append(h) | |
| 710 patternFile.close() | |
| 711 | |
| 712 if outFileName == "": | |
| 713 outFileName = inFileName + '.cleaned' | |
| 714 outFile = open( outFileName, 'w' ) | |
| 715 | |
| 716 bioseqNum = 0 | |
| 717 inFile=open( inFileName ) | |
| 718 bioseq.read( inFile ) | |
| 719 while bioseq.sequence != None: | |
| 720 bioseqNum += 1 | |
| 721 if bioseq.header not in lHeadersToRemove: | |
| 722 bioseq.write( outFile ) | |
| 723 if verbose > 1: | |
| 724 print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() | |
| 725 savedBioseqNb += 1 | |
| 726 bioseq.read( inFile ) | |
| 727 inFile.close() | |
| 728 | |
| 729 outFile.close() | |
| 730 | |
| 731 if verbose > 0: | |
| 732 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) | |
| 733 | |
| 734 | |
| 735 ## Find sequence's ORFs from a fasta file and write them in a tab file | |
| 736 # | |
| 737 # @param inFileName string name of the input fasta file | |
| 738 # @param orfMaxNb integer Select orfMaxNb best ORFs | |
| 739 # @param orfMinLength integer Keep ORFs with length > orfMinLength | |
| 740 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map') | |
| 741 # @param verbose integer verbosity level (default = 0) | |
| 742 # | |
| 743 @staticmethod | |
| 744 def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ): | |
| 745 if outFileName == "": | |
| 746 outFileName = inFileName + ".ORF.map" | |
| 747 outFile = open( outFileName, "w" ) | |
| 748 | |
| 749 bioseq = Bioseq() | |
| 750 bioseqNb = 0 | |
| 751 | |
| 752 inFile = open( inFileName ) | |
| 753 bioseq.read( inFile ) | |
| 754 while bioseq.sequence != None: | |
| 755 bioseq.upCase() | |
| 756 bioseqNb += 1 | |
| 757 if verbose > 0: | |
| 758 print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]' | |
| 759 | |
| 760 orf = bioseq.findORF() | |
| 761 bestOrf = [] | |
| 762 for i in orf.keys(): | |
| 763 orfLen = len(orf[i]) | |
| 764 for j in xrange(1, orfLen): | |
| 765 start = orf[i][j-1] + 4 | |
| 766 end = orf[i][j] + 3 | |
| 767 if end - start >= orfMinLength: | |
| 768 bestOrf.append( ( end-start, i+1, start, end ) ) | |
| 769 | |
| 770 bioseq.reverseComplement() | |
| 771 | |
| 772 orf = bioseq.findORF() | |
| 773 seqLen = bioseq.getLength() | |
| 774 for i in orf.keys(): | |
| 775 orfLen = len(orf[i]) | |
| 776 for j in xrange(1, orfLen): | |
| 777 start = seqLen - orf[i][j-1] - 3 | |
| 778 end = seqLen - orf[i][j] - 2 | |
| 779 if start - end >= orfMinLength: | |
| 780 bestOrf.append( ( start-end, (i+1)*-1, start, end ) ) | |
| 781 | |
| 782 bestOrf.sort() | |
| 783 bestOrf.reverse() | |
| 784 bestOrfNb = len(bestOrf) | |
| 785 if orfMaxNb != 0 and orfMaxNb < bestOrfNb: | |
| 786 bestOrfNb = orfMaxNb | |
| 787 for i in xrange(0, bestOrfNb): | |
| 788 if verbose > 0: | |
| 789 print bestOrf[i] | |
| 790 outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\ | |
| 791 "|"+str(bestOrf[i][0]),bioseq.header, | |
| 792 bestOrf[i][2],bestOrf[i][3])) | |
| 793 bioseq.read( inFile ) | |
| 794 | |
| 795 inFile.close() | |
| 796 outFile.close() | |
| 797 | |
| 798 return 0 | |
| 799 | |
| 800 | |
| 801 ## Sort sequences by increasing length (write a new file) | |
| 802 # | |
| 803 # @param inFileName string name of the input fasta file | |
| 804 # @param outFileName string name of the output fasta file | |
| 805 # @param verbose integer verbosity level | |
| 806 # | |
| 807 @staticmethod | |
| 808 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0): | |
| 809 if verbose > 0: | |
| 810 print "sort sequences by increasing length" | |
| 811 sys.stdout.flush() | |
| 812 if not os.path.exists( inFileName ): | |
| 813 print "ERROR: file '%s' doesn't exist" % ( inFileName ) | |
| 814 sys.exit(1) | |
| 815 | |
| 816 # read each seq one by one | |
| 817 # save them in distinct temporary files | |
| 818 # with their length in the name | |
| 819 inFileHandler = open( inFileName, "r" ) | |
| 820 countSeq = 0 | |
| 821 bs = Bioseq() | |
| 822 bs.read( inFileHandler ) | |
| 823 while bs.header: | |
| 824 countSeq += 1 | |
| 825 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq ) | |
| 826 bs.appendBioseqInFile( tmpFile ) | |
| 827 if verbose > 1: | |
| 828 print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile ) | |
| 829 bs.header = "" | |
| 830 bs.sequence = "" | |
| 831 bs.read( inFileHandler ) | |
| 832 inFileHandler.close() | |
| 833 | |
| 834 # sort temporary file names | |
| 835 # concatenate them into the output file | |
| 836 if os.path.exists( outFileName ): | |
| 837 os.remove( outFileName ) | |
| 838 lFiles = glob.glob( "*bp_*nb" ) | |
| 839 lFiles.sort( key=lambda s:int(s.split("bp_")[0]) ) | |
| 840 for fileName in lFiles: | |
| 841 cmd = "cat %s >> %s" % ( fileName, outFileName ) | |
| 842 returnValue = os.system( cmd ) | |
| 843 if returnValue != 0: | |
| 844 print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName ) | |
| 845 sys.exit(1) | |
| 846 os.remove( fileName ) | |
| 847 | |
| 848 return 0 | |
| 849 | |
| 850 | |
| 851 ## Sort sequences by header | |
| 852 # | |
| 853 # @param inFileName string name of the input fasta file | |
| 854 # @param outFileName string name of the output fasta file | |
| 855 # @param verbose integer verbosity level | |
| 856 # | |
| 857 @staticmethod | |
| 858 def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0): | |
| 859 if outFileName == "": | |
| 860 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0] | |
| 861 iBioseqDB = BioseqDB(inFileName) | |
| 862 f = open(outFileName, "w") | |
| 863 lHeaders = sorted(iBioseqDB.getHeaderList()) | |
| 864 for header in lHeaders: | |
| 865 iBioseq = iBioseqDB.fetch(header) | |
| 866 iBioseq.write(f) | |
| 867 f.close() | |
| 868 | |
| 869 | |
| 870 ## Return a dictionary which keys are the headers and values the length of the sequences | |
| 871 # | |
| 872 # @param inFile string name of the input fasta file | |
| 873 # @param verbose integer verbosity level | |
| 874 # | |
| 875 @staticmethod | |
| 876 def getLengthPerHeader( inFile, verbose=0 ): | |
| 877 dHeader2Length = {} | |
| 878 | |
| 879 inFileHandler = open( inFile, "r" ) | |
| 880 currentSeqHeader = "" | |
| 881 currentSeqLength = 0 | |
| 882 line = inFileHandler.readline() | |
| 883 while line: | |
| 884 if line[0] == ">": | |
| 885 if currentSeqHeader != "": | |
| 886 dHeader2Length[ currentSeqHeader ] = currentSeqLength | |
| 887 currentSeqLength = 0 | |
| 888 currentSeqHeader = line[1:-1] | |
| 889 if verbose > 0: | |
| 890 print "current header: %s" % ( currentSeqHeader ) | |
| 891 sys.stdout.flush() | |
| 892 else: | |
| 893 currentSeqLength += len( line.replace("\n","") ) | |
| 894 line = inFileHandler.readline() | |
| 895 dHeader2Length[ currentSeqHeader ] = currentSeqLength | |
| 896 inFileHandler.close() | |
| 897 | |
| 898 return dHeader2Length | |
| 899 | |
| 900 | |
| 901 ## Convert headers from a fasta file having chunk coordinates | |
| 902 # | |
| 903 # @param inFile string name of the input fasta file | |
| 904 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes | |
| 905 # @param outFile string name of the output file | |
| 906 # | |
| 907 @staticmethod | |
| 908 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile): | |
| 909 inFileHandler = open(inFile, "r") | |
| 910 outFileHandler = open(outFile, "w") | |
| 911 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile) | |
| 912 iConvCoord = ConvCoord() | |
| 913 line = inFileHandler.readline() | |
| 914 while line: | |
| 915 if line[0] == ">": | |
| 916 if "{Fragment}" in line: | |
| 917 chkName = line.split(" ")[1] | |
| 918 chrName = dChunk2Map[chkName].seqname | |
| 919 lCoordPairs = line.split(" ")[3].split(",") | |
| 920 lRangesOnChk = [] | |
| 921 for i in lCoordPairs: | |
| 922 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1])) | |
| 923 lRangesOnChk.append(iRange) | |
| 924 lRangesOnChr = [] | |
| 925 for iRange in lRangesOnChk: | |
| 926 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map)) | |
| 927 newHeader = line[1:-1].split(" ")[0] | |
| 928 newHeader += " %s" % chrName | |
| 929 newHeader += " {Fragment}" | |
| 930 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end) | |
| 931 for iRange in lRangesOnChr[1:]: | |
| 932 newHeader += ",%i..%i" % (iRange.start, iRange.end) | |
| 933 outFileHandler.write(">%s\n" % newHeader) | |
| 934 else: | |
| 935 chkName = line.split("_")[1].split(" ")[0] | |
| 936 chrName = dChunk2Map[chkName].seqname | |
| 937 coords = line[line.find("[")+1 : line.find("]")] | |
| 938 start = int(coords.split(",")[0]) | |
| 939 end = int(coords.split(",")[1]) | |
| 940 iRangeOnChk = Range(chkName, start, end) | |
| 941 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map) | |
| 942 newHeader = line[1:-1].split("_")[0] | |
| 943 newHeader += " %s" % chrName | |
| 944 newHeader += " %s" % line[line.find("(") : line.find(")")+1] | |
| 945 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd()) | |
| 946 outFileHandler.write(">%s\n" % newHeader) | |
| 947 else: | |
| 948 outFileHandler.write(line) | |
| 949 line = inFileHandler.readline() | |
| 950 inFileHandler.close() | |
| 951 outFileHandler.close() | |
| 952 | |
| 953 | |
| 954 ## Convert a fasta file to a length file | |
| 955 # | |
| 956 # @param inFile string name of the input fasta file | |
| 957 # @param outFile string name of the output file | |
| 958 # | |
| 959 @staticmethod | |
| 960 def convertFastaToLength(inFile, outFile = ""): | |
| 961 if outFile == "": | |
| 962 outFile = "%s.length" % inFile | |
| 963 | |
| 964 if inFile != "": | |
| 965 with open(inFile, "r") as inFH: | |
| 966 with open(outFile, "w") as outFH: | |
| 967 bioseq = Bioseq() | |
| 968 bioseq.read(inFH) | |
| 969 while bioseq.sequence != None: | |
| 970 seqLen = bioseq.getLength() | |
| 971 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen)) | |
| 972 bioseq.read(inFH) | |
| 973 | |
| 974 | |
| 975 ## Convert a fasta file to a seq file | |
| 976 # | |
| 977 # @param inFile string name of the input fasta file | |
| 978 # @param outFile string name of the output file | |
| 979 # | |
| 980 @staticmethod | |
| 981 def convertFastaToSeq(inFile, outFile = ""): | |
| 982 if outFile == "": | |
| 983 outFile = "%s.seq" % inFile | |
| 984 | |
| 985 if inFile != "": | |
| 986 with open(inFile, "r") as inFH: | |
| 987 with open(outFile, "w") as outFH: | |
| 988 bioseq = Bioseq() | |
| 989 bioseq.read(inFH) | |
| 990 while bioseq.sequence != None: | |
| 991 seqLen = bioseq.getLength() | |
| 992 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \ | |
| 993 bioseq.sequence, bioseq.header, seqLen)) | |
| 994 bioseq.read(inFH) | |
| 995 | |
| 996 | |
| 997 ## Splice an input fasta file using coordinates in a Map file | |
| 998 # | |
| 999 # @note the coordinates should be merged beforehand! | |
| 1000 # | |
| 1001 @staticmethod | |
| 1002 def spliceFromCoords( genomeFile, coordFile, obsFile ): | |
| 1003 genomeFileHandler = open( genomeFile, "r" ) | |
| 1004 obsFileHandler = open( obsFile, "w" ) | |
| 1005 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile ) | |
| 1006 | |
| 1007 bs = Bioseq() | |
| 1008 bs.read( genomeFileHandler ) | |
| 1009 while bs.sequence: | |
| 1010 if dChr2Maps.has_key( bs.header ): | |
| 1011 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] ) | |
| 1012 splicedSeq = "" | |
| 1013 currentSite = 0 | |
| 1014 for iMap in lCoords: | |
| 1015 minSplice = iMap.getMin() - 1 | |
| 1016 if minSplice > currentSite: | |
| 1017 splicedSeq += bs.sequence[ currentSite : minSplice ] | |
| 1018 currentSite = iMap.getMax() | |
| 1019 splicedSeq += bs.sequence[ currentSite : ] | |
| 1020 bs.sequence = splicedSeq | |
| 1021 bs.write( obsFileHandler ) | |
| 1022 bs.read( genomeFileHandler ) | |
| 1023 | |
| 1024 genomeFileHandler.close() | |
| 1025 obsFileHandler.close() | |
| 1026 | |
| 1027 | |
| 1028 ## Shuffle input sequences (single file or files in a directory) | |
| 1029 # | |
| 1030 @staticmethod | |
| 1031 def dbShuffle( inData, outData, verbose=0 ): | |
| 1032 if CheckerUtils.isExecutableInUserPath("esl-shuffle"): | |
| 1033 prg = "esl-shuffle" | |
| 1034 else : prg = "shuffle" | |
| 1035 genericCmd = prg + " -d INPUT > OUTPUT" | |
| 1036 if os.path.isfile( inData ): | |
| 1037 if verbose > 0: | |
| 1038 print "shuffle input file '%s'" % inData | |
| 1039 cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData) | |
| 1040 print cmd | |
| 1041 returnStatus = os.system( cmd ) | |
| 1042 if returnStatus != 0: | |
| 1043 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus ) | |
| 1044 sys.exit(1) | |
| 1045 | |
| 1046 elif os.path.isdir( inData ): | |
| 1047 if verbose > 0: | |
| 1048 print "shuffle files in input directory '%s'" % inData | |
| 1049 if os.path.exists( outData ): | |
| 1050 shutil.rmtree( outData ) | |
| 1051 os.mkdir( outData ) | |
| 1052 lInputFiles = glob.glob( "%s/*.fa" %( inData ) ) | |
| 1053 nbFastaFiles = 0 | |
| 1054 for inputFile in lInputFiles: | |
| 1055 nbFastaFiles += 1 | |
| 1056 if verbose > 1: | |
| 1057 print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) ) | |
| 1058 fastaBaseName = os.path.basename( inputFile ) | |
| 1059 prefix, extension = os.path.splitext( fastaBaseName ) | |
| 1060 cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix)) | |
| 1061 returnStatus = os.system( cmd ) | |
| 1062 if returnStatus != 0: | |
| 1063 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus ) | |
| 1064 sys.exit(1) | |
| 1065 | |
| 1066 | |
| 1067 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers | |
| 1068 # | |
| 1069 # @param inClusterFileName string input cluster file name | |
| 1070 # @param inFastaFileName string input fasta file name | |
| 1071 # @param outFileName string output file name | |
| 1072 # @param verbosity integer verbosity | |
| 1073 # | |
| 1074 @staticmethod | |
| 1075 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0): | |
| 1076 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity) | |
| 1077 iFastaParser = FastaParser(inFastaFileName) | |
| 1078 with open(outFileName, "w") as f: | |
| 1079 for iSequence in iFastaParser.getIterator(): | |
| 1080 | |
| 1081 header = iSequence.getName() | |
| 1082 if dHeader2ClusterClusterMember.get(header): | |
| 1083 cluster = dHeader2ClusterClusterMember[header][0] | |
| 1084 member = dHeader2ClusterClusterMember[header][1] | |
| 1085 else: | |
| 1086 clusterIdForSingletonCluster += 1 | |
| 1087 cluster = clusterIdForSingletonCluster | |
| 1088 member = 1 | |
| 1089 | |
| 1090 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header) | |
| 1091 iSequence.setName(newHeader) | |
| 1092 f.write(iSequence.printFasta()) | |
| 1093 | |
| 1094 @staticmethod | |
| 1095 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0): | |
| 1096 dHeader2ClusterClusterMember = {} | |
| 1097 clusterId = 0 | |
| 1098 with open(inClusterFileName) as f: | |
| 1099 line = f.readline() | |
| 1100 while line: | |
| 1101 lineWithoutLastChar = line.rstrip() | |
| 1102 lHeaders = lineWithoutLastChar.split("\t") | |
| 1103 clusterId += 1 | |
| 1104 if verbosity > 0: | |
| 1105 print "%i sequences in cluster %i" % (len(lHeaders), clusterId) | |
| 1106 memberId = 0 | |
| 1107 for header in lHeaders: | |
| 1108 memberId += 1 | |
| 1109 dHeader2ClusterClusterMember[header] = (clusterId, memberId) | |
| 1110 line = f.readline() | |
| 1111 if verbosity > 0: | |
| 1112 print "%i clusters" % clusterId | |
| 1113 return dHeader2ClusterClusterMember, clusterId | |
| 1114 | |
| 1115 @staticmethod | |
| 1116 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""): | |
| 1117 """ | |
| 1118 Write a map file from fasta output of clustering tool. | |
| 1119 Warning: only works if input fasta headers are formated like LTRharvest fasta output. | |
| 1120 """ | |
| 1121 if not outMapFileName: | |
| 1122 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0]) | |
| 1123 | |
| 1124 fileDb = open(fastaFileNameFromClustering , "r") | |
| 1125 fileMap = open(outMapFileName, "w") | |
| 1126 seq = Bioseq() | |
| 1127 numseq = 0 | |
| 1128 while 1: | |
| 1129 seq.read(fileDb) | |
| 1130 if seq.sequence == None: | |
| 1131 break | |
| 1132 numseq = numseq + 1 | |
| 1133 ID = seq.header.split(' ')[0].split('_')[0] | |
| 1134 chunk = seq.header.split(' ')[0].split('_')[1] | |
| 1135 start = seq.header.split(' ')[-1].split(',')[0][1:] | |
| 1136 end = seq.header.split(' ')[-1].split(',')[1][:-1] | |
| 1137 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end) | |
| 1138 fileMap.write(line + "\n") | |
| 1139 | |
| 1140 fileDb.close() | |
| 1141 fileMap.close() | |
| 1142 print "saved in %s" % outMapFileName | |
| 1143 |
