Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.1.a/commons/core/seq/FastaUtils.py @ 13:feef9a0db09d draft
Uploaded
| author | urgi-team |
|---|---|
| date | Wed, 20 Jul 2016 09:04:42 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 12:22b0494ec883 | 13:feef9a0db09d |
|---|---|
| 1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
| 2 # http://www.inra.fr | |
| 3 # http://urgi.versailles.inra.fr | |
| 4 # | |
| 5 # This software is governed by the CeCILL license under French law and | |
| 6 # abiding by the rules of distribution of free software. You can use, | |
| 7 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 9 # "http://www.cecill.info". | |
| 10 # | |
| 11 # As a counterpart to the access to the source code and rights to copy, | |
| 12 # modify and redistribute granted by the license, users are provided only | |
| 13 # with a limited warranty and the software's author, the holder of the | |
| 14 # economic rights, and the successive licensors have only limited | |
| 15 # liability. | |
| 16 # | |
| 17 # In this respect, the user's attention is drawn to the risks associated | |
| 18 # with loading, using, modifying and/or developing or reproducing the | |
| 19 # software by the user in light of its specific status of free software, | |
| 20 # that may mean that it is complicated to manipulate, and that also | |
| 21 # therefore means that it is reserved for developers and experienced | |
| 22 # professionals having in-depth computer knowledge. Users are therefore | |
| 23 # encouraged to load and test the software's suitability as regards their | |
| 24 # requirements in conditions enabling the security of their systems and/or | |
| 25 # data to be ensured and, more generally, to use and operate it in the | |
| 26 # same conditions as regards security. | |
| 27 # | |
| 28 # The fact that you are presently reading this means that you have had | |
| 29 # knowledge of the CeCILL license and that you accept its terms. | |
| 30 | |
| 31 | |
| 32 import os | |
| 33 import re | |
| 34 import sys | |
| 35 import math | |
| 36 import glob | |
| 37 import string | |
| 38 import shutil | |
| 39 from commons.core.utils.FileUtils import FileUtils | |
| 40 from commons.core.seq.Bioseq import Bioseq | |
| 41 from commons.core.seq.BioseqDB import BioseqDB | |
| 42 from commons.core.coord.MapUtils import MapUtils | |
| 43 from commons.core.coord.Range import Range | |
| 44 from commons.core.coord.ConvCoord import ConvCoord | |
| 45 from commons.core.parsing.FastaParser import FastaParser | |
| 46 from commons.core.checker.CheckerUtils import CheckerUtils | |
| 47 from commons.core.launcher.LauncherUtils import LauncherUtils | |
| 48 | |
| 49 | |
| 50 ## Static methods for fasta file manipulation | |
| 51 # | |
| 52 class FastaUtils( object ): | |
| 53 | |
| 54 ## Count the number of sequences in the input fasta file | |
| 55 # | |
| 56 # @param inFile name of the input fasta file | |
| 57 # | |
| 58 # @return integer number of sequences in the input fasta file | |
| 59 # | |
| 60 @staticmethod | |
| 61 def dbSize( inFile ): | |
| 62 nbSeq = 0 | |
| 63 with open(inFile) as fH: | |
| 64 for line in fH: | |
| 65 if line[0] == ">": | |
| 66 nbSeq += 1 | |
| 67 | |
| 68 return nbSeq | |
| 69 | |
| 70 | |
| 71 ## Compute the cumulative sequence length in the input fasta file | |
| 72 # | |
| 73 # @param inFile handler of the input fasta file | |
| 74 # | |
| 75 @staticmethod | |
| 76 def dbCumLength( inFile ): | |
| 77 cumLength = 0 | |
| 78 for line in inFile: | |
| 79 if line[0] != ">": | |
| 80 cumLength += len(string.rstrip(line)) | |
| 81 | |
| 82 return cumLength | |
| 83 | |
| 84 | |
| 85 ## Return a list with the length of each sequence in the input fasta file | |
| 86 # | |
| 87 # @param inFile string name of the input fasta file | |
| 88 # | |
| 89 @staticmethod | |
| 90 def dbLengths(inFile): | |
| 91 lLengths = [] | |
| 92 currentLength = 0 | |
| 93 with open(inFile) as fH: | |
| 94 for line in fH: | |
| 95 if line[0] == ">": | |
| 96 if currentLength != 0: | |
| 97 lLengths.append(currentLength) | |
| 98 currentLength = 0 | |
| 99 else: | |
| 100 currentLength += len(line[:-1]) | |
| 101 lLengths.append(currentLength) | |
| 102 | |
| 103 return lLengths | |
| 104 | |
| 105 | |
| 106 ## Retrieve the sequence headers present in the input fasta file | |
| 107 # | |
| 108 # @param inFile string name of the input fasta file | |
| 109 # @param verbose integer level of verbosity | |
| 110 # | |
| 111 # @return list of sequence headers | |
| 112 # | |
| 113 @staticmethod | |
| 114 def dbHeaders(inFile, verbose = 0): | |
| 115 lHeaders = [line[1:].rstrip() for line in open(inFile) if line[0] == ">"] | |
| 116 if verbose: | |
| 117 for header in lHeaders: | |
| 118 print header | |
| 119 | |
| 120 return lHeaders | |
| 121 | |
| 122 | |
| 123 ## Cut a data bank into chunks according to the input parameters | |
| 124 # If a sequence is shorter than the threshold, it is only renamed (not cut) | |
| 125 # | |
| 126 # @param inFileName string name of the input fasta file | |
| 127 # @param chkLgth string chunk length (in bp, default=200000) | |
| 128 # @param chkOver string chunk overlap (in bp, default=10000) | |
| 129 # @param wordN string N stretch word length (default=11, 0 for no detection) | |
| 130 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map') | |
| 131 # @param clean boolean remove 'cut' and 'Nstretch' files | |
| 132 # @param verbose integer (default = 0) | |
| 133 # | |
| 134 @staticmethod | |
| 135 def dbChunks(inFileName, chkLgth = "200000", chkOver = "10000", wordN = "11", outFilePrefix = "", clean = False, verbose = 0): | |
| 136 nbSeq = FastaUtils.dbSize(inFileName) | |
| 137 if verbose > 0: | |
| 138 print "cut the %i input sequences with cutterDB..." % nbSeq | |
| 139 sys.stdout.flush() | |
| 140 | |
| 141 prg = "cutterDB" | |
| 142 cmd = prg | |
| 143 cmd += " -l %s" % chkLgth | |
| 144 cmd += " -o %s" % chkOver | |
| 145 cmd += " -w %s" % wordN | |
| 146 cmd += " %s" % inFileName | |
| 147 returnStatus = os.system(cmd) | |
| 148 if returnStatus != 0: | |
| 149 msg = "ERROR: '%s' returned '%i'" % (prg, returnStatus) | |
| 150 sys.stderr.write("%s\n" % msg) | |
| 151 sys.exit(1) | |
| 152 | |
| 153 nbChunks = FastaUtils.dbSize("%s_cut" % inFileName) | |
| 154 if verbose > 0: | |
| 155 print "done (%i chunks)" % ( nbChunks ) | |
| 156 sys.stdout.flush() | |
| 157 | |
| 158 if verbose > 0: | |
| 159 print "rename the headers..." | |
| 160 sys.stdout.flush() | |
| 161 | |
| 162 if outFilePrefix == "": | |
| 163 outFastaName = inFileName + "_chunks.fa" | |
| 164 outMapName = inFileName + "_chunks.map" | |
| 165 else: | |
| 166 outFastaName = outFilePrefix + ".fa" | |
| 167 outMapName = outFilePrefix + ".map" | |
| 168 | |
| 169 with open("%s_cut" % inFileName) as inFile: | |
| 170 outFasta = open(outFastaName, "w") | |
| 171 outMap = open(outMapName, "w") | |
| 172 | |
| 173 for line in inFile: | |
| 174 if line[0] == ">": | |
| 175 if verbose > 1: | |
| 176 print "rename '%s'" % (line[:-1]); sys.stdout.flush() | |
| 177 data = line[:-1].split(" ") | |
| 178 seqID = data[0].split(">")[1] | |
| 179 newHeader = "chunk%s" % (str(seqID).zfill(len(str(nbChunks)))) | |
| 180 oldHeader = data[2] | |
| 181 seqStart = data[4].split("..")[0] | |
| 182 seqEnd = data[4].split("..")[1] | |
| 183 outMap.write("%s\t%s\t%s\t%s\n" % (newHeader, oldHeader, seqStart, seqEnd)) | |
| 184 outFasta.write(">%s\n" % newHeader) | |
| 185 | |
| 186 else: | |
| 187 outFasta.write(line.upper()) | |
| 188 | |
| 189 outFasta.close() | |
| 190 outMap.close() | |
| 191 | |
| 192 #stats on .Nstretch.map file | |
| 193 genomeLength = FastaUtils.dbCumLength(open(inFileName)) | |
| 194 NstretchMapFile = inFileName + ".Nstretch.map" | |
| 195 outNstrechStats = open('%s.NstretchStats.txt' % inFileName , "w") | |
| 196 if FileUtils.isEmpty(NstretchMapFile) or not FileUtils.isRessourceExists(NstretchMapFile): | |
| 197 outNstrechStats.write("No N in stretch length > %s\n" % wordN) | |
| 198 else: | |
| 199 with open(NstretchMapFile) as f: | |
| 200 dHeader2lLengths = {} | |
| 201 for line in f: | |
| 202 data = line.rstrip().split() | |
| 203 header = data[1] | |
| 204 length = int(data[3]) - int(data[2]) + 1 | |
| 205 if header not in dHeader2lLengths: | |
| 206 dHeader2lLengths[header] = [] | |
| 207 dHeader2lLengths[header].append(length) | |
| 208 | |
| 209 for header in sorted(dHeader2lLengths): | |
| 210 lLengths = dHeader2lLengths[header] | |
| 211 outNstrechStats.write("%s\tmin: %s\tmax: %s\tcumul: %s\n" % (header, min(lLengths), max(lLengths), sum(lLengths))) | |
| 212 | |
| 213 cumulAllStretch = sum([sum(lengths) for lengths in dHeader2lLengths.values()]) | |
| 214 | |
| 215 NstretchPrct = float(cumulAllStretch)/genomeLength*100 | |
| 216 outNstrechStats.write("Total N in stretch length > %s: %s bp represent %6.2f %% of genome\n" % (wordN, cumulAllStretch, NstretchPrct)) | |
| 217 outNstrechStats.close() | |
| 218 | |
| 219 if clean == True: | |
| 220 os.remove(inFileName + "_cut") | |
| 221 os.remove(NstretchMapFile) | |
| 222 | |
| 223 | |
| 224 ## Split the input fasta file in several output files | |
| 225 # | |
| 226 # @param inFile string name of the input fasta file | |
| 227 # @param nbSeqPerBatch integer number of sequences per output file | |
| 228 # @param newDir boolean put the sequences in a new directory called 'batches' | |
| 229 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1') | |
| 230 # @param prefix prefix in output file name | |
| 231 # @param verbose integer verbosity level (default = 0) | |
| 232 # | |
| 233 @staticmethod | |
| 234 def dbSplit(inFile, nbSeqPerBatch, newDir, useSeqHeader = False, prefix = "batch", verbose = 0): | |
| 235 if not os.path.exists(inFile): | |
| 236 msg = "ERROR: file '%s' doesn't exist" % inFile | |
| 237 sys.stderr.write("%s\n" % msg) | |
| 238 sys.exit(1) | |
| 239 | |
| 240 nbSeq = FastaUtils.dbSize(inFile) | |
| 241 | |
| 242 nbBatches = int(math.ceil(nbSeq / float(nbSeqPerBatch))) | |
| 243 if verbose > 0: | |
| 244 print "save the %i input sequences into %i batches" % (nbSeq, nbBatches) | |
| 245 sys.stdout.flush() | |
| 246 | |
| 247 if nbSeqPerBatch != 1 and useSeqHeader: | |
| 248 useSeqHeader = False | |
| 249 | |
| 250 if newDir == True: | |
| 251 if os.path.exists("batches"): | |
| 252 shutil.rmtree("batches") | |
| 253 os.mkdir("batches") | |
| 254 os.chdir("batches") | |
| 255 os.system("ln -s ../%s ." % inFile) | |
| 256 | |
| 257 with open(inFile) as inFileHandler: | |
| 258 countBatch = 0 | |
| 259 countSeq = 0 | |
| 260 for line in inFileHandler: | |
| 261 if line[0] == ">": | |
| 262 countSeq += 1 | |
| 263 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1: | |
| 264 try: | |
| 265 outFile.close() | |
| 266 except: pass | |
| 267 countBatch += 1 | |
| 268 if useSeqHeader: | |
| 269 outFileName = "%s.fa" % (line[1:-1].replace(" ", "_")) | |
| 270 else: | |
| 271 outFileName = "%s_%s.fa" % (prefix, str(countBatch).zfill(len(str(nbBatches)))) | |
| 272 outFile = open(outFileName, "w") | |
| 273 | |
| 274 if verbose > 1: | |
| 275 print "saving seq '%s' in file '%s'..." % (line[1:].rstrip(), outFileName) | |
| 276 sys.stdout.flush() | |
| 277 outFile.write(line) | |
| 278 | |
| 279 if newDir: | |
| 280 os.remove(os.path.basename(inFile)) | |
| 281 os.chdir("..") | |
| 282 | |
| 283 | |
| 284 ## Split the input fasta file in several output files | |
| 285 # | |
| 286 # @param inFileName string name of the input fasta file | |
| 287 # @param maxSize integer max cumulative length for each output file | |
| 288 # | |
| 289 @staticmethod | |
| 290 def splitFastaFileInBatches(inFileName, maxSize = 200000): | |
| 291 iBioseqDB = BioseqDB(inFileName) | |
| 292 lHeadersSizeTuples = [] | |
| 293 for iBioseq in iBioseqDB.db: | |
| 294 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength())) | |
| 295 | |
| 296 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize) | |
| 297 os.mkdir("batches") | |
| 298 os.chdir("batches") | |
| 299 | |
| 300 iterator = 0 | |
| 301 for lHeader in lHeadersList: | |
| 302 iterator += 1 | |
| 303 with open("batch_%s.fa" % iterator, 'w') as f: | |
| 304 for header in lHeader : | |
| 305 iBioseq = iBioseqDB.fetch(header) | |
| 306 iBioseq.write(f) | |
| 307 os.chdir("..") | |
| 308 | |
| 309 | |
| 310 ## Split the input fasta file in several output files according to their cluster identifier | |
| 311 # | |
| 312 # @param inFileName string name of the input fasta file | |
| 313 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust) | |
| 314 # @param simplifyHeader boolean simplify the headers | |
| 315 # @param createDir boolean put the sequences in different directories | |
| 316 # @param outPrefix string prefix of the output files (default='seqCluster') | |
| 317 # @param verbose integer (default = 0) | |
| 318 # | |
| 319 @staticmethod | |
| 320 def splitSeqPerCluster(inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix = "seqCluster", verbose = 0): | |
| 321 if not os.path.exists(inFileName): | |
| 322 print "ERROR: %s doesn't exist" % inFileName | |
| 323 sys.exit(1) | |
| 324 | |
| 325 inFile = open(inFileName) | |
| 326 | |
| 327 line = inFile.readline() | |
| 328 if line: | |
| 329 name = line.split(" ")[0] | |
| 330 if "Cluster" in name: | |
| 331 clusterID = name.split("Cluster")[1].split("Mb")[0] | |
| 332 seqID = name.split("Mb")[1] | |
| 333 else: | |
| 334 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust | |
| 335 if "Q" in name.split("Gr")[0]: | |
| 336 seqID = name.split("Gr")[0].split("MbQ")[1] | |
| 337 elif "S" in name: | |
| 338 seqID = name.split("Gr")[0].split("MbS")[1] | |
| 339 sClusterIDs = set( [ clusterID ] ) | |
| 340 if simplifyHeader == True: | |
| 341 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) | |
| 342 else: | |
| 343 header = line[1:-1] | |
| 344 if createDir == True: | |
| 345 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): | |
| 346 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
| 347 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
| 348 outFileName = "%s%s.fa" % ( outPrefix, clusterID ) | |
| 349 outFile = open( outFileName, "w" ) | |
| 350 outFile.write( ">%s\n" % ( header ) ) | |
| 351 prevClusterID = clusterID | |
| 352 | |
| 353 line = inFile.readline() | |
| 354 while line: | |
| 355 if line[0] == ">": | |
| 356 name = line.split(" ")[0] | |
| 357 if "Cluster" in name: | |
| 358 clusterID = name.split("Cluster")[1].split("Mb")[0] | |
| 359 seqID = name.split("Mb")[1] | |
| 360 else: | |
| 361 clusterID = name.split("Cl")[0].split("Gr")[1] | |
| 362 if "Q" in name.split("Gr")[0]: | |
| 363 seqID = name.split("Gr")[0].split("MbQ")[1] | |
| 364 elif "S" in name: | |
| 365 seqID = name.split("Gr")[0].split("MbS")[1] | |
| 366 | |
| 367 if clusterID != prevClusterID: | |
| 368 outFile.close() | |
| 369 | |
| 370 if simplifyHeader == True: | |
| 371 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) | |
| 372 else: | |
| 373 header = line[1:-1] | |
| 374 | |
| 375 if createDir == True: | |
| 376 os.chdir( ".." ) | |
| 377 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): | |
| 378 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
| 379 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
| 380 | |
| 381 outFileName = "%s%s.fa" % ( outPrefix, clusterID ) | |
| 382 if not os.path.exists( outFileName ): | |
| 383 outFile = open( outFileName, "w" ) | |
| 384 else: | |
| 385 if clusterID != prevClusterID: | |
| 386 outFile.close() | |
| 387 outFile = open( outFileName, "a" ) | |
| 388 outFile.write( ">%s\n" % ( header ) ) | |
| 389 prevClusterID = clusterID | |
| 390 sClusterIDs.add( clusterID ) | |
| 391 | |
| 392 else: | |
| 393 outFile.write( line ) | |
| 394 | |
| 395 line = inFile.readline() | |
| 396 | |
| 397 outFile.close() | |
| 398 if verbose > 0: | |
| 399 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush() | |
| 400 | |
| 401 if createDir == True: | |
| 402 os.chdir("..") | |
| 403 else: | |
| 404 print "WARNING: empty input file - no cluster found"; sys.stdout.flush() | |
| 405 | |
| 406 | |
| 407 ## Filter a fasta file in two fasta files using the length of each sequence as a criterion | |
| 408 # | |
| 409 # @param len_min integer length sequence criterion to filter | |
| 410 # @param inFileName string name of the input fasta file | |
| 411 # @param verbose integer (default = 0) | |
| 412 # | |
| 413 @staticmethod | |
| 414 def dbLengthFilter(len_min, inFileName, verbose = 0): | |
| 415 file_db = open(inFileName,) | |
| 416 file_dbInf = open(inFileName + ".Inf" + str(len_min), "w") | |
| 417 file_dbSup = open(inFileName + ".Sup" + str(len_min), "w") | |
| 418 seq = Bioseq() | |
| 419 numseq = 0 | |
| 420 nbsaveInf = 0 | |
| 421 nbsaveSup = 0 | |
| 422 seq.read(file_db) | |
| 423 while seq.getHeader(): | |
| 424 l = seq.getLength() | |
| 425 numseq = numseq + 1 | |
| 426 if l >= len_min: | |
| 427 seq.write(file_dbSup) | |
| 428 if verbose > 0: | |
| 429 nbsaveSup = nbsaveSup + 1 | |
| 430 else: | |
| 431 seq.write(file_dbInf) | |
| 432 if verbose > 0: | |
| 433 nbsaveInf = nbsaveInf + 1 | |
| 434 seq.read(file_db) | |
| 435 | |
| 436 file_db.close() | |
| 437 file_dbInf.close() | |
| 438 file_dbSup.close() | |
| 439 if verbose > 0: | |
| 440 print "%i saved sequences in %s: %i sequences for %s.Inf%s and %i sequences for %s.Sup%s" % (nbsaveInf + nbsaveSup, inFileName, nbsaveInf, inFileName, str(len_min), nbsaveSup, inFileName, str(len_min)) | |
| 441 | |
| 442 | |
| 443 ## Extract the longest sequences from a fasta file | |
| 444 # | |
| 445 # @param num integer maximum number of sequences in the output file | |
| 446 # @param inFileName string name of the input fasta file | |
| 447 # @param outFileName string name of the output fasta file | |
| 448 # @param minThresh integer minimum length threshold (default=0) | |
| 449 # @param verbose integer (default = 0) | |
| 450 # | |
| 451 @staticmethod | |
| 452 def dbLongestSequences(num, inFileName, outFileName = "", verbose = 0, minThresh = 0): | |
| 453 bsDB = BioseqDB(inFileName) | |
| 454 if verbose > 0: | |
| 455 print "nb of input sequences: %i" % bsDB.getSize() | |
| 456 | |
| 457 if outFileName == "": | |
| 458 outFileName = inFileName + ".best" + str(num) | |
| 459 outFile = open( outFileName, "w" ) | |
| 460 | |
| 461 if bsDB.getSize()==0: | |
| 462 return 0 | |
| 463 | |
| 464 num = int(num) | |
| 465 if verbose > 0: | |
| 466 print "keep the %i longest sequences" % num | |
| 467 if minThresh > 0: | |
| 468 print "with length > %i bp" % minThresh | |
| 469 sys.stdout.flush() | |
| 470 | |
| 471 tmpLSeqLgth = bsDB.getListOfSequencesLength() | |
| 472 tmpLSeqLgth.sort(reverse = True) | |
| 473 | |
| 474 # select the longests | |
| 475 lSeqLgth = [] | |
| 476 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ): | |
| 477 if tmpLSeqLgth[i] >= minThresh: | |
| 478 lSeqLgth.append( tmpLSeqLgth[i] ) | |
| 479 if verbose > 0: | |
| 480 print "selected max length: %i" % max(lSeqLgth) | |
| 481 print "selected min length: %i" % min(lSeqLgth) | |
| 482 sys.stdout.flush() | |
| 483 | |
| 484 # save the longest | |
| 485 inFile = open( inFileName ) | |
| 486 seqNum = 0 | |
| 487 nbSave = 0 | |
| 488 for bs in bsDB.db: | |
| 489 seqNum += 1 | |
| 490 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh: | |
| 491 bs.write( outFile ) | |
| 492 if verbose > 1: | |
| 493 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] ) | |
| 494 sys.stdout.flush() | |
| 495 nbSave += 1 | |
| 496 if nbSave == num: | |
| 497 break | |
| 498 inFile.close() | |
| 499 outFile.close() | |
| 500 if verbose > 0: | |
| 501 print nbSave, "saved sequences in ", outFileName | |
| 502 sys.stdout.flush() | |
| 503 | |
| 504 return 0 | |
| 505 | |
| 506 | |
| 507 ## Extract all the sequence headers from a fasta file and write them in a new file | |
| 508 # | |
| 509 # @param inFileName string name of the input fasta file | |
| 510 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers') | |
| 511 # | |
| 512 @staticmethod | |
| 513 def dbExtractSeqHeaders(inFileName, outFileName = ""): | |
| 514 if not outFileName: | |
| 515 outFileName = inFileName + ".headers" | |
| 516 | |
| 517 with open(outFileName, "w") as f: | |
| 518 for header in FastaUtils.dbHeaders(inFileName): | |
| 519 f.write("%s\n" % header) | |
| 520 | |
| 521 return 0 | |
| 522 | |
| 523 | |
| 524 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file | |
| 525 # | |
| 526 # @param pattern regular expression to search in headers | |
| 527 # @param inFileName string name of the input fasta file | |
| 528 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
| 529 # @param verbose integer verbosity level (default = 0) | |
| 530 # | |
| 531 @staticmethod | |
| 532 def dbExtractByPattern(pattern, inFileName, outFileName = "", verbose = 0): | |
| 533 if not pattern: | |
| 534 return | |
| 535 | |
| 536 if not outFileName: | |
| 537 outFileName = inFileName + '.extracted' | |
| 538 outFile = open(outFileName, 'w') | |
| 539 | |
| 540 patternTosearch = re.compile(pattern) | |
| 541 bioseq = Bioseq() | |
| 542 bioseqNb = 0 | |
| 543 savedBioseqNb = 0 | |
| 544 inFile = open(inFileName) | |
| 545 bioseq.read(inFile) | |
| 546 while bioseq.sequence: | |
| 547 bioseqNb = bioseqNb + 1 | |
| 548 m = patternTosearch.search(bioseq.header) | |
| 549 if m: | |
| 550 bioseq.write(outFile) | |
| 551 if verbose > 1: | |
| 552 print 'sequence num', bioseqNb, 'matched on', m.group(), '[', bioseq.header[0:40], '...] saved !!' | |
| 553 savedBioseqNb = savedBioseqNb + 1 | |
| 554 bioseq.read(inFile) | |
| 555 inFile.close() | |
| 556 | |
| 557 outFile.close() | |
| 558 | |
| 559 if verbose > 0: | |
| 560 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) | |
| 561 | |
| 562 | |
| 563 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file | |
| 564 # | |
| 565 # @param patternFileName string file containing regular expression to search in headers | |
| 566 # @param inFileName string name of the input fasta file | |
| 567 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
| 568 # @param verbose integer verbosity level (default = 0) | |
| 569 # | |
| 570 @staticmethod | |
| 571 def dbExtractByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0): | |
| 572 if not patternFileName: | |
| 573 print "ERROR: no file of pattern" | |
| 574 sys.exit(1) | |
| 575 | |
| 576 bioseq = Bioseq() | |
| 577 bioseqNb = 0 | |
| 578 savedBioseqNb = 0 | |
| 579 lHeaders = [] | |
| 580 | |
| 581 with open(inFileName) as inFile: | |
| 582 bioseq.read(inFile) | |
| 583 while bioseq.sequence: | |
| 584 lHeaders.append(bioseq.header) | |
| 585 bioseq.read(inFile) | |
| 586 | |
| 587 lHeadersToKeep = [] | |
| 588 with open(patternFileName) as patternFile: | |
| 589 for pattern in patternFile: | |
| 590 pattern = pattern.rstrip() | |
| 591 if verbose > 0: | |
| 592 print "pattern: ", pattern; sys.stdout.flush() | |
| 593 | |
| 594 patternToSearch = re.compile(pattern) | |
| 595 lHeadersToKeep.extend([h for h in lHeaders if patternToSearch.search(h)]) | |
| 596 | |
| 597 if not outFileName: | |
| 598 outFileName = inFileName + ".extracted" | |
| 599 | |
| 600 with open(outFileName, "w") as outFile: | |
| 601 with open(inFileName) as inFile: | |
| 602 bioseq.read(inFile) | |
| 603 while bioseq.sequence: | |
| 604 bioseqNb += 1 | |
| 605 if bioseq.header in lHeadersToKeep: | |
| 606 bioseq.write(outFile) | |
| 607 savedBioseqNb += 1 | |
| 608 if verbose > 1: | |
| 609 print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush() | |
| 610 bioseq.read(inFile) | |
| 611 | |
| 612 if verbose > 0: | |
| 613 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) | |
| 614 | |
| 615 | |
| 616 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file | |
| 617 # | |
| 618 # @param pattern regular expression to search in headers | |
| 619 # @param inFileName string name of the input fasta file | |
| 620 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
| 621 # @param verbose integer verbosity level (default = 0) | |
| 622 # | |
| 623 @staticmethod | |
| 624 def dbCleanByPattern(pattern, inFileName, outFileName = "", verbose = 0): | |
| 625 if not pattern: | |
| 626 return | |
| 627 | |
| 628 patternToSearch = re.compile(pattern) | |
| 629 | |
| 630 if outFileName == "": | |
| 631 outFileName = inFileName + '.cleaned' | |
| 632 | |
| 633 with open(outFileName, 'w') as outFile: | |
| 634 bioseq = Bioseq() | |
| 635 bioseqNb = 0 | |
| 636 savedBioseqNb = 0 | |
| 637 with open(inFileName) as inFile: | |
| 638 bioseq.read(inFile) | |
| 639 while bioseq.sequence: | |
| 640 bioseqNb += 1 | |
| 641 if not patternToSearch.search(bioseq.header): | |
| 642 bioseq.write(outFile) | |
| 643 savedBioseqNb += 1 | |
| 644 if verbose > 1: | |
| 645 print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!' | |
| 646 bioseq.read(inFile) | |
| 647 | |
| 648 if verbose > 0: | |
| 649 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) | |
| 650 | |
| 651 | |
| 652 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file | |
| 653 # | |
| 654 # @param patternFileName string file containing regular expression to search in headers | |
| 655 # @param inFileName string name of the input fasta file | |
| 656 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
| 657 # @param verbose integer verbosity level (default = 0) | |
| 658 # | |
| 659 @staticmethod | |
| 660 def dbCleanByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0): | |
| 661 if not patternFileName: | |
| 662 print "ERROR: no file of pattern" | |
| 663 sys.exit(1) | |
| 664 | |
| 665 bioseq = Bioseq() | |
| 666 bioseqNb = 0 | |
| 667 savedBioseqNb = 0 | |
| 668 lHeaders = [] | |
| 669 with open(inFileName) as inFile: | |
| 670 bioseq.read(inFile) | |
| 671 while bioseq.sequence: | |
| 672 lHeaders.append(bioseq.header) | |
| 673 bioseq.read(inFile) | |
| 674 | |
| 675 with open(patternFileName) as patternFile: | |
| 676 lHeadersToRemove = [] | |
| 677 for pattern in patternFile: | |
| 678 pattern = pattern.rstrip() | |
| 679 if verbose > 0: | |
| 680 print "pattern: ", pattern; sys.stdout.flush() | |
| 681 | |
| 682 patternToSearch = re.compile(pattern) | |
| 683 lHeadersToRemove.extend([h for h in lHeaders if patternToSearch.search(h)]) | |
| 684 | |
| 685 if not outFileName: | |
| 686 outFileName = inFileName + '.cleaned' | |
| 687 | |
| 688 with open(outFileName, 'w') as outFile: | |
| 689 bioseqNum = 0 | |
| 690 with open(inFileName) as inFile: | |
| 691 bioseq.read(inFile) | |
| 692 while bioseq.sequence: | |
| 693 bioseqNum += 1 | |
| 694 if bioseq.header not in lHeadersToRemove: | |
| 695 bioseq.write(outFile) | |
| 696 savedBioseqNb += 1 | |
| 697 if verbose > 1: | |
| 698 print 'sequence num', bioseqNum, '/', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush() | |
| 699 bioseq.read(inFile) | |
| 700 | |
| 701 if verbose > 0: | |
| 702 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) | |
| 703 | |
| 704 | |
| 705 ## Find sequence's ORFs from a fasta file and write them in a tab file | |
| 706 # | |
| 707 # @param inFileName string name of the input fasta file | |
| 708 # @param orfMaxNb integer Select orfMaxNb best ORFs | |
| 709 # @param orfMinLength integer Keep ORFs with length > orfMinLength | |
| 710 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map') | |
| 711 # @param verbose integer verbosity level (default = 0) | |
| 712 # | |
| 713 @staticmethod | |
| 714 def dbORF(inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose = 0): | |
| 715 if not outFileName: | |
| 716 outFileName = inFileName + ".ORF.map" | |
| 717 outFile = open(outFileName, "w") | |
| 718 | |
| 719 bioseq = Bioseq() | |
| 720 bioseqNb = 0 | |
| 721 | |
| 722 inFile = open(inFileName) | |
| 723 bioseq.read(inFile) | |
| 724 while bioseq.sequence: | |
| 725 bioseq.upCase() | |
| 726 bioseqNb += 1 | |
| 727 if verbose > 0: | |
| 728 print 'sequence num', bioseqNb, '=', bioseq.getLength(), '[', bioseq.header[0:40], '...]' | |
| 729 | |
| 730 orf = bioseq.findORF() | |
| 731 bestOrf = [] | |
| 732 for i in orf.keys(): | |
| 733 orfLen = len(orf[i]) | |
| 734 for j in xrange(1, orfLen): | |
| 735 start = orf[i][j - 1] + 4 | |
| 736 end = orf[i][j] + 3 | |
| 737 if end - start >= orfMinLength: | |
| 738 bestOrf.append((end - start, i + 1, start, end)) | |
| 739 | |
| 740 bioseq.reverseComplement() | |
| 741 | |
| 742 orf = bioseq.findORF() | |
| 743 seqLen = bioseq.getLength() | |
| 744 for i in orf.keys(): | |
| 745 orfLen = len(orf[i]) | |
| 746 for j in xrange(1, orfLen): | |
| 747 start = seqLen - orf[i][j - 1] - 3 | |
| 748 end = seqLen - orf[i][j] - 2 | |
| 749 if start - end >= orfMinLength: | |
| 750 bestOrf.append((start - end, (i + 1) * -1, start, end)) | |
| 751 | |
| 752 bestOrf.sort(reverse = True) | |
| 753 bestOrfNb = len(bestOrf) | |
| 754 if orfMaxNb != 0 and orfMaxNb < bestOrfNb: | |
| 755 bestOrfNb = orfMaxNb | |
| 756 for i in xrange(0, bestOrfNb): | |
| 757 if verbose > 0: | |
| 758 print bestOrf[i] | |
| 759 outFile.write("%s\t%s\t%d\t%d\n" % ("ORF|" + str(bestOrf[i][1]) + \ | |
| 760 "|" + str(bestOrf[i][0]), bioseq.header, | |
| 761 bestOrf[i][2], bestOrf[i][3])) | |
| 762 bioseq.read(inFile) | |
| 763 | |
| 764 inFile.close() | |
| 765 outFile.close() | |
| 766 | |
| 767 return 0 | |
| 768 | |
| 769 | |
| 770 ## Sort sequences by increasing length (write a new file) | |
| 771 # | |
| 772 # @param inFileName string name of the input fasta file | |
| 773 # @param outFileName string name of the output fasta file | |
| 774 # @param verbose integer verbosity level | |
| 775 # | |
| 776 @staticmethod | |
| 777 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose = 0): | |
| 778 if verbose > 0: | |
| 779 print "sort sequences by increasing length" | |
| 780 sys.stdout.flush() | |
| 781 if not os.path.exists(inFileName): | |
| 782 print "ERROR: file '%s' doesn't exist" % (inFileName) | |
| 783 sys.exit(1) | |
| 784 | |
| 785 # read each seq one by one | |
| 786 # save them in distinct temporary files | |
| 787 # with their length in the name | |
| 788 inFileHandler = open(inFileName, "r") | |
| 789 countSeq = 0 | |
| 790 bs = Bioseq() | |
| 791 bs.read(inFileHandler) | |
| 792 while bs.header: | |
| 793 countSeq += 1 | |
| 794 tmpFile = "%ibp_%inb" % (bs.getLength(), countSeq) | |
| 795 bs.appendBioseqInFile(tmpFile) | |
| 796 if verbose > 1: | |
| 797 print "%s (%i bp) saved in '%s'" % (bs.header, bs.getLength(), tmpFile) | |
| 798 bs.header = "" | |
| 799 bs.sequence = "" | |
| 800 bs.read(inFileHandler) | |
| 801 inFileHandler.close() | |
| 802 | |
| 803 # sort temporary file names | |
| 804 # concatenate them into the output file | |
| 805 if os.path.exists(outFileName): | |
| 806 os.remove(outFileName) | |
| 807 lFiles = glob.glob("*bp_*nb") | |
| 808 lFiles.sort(key = lambda s:int(s.split("bp_")[0])) | |
| 809 for fileName in lFiles: | |
| 810 cmd = "cat %s >> %s" % (fileName, outFileName) | |
| 811 returnValue = os.system(cmd) | |
| 812 if returnValue != 0: | |
| 813 print "ERROR while concatenating '%s' with '%s'" % (fileName, outFileName) | |
| 814 sys.exit(1) | |
| 815 os.remove(fileName) | |
| 816 | |
| 817 return 0 | |
| 818 | |
| 819 | |
| 820 ## Sort sequences by header | |
| 821 # | |
| 822 # @param inFileName string name of the input fasta file | |
| 823 # @param outFileName string name of the output fasta file | |
| 824 # @param verbose integer verbosity level | |
| 825 # | |
| 826 @staticmethod | |
| 827 def sortSequencesByHeader(inFileName, outFileName = ""): | |
| 828 if outFileName == "": | |
| 829 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0] | |
| 830 iBioseqDB = BioseqDB(inFileName) | |
| 831 with open(outFileName, "w") as f: | |
| 832 for header in sorted(iBioseqDB.getHeaderList()): | |
| 833 iBioseq = iBioseqDB.fetch(header) | |
| 834 iBioseq.write(f) | |
| 835 | |
| 836 | |
| 837 ## Return a dictionary which keys are the headers and values the length of the sequences | |
| 838 # | |
| 839 # @param inFile string name of the input fasta file | |
| 840 # @param verbose integer verbosity level | |
| 841 # | |
| 842 @staticmethod | |
| 843 def getLengthPerHeader(inFile, verbose = 0): | |
| 844 dHeader2Length = {} | |
| 845 | |
| 846 with open(inFile) as inFileHandler: | |
| 847 currentSeqHeader = "" | |
| 848 currentSeqLength = 0 | |
| 849 for line in inFileHandler: | |
| 850 if line[0] == ">": | |
| 851 if currentSeqHeader != "": | |
| 852 dHeader2Length[currentSeqHeader] = currentSeqLength | |
| 853 currentSeqLength = 0 | |
| 854 currentSeqHeader = line[1:-1] | |
| 855 if verbose > 0: | |
| 856 print "current header: %s" % currentSeqHeader | |
| 857 sys.stdout.flush() | |
| 858 else: | |
| 859 currentSeqLength += len(line.replace("\n", "")) | |
| 860 dHeader2Length[currentSeqHeader] = currentSeqLength | |
| 861 | |
| 862 return dHeader2Length | |
| 863 | |
| 864 | |
| 865 ## Convert headers from a fasta file having chunk coordinates | |
| 866 # | |
| 867 # @param inFile string name of the input fasta file | |
| 868 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes | |
| 869 # @param outFile string name of the output file | |
| 870 # | |
| 871 @staticmethod | |
| 872 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile): | |
| 873 inFileHandler = open(inFile, "r") | |
| 874 outFileHandler = open(outFile, "w") | |
| 875 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile) | |
| 876 iConvCoord = ConvCoord() | |
| 877 for line in inFileHandler: | |
| 878 if line[0] == ">": | |
| 879 if "{Fragment}" in line: | |
| 880 chkName = line.split(" ")[1] | |
| 881 chrName = dChunk2Map[chkName].seqname | |
| 882 lCoordPairs = line.split(" ")[3].split(",") | |
| 883 lRangesOnChk = [] | |
| 884 for i in lCoordPairs: | |
| 885 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1])) | |
| 886 lRangesOnChk.append(iRange) | |
| 887 lRangesOnChr = [] | |
| 888 for iRange in lRangesOnChk: | |
| 889 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map)) | |
| 890 newHeader = line[1:-1].split(" ")[0] | |
| 891 newHeader += " %s" % chrName | |
| 892 newHeader += " {Fragment}" | |
| 893 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end) | |
| 894 for iRange in lRangesOnChr[1:]: | |
| 895 newHeader += ",%i..%i" % (iRange.start, iRange.end) | |
| 896 outFileHandler.write(">%s\n" % newHeader) | |
| 897 else: | |
| 898 chkName = line.split("_")[1].split(" ")[0] | |
| 899 chrName = dChunk2Map[chkName].seqname | |
| 900 coords = line[line.find("[")+1 : line.find("]")] | |
| 901 start = int(coords.split(",")[0]) | |
| 902 end = int(coords.split(",")[1]) | |
| 903 iRangeOnChk = Range(chkName, start, end) | |
| 904 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map) | |
| 905 newHeader = line[1:-1].split("_")[0] | |
| 906 newHeader += " %s" % chrName | |
| 907 newHeader += " %s" % line[line.find("(") : line.find(")")+1] | |
| 908 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd()) | |
| 909 outFileHandler.write(">%s\n" % newHeader) | |
| 910 else: | |
| 911 outFileHandler.write(line) | |
| 912 inFileHandler.close() | |
| 913 outFileHandler.close() | |
| 914 | |
| 915 | |
| 916 ## Convert a fasta file to a length file | |
| 917 # | |
| 918 # @param inFile string name of the input fasta file | |
| 919 # @param outFile string name of the output file | |
| 920 # | |
| 921 @staticmethod | |
| 922 def convertFastaToLength(inFile, outFile = ""): | |
| 923 if not outFile: | |
| 924 outFile = "%s.length" % inFile | |
| 925 | |
| 926 if inFile: | |
| 927 with open(inFile) as inFH: | |
| 928 with open(outFile, "w") as outFH: | |
| 929 bioseq = Bioseq() | |
| 930 bioseq.read(inFH) | |
| 931 while bioseq.sequence: | |
| 932 seqLen = bioseq.getLength() | |
| 933 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen)) | |
| 934 bioseq.read(inFH) | |
| 935 | |
| 936 | |
| 937 ## Convert a fasta file to a seq file | |
| 938 # | |
| 939 # @param inFile string name of the input fasta file | |
| 940 # @param outFile string name of the output file | |
| 941 # | |
| 942 @staticmethod | |
| 943 def convertFastaToSeq(inFile, outFile = ""): | |
| 944 if not outFile: | |
| 945 outFile = "%s.seq" % inFile | |
| 946 | |
| 947 if inFile: | |
| 948 with open(inFile) as inFH: | |
| 949 with open(outFile, "w") as outFH: | |
| 950 bioseq = Bioseq() | |
| 951 bioseq.read(inFH) | |
| 952 while bioseq.sequence: | |
| 953 seqLen = bioseq.getLength() | |
| 954 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \ | |
| 955 bioseq.sequence, bioseq.header, seqLen)) | |
| 956 bioseq.read(inFH) | |
| 957 | |
| 958 | |
| 959 ## Splice an input fasta file using coordinates in a Map file | |
| 960 # | |
| 961 # @note the coordinates should be merged beforehand! | |
| 962 # | |
| 963 @staticmethod | |
| 964 def spliceFromCoords(genomeFile, coordFile, obsFile): | |
| 965 genomeFileHandler = open(genomeFile) | |
| 966 obsFileHandler = open(obsFile, "w") | |
| 967 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile(coordFile) | |
| 968 | |
| 969 bs = Bioseq() | |
| 970 bs.read(genomeFileHandler) | |
| 971 while bs.sequence: | |
| 972 | |
| 973 if dChr2Maps.has_key(bs.header): | |
| 974 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax(dChr2Maps[bs.header]) | |
| 975 splicedSeq = [] | |
| 976 currentSite = 0 | |
| 977 for iMap in lCoords: | |
| 978 minSplice = iMap.getMin() - 1 | |
| 979 if minSplice > currentSite: | |
| 980 splicedSeq += bs.sequence[currentSite : minSplice] | |
| 981 if currentSite <= iMap.getMax(): | |
| 982 currentSite = iMap.getMax() | |
| 983 splicedSeq += bs.sequence[currentSite:] | |
| 984 bs.sequence = "".join(splicedSeq) | |
| 985 bs.write(obsFileHandler) | |
| 986 bs.read(genomeFileHandler) | |
| 987 | |
| 988 genomeFileHandler.close() | |
| 989 obsFileHandler.close() | |
| 990 | |
| 991 | |
| 992 ## Shuffle input sequences (single file or files in a directory) | |
| 993 # | |
| 994 @staticmethod | |
| 995 def dbShuffle(inData, outData, verbose = 0): | |
| 996 if CheckerUtils.isExecutableInUserPath("esl-shuffle"): | |
| 997 prg = "esl-shuffle" | |
| 998 else : prg = "shuffle" | |
| 999 genericCmd = prg + " --seed 1 -d INPUT > OUTPUT" | |
| 1000 if os.path.isfile(inData): | |
| 1001 if verbose > 0: | |
| 1002 print "shuffle input file '%s'" % inData | |
| 1003 cmd = genericCmd.replace("INPUT", inData).replace("OUTPUT", outData) | |
| 1004 print cmd | |
| 1005 returnStatus = os.system(cmd) | |
| 1006 if returnStatus: | |
| 1007 sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus) | |
| 1008 sys.exit(1) | |
| 1009 | |
| 1010 elif os.path.isdir(inData): | |
| 1011 if verbose > 0: | |
| 1012 print "shuffle files in input directory '%s'" % inData | |
| 1013 if os.path.exists(outData): | |
| 1014 shutil.rmtree(outData) | |
| 1015 os.mkdir(outData) | |
| 1016 lInputFiles = glob.glob("%s/*.fa" % inData) | |
| 1017 nbFastaFiles = 0 | |
| 1018 for inputFile in lInputFiles: | |
| 1019 nbFastaFiles += 1 | |
| 1020 if verbose > 1: | |
| 1021 print "%3i / %3i" % (nbFastaFiles, len(lInputFiles)) | |
| 1022 fastaBaseName = os.path.basename(inputFile) | |
| 1023 prefix = os.path.splitext(fastaBaseName)[0] | |
| 1024 cmd = genericCmd.replace("INPUT", inputFile).replace("OUTPUT", "%s/%s_shuffle.fa" % (outData, prefix)) | |
| 1025 returnStatus = os.system(cmd) | |
| 1026 if returnStatus: | |
| 1027 sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus) | |
| 1028 sys.exit(1) | |
| 1029 | |
| 1030 | |
| 1031 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers | |
| 1032 # | |
| 1033 # @param inClusterFileName string input cluster file name | |
| 1034 # @param inFastaFileName string input fasta file name | |
| 1035 # @param outFileName string output file name | |
| 1036 # @param verbosity integer verbosity | |
| 1037 # | |
| 1038 @staticmethod | |
| 1039 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0): | |
| 1040 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity) | |
| 1041 iFastaParser = FastaParser(inFastaFileName) | |
| 1042 with open(outFileName, "w") as f: | |
| 1043 for iSequence in iFastaParser.getIterator(): | |
| 1044 | |
| 1045 header = iSequence.getName() | |
| 1046 if dHeader2ClusterClusterMember.get(header): | |
| 1047 cluster = dHeader2ClusterClusterMember[header][0] | |
| 1048 member = dHeader2ClusterClusterMember[header][1] | |
| 1049 else: | |
| 1050 clusterIdForSingletonCluster += 1 | |
| 1051 cluster = clusterIdForSingletonCluster | |
| 1052 member = 1 | |
| 1053 | |
| 1054 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header) | |
| 1055 iSequence.setName(newHeader) | |
| 1056 f.write(iSequence.printFasta()) | |
| 1057 | |
| 1058 @staticmethod | |
| 1059 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0): | |
| 1060 dHeader2ClusterClusterMember = {} | |
| 1061 clusterId = 0 | |
| 1062 with open(inClusterFileName) as f: | |
| 1063 for line in f: | |
| 1064 lineWithoutLastChar = line.rstrip() | |
| 1065 lHeaders = lineWithoutLastChar.split("\t") | |
| 1066 clusterId += 1 | |
| 1067 if verbosity > 0: | |
| 1068 print "%i sequences in cluster %i" % (len(lHeaders), clusterId) | |
| 1069 memberId = 0 | |
| 1070 for header in lHeaders: | |
| 1071 memberId += 1 | |
| 1072 dHeader2ClusterClusterMember[header] = (clusterId, memberId) | |
| 1073 if verbosity > 0: | |
| 1074 print "%i clusters" % clusterId | |
| 1075 return dHeader2ClusterClusterMember, clusterId | |
| 1076 | |
| 1077 @staticmethod | |
| 1078 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""): | |
| 1079 """ | |
| 1080 Write a map file from fasta output of clustering tool. | |
| 1081 Warning: only works if input fasta headers are formated like LTRharvest fasta output. | |
| 1082 """ | |
| 1083 if not outMapFileName: | |
| 1084 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0]) | |
| 1085 | |
| 1086 fileDb = open(fastaFileNameFromClustering) | |
| 1087 fileMap = open(outMapFileName, "w") | |
| 1088 seq = Bioseq() | |
| 1089 numseq = 0 | |
| 1090 seq.read(fileDb) | |
| 1091 while seq.sequence: | |
| 1092 numseq = numseq + 1 | |
| 1093 ID = seq.header.split(' ')[0].split('_')[0] | |
| 1094 chunk = seq.header.split(' ')[0].split('_')[1] | |
| 1095 start = seq.header.split(' ')[-1].split(',')[0][1:] | |
| 1096 end = seq.header.split(' ')[-1].split(',')[1][:-1] | |
| 1097 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end) | |
| 1098 fileMap.write("%s\n" % line) | |
| 1099 seq.read(fileDb) | |
| 1100 | |
| 1101 fileDb.close() | |
| 1102 fileMap.close() | |
| 1103 print "saved in %s" % outMapFileName | |
| 1104 | |
| 1105 @staticmethod | |
| 1106 def getNstretchesRangesList(fastaFileName, nbN = 2): | |
| 1107 lNstretchesRanges = [] | |
| 1108 if nbN != 0: | |
| 1109 iBSDB = BioseqDB(fastaFileName) | |
| 1110 for iBS in iBSDB.db: | |
| 1111 nbNFound = 0 | |
| 1112 start = 1 | |
| 1113 pos = 1 | |
| 1114 lastPos = 0 | |
| 1115 | |
| 1116 while pos <= iBS.getLength(): | |
| 1117 if nbNFound == 0: | |
| 1118 start = pos | |
| 1119 | |
| 1120 while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']: | |
| 1121 nbNFound += 1 | |
| 1122 pos += 1 | |
| 1123 lastPos = pos | |
| 1124 | |
| 1125 if pos - lastPos >= nbN: | |
| 1126 if nbNFound >= nbN: | |
| 1127 lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1)) | |
| 1128 nbNFound = 0 | |
| 1129 pos += 1 | |
| 1130 | |
| 1131 if nbNFound >= nbN: | |
| 1132 lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1)) | |
| 1133 | |
| 1134 lNstretchesRanges.sort(key = lambda iRange: (iRange.getSeqname(), iRange.getStart(), iRange.getEnd()), reverse = False) | |
| 1135 | |
| 1136 return lNstretchesRanges | |
| 1137 | |
| 1138 | |
| 1139 @staticmethod | |
| 1140 def writeNstretches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"): | |
| 1141 lNstretchesRanges = FastaUtils.getNstretchesRangesList(fastaFileName, nbN) | |
| 1142 | |
| 1143 outFormat = outFormat.lower() | |
| 1144 if outFormat in ["gff", "gff3"]: | |
| 1145 outFormat = "gff3" | |
| 1146 else: | |
| 1147 outFormat = "map" | |
| 1148 | |
| 1149 if outFileName == "": | |
| 1150 outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat) | |
| 1151 | |
| 1152 with open(outFileName, "w") as fH: | |
| 1153 if outFormat == "gff3": | |
| 1154 fH.write("##gff-version 3\n") | |
| 1155 for iRange in lNstretchesRanges: | |
| 1156 seq = iRange.getSeqname() | |
| 1157 start = iRange.getStart() | |
| 1158 end = iRange.getEnd() | |
| 1159 if outFormat == "gff3": | |
| 1160 fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end)) | |
| 1161 else: | |
| 1162 fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end)) | |
| 1163 |
