comparison commons/core/seq/FastaUtils.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents
children
comparison
equal deleted inserted replaced
35:d94018ca4ada 36:44d5973c188c
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import os
33 import sys
34 import string
35 import math
36 import shutil
37 import re
38 import glob
39 from operator import itemgetter
40 from commons.core.seq.BioseqDB import BioseqDB
41 from commons.core.seq.Bioseq import Bioseq
42 from commons.core.coord.MapUtils import MapUtils
43 from commons.core.coord.Range import Range
44 from commons.core.checker.CheckerUtils import CheckerUtils
45 from commons.core.launcher.LauncherUtils import LauncherUtils
46 from commons.core.coord.ConvCoord import ConvCoord
47 from commons.core.parsing.FastaParser import FastaParser
48
49
50 ## Static methods for fasta file manipulation
51 #
52 class FastaUtils( object ):
53
54 ## Count the number of sequences in the input fasta file
55 #
56 # @param inFile name of the input fasta file
57 #
58 # @return integer number of sequences in the input fasta file
59 #
60 @staticmethod
61 def dbSize( inFile ):
62 nbSeq = 0
63 inFileHandler = open( inFile, "r" )
64 line = inFileHandler.readline()
65 while line:
66 if line[0] == ">":
67 nbSeq = nbSeq + 1
68 line = inFileHandler.readline()
69 inFileHandler.close()
70
71 return nbSeq
72
73
74 ## Compute the cumulative sequence length in the input fasta file
75 #
76 # @param inFile handler of the input fasta file
77 #
78 @staticmethod
79 def dbCumLength( inFile ):
80 cumLength = 0
81 line = inFile.readline()
82 while line:
83 if line[0] != ">":
84 cumLength += len(string.rstrip(line))
85 line = inFile.readline()
86
87 return cumLength
88
89
90 ## Return a list with the length of each sequence in the input fasta file
91 #
92 # @param inFile string name of the input fasta file
93 #
94 @staticmethod
95 def dbLengths( inFile ):
96 lLengths = []
97 inFileHandler = open( inFile, "r" )
98 currentLength = 0
99 line = inFileHandler.readline()
100 while line:
101 if line[0] == ">":
102 if currentLength != 0:
103 lLengths.append( currentLength )
104 currentLength = 0
105 else:
106 currentLength += len(line[:-1])
107 line = inFileHandler.readline()
108 lLengths.append( currentLength )
109 inFileHandler.close()
110 return lLengths
111
112
113 ## Retrieve the sequence headers present in the input fasta file
114 #
115 # @param inFile string name of the input fasta file
116 # @param verbose integer level of verbosity
117 #
118 # @return list of sequence headers
119 #
120 @staticmethod
121 def dbHeaders( inFile, verbose=0 ):
122 lHeaders = []
123
124 inFileHandler = open( inFile, "r" )
125 line = inFileHandler.readline()
126 while line:
127 if line[0] == ">":
128 lHeaders.append( string.rstrip(line[1:]) )
129 if verbose > 0:
130 print string.rstrip(line[1:])
131 line = inFileHandler.readline()
132 inFileHandler.close()
133
134 return lHeaders
135
136
137 ## Cut a data bank into chunks according to the input parameters
138 # If a sequence is shorter than the threshold, it is only renamed (not cut)
139 #
140 # @param inFileName string name of the input fasta file
141 # @param chkLgth string chunk length (in bp, default=200000)
142 # @param chkOver string chunk overlap (in bp, default=10000)
143 # @param wordN string N stretch word length (default=11, 0 for no detection)
144 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
145 # @param clean boolean remove 'cut' and 'Nstretch' files
146 # @param verbose integer (default = 0)
147 #
148 @staticmethod
149 def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ):
150 nbSeq = FastaUtils.dbSize( inFileName )
151 if verbose > 0:
152 print "cut the %i input sequences with cutterDB..." % ( nbSeq )
153 sys.stdout.flush()
154
155 prg = "cutterDB"
156 cmd = prg
157 cmd += " -l %s" % ( chkLgth )
158 cmd += " -o %s" %( chkOver )
159 cmd += " -w %s" % ( wordN )
160 cmd += " %s" % ( inFileName )
161 returnStatus = os.system( cmd )
162 if returnStatus != 0:
163 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
164 sys.stderr.write( "%s\n" % ( msg ) )
165 sys.exit(1)
166
167 nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) )
168 if verbose > 0:
169 print "done (%i chunks)" % ( nbChunks )
170 sys.stdout.flush()
171
172 if verbose > 0:
173 print "rename the headers..."
174 sys.stdout.flush()
175
176 if outFilePrefix == "":
177 outFastaName = inFileName + "_chunks.fa"
178 outMapName = inFileName + "_chunks.map"
179 else:
180 outFastaName = outFilePrefix + ".fa"
181 outMapName = outFilePrefix + ".map"
182
183 inFile = open( "%s_cut" % ( inFileName ), "r" )
184 line = inFile.readline()
185
186 outFasta = open( outFastaName, "w" )
187 outMap = open( outMapName, "w" )
188
189 # read line after line (no need for big RAM) and change the sequence headers
190 while line:
191
192 if line[0] == ">":
193 if verbose > 1:
194 print "rename '%s'" % ( line[:-1] ); sys.stdout.flush()
195 data = line[:-1].split(" ")
196 seqID = data[0].split(">")[1]
197 newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) )
198 oldHeader = data[2]
199 seqStart = data[4].split("..")[0]
200 seqEnd = data[4].split("..")[1]
201 outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) )
202 outFasta.write( ">%s\n" % ( newHeader ) )
203
204 else:
205 outFasta.write( line.upper() )
206
207 line = inFile.readline()
208
209 inFile.close()
210 outFasta.close()
211 outMap.close()
212
213 if clean == True:
214 os.remove(inFileName + "_cut")
215 os.remove(inFileName + ".Nstretch.map")
216
217
218 ## Split the input fasta file in several output files
219 #
220 # @param inFile string name of the input fasta file
221 # @param nbSeqPerBatch integer number of sequences per output file
222 # @param newDir boolean put the sequences in a new directory called 'batches'
223 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
224 # @param prefix prefix in output file name
225 # @param verbose integer verbosity level (default = 0)
226 #
227 @staticmethod
228 def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ):
229 if not os.path.exists( inFile ):
230 msg = "ERROR: file '%s' doesn't exist" % ( inFile )
231 sys.stderr.write( "%s\n" % ( msg ) )
232 sys.exit(1)
233
234 nbSeq = FastaUtils.dbSize( inFile )
235
236 nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) )
237 if verbose > 0:
238 print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches )
239 sys.stdout.flush()
240
241 if nbSeqPerBatch > 1 and useSeqHeader:
242 useSeqHeader = False
243
244 if newDir == True:
245 if os.path.exists( "batches" ):
246 shutil.rmtree( "batches" )
247 os.mkdir( "batches" )
248 os.chdir( "batches" )
249 os.system( "ln -s ../%s ." % ( inFile ) )
250
251 inFileHandler = open( inFile, "r" )
252 inFileHandler.seek( 0, 0 )
253 countBatch = 0
254 countSeq = 0
255 line = inFileHandler.readline()
256 while line:
257 if line == "":
258 break
259 if line[0] == ">":
260 countSeq += 1
261 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
262 if "outFile" in locals():
263 outFile.close()
264 countBatch += 1
265 if nbSeqPerBatch == 1 and useSeqHeader:
266 outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") )
267 else:
268 outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) )
269 outFile = open( outFileName, "w" )
270 if verbose > 1:
271 print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName )
272 sys.stdout.flush()
273 outFile.write( line )
274 line = inFileHandler.readline()
275 inFileHandler.close()
276
277 if newDir == True:
278 os.remove( os.path.basename( inFile ) )
279 os.chdir( ".." )
280
281
282 ## Split the input fasta file in several output files
283 #
284 # @param inFileName string name of the input fasta file
285 # @param maxSize integer max cumulative length for each output file
286 #
287 @staticmethod
288 def splitFastaFileInBatches(inFileName, maxSize = 200000):
289 iBioseqDB = BioseqDB(inFileName)
290 lHeadersSizeTuples = []
291 for iBioseq in iBioseqDB.db:
292 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
293
294 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
295 os.mkdir("batches")
296 os.chdir("batches")
297
298 iterator = 0
299 for lHeader in lHeadersList :
300 iterator += 1
301 with open("batch_%s.fa" % iterator, 'w') as f :
302 for header in lHeader :
303 iBioseq = iBioseqDB.fetch(header)
304 iBioseq.write(f)
305 os.chdir("..")
306
307
308 ## Split the input fasta file in several output files according to their cluster identifier
309 #
310 # @param inFileName string name of the input fasta file
311 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
312 # @param simplifyHeader boolean simplify the headers
313 # @param createDir boolean put the sequences in different directories
314 # @param outPrefix string prefix of the output files (default='seqCluster')
315 # @param verbose integer (default = 0)
316 #
317 @staticmethod
318 def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ):
319 if not os.path.exists( inFileName ):
320 print "ERROR: %s doesn't exist" % ( inFileName )
321 sys.exit(1)
322
323 inFile = open( inFileName, "r" )
324
325 line = inFile.readline()
326 if line:
327 name = line.split(" ")[0]
328 if "Cluster" in name:
329 clusterID = name.split("Cluster")[1].split("Mb")[0]
330 seqID = name.split("Mb")[1]
331 else:
332 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
333 if "Q" in name.split("Gr")[0]:
334 seqID = name.split("Gr")[0].split("MbQ")[1]
335 elif "S" in name:
336 seqID = name.split("Gr")[0].split("MbS")[1]
337 sClusterIDs = set( [ clusterID ] )
338 if simplifyHeader == True:
339 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
340 else:
341 header = line[1:-1]
342 if createDir == True:
343 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
344 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
345 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
346 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
347 outFile = open( outFileName, "w" )
348 outFile.write( ">%s\n" % ( header ) )
349 prevClusterID = clusterID
350
351 line = inFile.readline()
352 while line:
353 if line[0] == ">":
354 name = line.split(" ")[0]
355 if "Cluster" in name:
356 clusterID = name.split("Cluster")[1].split("Mb")[0]
357 seqID = name.split("Mb")[1]
358 else:
359 clusterID = name.split("Cl")[0].split("Gr")[1]
360 if "Q" in name.split("Gr")[0]:
361 seqID = name.split("Gr")[0].split("MbQ")[1]
362 elif "S" in name:
363 seqID = name.split("Gr")[0].split("MbS")[1]
364
365 if clusterID != prevClusterID:
366 outFile.close()
367
368 if simplifyHeader == True:
369 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
370 else:
371 header = line[1:-1]
372
373 if createDir == True:
374 os.chdir( ".." )
375 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
376 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
377 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
378
379 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
380 if not os.path.exists( outFileName ):
381 outFile = open( outFileName, "w" )
382 else:
383 if clusterID != prevClusterID:
384 outFile.close()
385 outFile = open( outFileName, "a" )
386 outFile.write( ">%s\n" % ( header ) )
387 prevClusterID = clusterID
388 sClusterIDs.add( clusterID )
389
390 else:
391 outFile.write( line )
392
393 line = inFile.readline()
394
395 outFile.close()
396 if verbose > 0:
397 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
398
399 if createDir == True:
400 os.chdir("..")
401 else:
402 print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
403
404
405 ## Filter a fasta file in two fasta files using the length of each sequence as a criteron
406 #
407 # @param len_min integer length sequence criterion to filter
408 # @param inFileName string name of the input fasta file
409 # @param verbose integer (default = 0)
410 #
411 @staticmethod
412 def dbLengthFilter( len_min, inFileName, verbose=0 ):
413 file_db = open( inFileName, "r" )
414 file_dbInf = open( inFileName+".Inf"+str(len_min), "w" )
415 file_dbSup = open( inFileName+".Sup"+str(len_min), "w" )
416 seq = Bioseq()
417 numseq = 0
418 nbsave = 0
419
420 seq.read( file_db )
421 while seq.sequence:
422 l = seq.getLength()
423 numseq = numseq + 1
424 if l >= len_min:
425 seq.write( file_dbSup )
426 if verbose > 0:
427 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!'
428 nbsave=nbsave+1
429 else:
430 seq.write( file_dbInf )
431 if verbose > 0:
432 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!'
433 nbsave=nbsave+1
434 seq.read( file_db )
435
436 file_db.close()
437 file_dbInf.close()
438 file_dbSup.close()
439 if verbose > 0:
440 print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
441
442
443 ## Extract the longest sequences from a fasta file
444 #
445 # @param num integer maximum number of sequences in the output file
446 # @param inFileName string name of the input fasta file
447 # @param outFileName string name of the output fasta file
448 # @param minThresh integer minimum length threshold (default=0)
449 # @param verbose integer (default = 0)
450 #
451 @staticmethod
452 def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ):
453 bsDB = BioseqDB( inFileName )
454 if verbose > 0:
455 print "nb of input sequences: %i" % ( bsDB.getSize() )
456
457 if outFileName == "":
458 outFileName = inFileName + ".best" + str(num)
459 outFile = open( outFileName, "w" )
460
461 if bsDB.getSize()==0:
462 return 0
463
464 num = int(num)
465 if verbose > 0:
466 print "keep the %i longest sequences" % ( num )
467 if minThresh > 0:
468 print "with length > %i bp" % ( minThresh )
469 sys.stdout.flush()
470
471 # retrieve the length of each input sequence
472 tmpLSeqLgth = []
473 seqNum = 0
474 for bs in bsDB.db:
475 seqNum += 1
476 tmpLSeqLgth.append( bs.getLength() )
477 if verbose > 1:
478 print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() )
479 sys.stdout.flush()
480
481 # sort the lengths
482 tmpLSeqLgth.sort()
483 tmpLSeqLgth.reverse()
484
485 # select the longest
486 lSeqLgth = []
487 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
488 if tmpLSeqLgth[i] >= minThresh:
489 lSeqLgth.append( tmpLSeqLgth[i] )
490 if verbose > 0:
491 print "selected max length: %i" % ( max(lSeqLgth) )
492 print "selected min length: %i" % ( min(lSeqLgth) )
493 sys.stdout.flush()
494
495 # save the longest
496 inFile = open( inFileName )
497 seqNum = 0
498 nbSave = 0
499 for bs in bsDB.db:
500 seqNum += 1
501 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
502 bs.write( outFile )
503 if verbose > 1:
504 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
505 sys.stdout.flush()
506 nbSave += 1
507 if nbSave == num:
508 break
509 inFile.close()
510 outFile.close()
511 if verbose > 0:
512 print nbSave, "saved sequences in ", outFileName
513 sys.stdout.flush()
514
515 return 0
516
517
518 ## Extract all the sequence headers from a fasta file and write them in a new fasta file
519 #
520 # @param inFileName string name of the input fasta file
521 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
522 #
523 @staticmethod
524 def dbExtractSeqHeaders( inFileName, outFileName="" ):
525 lHeaders = FastaUtils.dbHeaders( inFileName )
526
527 if outFileName == "":
528 outFileName = inFileName + ".headers"
529
530 outFile = open( outFileName, "w" )
531 for i in lHeaders:
532 outFile.write( i + "\n" )
533 outFile.close()
534
535 return 0
536
537
538 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
539 #
540 # @param pattern regular expression to search in headers
541 # @param inFileName string name of the input fasta file
542 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
543 # @param verbose integer verbosity level (default = 0)
544 #
545 @staticmethod
546 def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ):
547 if pattern == "":
548 return
549
550 if outFileName == "":
551 outFileName = inFileName + '.extracted'
552 outFile = open( outFileName, 'w' )
553
554 patternTosearch = re.compile( pattern )
555 bioseq = Bioseq()
556 bioseqNb = 0
557 savedBioseqNb = 0
558 inFile = open( inFileName, "r" )
559 bioseq.read( inFile )
560 while bioseq.sequence:
561 bioseqNb = bioseqNb + 1
562 m = patternTosearch.search( bioseq.header )
563 if m:
564 bioseq.write( outFile )
565 if verbose > 1:
566 print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!'
567 savedBioseqNb = savedBioseqNb + 1
568 bioseq.read( inFile )
569 inFile.close()
570
571 outFile.close()
572
573 if verbose > 0:
574 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
575
576
577 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
578 #
579 # @param patternFileName string file containing regular expression to search in headers
580 # @param inFileName string name of the input fasta file
581 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
582 # @param verbose integer verbosity level (default = 0)
583 #
584 @staticmethod
585 def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
586
587 if patternFileName == "":
588 print "ERROR: no file of pattern"
589 sys.exit(1)
590
591 bioseq = Bioseq()
592 bioseqNb = 0
593 savedBioseqNb = 0
594 lHeaders = []
595
596 inFile = open( inFileName, "r" )
597 bioseq.read( inFile )
598 while bioseq.sequence != None:
599 lHeaders.append( bioseq.header )
600 bioseq.read( inFile )
601 inFile.close()
602
603 lHeadersToKeep = []
604 patternFile = open( patternFileName, "r" )
605 for pattern in patternFile:
606 if verbose > 0:
607 print "pattern: ",pattern[:-1]; sys.stdout.flush()
608
609 patternToSearch = re.compile(pattern[:-1])
610 for h in lHeaders:
611 if patternToSearch.search(h):
612 lHeadersToKeep.append(h)
613 patternFile.close()
614
615 if outFileName == "":
616 outFileName = inFileName + ".extracted"
617 outFile=open( outFileName, "w" )
618
619 inFile = open( inFileName, "r" )
620 bioseq.read(inFile)
621 while bioseq.sequence:
622 bioseqNb += 1
623 if bioseq.header in lHeadersToKeep:
624 bioseq.write(outFile)
625 if verbose > 1:
626 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
627 savedBioseqNb += 1
628 bioseq.read(inFile)
629 inFile.close()
630
631 outFile.close()
632
633 if verbose > 0:
634 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
635
636
637 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
638 #
639 # @param pattern regular expression to search in headers
640 # @param inFileName string name of the input fasta file
641 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
642 # @param verbose integer verbosity level (default = 0)
643 #
644 @staticmethod
645 def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ):
646 if pattern == "":
647 return
648
649 patternToSearch = re.compile(pattern)
650
651 if outFileName == "":
652 outFileName = inFileName + '.cleaned'
653 outFile = open(outFileName,'w')
654
655 bioseq = Bioseq()
656 bioseqNb = 0
657 savedBioseqNb = 0
658 inFile = open(inFileName)
659 bioseq.read(inFile)
660 while bioseq.sequence != None:
661 bioseqNb += 1
662 if not patternToSearch.search(bioseq.header):
663 bioseq.write(outFile)
664 if verbose > 1:
665 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'
666 savedBioseqNb += 1
667 bioseq.read(inFile)
668 inFile.close()
669
670 outFile.close()
671
672 if verbose > 0:
673 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
674
675
676 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
677 #
678 # @param patternFileName string file containing regular expression to search in headers
679 # @param inFileName string name of the input fasta file
680 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
681 # @param verbose integer verbosity level (default = 0)
682 #
683 @staticmethod
684 def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
685 if patternFileName == "":
686 print "ERROR: no file of pattern"
687 sys.exit(1)
688
689 bioseq = Bioseq()
690 bioseqNb = 0
691 savedBioseqNb = 0
692 lHeaders = []
693 inFile = open( inFileName, "r" )
694 bioseq.read( inFile )
695 while bioseq.sequence != None:
696 bioseqNb += 1
697 lHeaders.append( bioseq.header )
698 bioseq.read( inFile )
699 inFile.close()
700
701 patternFile = open( patternFileName, "r")
702 lHeadersToRemove = []
703 for pattern in patternFile:
704 if verbose > 0:
705 print "pattern: ",pattern[:-1]; sys.stdout.flush()
706
707 patternToSearch = re.compile( pattern[:-1] )
708 for h in lHeaders:
709 if patternToSearch.search(h):
710 lHeadersToRemove.append(h)
711 patternFile.close()
712
713 if outFileName == "":
714 outFileName = inFileName + '.cleaned'
715 outFile = open( outFileName, 'w' )
716
717 bioseqNum = 0
718 inFile=open( inFileName )
719 bioseq.read( inFile )
720 while bioseq.sequence != None:
721 bioseqNum += 1
722 if bioseq.header not in lHeadersToRemove:
723 bioseq.write( outFile )
724 if verbose > 1:
725 print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
726 savedBioseqNb += 1
727 bioseq.read( inFile )
728 inFile.close()
729
730 outFile.close()
731
732 if verbose > 0:
733 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
734
735
736 ## Find sequence's ORFs from a fasta file and write them in a tab file
737 #
738 # @param inFileName string name of the input fasta file
739 # @param orfMaxNb integer Select orfMaxNb best ORFs
740 # @param orfMinLength integer Keep ORFs with length > orfMinLength
741 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
742 # @param verbose integer verbosity level (default = 0)
743 #
744 @staticmethod
745 def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ):
746 if outFileName == "":
747 outFileName = inFileName + ".ORF.map"
748 outFile = open( outFileName, "w" )
749
750 bioseq = Bioseq()
751 bioseqNb = 0
752
753 inFile = open( inFileName )
754 bioseq.read( inFile )
755 while bioseq.sequence != None:
756 bioseq.upCase()
757 bioseqNb += 1
758 if verbose > 0:
759 print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]'
760
761 orf = bioseq.findORF()
762 bestOrf = []
763 for i in orf.keys():
764 orfLen = len(orf[i])
765 for j in xrange(1, orfLen):
766 start = orf[i][j-1] + 4
767 end = orf[i][j] + 3
768 if end - start >= orfMinLength:
769 bestOrf.append( ( end-start, i+1, start, end ) )
770
771 bioseq.reverseComplement()
772
773 orf = bioseq.findORF()
774 seqLen = bioseq.getLength()
775 for i in orf.keys():
776 orfLen = len(orf[i])
777 for j in xrange(1, orfLen):
778 start = seqLen - orf[i][j-1] - 3
779 end = seqLen - orf[i][j] - 2
780 if start - end >= orfMinLength:
781 bestOrf.append( ( start-end, (i+1)*-1, start, end ) )
782
783 bestOrf.sort()
784 bestOrf.reverse()
785 bestOrfNb = len(bestOrf)
786 if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
787 bestOrfNb = orfMaxNb
788 for i in xrange(0, bestOrfNb):
789 if verbose > 0:
790 print bestOrf[i]
791 outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\
792 "|"+str(bestOrf[i][0]),bioseq.header,
793 bestOrf[i][2],bestOrf[i][3]))
794 bioseq.read( inFile )
795
796 inFile.close()
797 outFile.close()
798
799 return 0
800
801
802 ## Sort sequences by increasing length (write a new file)
803 #
804 # @param inFileName string name of the input fasta file
805 # @param outFileName string name of the output fasta file
806 # @param verbose integer verbosity level
807 #
808 @staticmethod
809 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
810 if verbose > 0:
811 print "sort sequences by increasing length"
812 sys.stdout.flush()
813 if not os.path.exists( inFileName ):
814 print "ERROR: file '%s' doesn't exist" % ( inFileName )
815 sys.exit(1)
816
817 # read each seq one by one
818 # save them in distinct temporary files
819 # with their length in the name
820 inFileHandler = open( inFileName, "r" )
821 countSeq = 0
822 bs = Bioseq()
823 bs.read( inFileHandler )
824 while bs.header:
825 countSeq += 1
826 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
827 bs.appendBioseqInFile( tmpFile )
828 if verbose > 1:
829 print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
830 bs.header = ""
831 bs.sequence = ""
832 bs.read( inFileHandler )
833 inFileHandler.close()
834
835 # sort temporary file names
836 # concatenate them into the output file
837 if os.path.exists( outFileName ):
838 os.remove( outFileName )
839 lFiles = glob.glob( "*bp_*nb" )
840 lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
841 for fileName in lFiles:
842 cmd = "cat %s >> %s" % ( fileName, outFileName )
843 returnValue = os.system( cmd )
844 if returnValue != 0:
845 print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
846 sys.exit(1)
847 os.remove( fileName )
848
849 return 0
850
851
852 ## Sort sequences by header
853 #
854 # @param inFileName string name of the input fasta file
855 # @param outFileName string name of the output fasta file
856 # @param verbose integer verbosity level
857 #
858 @staticmethod
859 def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0):
860 if outFileName == "":
861 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
862 iBioseqDB = BioseqDB(inFileName)
863 f = open(outFileName, "w")
864 lHeaders = sorted(iBioseqDB.getHeaderList())
865 for header in lHeaders:
866 iBioseq = iBioseqDB.fetch(header)
867 iBioseq.write(f)
868 f.close()
869
870
871 ## Return a dictionary which keys are the headers and values the length of the sequences
872 #
873 # @param inFile string name of the input fasta file
874 # @param verbose integer verbosity level
875 #
876 @staticmethod
877 def getLengthPerHeader( inFile, verbose=0 ):
878 dHeader2Length = {}
879
880 inFileHandler = open( inFile, "r" )
881 currentSeqHeader = ""
882 currentSeqLength = 0
883 line = inFileHandler.readline()
884 while line:
885 if line[0] == ">":
886 if currentSeqHeader != "":
887 dHeader2Length[ currentSeqHeader ] = currentSeqLength
888 currentSeqLength = 0
889 currentSeqHeader = line[1:-1]
890 if verbose > 0:
891 print "current header: %s" % ( currentSeqHeader )
892 sys.stdout.flush()
893 else:
894 currentSeqLength += len( line.replace("\n","") )
895 line = inFileHandler.readline()
896 dHeader2Length[ currentSeqHeader ] = currentSeqLength
897 inFileHandler.close()
898
899 return dHeader2Length
900
901
902 ## Convert headers from a fasta file having chunk coordinates
903 #
904 # @param inFile string name of the input fasta file
905 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
906 # @param outFile string name of the output file
907 #
908 @staticmethod
909 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
910 inFileHandler = open(inFile, "r")
911 outFileHandler = open(outFile, "w")
912 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
913 iConvCoord = ConvCoord()
914 line = inFileHandler.readline()
915 while line:
916 if line[0] == ">":
917 if "{Fragment}" in line:
918 chkName = line.split(" ")[1]
919 chrName = dChunk2Map[chkName].seqname
920 lCoordPairs = line.split(" ")[3].split(",")
921 lRangesOnChk = []
922 for i in lCoordPairs:
923 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
924 lRangesOnChk.append(iRange)
925 lRangesOnChr = []
926 for iRange in lRangesOnChk:
927 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
928 newHeader = line[1:-1].split(" ")[0]
929 newHeader += " %s" % chrName
930 newHeader += " {Fragment}"
931 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
932 for iRange in lRangesOnChr[1:]:
933 newHeader += ",%i..%i" % (iRange.start, iRange.end)
934 outFileHandler.write(">%s\n" % newHeader)
935 else:
936 chkName = line.split("_")[1].split(" ")[0]
937 chrName = dChunk2Map[chkName].seqname
938 coords = line[line.find("[")+1 : line.find("]")]
939 start = int(coords.split(",")[0])
940 end = int(coords.split(",")[1])
941 iRangeOnChk = Range(chkName, start, end)
942 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
943 newHeader = line[1:-1].split("_")[0]
944 newHeader += " %s" % chrName
945 newHeader += " %s" % line[line.find("(") : line.find(")")+1]
946 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
947 outFileHandler.write(">%s\n" % newHeader)
948 else:
949 outFileHandler.write(line)
950 line = inFileHandler.readline()
951 inFileHandler.close()
952 outFileHandler.close()
953
954
955 ## Convert a fasta file to a length file
956 #
957 # @param inFile string name of the input fasta file
958 # @param outFile string name of the output file
959 #
960 @staticmethod
961 def convertFastaToLength(inFile, outFile = ""):
962 if outFile == "":
963 outFile = "%s.length" % inFile
964
965 if inFile != "":
966 with open(inFile, "r") as inFH:
967 with open(outFile, "w") as outFH:
968 bioseq = Bioseq()
969 bioseq.read(inFH)
970 while bioseq.sequence != None:
971 seqLen = bioseq.getLength()
972 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
973 bioseq.read(inFH)
974
975
976 ## Convert a fasta file to a seq file
977 #
978 # @param inFile string name of the input fasta file
979 # @param outFile string name of the output file
980 #
981 @staticmethod
982 def convertFastaToSeq(inFile, outFile = ""):
983 if outFile == "":
984 outFile = "%s.seq" % inFile
985
986 if inFile != "":
987 with open(inFile, "r") as inFH:
988 with open(outFile, "w") as outFH:
989 bioseq = Bioseq()
990 bioseq.read(inFH)
991 while bioseq.sequence != None:
992 seqLen = bioseq.getLength()
993 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
994 bioseq.sequence, bioseq.header, seqLen))
995 bioseq.read(inFH)
996
997
998 ## Splice an input fasta file using coordinates in a Map file
999 #
1000 # @note the coordinates should be merged beforehand!
1001 #
1002 @staticmethod
1003 def spliceFromCoords( genomeFile, coordFile, obsFile ):
1004 genomeFileHandler = open( genomeFile, "r" )
1005 obsFileHandler = open( obsFile, "w" )
1006 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
1007
1008 bs = Bioseq()
1009 bs.read( genomeFileHandler )
1010 while bs.sequence:
1011 if dChr2Maps.has_key( bs.header ):
1012 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
1013 splicedSeq = ""
1014 currentSite = 0
1015 for iMap in lCoords:
1016 minSplice = iMap.getMin() - 1
1017 if minSplice > currentSite:
1018 splicedSeq += bs.sequence[ currentSite : minSplice ]
1019 currentSite = iMap.getMax()
1020 splicedSeq += bs.sequence[ currentSite : ]
1021 bs.sequence = splicedSeq
1022 bs.write( obsFileHandler )
1023 bs.read( genomeFileHandler )
1024
1025 genomeFileHandler.close()
1026 obsFileHandler.close()
1027
1028
1029 ## Shuffle input sequences (single file or files in a directory)
1030 #
1031 @staticmethod
1032 def dbShuffle( inData, outData, verbose=0 ):
1033 if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
1034 prg = "esl-shuffle"
1035 else : prg = "shuffle"
1036 genericCmd = prg + " -d INPUT > OUTPUT"
1037 if os.path.isfile( inData ):
1038 if verbose > 0:
1039 print "shuffle input file '%s'" % inData
1040 cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData)
1041 print cmd
1042 returnStatus = os.system( cmd )
1043 if returnStatus != 0:
1044 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
1045 sys.exit(1)
1046
1047 elif os.path.isdir( inData ):
1048 if verbose > 0:
1049 print "shuffle files in input directory '%s'" % inData
1050 if os.path.exists( outData ):
1051 shutil.rmtree( outData )
1052 os.mkdir( outData )
1053 lInputFiles = glob.glob( "%s/*.fa" %( inData ) )
1054 nbFastaFiles = 0
1055 for inputFile in lInputFiles:
1056 nbFastaFiles += 1
1057 if verbose > 1:
1058 print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) )
1059 fastaBaseName = os.path.basename( inputFile )
1060 prefix, extension = os.path.splitext( fastaBaseName )
1061 cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix))
1062 returnStatus = os.system( cmd )
1063 if returnStatus != 0:
1064 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
1065 sys.exit(1)
1066
1067
1068 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
1069 #
1070 # @param inClusterFileName string input cluster file name
1071 # @param inFastaFileName string input fasta file name
1072 # @param outFileName string output file name
1073 # @param verbosity integer verbosity
1074 #
1075 @staticmethod
1076 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
1077 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
1078 iFastaParser = FastaParser(inFastaFileName)
1079 with open(outFileName, "w") as f:
1080 for iSequence in iFastaParser.getIterator():
1081
1082 header = iSequence.getName()
1083 if dHeader2ClusterClusterMember.get(header):
1084 cluster = dHeader2ClusterClusterMember[header][0]
1085 member = dHeader2ClusterClusterMember[header][1]
1086 else:
1087 clusterIdForSingletonCluster += 1
1088 cluster = clusterIdForSingletonCluster
1089 member = 1
1090
1091 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
1092 iSequence.setName(newHeader)
1093 f.write(iSequence.printFasta())
1094
1095 @staticmethod
1096 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
1097 dHeader2ClusterClusterMember = {}
1098 clusterId = 0
1099 with open(inClusterFileName) as f:
1100 line = f.readline()
1101 while line:
1102 lineWithoutLastChar = line.rstrip()
1103 lHeaders = lineWithoutLastChar.split("\t")
1104 clusterId += 1
1105 if verbosity > 0:
1106 print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
1107 memberId = 0
1108 for header in lHeaders:
1109 memberId += 1
1110 dHeader2ClusterClusterMember[header] = (clusterId, memberId)
1111 line = f.readline()
1112 if verbosity > 0:
1113 print "%i clusters" % clusterId
1114 return dHeader2ClusterClusterMember, clusterId
1115
1116 @staticmethod
1117 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
1118 """
1119 Write a map file from fasta output of clustering tool.
1120 Warning: only works if input fasta headers are formated like LTRharvest fasta output.
1121 """
1122 if not outMapFileName:
1123 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
1124
1125 fileDb = open(fastaFileNameFromClustering , "r")
1126 fileMap = open(outMapFileName, "w")
1127 seq = Bioseq()
1128 numseq = 0
1129 while 1:
1130 seq.read(fileDb)
1131 if seq.sequence == None:
1132 break
1133 numseq = numseq + 1
1134 ID = seq.header.split(' ')[0].split('_')[0]
1135 chunk = seq.header.split(' ')[0].split('_')[1]
1136 start = seq.header.split(' ')[-1].split(',')[0][1:]
1137 end = seq.header.split(' ')[-1].split(',')[1][:-1]
1138 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
1139 fileMap.write(line + "\n")
1140
1141 fileDb.close()
1142 fileMap.close()
1143 print "saved in %s" % outMapFileName
1144
1145 @staticmethod
1146 def writeNstreches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
1147 outFormat = outFormat.lower()
1148 if outFormat in ["gff", "gff3"]:
1149 outFormat = "gff3"
1150 else:
1151 outFormat = "map"
1152
1153 lTNstretches = []
1154 if nbN != 0:
1155 iBSDB = BioseqDB(fastaFileName)
1156 for iBS in iBSDB.db:
1157 nbNFound = 0
1158 start = 1
1159 pos = 1
1160 lastPos = 0
1161
1162 while pos <= iBS.getLength():
1163 if nbNFound == 0:
1164 start = pos
1165
1166 while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
1167 nbNFound += 1
1168 pos += 1
1169 lastPos = pos
1170
1171 if pos - lastPos >= nbN:
1172 if nbNFound >= nbN:
1173 lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
1174 nbNFound = 0
1175 pos += 1
1176
1177 if nbNFound >= nbN:
1178 lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
1179
1180 lTNstretches.sort(key = itemgetter(0, 1, 2))
1181
1182 if outFileName == "":
1183 outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)
1184
1185 with open(outFileName, "w") as fH:
1186 if outFormat == "gff3":
1187 fH.write("##gff-version 3\n")
1188 for item in lTNstretches:
1189 seq = item[0]
1190 start = item[1]
1191 end = item[2]
1192 if outFormat == "gff3":
1193 fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
1194 else:
1195 fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))
1196
1197