comparison smart_toolShed/commons/core/seq/FastaUtils.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import os
33 import sys
34 import string
35 import math
36 import shutil
37 import re
38 import glob
39 from commons.core.seq.BioseqDB import BioseqDB
40 from commons.core.seq.Bioseq import Bioseq
41 from commons.core.coord.MapUtils import MapUtils
42 from commons.core.coord.Range import Range
43 from commons.core.checker.CheckerUtils import CheckerUtils
44 from commons.core.launcher.LauncherUtils import LauncherUtils
45 from commons.core.coord.ConvCoord import ConvCoord
46 from commons.core.parsing.FastaParser import FastaParser
47
48
49 ## Static methods for fasta file manipulation
50 #
51 class FastaUtils( object ):
52
53 ## Count the number of sequences in the input fasta file
54 #
55 # @param inFile name of the input fasta file
56 #
57 # @return integer number of sequences in the input fasta file
58 #
59 @staticmethod
60 def dbSize( inFile ):
61 nbSeq = 0
62 inFileHandler = open( inFile, "r" )
63 line = inFileHandler.readline()
64 while line:
65 if line[0] == ">":
66 nbSeq = nbSeq + 1
67 line = inFileHandler.readline()
68 inFileHandler.close()
69
70 return nbSeq
71
72
73 ## Compute the cumulative sequence length in the input fasta file
74 #
75 # @param inFile handler of the input fasta file
76 #
77 @staticmethod
78 def dbCumLength( inFile ):
79 cumLength = 0
80 line = inFile.readline()
81 while line:
82 if line[0] != ">":
83 cumLength += len(string.rstrip(line))
84 line = inFile.readline()
85
86 return cumLength
87
88
89 ## Return a list with the length of each sequence in the input fasta file
90 #
91 # @param inFile string name of the input fasta file
92 #
93 @staticmethod
94 def dbLengths( inFile ):
95 lLengths = []
96 inFileHandler = open( inFile, "r" )
97 currentLength = 0
98 line = inFileHandler.readline()
99 while line:
100 if line[0] == ">":
101 if currentLength != 0:
102 lLengths.append( currentLength )
103 currentLength = 0
104 else:
105 currentLength += len(line[:-1])
106 line = inFileHandler.readline()
107 lLengths.append( currentLength )
108 inFileHandler.close()
109 return lLengths
110
111
112 ## Retrieve the sequence headers present in the input fasta file
113 #
114 # @param inFile string name of the input fasta file
115 # @param verbose integer level of verbosity
116 #
117 # @return list of sequence headers
118 #
119 @staticmethod
120 def dbHeaders( inFile, verbose=0 ):
121 lHeaders = []
122
123 inFileHandler = open( inFile, "r" )
124 line = inFileHandler.readline()
125 while line:
126 if line[0] == ">":
127 lHeaders.append( string.rstrip(line[1:]) )
128 if verbose > 0:
129 print string.rstrip(line[1:])
130 line = inFileHandler.readline()
131 inFileHandler.close()
132
133 return lHeaders
134
135
136 ## Cut a data bank into chunks according to the input parameters
137 # If a sequence is shorter than the threshold, it is only renamed (not cut)
138 #
139 # @param inFileName string name of the input fasta file
140 # @param chkLgth string chunk length (in bp, default=200000)
141 # @param chkOver string chunk overlap (in bp, default=10000)
142 # @param wordN string N stretch word length (default=11, 0 for no detection)
143 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
144 # @param clean boolean remove 'cut' and 'Nstretch' files
145 # @param verbose integer (default = 0)
146 #
147 @staticmethod
148 def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ):
149 nbSeq = FastaUtils.dbSize( inFileName )
150 if verbose > 0:
151 print "cut the %i input sequences with cutterDB..." % ( nbSeq )
152 sys.stdout.flush()
153
154 prg = "cutterDB"
155 cmd = prg
156 cmd += " -l %s" % ( chkLgth )
157 cmd += " -o %s" %( chkOver )
158 cmd += " -w %s" % ( wordN )
159 cmd += " %s" % ( inFileName )
160 returnStatus = os.system( cmd )
161 if returnStatus != 0:
162 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
163 sys.stderr.write( "%s\n" % ( msg ) )
164 sys.exit(1)
165
166 nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) )
167 if verbose > 0:
168 print "done (%i chunks)" % ( nbChunks )
169 sys.stdout.flush()
170
171 if verbose > 0:
172 print "rename the headers..."
173 sys.stdout.flush()
174
175 if outFilePrefix == "":
176 outFastaName = inFileName + "_chunks.fa"
177 outMapName = inFileName + "_chunks.map"
178 else:
179 outFastaName = outFilePrefix + ".fa"
180 outMapName = outFilePrefix + ".map"
181
182 inFile = open( "%s_cut" % ( inFileName ), "r" )
183 line = inFile.readline()
184
185 outFasta = open( outFastaName, "w" )
186 outMap = open( outMapName, "w" )
187
188 # read line after line (no need for big RAM) and change the sequence headers
189 while line:
190
191 if line[0] == ">":
192 if verbose > 1:
193 print "rename '%s'" % ( line[:-1] ); sys.stdout.flush()
194 data = line[:-1].split(" ")
195 seqID = data[0].split(">")[1]
196 newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) )
197 oldHeader = data[2]
198 seqStart = data[4].split("..")[0]
199 seqEnd = data[4].split("..")[1]
200 outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) )
201 outFasta.write( ">%s\n" % ( newHeader ) )
202
203 else:
204 outFasta.write( line.upper() )
205
206 line = inFile.readline()
207
208 inFile.close()
209 outFasta.close()
210 outMap.close()
211
212 if clean == True:
213 os.remove(inFileName + "_cut")
214 os.remove(inFileName + ".Nstretch.map")
215
216
217 ## Split the input fasta file in several output files
218 #
219 # @param inFile string name of the input fasta file
220 # @param nbSeqPerBatch integer number of sequences per output file
221 # @param newDir boolean put the sequences in a new directory called 'batches'
222 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
223 # @param prefix prefix in output file name
224 # @param verbose integer verbosity level (default = 0)
225 #
226 @staticmethod
227 def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ):
228 if not os.path.exists( inFile ):
229 msg = "ERROR: file '%s' doesn't exist" % ( inFile )
230 sys.stderr.write( "%s\n" % ( msg ) )
231 sys.exit(1)
232
233 nbSeq = FastaUtils.dbSize( inFile )
234
235 nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) )
236 if verbose > 0:
237 print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches )
238 sys.stdout.flush()
239
240 if nbSeqPerBatch > 1 and useSeqHeader:
241 useSeqHeader = False
242
243 if newDir == True:
244 if os.path.exists( "batches" ):
245 shutil.rmtree( "batches" )
246 os.mkdir( "batches" )
247 os.chdir( "batches" )
248 os.system( "ln -s ../%s ." % ( inFile ) )
249
250 inFileHandler = open( inFile, "r" )
251 inFileHandler.seek( 0, 0 )
252 countBatch = 0
253 countSeq = 0
254 line = inFileHandler.readline()
255 while line:
256 if line == "":
257 break
258 if line[0] == ">":
259 countSeq += 1
260 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
261 if "outFile" in locals():
262 outFile.close()
263 countBatch += 1
264 if nbSeqPerBatch == 1 and useSeqHeader:
265 outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") )
266 else:
267 outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) )
268 outFile = open( outFileName, "w" )
269 if verbose > 1:
270 print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName )
271 sys.stdout.flush()
272 outFile.write( line )
273 line = inFileHandler.readline()
274 inFileHandler.close()
275
276 if newDir == True:
277 os.remove( os.path.basename( inFile ) )
278 os.chdir( ".." )
279
280
281 ## Split the input fasta file in several output files
282 #
283 # @param inFileName string name of the input fasta file
284 # @param maxSize integer max cumulative length for each output file
285 #
286 @staticmethod
287 def splitFastaFileInBatches(inFileName, maxSize = 200000):
288 iBioseqDB = BioseqDB(inFileName)
289 lHeadersSizeTuples = []
290 for iBioseq in iBioseqDB.db:
291 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
292
293 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
294 os.mkdir("batches")
295 os.chdir("batches")
296
297 iterator = 0
298 for lHeader in lHeadersList :
299 iterator += 1
300 with open("batch_%s.fa" % iterator, 'w') as f :
301 for header in lHeader :
302 iBioseq = iBioseqDB.fetch(header)
303 iBioseq.write(f)
304 os.chdir("..")
305
306
307 ## Split the input fasta file in several output files according to their cluster identifier
308 #
309 # @param inFileName string name of the input fasta file
310 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
311 # @param simplifyHeader boolean simplify the headers
312 # @param createDir boolean put the sequences in different directories
313 # @param outPrefix string prefix of the output files (default='seqCluster')
314 # @param verbose integer (default = 0)
315 #
316 @staticmethod
317 def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ):
318 if not os.path.exists( inFileName ):
319 print "ERROR: %s doesn't exist" % ( inFileName )
320 sys.exit(1)
321
322 inFile = open( inFileName, "r" )
323
324 line = inFile.readline()
325 if line:
326 name = line.split(" ")[0]
327 if "Cluster" in name:
328 clusterID = name.split("Cluster")[1].split("Mb")[0]
329 seqID = name.split("Mb")[1]
330 else:
331 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
332 if "Q" in name.split("Gr")[0]:
333 seqID = name.split("Gr")[0].split("MbQ")[1]
334 elif "S" in name:
335 seqID = name.split("Gr")[0].split("MbS")[1]
336 sClusterIDs = set( [ clusterID ] )
337 if simplifyHeader == True:
338 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
339 else:
340 header = line[1:-1]
341 if createDir == True:
342 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
343 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
344 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
345 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
346 outFile = open( outFileName, "w" )
347 outFile.write( ">%s\n" % ( header ) )
348 prevClusterID = clusterID
349
350 line = inFile.readline()
351 while line:
352 if line[0] == ">":
353 name = line.split(" ")[0]
354 if "Cluster" in name:
355 clusterID = name.split("Cluster")[1].split("Mb")[0]
356 seqID = name.split("Mb")[1]
357 else:
358 clusterID = name.split("Cl")[0].split("Gr")[1]
359 if "Q" in name.split("Gr")[0]:
360 seqID = name.split("Gr")[0].split("MbQ")[1]
361 elif "S" in name:
362 seqID = name.split("Gr")[0].split("MbS")[1]
363
364 if clusterID != prevClusterID:
365 outFile.close()
366
367 if simplifyHeader == True:
368 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
369 else:
370 header = line[1:-1]
371
372 if createDir == True:
373 os.chdir( ".." )
374 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
375 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
376 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
377
378 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
379 if not os.path.exists( outFileName ):
380 outFile = open( outFileName, "w" )
381 else:
382 if clusterID != prevClusterID:
383 outFile.close()
384 outFile = open( outFileName, "a" )
385 outFile.write( ">%s\n" % ( header ) )
386 prevClusterID = clusterID
387 sClusterIDs.add( clusterID )
388
389 else:
390 outFile.write( line )
391
392 line = inFile.readline()
393
394 outFile.close()
395 if verbose > 0:
396 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
397
398 if createDir == True:
399 os.chdir("..")
400 else:
401 print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
402
403
404 ## Filter a fasta file in two fasta files using the length of each sequence as a criteron
405 #
406 # @param len_min integer length sequence criterion to filter
407 # @param inFileName string name of the input fasta file
408 # @param verbose integer (default = 0)
409 #
410 @staticmethod
411 def dbLengthFilter( len_min, inFileName, verbose=0 ):
412 file_db = open( inFileName, "r" )
413 file_dbInf = open( inFileName+".Inf"+str(len_min), "w" )
414 file_dbSup = open( inFileName+".Sup"+str(len_min), "w" )
415 seq = Bioseq()
416 numseq = 0
417 nbsave = 0
418
419 seq.read( file_db )
420 while seq.sequence:
421 l = seq.getLength()
422 numseq = numseq + 1
423 if l >= len_min:
424 seq.write( file_dbSup )
425 if verbose > 0:
426 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!'
427 nbsave=nbsave+1
428 else:
429 seq.write( file_dbInf )
430 if verbose > 0:
431 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!'
432 nbsave=nbsave+1
433 seq.read( file_db )
434
435 file_db.close()
436 file_dbInf.close()
437 file_dbSup.close()
438 if verbose > 0:
439 print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
440
441
442 ## Extract the longest sequences from a fasta file
443 #
444 # @param num integer maximum number of sequences in the output file
445 # @param inFileName string name of the input fasta file
446 # @param outFileName string name of the output fasta file
447 # @param minThresh integer minimum length threshold (default=0)
448 # @param verbose integer (default = 0)
449 #
450 @staticmethod
451 def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ):
452 bsDB = BioseqDB( inFileName )
453 if verbose > 0:
454 print "nb of input sequences: %i" % ( bsDB.getSize() )
455
456 if outFileName == "":
457 outFileName = inFileName + ".best" + str(num)
458 outFile = open( outFileName, "w" )
459
460 if bsDB.getSize()==0:
461 return 0
462
463 num = int(num)
464 if verbose > 0:
465 print "keep the %i longest sequences" % ( num )
466 if minThresh > 0:
467 print "with length > %i bp" % ( minThresh )
468 sys.stdout.flush()
469
470 # retrieve the length of each input sequence
471 tmpLSeqLgth = []
472 seqNum = 0
473 for bs in bsDB.db:
474 seqNum += 1
475 tmpLSeqLgth.append( bs.getLength() )
476 if verbose > 1:
477 print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() )
478 sys.stdout.flush()
479
480 # sort the lengths
481 tmpLSeqLgth.sort()
482 tmpLSeqLgth.reverse()
483
484 # select the longest
485 lSeqLgth = []
486 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
487 if tmpLSeqLgth[i] >= minThresh:
488 lSeqLgth.append( tmpLSeqLgth[i] )
489 if verbose > 0:
490 print "selected max length: %i" % ( max(lSeqLgth) )
491 print "selected min length: %i" % ( min(lSeqLgth) )
492 sys.stdout.flush()
493
494 # save the longest
495 inFile = open( inFileName )
496 seqNum = 0
497 nbSave = 0
498 for bs in bsDB.db:
499 seqNum += 1
500 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
501 bs.write( outFile )
502 if verbose > 1:
503 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
504 sys.stdout.flush()
505 nbSave += 1
506 if nbSave == num:
507 break
508 inFile.close()
509 outFile.close()
510 if verbose > 0:
511 print nbSave, "saved sequences in ", outFileName
512 sys.stdout.flush()
513
514 return 0
515
516
517 ## Extract all the sequence headers from a fasta file and write them in a new fasta file
518 #
519 # @param inFileName string name of the input fasta file
520 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
521 #
522 @staticmethod
523 def dbExtractSeqHeaders( inFileName, outFileName="" ):
524 lHeaders = FastaUtils.dbHeaders( inFileName )
525
526 if outFileName == "":
527 outFileName = inFileName + ".headers"
528
529 outFile = open( outFileName, "w" )
530 for i in lHeaders:
531 outFile.write( i + "\n" )
532 outFile.close()
533
534 return 0
535
536
537 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
538 #
539 # @param pattern regular expression to search in headers
540 # @param inFileName string name of the input fasta file
541 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
542 # @param verbose integer verbosity level (default = 0)
543 #
544 @staticmethod
545 def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ):
546 if pattern == "":
547 return
548
549 if outFileName == "":
550 outFileName = inFileName + '.extracted'
551 outFile = open( outFileName, 'w' )
552
553 patternTosearch = re.compile( pattern )
554 bioseq = Bioseq()
555 bioseqNb = 0
556 savedBioseqNb = 0
557 inFile = open( inFileName, "r" )
558 bioseq.read( inFile )
559 while bioseq.sequence:
560 bioseqNb = bioseqNb + 1
561 m = patternTosearch.search( bioseq.header )
562 if m:
563 bioseq.write( outFile )
564 if verbose > 1:
565 print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!'
566 savedBioseqNb = savedBioseqNb + 1
567 bioseq.read( inFile )
568 inFile.close()
569
570 outFile.close()
571
572 if verbose > 0:
573 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
574
575
576 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
577 #
578 # @param patternFileName string file containing regular expression to search in headers
579 # @param inFileName string name of the input fasta file
580 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
581 # @param verbose integer verbosity level (default = 0)
582 #
583 @staticmethod
584 def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
585
586 if patternFileName == "":
587 print "ERROR: no file of pattern"
588 sys.exit(1)
589
590 bioseq = Bioseq()
591 bioseqNb = 0
592 savedBioseqNb = 0
593 lHeaders = []
594
595 inFile = open( inFileName, "r" )
596 bioseq.read( inFile )
597 while bioseq.sequence != None:
598 lHeaders.append( bioseq.header )
599 bioseq.read( inFile )
600 inFile.close()
601
602 lHeadersToKeep = []
603 patternFile = open( patternFileName, "r" )
604 for pattern in patternFile:
605 if verbose > 0:
606 print "pattern: ",pattern[:-1]; sys.stdout.flush()
607
608 patternToSearch = re.compile(pattern[:-1])
609 for h in lHeaders:
610 if patternToSearch.search(h):
611 lHeadersToKeep.append(h)
612 patternFile.close()
613
614 if outFileName == "":
615 outFileName = inFileName + ".extracted"
616 outFile=open( outFileName, "w" )
617
618 inFile = open( inFileName, "r" )
619 bioseq.read(inFile)
620 while bioseq.sequence:
621 bioseqNb += 1
622 if bioseq.header in lHeadersToKeep:
623 bioseq.write(outFile)
624 if verbose > 1:
625 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
626 savedBioseqNb += 1
627 bioseq.read(inFile)
628 inFile.close()
629
630 outFile.close()
631
632 if verbose > 0:
633 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
634
635
636 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
637 #
638 # @param pattern regular expression to search in headers
639 # @param inFileName string name of the input fasta file
640 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
641 # @param verbose integer verbosity level (default = 0)
642 #
643 @staticmethod
644 def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ):
645 if pattern == "":
646 return
647
648 patternToSearch = re.compile(pattern)
649
650 if outFileName == "":
651 outFileName = inFileName + '.cleaned'
652 outFile = open(outFileName,'w')
653
654 bioseq = Bioseq()
655 bioseqNb = 0
656 savedBioseqNb = 0
657 inFile = open(inFileName)
658 bioseq.read(inFile)
659 while bioseq.sequence != None:
660 bioseqNb += 1
661 if not patternToSearch.search(bioseq.header):
662 bioseq.write(outFile)
663 if verbose > 1:
664 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'
665 savedBioseqNb += 1
666 bioseq.read(inFile)
667 inFile.close()
668
669 outFile.close()
670
671 if verbose > 0:
672 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
673
674
675 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
676 #
677 # @param patternFileName string file containing regular expression to search in headers
678 # @param inFileName string name of the input fasta file
679 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
680 # @param verbose integer verbosity level (default = 0)
681 #
682 @staticmethod
683 def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
684 if patternFileName == "":
685 print "ERROR: no file of pattern"
686 sys.exit(1)
687
688 bioseq = Bioseq()
689 bioseqNb = 0
690 savedBioseqNb = 0
691 lHeaders = []
692 inFile = open( inFileName, "r" )
693 bioseq.read( inFile )
694 while bioseq.sequence != None:
695 bioseqNb += 1
696 lHeaders.append( bioseq.header )
697 bioseq.read( inFile )
698 inFile.close()
699
700 patternFile = open( patternFileName, "r")
701 lHeadersToRemove = []
702 for pattern in patternFile:
703 if verbose > 0:
704 print "pattern: ",pattern[:-1]; sys.stdout.flush()
705
706 patternToSearch = re.compile( pattern[:-1] )
707 for h in lHeaders:
708 if patternToSearch.search(h):
709 lHeadersToRemove.append(h)
710 patternFile.close()
711
712 if outFileName == "":
713 outFileName = inFileName + '.cleaned'
714 outFile = open( outFileName, 'w' )
715
716 bioseqNum = 0
717 inFile=open( inFileName )
718 bioseq.read( inFile )
719 while bioseq.sequence != None:
720 bioseqNum += 1
721 if bioseq.header not in lHeadersToRemove:
722 bioseq.write( outFile )
723 if verbose > 1:
724 print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
725 savedBioseqNb += 1
726 bioseq.read( inFile )
727 inFile.close()
728
729 outFile.close()
730
731 if verbose > 0:
732 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
733
734
735 ## Find sequence's ORFs from a fasta file and write them in a tab file
736 #
737 # @param inFileName string name of the input fasta file
738 # @param orfMaxNb integer Select orfMaxNb best ORFs
739 # @param orfMinLength integer Keep ORFs with length > orfMinLength
740 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
741 # @param verbose integer verbosity level (default = 0)
742 #
743 @staticmethod
744 def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ):
745 if outFileName == "":
746 outFileName = inFileName + ".ORF.map"
747 outFile = open( outFileName, "w" )
748
749 bioseq = Bioseq()
750 bioseqNb = 0
751
752 inFile = open( inFileName )
753 bioseq.read( inFile )
754 while bioseq.sequence != None:
755 bioseq.upCase()
756 bioseqNb += 1
757 if verbose > 0:
758 print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]'
759
760 orf = bioseq.findORF()
761 bestOrf = []
762 for i in orf.keys():
763 orfLen = len(orf[i])
764 for j in xrange(1, orfLen):
765 start = orf[i][j-1] + 4
766 end = orf[i][j] + 3
767 if end - start >= orfMinLength:
768 bestOrf.append( ( end-start, i+1, start, end ) )
769
770 bioseq.reverseComplement()
771
772 orf = bioseq.findORF()
773 seqLen = bioseq.getLength()
774 for i in orf.keys():
775 orfLen = len(orf[i])
776 for j in xrange(1, orfLen):
777 start = seqLen - orf[i][j-1] - 3
778 end = seqLen - orf[i][j] - 2
779 if start - end >= orfMinLength:
780 bestOrf.append( ( start-end, (i+1)*-1, start, end ) )
781
782 bestOrf.sort()
783 bestOrf.reverse()
784 bestOrfNb = len(bestOrf)
785 if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
786 bestOrfNb = orfMaxNb
787 for i in xrange(0, bestOrfNb):
788 if verbose > 0:
789 print bestOrf[i]
790 outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\
791 "|"+str(bestOrf[i][0]),bioseq.header,
792 bestOrf[i][2],bestOrf[i][3]))
793 bioseq.read( inFile )
794
795 inFile.close()
796 outFile.close()
797
798 return 0
799
800
801 ## Sort sequences by increasing length (write a new file)
802 #
803 # @param inFileName string name of the input fasta file
804 # @param outFileName string name of the output fasta file
805 # @param verbose integer verbosity level
806 #
807 @staticmethod
808 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
809 if verbose > 0:
810 print "sort sequences by increasing length"
811 sys.stdout.flush()
812 if not os.path.exists( inFileName ):
813 print "ERROR: file '%s' doesn't exist" % ( inFileName )
814 sys.exit(1)
815
816 # read each seq one by one
817 # save them in distinct temporary files
818 # with their length in the name
819 inFileHandler = open( inFileName, "r" )
820 countSeq = 0
821 bs = Bioseq()
822 bs.read( inFileHandler )
823 while bs.header:
824 countSeq += 1
825 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
826 bs.appendBioseqInFile( tmpFile )
827 if verbose > 1:
828 print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
829 bs.header = ""
830 bs.sequence = ""
831 bs.read( inFileHandler )
832 inFileHandler.close()
833
834 # sort temporary file names
835 # concatenate them into the output file
836 if os.path.exists( outFileName ):
837 os.remove( outFileName )
838 lFiles = glob.glob( "*bp_*nb" )
839 lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
840 for fileName in lFiles:
841 cmd = "cat %s >> %s" % ( fileName, outFileName )
842 returnValue = os.system( cmd )
843 if returnValue != 0:
844 print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
845 sys.exit(1)
846 os.remove( fileName )
847
848 return 0
849
850
851 ## Sort sequences by header
852 #
853 # @param inFileName string name of the input fasta file
854 # @param outFileName string name of the output fasta file
855 # @param verbose integer verbosity level
856 #
857 @staticmethod
858 def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0):
859 if outFileName == "":
860 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
861 iBioseqDB = BioseqDB(inFileName)
862 f = open(outFileName, "w")
863 lHeaders = sorted(iBioseqDB.getHeaderList())
864 for header in lHeaders:
865 iBioseq = iBioseqDB.fetch(header)
866 iBioseq.write(f)
867 f.close()
868
869
870 ## Return a dictionary which keys are the headers and values the length of the sequences
871 #
872 # @param inFile string name of the input fasta file
873 # @param verbose integer verbosity level
874 #
875 @staticmethod
876 def getLengthPerHeader( inFile, verbose=0 ):
877 dHeader2Length = {}
878
879 inFileHandler = open( inFile, "r" )
880 currentSeqHeader = ""
881 currentSeqLength = 0
882 line = inFileHandler.readline()
883 while line:
884 if line[0] == ">":
885 if currentSeqHeader != "":
886 dHeader2Length[ currentSeqHeader ] = currentSeqLength
887 currentSeqLength = 0
888 currentSeqHeader = line[1:-1]
889 if verbose > 0:
890 print "current header: %s" % ( currentSeqHeader )
891 sys.stdout.flush()
892 else:
893 currentSeqLength += len( line.replace("\n","") )
894 line = inFileHandler.readline()
895 dHeader2Length[ currentSeqHeader ] = currentSeqLength
896 inFileHandler.close()
897
898 return dHeader2Length
899
900
901 ## Convert headers from a fasta file having chunk coordinates
902 #
903 # @param inFile string name of the input fasta file
904 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
905 # @param outFile string name of the output file
906 #
907 @staticmethod
908 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
909 inFileHandler = open(inFile, "r")
910 outFileHandler = open(outFile, "w")
911 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
912 iConvCoord = ConvCoord()
913 line = inFileHandler.readline()
914 while line:
915 if line[0] == ">":
916 if "{Fragment}" in line:
917 chkName = line.split(" ")[1]
918 chrName = dChunk2Map[chkName].seqname
919 lCoordPairs = line.split(" ")[3].split(",")
920 lRangesOnChk = []
921 for i in lCoordPairs:
922 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
923 lRangesOnChk.append(iRange)
924 lRangesOnChr = []
925 for iRange in lRangesOnChk:
926 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
927 newHeader = line[1:-1].split(" ")[0]
928 newHeader += " %s" % chrName
929 newHeader += " {Fragment}"
930 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
931 for iRange in lRangesOnChr[1:]:
932 newHeader += ",%i..%i" % (iRange.start, iRange.end)
933 outFileHandler.write(">%s\n" % newHeader)
934 else:
935 chkName = line.split("_")[1].split(" ")[0]
936 chrName = dChunk2Map[chkName].seqname
937 coords = line[line.find("[")+1 : line.find("]")]
938 start = int(coords.split(",")[0])
939 end = int(coords.split(",")[1])
940 iRangeOnChk = Range(chkName, start, end)
941 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
942 newHeader = line[1:-1].split("_")[0]
943 newHeader += " %s" % chrName
944 newHeader += " %s" % line[line.find("(") : line.find(")")+1]
945 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
946 outFileHandler.write(">%s\n" % newHeader)
947 else:
948 outFileHandler.write(line)
949 line = inFileHandler.readline()
950 inFileHandler.close()
951 outFileHandler.close()
952
953
954 ## Convert a fasta file to a length file
955 #
956 # @param inFile string name of the input fasta file
957 # @param outFile string name of the output file
958 #
959 @staticmethod
960 def convertFastaToLength(inFile, outFile = ""):
961 if outFile == "":
962 outFile = "%s.length" % inFile
963
964 if inFile != "":
965 with open(inFile, "r") as inFH:
966 with open(outFile, "w") as outFH:
967 bioseq = Bioseq()
968 bioseq.read(inFH)
969 while bioseq.sequence != None:
970 seqLen = bioseq.getLength()
971 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
972 bioseq.read(inFH)
973
974
975 ## Convert a fasta file to a seq file
976 #
977 # @param inFile string name of the input fasta file
978 # @param outFile string name of the output file
979 #
980 @staticmethod
981 def convertFastaToSeq(inFile, outFile = ""):
982 if outFile == "":
983 outFile = "%s.seq" % inFile
984
985 if inFile != "":
986 with open(inFile, "r") as inFH:
987 with open(outFile, "w") as outFH:
988 bioseq = Bioseq()
989 bioseq.read(inFH)
990 while bioseq.sequence != None:
991 seqLen = bioseq.getLength()
992 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
993 bioseq.sequence, bioseq.header, seqLen))
994 bioseq.read(inFH)
995
996
997 ## Splice an input fasta file using coordinates in a Map file
998 #
999 # @note the coordinates should be merged beforehand!
1000 #
1001 @staticmethod
1002 def spliceFromCoords( genomeFile, coordFile, obsFile ):
1003 genomeFileHandler = open( genomeFile, "r" )
1004 obsFileHandler = open( obsFile, "w" )
1005 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
1006
1007 bs = Bioseq()
1008 bs.read( genomeFileHandler )
1009 while bs.sequence:
1010 if dChr2Maps.has_key( bs.header ):
1011 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
1012 splicedSeq = ""
1013 currentSite = 0
1014 for iMap in lCoords:
1015 minSplice = iMap.getMin() - 1
1016 if minSplice > currentSite:
1017 splicedSeq += bs.sequence[ currentSite : minSplice ]
1018 currentSite = iMap.getMax()
1019 splicedSeq += bs.sequence[ currentSite : ]
1020 bs.sequence = splicedSeq
1021 bs.write( obsFileHandler )
1022 bs.read( genomeFileHandler )
1023
1024 genomeFileHandler.close()
1025 obsFileHandler.close()
1026
1027
1028 ## Shuffle input sequences (single file or files in a directory)
1029 #
1030 @staticmethod
1031 def dbShuffle( inData, outData, verbose=0 ):
1032 if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
1033 prg = "esl-shuffle"
1034 else : prg = "shuffle"
1035 genericCmd = prg + " -d INPUT > OUTPUT"
1036 if os.path.isfile( inData ):
1037 if verbose > 0:
1038 print "shuffle input file '%s'" % inData
1039 cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData)
1040 print cmd
1041 returnStatus = os.system( cmd )
1042 if returnStatus != 0:
1043 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
1044 sys.exit(1)
1045
1046 elif os.path.isdir( inData ):
1047 if verbose > 0:
1048 print "shuffle files in input directory '%s'" % inData
1049 if os.path.exists( outData ):
1050 shutil.rmtree( outData )
1051 os.mkdir( outData )
1052 lInputFiles = glob.glob( "%s/*.fa" %( inData ) )
1053 nbFastaFiles = 0
1054 for inputFile in lInputFiles:
1055 nbFastaFiles += 1
1056 if verbose > 1:
1057 print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) )
1058 fastaBaseName = os.path.basename( inputFile )
1059 prefix, extension = os.path.splitext( fastaBaseName )
1060 cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix))
1061 returnStatus = os.system( cmd )
1062 if returnStatus != 0:
1063 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
1064 sys.exit(1)
1065
1066
1067 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
1068 #
1069 # @param inClusterFileName string input cluster file name
1070 # @param inFastaFileName string input fasta file name
1071 # @param outFileName string output file name
1072 # @param verbosity integer verbosity
1073 #
1074 @staticmethod
1075 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
1076 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
1077 iFastaParser = FastaParser(inFastaFileName)
1078 with open(outFileName, "w") as f:
1079 for iSequence in iFastaParser.getIterator():
1080
1081 header = iSequence.getName()
1082 if dHeader2ClusterClusterMember.get(header):
1083 cluster = dHeader2ClusterClusterMember[header][0]
1084 member = dHeader2ClusterClusterMember[header][1]
1085 else:
1086 clusterIdForSingletonCluster += 1
1087 cluster = clusterIdForSingletonCluster
1088 member = 1
1089
1090 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
1091 iSequence.setName(newHeader)
1092 f.write(iSequence.printFasta())
1093
1094 @staticmethod
1095 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
1096 dHeader2ClusterClusterMember = {}
1097 clusterId = 0
1098 with open(inClusterFileName) as f:
1099 line = f.readline()
1100 while line:
1101 lineWithoutLastChar = line.rstrip()
1102 lHeaders = lineWithoutLastChar.split("\t")
1103 clusterId += 1
1104 if verbosity > 0:
1105 print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
1106 memberId = 0
1107 for header in lHeaders:
1108 memberId += 1
1109 dHeader2ClusterClusterMember[header] = (clusterId, memberId)
1110 line = f.readline()
1111 if verbosity > 0:
1112 print "%i clusters" % clusterId
1113 return dHeader2ClusterClusterMember, clusterId
1114
1115 @staticmethod
1116 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
1117 """
1118 Write a map file from fasta output of clustering tool.
1119 Warning: only works if input fasta headers are formated like LTRharvest fasta output.
1120 """
1121 if not outMapFileName:
1122 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
1123
1124 fileDb = open(fastaFileNameFromClustering , "r")
1125 fileMap = open(outMapFileName, "w")
1126 seq = Bioseq()
1127 numseq = 0
1128 while 1:
1129 seq.read(fileDb)
1130 if seq.sequence == None:
1131 break
1132 numseq = numseq + 1
1133 ID = seq.header.split(' ')[0].split('_')[0]
1134 chunk = seq.header.split(' ')[0].split('_')[1]
1135 start = seq.header.split(' ')[-1].split(',')[0][1:]
1136 end = seq.header.split(' ')[-1].split(',')[1][:-1]
1137 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
1138 fileMap.write(line + "\n")
1139
1140 fileDb.close()
1141 fileMap.close()
1142 print "saved in %s" % outMapFileName
1143