6
|
1 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
2 # http://www.inra.fr
|
|
3 # http://urgi.versailles.inra.fr
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30
|
|
31
|
|
32 import os
|
|
33 import sys
|
|
34 import string
|
|
35 import math
|
|
36 import shutil
|
|
37 import re
|
|
38 import glob
|
18
|
39 from operator import itemgetter
|
6
|
40 from commons.core.seq.BioseqDB import BioseqDB
|
|
41 from commons.core.seq.Bioseq import Bioseq
|
|
42 from commons.core.coord.MapUtils import MapUtils
|
|
43 from commons.core.coord.Range import Range
|
|
44 from commons.core.checker.CheckerUtils import CheckerUtils
|
|
45 from commons.core.launcher.LauncherUtils import LauncherUtils
|
|
46 from commons.core.coord.ConvCoord import ConvCoord
|
|
47 from commons.core.parsing.FastaParser import FastaParser
|
|
48
|
|
49
|
|
50 ## Static methods for fasta file manipulation
|
|
51 #
|
|
52 class FastaUtils( object ):
|
|
53
|
|
54 ## Count the number of sequences in the input fasta file
|
|
55 #
|
|
56 # @param inFile name of the input fasta file
|
|
57 #
|
|
58 # @return integer number of sequences in the input fasta file
|
|
59 #
|
|
60 @staticmethod
|
|
61 def dbSize( inFile ):
|
|
62 nbSeq = 0
|
|
63 inFileHandler = open( inFile, "r" )
|
|
64 line = inFileHandler.readline()
|
|
65 while line:
|
|
66 if line[0] == ">":
|
|
67 nbSeq = nbSeq + 1
|
|
68 line = inFileHandler.readline()
|
|
69 inFileHandler.close()
|
|
70
|
|
71 return nbSeq
|
|
72
|
|
73
|
|
74 ## Compute the cumulative sequence length in the input fasta file
|
|
75 #
|
|
76 # @param inFile handler of the input fasta file
|
|
77 #
|
|
78 @staticmethod
|
|
79 def dbCumLength( inFile ):
|
|
80 cumLength = 0
|
|
81 line = inFile.readline()
|
|
82 while line:
|
|
83 if line[0] != ">":
|
|
84 cumLength += len(string.rstrip(line))
|
|
85 line = inFile.readline()
|
|
86
|
|
87 return cumLength
|
|
88
|
|
89
|
|
90 ## Return a list with the length of each sequence in the input fasta file
|
|
91 #
|
|
92 # @param inFile string name of the input fasta file
|
|
93 #
|
|
94 @staticmethod
|
|
95 def dbLengths( inFile ):
|
|
96 lLengths = []
|
|
97 inFileHandler = open( inFile, "r" )
|
|
98 currentLength = 0
|
|
99 line = inFileHandler.readline()
|
|
100 while line:
|
|
101 if line[0] == ">":
|
|
102 if currentLength != 0:
|
|
103 lLengths.append( currentLength )
|
|
104 currentLength = 0
|
|
105 else:
|
|
106 currentLength += len(line[:-1])
|
|
107 line = inFileHandler.readline()
|
|
108 lLengths.append( currentLength )
|
|
109 inFileHandler.close()
|
|
110 return lLengths
|
|
111
|
|
112
|
|
113 ## Retrieve the sequence headers present in the input fasta file
|
|
114 #
|
|
115 # @param inFile string name of the input fasta file
|
|
116 # @param verbose integer level of verbosity
|
|
117 #
|
|
118 # @return list of sequence headers
|
|
119 #
|
|
120 @staticmethod
|
|
121 def dbHeaders( inFile, verbose=0 ):
|
|
122 lHeaders = []
|
|
123
|
|
124 inFileHandler = open( inFile, "r" )
|
|
125 line = inFileHandler.readline()
|
|
126 while line:
|
|
127 if line[0] == ">":
|
|
128 lHeaders.append( string.rstrip(line[1:]) )
|
|
129 if verbose > 0:
|
|
130 print string.rstrip(line[1:])
|
|
131 line = inFileHandler.readline()
|
|
132 inFileHandler.close()
|
|
133
|
|
134 return lHeaders
|
|
135
|
|
136
|
|
137 ## Cut a data bank into chunks according to the input parameters
|
|
138 # If a sequence is shorter than the threshold, it is only renamed (not cut)
|
|
139 #
|
|
140 # @param inFileName string name of the input fasta file
|
|
141 # @param chkLgth string chunk length (in bp, default=200000)
|
|
142 # @param chkOver string chunk overlap (in bp, default=10000)
|
|
143 # @param wordN string N stretch word length (default=11, 0 for no detection)
|
|
144 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
|
|
145 # @param clean boolean remove 'cut' and 'Nstretch' files
|
|
146 # @param verbose integer (default = 0)
|
|
147 #
|
|
148 @staticmethod
|
|
149 def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ):
|
|
150 nbSeq = FastaUtils.dbSize( inFileName )
|
|
151 if verbose > 0:
|
|
152 print "cut the %i input sequences with cutterDB..." % ( nbSeq )
|
|
153 sys.stdout.flush()
|
|
154
|
|
155 prg = "cutterDB"
|
|
156 cmd = prg
|
|
157 cmd += " -l %s" % ( chkLgth )
|
|
158 cmd += " -o %s" %( chkOver )
|
|
159 cmd += " -w %s" % ( wordN )
|
|
160 cmd += " %s" % ( inFileName )
|
|
161 returnStatus = os.system( cmd )
|
|
162 if returnStatus != 0:
|
|
163 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
|
|
164 sys.stderr.write( "%s\n" % ( msg ) )
|
|
165 sys.exit(1)
|
|
166
|
|
167 nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) )
|
|
168 if verbose > 0:
|
|
169 print "done (%i chunks)" % ( nbChunks )
|
|
170 sys.stdout.flush()
|
|
171
|
|
172 if verbose > 0:
|
|
173 print "rename the headers..."
|
|
174 sys.stdout.flush()
|
|
175
|
|
176 if outFilePrefix == "":
|
|
177 outFastaName = inFileName + "_chunks.fa"
|
|
178 outMapName = inFileName + "_chunks.map"
|
|
179 else:
|
|
180 outFastaName = outFilePrefix + ".fa"
|
|
181 outMapName = outFilePrefix + ".map"
|
|
182
|
|
183 inFile = open( "%s_cut" % ( inFileName ), "r" )
|
|
184 line = inFile.readline()
|
|
185
|
|
186 outFasta = open( outFastaName, "w" )
|
|
187 outMap = open( outMapName, "w" )
|
|
188
|
|
189 # read line after line (no need for big RAM) and change the sequence headers
|
|
190 while line:
|
|
191
|
|
192 if line[0] == ">":
|
|
193 if verbose > 1:
|
|
194 print "rename '%s'" % ( line[:-1] ); sys.stdout.flush()
|
|
195 data = line[:-1].split(" ")
|
|
196 seqID = data[0].split(">")[1]
|
|
197 newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) )
|
|
198 oldHeader = data[2]
|
|
199 seqStart = data[4].split("..")[0]
|
|
200 seqEnd = data[4].split("..")[1]
|
|
201 outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) )
|
|
202 outFasta.write( ">%s\n" % ( newHeader ) )
|
|
203
|
|
204 else:
|
|
205 outFasta.write( line.upper() )
|
|
206
|
|
207 line = inFile.readline()
|
|
208
|
|
209 inFile.close()
|
|
210 outFasta.close()
|
|
211 outMap.close()
|
|
212
|
|
213 if clean == True:
|
|
214 os.remove(inFileName + "_cut")
|
|
215 os.remove(inFileName + ".Nstretch.map")
|
|
216
|
|
217
|
|
218 ## Split the input fasta file in several output files
|
|
219 #
|
|
220 # @param inFile string name of the input fasta file
|
|
221 # @param nbSeqPerBatch integer number of sequences per output file
|
|
222 # @param newDir boolean put the sequences in a new directory called 'batches'
|
|
223 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
|
|
224 # @param prefix prefix in output file name
|
|
225 # @param verbose integer verbosity level (default = 0)
|
|
226 #
|
|
227 @staticmethod
|
|
228 def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ):
|
|
229 if not os.path.exists( inFile ):
|
|
230 msg = "ERROR: file '%s' doesn't exist" % ( inFile )
|
|
231 sys.stderr.write( "%s\n" % ( msg ) )
|
|
232 sys.exit(1)
|
|
233
|
|
234 nbSeq = FastaUtils.dbSize( inFile )
|
|
235
|
|
236 nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) )
|
|
237 if verbose > 0:
|
|
238 print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches )
|
|
239 sys.stdout.flush()
|
|
240
|
|
241 if nbSeqPerBatch > 1 and useSeqHeader:
|
|
242 useSeqHeader = False
|
|
243
|
|
244 if newDir == True:
|
|
245 if os.path.exists( "batches" ):
|
|
246 shutil.rmtree( "batches" )
|
|
247 os.mkdir( "batches" )
|
|
248 os.chdir( "batches" )
|
|
249 os.system( "ln -s ../%s ." % ( inFile ) )
|
|
250
|
|
251 inFileHandler = open( inFile, "r" )
|
|
252 inFileHandler.seek( 0, 0 )
|
|
253 countBatch = 0
|
|
254 countSeq = 0
|
|
255 line = inFileHandler.readline()
|
|
256 while line:
|
|
257 if line == "":
|
|
258 break
|
|
259 if line[0] == ">":
|
|
260 countSeq += 1
|
|
261 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
|
|
262 if "outFile" in locals():
|
|
263 outFile.close()
|
|
264 countBatch += 1
|
|
265 if nbSeqPerBatch == 1 and useSeqHeader:
|
|
266 outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") )
|
|
267 else:
|
|
268 outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) )
|
|
269 outFile = open( outFileName, "w" )
|
|
270 if verbose > 1:
|
|
271 print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName )
|
|
272 sys.stdout.flush()
|
|
273 outFile.write( line )
|
|
274 line = inFileHandler.readline()
|
|
275 inFileHandler.close()
|
|
276
|
|
277 if newDir == True:
|
|
278 os.remove( os.path.basename( inFile ) )
|
|
279 os.chdir( ".." )
|
|
280
|
|
281
|
|
282 ## Split the input fasta file in several output files
|
|
283 #
|
|
284 # @param inFileName string name of the input fasta file
|
|
285 # @param maxSize integer max cumulative length for each output file
|
|
286 #
|
|
287 @staticmethod
|
|
288 def splitFastaFileInBatches(inFileName, maxSize = 200000):
|
|
289 iBioseqDB = BioseqDB(inFileName)
|
|
290 lHeadersSizeTuples = []
|
|
291 for iBioseq in iBioseqDB.db:
|
|
292 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
|
|
293
|
|
294 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
|
|
295 os.mkdir("batches")
|
|
296 os.chdir("batches")
|
|
297
|
|
298 iterator = 0
|
|
299 for lHeader in lHeadersList :
|
|
300 iterator += 1
|
|
301 with open("batch_%s.fa" % iterator, 'w') as f :
|
|
302 for header in lHeader :
|
|
303 iBioseq = iBioseqDB.fetch(header)
|
|
304 iBioseq.write(f)
|
|
305 os.chdir("..")
|
|
306
|
|
307
|
|
308 ## Split the input fasta file in several output files according to their cluster identifier
|
|
309 #
|
|
310 # @param inFileName string name of the input fasta file
|
|
311 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
|
|
312 # @param simplifyHeader boolean simplify the headers
|
|
313 # @param createDir boolean put the sequences in different directories
|
|
314 # @param outPrefix string prefix of the output files (default='seqCluster')
|
|
315 # @param verbose integer (default = 0)
|
|
316 #
|
|
317 @staticmethod
|
|
318 def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ):
|
|
319 if not os.path.exists( inFileName ):
|
|
320 print "ERROR: %s doesn't exist" % ( inFileName )
|
|
321 sys.exit(1)
|
|
322
|
|
323 inFile = open( inFileName, "r" )
|
|
324
|
|
325 line = inFile.readline()
|
|
326 if line:
|
|
327 name = line.split(" ")[0]
|
|
328 if "Cluster" in name:
|
|
329 clusterID = name.split("Cluster")[1].split("Mb")[0]
|
|
330 seqID = name.split("Mb")[1]
|
|
331 else:
|
|
332 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
|
|
333 if "Q" in name.split("Gr")[0]:
|
|
334 seqID = name.split("Gr")[0].split("MbQ")[1]
|
|
335 elif "S" in name:
|
|
336 seqID = name.split("Gr")[0].split("MbS")[1]
|
|
337 sClusterIDs = set( [ clusterID ] )
|
|
338 if simplifyHeader == True:
|
|
339 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
|
|
340 else:
|
|
341 header = line[1:-1]
|
|
342 if createDir == True:
|
|
343 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
|
|
344 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
345 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
346 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
|
|
347 outFile = open( outFileName, "w" )
|
|
348 outFile.write( ">%s\n" % ( header ) )
|
|
349 prevClusterID = clusterID
|
|
350
|
|
351 line = inFile.readline()
|
|
352 while line:
|
|
353 if line[0] == ">":
|
|
354 name = line.split(" ")[0]
|
|
355 if "Cluster" in name:
|
|
356 clusterID = name.split("Cluster")[1].split("Mb")[0]
|
|
357 seqID = name.split("Mb")[1]
|
|
358 else:
|
|
359 clusterID = name.split("Cl")[0].split("Gr")[1]
|
|
360 if "Q" in name.split("Gr")[0]:
|
|
361 seqID = name.split("Gr")[0].split("MbQ")[1]
|
|
362 elif "S" in name:
|
|
363 seqID = name.split("Gr")[0].split("MbS")[1]
|
|
364
|
|
365 if clusterID != prevClusterID:
|
|
366 outFile.close()
|
|
367
|
|
368 if simplifyHeader == True:
|
|
369 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
|
|
370 else:
|
|
371 header = line[1:-1]
|
|
372
|
|
373 if createDir == True:
|
|
374 os.chdir( ".." )
|
|
375 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
|
|
376 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
377 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
378
|
|
379 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
|
|
380 if not os.path.exists( outFileName ):
|
|
381 outFile = open( outFileName, "w" )
|
|
382 else:
|
|
383 if clusterID != prevClusterID:
|
|
384 outFile.close()
|
|
385 outFile = open( outFileName, "a" )
|
|
386 outFile.write( ">%s\n" % ( header ) )
|
|
387 prevClusterID = clusterID
|
|
388 sClusterIDs.add( clusterID )
|
|
389
|
|
390 else:
|
|
391 outFile.write( line )
|
|
392
|
|
393 line = inFile.readline()
|
|
394
|
|
395 outFile.close()
|
|
396 if verbose > 0:
|
|
397 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
|
|
398
|
|
399 if createDir == True:
|
|
400 os.chdir("..")
|
|
401 else:
|
|
402 print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
|
|
403
|
|
404
|
|
405 ## Filter a fasta file in two fasta files using the length of each sequence as a criteron
|
|
406 #
|
|
407 # @param len_min integer length sequence criterion to filter
|
|
408 # @param inFileName string name of the input fasta file
|
|
409 # @param verbose integer (default = 0)
|
|
410 #
|
|
411 @staticmethod
|
|
412 def dbLengthFilter( len_min, inFileName, verbose=0 ):
|
|
413 file_db = open( inFileName, "r" )
|
|
414 file_dbInf = open( inFileName+".Inf"+str(len_min), "w" )
|
|
415 file_dbSup = open( inFileName+".Sup"+str(len_min), "w" )
|
|
416 seq = Bioseq()
|
|
417 numseq = 0
|
|
418 nbsave = 0
|
|
419
|
|
420 seq.read( file_db )
|
|
421 while seq.sequence:
|
|
422 l = seq.getLength()
|
|
423 numseq = numseq + 1
|
|
424 if l >= len_min:
|
|
425 seq.write( file_dbSup )
|
|
426 if verbose > 0:
|
|
427 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!'
|
|
428 nbsave=nbsave+1
|
|
429 else:
|
|
430 seq.write( file_dbInf )
|
|
431 if verbose > 0:
|
|
432 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!'
|
|
433 nbsave=nbsave+1
|
|
434 seq.read( file_db )
|
|
435
|
|
436 file_db.close()
|
|
437 file_dbInf.close()
|
|
438 file_dbSup.close()
|
|
439 if verbose > 0:
|
|
440 print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
|
|
441
|
|
442
|
|
443 ## Extract the longest sequences from a fasta file
|
|
444 #
|
|
445 # @param num integer maximum number of sequences in the output file
|
|
446 # @param inFileName string name of the input fasta file
|
|
447 # @param outFileName string name of the output fasta file
|
|
448 # @param minThresh integer minimum length threshold (default=0)
|
|
449 # @param verbose integer (default = 0)
|
|
450 #
|
|
451 @staticmethod
|
|
452 def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ):
|
|
453 bsDB = BioseqDB( inFileName )
|
|
454 if verbose > 0:
|
|
455 print "nb of input sequences: %i" % ( bsDB.getSize() )
|
|
456
|
|
457 if outFileName == "":
|
|
458 outFileName = inFileName + ".best" + str(num)
|
|
459 outFile = open( outFileName, "w" )
|
|
460
|
|
461 if bsDB.getSize()==0:
|
|
462 return 0
|
|
463
|
|
464 num = int(num)
|
|
465 if verbose > 0:
|
|
466 print "keep the %i longest sequences" % ( num )
|
|
467 if minThresh > 0:
|
|
468 print "with length > %i bp" % ( minThresh )
|
|
469 sys.stdout.flush()
|
|
470
|
|
471 # retrieve the length of each input sequence
|
|
472 tmpLSeqLgth = []
|
|
473 seqNum = 0
|
|
474 for bs in bsDB.db:
|
|
475 seqNum += 1
|
|
476 tmpLSeqLgth.append( bs.getLength() )
|
|
477 if verbose > 1:
|
|
478 print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() )
|
|
479 sys.stdout.flush()
|
|
480
|
|
481 # sort the lengths
|
|
482 tmpLSeqLgth.sort()
|
|
483 tmpLSeqLgth.reverse()
|
|
484
|
|
485 # select the longest
|
|
486 lSeqLgth = []
|
|
487 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
|
|
488 if tmpLSeqLgth[i] >= minThresh:
|
|
489 lSeqLgth.append( tmpLSeqLgth[i] )
|
|
490 if verbose > 0:
|
|
491 print "selected max length: %i" % ( max(lSeqLgth) )
|
|
492 print "selected min length: %i" % ( min(lSeqLgth) )
|
|
493 sys.stdout.flush()
|
|
494
|
|
495 # save the longest
|
|
496 inFile = open( inFileName )
|
|
497 seqNum = 0
|
|
498 nbSave = 0
|
|
499 for bs in bsDB.db:
|
|
500 seqNum += 1
|
|
501 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
|
|
502 bs.write( outFile )
|
|
503 if verbose > 1:
|
|
504 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
|
|
505 sys.stdout.flush()
|
|
506 nbSave += 1
|
|
507 if nbSave == num:
|
|
508 break
|
|
509 inFile.close()
|
|
510 outFile.close()
|
|
511 if verbose > 0:
|
|
512 print nbSave, "saved sequences in ", outFileName
|
|
513 sys.stdout.flush()
|
|
514
|
|
515 return 0
|
|
516
|
|
517
|
|
518 ## Extract all the sequence headers from a fasta file and write them in a new fasta file
|
|
519 #
|
|
520 # @param inFileName string name of the input fasta file
|
|
521 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
|
|
522 #
|
|
523 @staticmethod
|
|
524 def dbExtractSeqHeaders( inFileName, outFileName="" ):
|
|
525 lHeaders = FastaUtils.dbHeaders( inFileName )
|
|
526
|
|
527 if outFileName == "":
|
|
528 outFileName = inFileName + ".headers"
|
|
529
|
|
530 outFile = open( outFileName, "w" )
|
|
531 for i in lHeaders:
|
|
532 outFile.write( i + "\n" )
|
|
533 outFile.close()
|
|
534
|
|
535 return 0
|
|
536
|
|
537
|
|
538 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
|
|
539 #
|
|
540 # @param pattern regular expression to search in headers
|
|
541 # @param inFileName string name of the input fasta file
|
|
542 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
543 # @param verbose integer verbosity level (default = 0)
|
|
544 #
|
|
545 @staticmethod
|
|
546 def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ):
|
|
547 if pattern == "":
|
|
548 return
|
|
549
|
|
550 if outFileName == "":
|
|
551 outFileName = inFileName + '.extracted'
|
|
552 outFile = open( outFileName, 'w' )
|
|
553
|
|
554 patternTosearch = re.compile( pattern )
|
|
555 bioseq = Bioseq()
|
|
556 bioseqNb = 0
|
|
557 savedBioseqNb = 0
|
|
558 inFile = open( inFileName, "r" )
|
|
559 bioseq.read( inFile )
|
|
560 while bioseq.sequence:
|
|
561 bioseqNb = bioseqNb + 1
|
|
562 m = patternTosearch.search( bioseq.header )
|
|
563 if m:
|
|
564 bioseq.write( outFile )
|
|
565 if verbose > 1:
|
|
566 print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!'
|
|
567 savedBioseqNb = savedBioseqNb + 1
|
|
568 bioseq.read( inFile )
|
|
569 inFile.close()
|
|
570
|
|
571 outFile.close()
|
|
572
|
|
573 if verbose > 0:
|
|
574 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
|
|
575
|
|
576
|
|
577 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
|
|
578 #
|
|
579 # @param patternFileName string file containing regular expression to search in headers
|
|
580 # @param inFileName string name of the input fasta file
|
|
581 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
582 # @param verbose integer verbosity level (default = 0)
|
|
583 #
|
|
584 @staticmethod
|
|
585 def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
|
|
586
|
|
587 if patternFileName == "":
|
|
588 print "ERROR: no file of pattern"
|
|
589 sys.exit(1)
|
|
590
|
|
591 bioseq = Bioseq()
|
|
592 bioseqNb = 0
|
|
593 savedBioseqNb = 0
|
|
594 lHeaders = []
|
|
595
|
|
596 inFile = open( inFileName, "r" )
|
|
597 bioseq.read( inFile )
|
|
598 while bioseq.sequence != None:
|
|
599 lHeaders.append( bioseq.header )
|
|
600 bioseq.read( inFile )
|
|
601 inFile.close()
|
|
602
|
|
603 lHeadersToKeep = []
|
|
604 patternFile = open( patternFileName, "r" )
|
|
605 for pattern in patternFile:
|
|
606 if verbose > 0:
|
|
607 print "pattern: ",pattern[:-1]; sys.stdout.flush()
|
|
608
|
|
609 patternToSearch = re.compile(pattern[:-1])
|
|
610 for h in lHeaders:
|
|
611 if patternToSearch.search(h):
|
|
612 lHeadersToKeep.append(h)
|
|
613 patternFile.close()
|
|
614
|
|
615 if outFileName == "":
|
|
616 outFileName = inFileName + ".extracted"
|
|
617 outFile=open( outFileName, "w" )
|
|
618
|
|
619 inFile = open( inFileName, "r" )
|
|
620 bioseq.read(inFile)
|
|
621 while bioseq.sequence:
|
|
622 bioseqNb += 1
|
|
623 if bioseq.header in lHeadersToKeep:
|
|
624 bioseq.write(outFile)
|
|
625 if verbose > 1:
|
|
626 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
|
|
627 savedBioseqNb += 1
|
|
628 bioseq.read(inFile)
|
|
629 inFile.close()
|
|
630
|
|
631 outFile.close()
|
|
632
|
|
633 if verbose > 0:
|
|
634 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
|
|
635
|
|
636
|
|
637 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
|
|
638 #
|
|
639 # @param pattern regular expression to search in headers
|
|
640 # @param inFileName string name of the input fasta file
|
|
641 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
642 # @param verbose integer verbosity level (default = 0)
|
|
643 #
|
|
644 @staticmethod
|
|
645 def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ):
|
|
646 if pattern == "":
|
|
647 return
|
|
648
|
|
649 patternToSearch = re.compile(pattern)
|
|
650
|
|
651 if outFileName == "":
|
|
652 outFileName = inFileName + '.cleaned'
|
|
653 outFile = open(outFileName,'w')
|
|
654
|
|
655 bioseq = Bioseq()
|
|
656 bioseqNb = 0
|
|
657 savedBioseqNb = 0
|
|
658 inFile = open(inFileName)
|
|
659 bioseq.read(inFile)
|
|
660 while bioseq.sequence != None:
|
|
661 bioseqNb += 1
|
|
662 if not patternToSearch.search(bioseq.header):
|
|
663 bioseq.write(outFile)
|
|
664 if verbose > 1:
|
|
665 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'
|
|
666 savedBioseqNb += 1
|
|
667 bioseq.read(inFile)
|
|
668 inFile.close()
|
|
669
|
|
670 outFile.close()
|
|
671
|
|
672 if verbose > 0:
|
|
673 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
|
|
674
|
|
675
|
|
676 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
|
|
677 #
|
|
678 # @param patternFileName string file containing regular expression to search in headers
|
|
679 # @param inFileName string name of the input fasta file
|
|
680 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
681 # @param verbose integer verbosity level (default = 0)
|
|
682 #
|
|
683 @staticmethod
|
|
684 def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
|
|
685 if patternFileName == "":
|
|
686 print "ERROR: no file of pattern"
|
|
687 sys.exit(1)
|
|
688
|
|
689 bioseq = Bioseq()
|
|
690 bioseqNb = 0
|
|
691 savedBioseqNb = 0
|
|
692 lHeaders = []
|
|
693 inFile = open( inFileName, "r" )
|
|
694 bioseq.read( inFile )
|
|
695 while bioseq.sequence != None:
|
|
696 bioseqNb += 1
|
|
697 lHeaders.append( bioseq.header )
|
|
698 bioseq.read( inFile )
|
|
699 inFile.close()
|
|
700
|
|
701 patternFile = open( patternFileName, "r")
|
|
702 lHeadersToRemove = []
|
|
703 for pattern in patternFile:
|
|
704 if verbose > 0:
|
|
705 print "pattern: ",pattern[:-1]; sys.stdout.flush()
|
|
706
|
|
707 patternToSearch = re.compile( pattern[:-1] )
|
|
708 for h in lHeaders:
|
|
709 if patternToSearch.search(h):
|
|
710 lHeadersToRemove.append(h)
|
|
711 patternFile.close()
|
|
712
|
|
713 if outFileName == "":
|
|
714 outFileName = inFileName + '.cleaned'
|
|
715 outFile = open( outFileName, 'w' )
|
|
716
|
|
717 bioseqNum = 0
|
|
718 inFile=open( inFileName )
|
|
719 bioseq.read( inFile )
|
|
720 while bioseq.sequence != None:
|
|
721 bioseqNum += 1
|
|
722 if bioseq.header not in lHeadersToRemove:
|
|
723 bioseq.write( outFile )
|
|
724 if verbose > 1:
|
|
725 print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
|
|
726 savedBioseqNb += 1
|
|
727 bioseq.read( inFile )
|
|
728 inFile.close()
|
|
729
|
|
730 outFile.close()
|
|
731
|
|
732 if verbose > 0:
|
|
733 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
|
|
734
|
|
735
|
|
736 ## Find sequence's ORFs from a fasta file and write them in a tab file
|
|
737 #
|
|
738 # @param inFileName string name of the input fasta file
|
|
739 # @param orfMaxNb integer Select orfMaxNb best ORFs
|
|
740 # @param orfMinLength integer Keep ORFs with length > orfMinLength
|
|
741 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
|
|
742 # @param verbose integer verbosity level (default = 0)
|
|
743 #
|
|
744 @staticmethod
|
|
745 def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ):
|
|
746 if outFileName == "":
|
|
747 outFileName = inFileName + ".ORF.map"
|
|
748 outFile = open( outFileName, "w" )
|
|
749
|
|
750 bioseq = Bioseq()
|
|
751 bioseqNb = 0
|
|
752
|
|
753 inFile = open( inFileName )
|
|
754 bioseq.read( inFile )
|
|
755 while bioseq.sequence != None:
|
|
756 bioseq.upCase()
|
|
757 bioseqNb += 1
|
|
758 if verbose > 0:
|
|
759 print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]'
|
|
760
|
|
761 orf = bioseq.findORF()
|
|
762 bestOrf = []
|
|
763 for i in orf.keys():
|
|
764 orfLen = len(orf[i])
|
|
765 for j in xrange(1, orfLen):
|
|
766 start = orf[i][j-1] + 4
|
|
767 end = orf[i][j] + 3
|
|
768 if end - start >= orfMinLength:
|
|
769 bestOrf.append( ( end-start, i+1, start, end ) )
|
|
770
|
|
771 bioseq.reverseComplement()
|
|
772
|
|
773 orf = bioseq.findORF()
|
|
774 seqLen = bioseq.getLength()
|
|
775 for i in orf.keys():
|
|
776 orfLen = len(orf[i])
|
|
777 for j in xrange(1, orfLen):
|
|
778 start = seqLen - orf[i][j-1] - 3
|
|
779 end = seqLen - orf[i][j] - 2
|
|
780 if start - end >= orfMinLength:
|
|
781 bestOrf.append( ( start-end, (i+1)*-1, start, end ) )
|
|
782
|
|
783 bestOrf.sort()
|
|
784 bestOrf.reverse()
|
|
785 bestOrfNb = len(bestOrf)
|
|
786 if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
|
|
787 bestOrfNb = orfMaxNb
|
|
788 for i in xrange(0, bestOrfNb):
|
|
789 if verbose > 0:
|
|
790 print bestOrf[i]
|
|
791 outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\
|
|
792 "|"+str(bestOrf[i][0]),bioseq.header,
|
|
793 bestOrf[i][2],bestOrf[i][3]))
|
|
794 bioseq.read( inFile )
|
|
795
|
|
796 inFile.close()
|
|
797 outFile.close()
|
|
798
|
|
799 return 0
|
|
800
|
|
801
|
|
802 ## Sort sequences by increasing length (write a new file)
|
|
803 #
|
|
804 # @param inFileName string name of the input fasta file
|
|
805 # @param outFileName string name of the output fasta file
|
|
806 # @param verbose integer verbosity level
|
|
807 #
|
|
808 @staticmethod
|
|
809 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
|
|
810 if verbose > 0:
|
|
811 print "sort sequences by increasing length"
|
|
812 sys.stdout.flush()
|
|
813 if not os.path.exists( inFileName ):
|
|
814 print "ERROR: file '%s' doesn't exist" % ( inFileName )
|
|
815 sys.exit(1)
|
|
816
|
|
817 # read each seq one by one
|
|
818 # save them in distinct temporary files
|
|
819 # with their length in the name
|
|
820 inFileHandler = open( inFileName, "r" )
|
|
821 countSeq = 0
|
|
822 bs = Bioseq()
|
|
823 bs.read( inFileHandler )
|
|
824 while bs.header:
|
|
825 countSeq += 1
|
|
826 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
|
|
827 bs.appendBioseqInFile( tmpFile )
|
|
828 if verbose > 1:
|
|
829 print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
|
|
830 bs.header = ""
|
|
831 bs.sequence = ""
|
|
832 bs.read( inFileHandler )
|
|
833 inFileHandler.close()
|
|
834
|
|
835 # sort temporary file names
|
|
836 # concatenate them into the output file
|
|
837 if os.path.exists( outFileName ):
|
|
838 os.remove( outFileName )
|
|
839 lFiles = glob.glob( "*bp_*nb" )
|
|
840 lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
|
|
841 for fileName in lFiles:
|
|
842 cmd = "cat %s >> %s" % ( fileName, outFileName )
|
|
843 returnValue = os.system( cmd )
|
|
844 if returnValue != 0:
|
|
845 print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
|
|
846 sys.exit(1)
|
|
847 os.remove( fileName )
|
|
848
|
|
849 return 0
|
|
850
|
|
851
|
|
852 ## Sort sequences by header
|
|
853 #
|
|
854 # @param inFileName string name of the input fasta file
|
|
855 # @param outFileName string name of the output fasta file
|
|
856 # @param verbose integer verbosity level
|
|
857 #
|
|
858 @staticmethod
|
|
859 def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0):
|
|
860 if outFileName == "":
|
|
861 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
|
|
862 iBioseqDB = BioseqDB(inFileName)
|
|
863 f = open(outFileName, "w")
|
|
864 lHeaders = sorted(iBioseqDB.getHeaderList())
|
|
865 for header in lHeaders:
|
|
866 iBioseq = iBioseqDB.fetch(header)
|
|
867 iBioseq.write(f)
|
|
868 f.close()
|
|
869
|
|
870
|
|
871 ## Return a dictionary which keys are the headers and values the length of the sequences
|
|
872 #
|
|
873 # @param inFile string name of the input fasta file
|
|
874 # @param verbose integer verbosity level
|
|
875 #
|
|
876 @staticmethod
|
|
877 def getLengthPerHeader( inFile, verbose=0 ):
|
|
878 dHeader2Length = {}
|
|
879
|
|
880 inFileHandler = open( inFile, "r" )
|
|
881 currentSeqHeader = ""
|
|
882 currentSeqLength = 0
|
|
883 line = inFileHandler.readline()
|
|
884 while line:
|
|
885 if line[0] == ">":
|
|
886 if currentSeqHeader != "":
|
|
887 dHeader2Length[ currentSeqHeader ] = currentSeqLength
|
|
888 currentSeqLength = 0
|
|
889 currentSeqHeader = line[1:-1]
|
|
890 if verbose > 0:
|
|
891 print "current header: %s" % ( currentSeqHeader )
|
|
892 sys.stdout.flush()
|
|
893 else:
|
|
894 currentSeqLength += len( line.replace("\n","") )
|
|
895 line = inFileHandler.readline()
|
|
896 dHeader2Length[ currentSeqHeader ] = currentSeqLength
|
|
897 inFileHandler.close()
|
|
898
|
|
899 return dHeader2Length
|
|
900
|
|
901
|
|
902 ## Convert headers from a fasta file having chunk coordinates
|
|
903 #
|
|
904 # @param inFile string name of the input fasta file
|
|
905 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
|
|
906 # @param outFile string name of the output file
|
|
907 #
|
|
908 @staticmethod
|
|
909 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
|
|
910 inFileHandler = open(inFile, "r")
|
|
911 outFileHandler = open(outFile, "w")
|
|
912 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
|
|
913 iConvCoord = ConvCoord()
|
|
914 line = inFileHandler.readline()
|
|
915 while line:
|
|
916 if line[0] == ">":
|
|
917 if "{Fragment}" in line:
|
|
918 chkName = line.split(" ")[1]
|
|
919 chrName = dChunk2Map[chkName].seqname
|
|
920 lCoordPairs = line.split(" ")[3].split(",")
|
|
921 lRangesOnChk = []
|
|
922 for i in lCoordPairs:
|
|
923 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
|
|
924 lRangesOnChk.append(iRange)
|
|
925 lRangesOnChr = []
|
|
926 for iRange in lRangesOnChk:
|
|
927 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
|
|
928 newHeader = line[1:-1].split(" ")[0]
|
|
929 newHeader += " %s" % chrName
|
|
930 newHeader += " {Fragment}"
|
|
931 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
|
|
932 for iRange in lRangesOnChr[1:]:
|
|
933 newHeader += ",%i..%i" % (iRange.start, iRange.end)
|
|
934 outFileHandler.write(">%s\n" % newHeader)
|
|
935 else:
|
|
936 chkName = line.split("_")[1].split(" ")[0]
|
|
937 chrName = dChunk2Map[chkName].seqname
|
|
938 coords = line[line.find("[")+1 : line.find("]")]
|
|
939 start = int(coords.split(",")[0])
|
|
940 end = int(coords.split(",")[1])
|
|
941 iRangeOnChk = Range(chkName, start, end)
|
|
942 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
|
|
943 newHeader = line[1:-1].split("_")[0]
|
|
944 newHeader += " %s" % chrName
|
|
945 newHeader += " %s" % line[line.find("(") : line.find(")")+1]
|
|
946 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
|
|
947 outFileHandler.write(">%s\n" % newHeader)
|
|
948 else:
|
|
949 outFileHandler.write(line)
|
|
950 line = inFileHandler.readline()
|
|
951 inFileHandler.close()
|
|
952 outFileHandler.close()
|
|
953
|
|
954
|
|
955 ## Convert a fasta file to a length file
|
|
956 #
|
|
957 # @param inFile string name of the input fasta file
|
|
958 # @param outFile string name of the output file
|
|
959 #
|
|
960 @staticmethod
|
|
961 def convertFastaToLength(inFile, outFile = ""):
|
|
962 if outFile == "":
|
|
963 outFile = "%s.length" % inFile
|
|
964
|
|
965 if inFile != "":
|
|
966 with open(inFile, "r") as inFH:
|
|
967 with open(outFile, "w") as outFH:
|
|
968 bioseq = Bioseq()
|
|
969 bioseq.read(inFH)
|
|
970 while bioseq.sequence != None:
|
|
971 seqLen = bioseq.getLength()
|
|
972 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
|
|
973 bioseq.read(inFH)
|
|
974
|
|
975
|
|
976 ## Convert a fasta file to a seq file
|
|
977 #
|
|
978 # @param inFile string name of the input fasta file
|
|
979 # @param outFile string name of the output file
|
|
980 #
|
|
981 @staticmethod
|
|
982 def convertFastaToSeq(inFile, outFile = ""):
|
|
983 if outFile == "":
|
|
984 outFile = "%s.seq" % inFile
|
|
985
|
|
986 if inFile != "":
|
|
987 with open(inFile, "r") as inFH:
|
|
988 with open(outFile, "w") as outFH:
|
|
989 bioseq = Bioseq()
|
|
990 bioseq.read(inFH)
|
|
991 while bioseq.sequence != None:
|
|
992 seqLen = bioseq.getLength()
|
|
993 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
|
|
994 bioseq.sequence, bioseq.header, seqLen))
|
|
995 bioseq.read(inFH)
|
|
996
|
|
997
|
|
998 ## Splice an input fasta file using coordinates in a Map file
|
|
999 #
|
|
1000 # @note the coordinates should be merged beforehand!
|
|
1001 #
|
|
1002 @staticmethod
|
|
1003 def spliceFromCoords( genomeFile, coordFile, obsFile ):
|
|
1004 genomeFileHandler = open( genomeFile, "r" )
|
|
1005 obsFileHandler = open( obsFile, "w" )
|
|
1006 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
|
|
1007
|
|
1008 bs = Bioseq()
|
|
1009 bs.read( genomeFileHandler )
|
|
1010 while bs.sequence:
|
|
1011 if dChr2Maps.has_key( bs.header ):
|
|
1012 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
|
|
1013 splicedSeq = ""
|
|
1014 currentSite = 0
|
|
1015 for iMap in lCoords:
|
|
1016 minSplice = iMap.getMin() - 1
|
|
1017 if minSplice > currentSite:
|
|
1018 splicedSeq += bs.sequence[ currentSite : minSplice ]
|
|
1019 currentSite = iMap.getMax()
|
|
1020 splicedSeq += bs.sequence[ currentSite : ]
|
|
1021 bs.sequence = splicedSeq
|
|
1022 bs.write( obsFileHandler )
|
|
1023 bs.read( genomeFileHandler )
|
|
1024
|
|
1025 genomeFileHandler.close()
|
|
1026 obsFileHandler.close()
|
|
1027
|
|
1028
|
|
1029 ## Shuffle input sequences (single file or files in a directory)
|
|
1030 #
|
|
1031 @staticmethod
|
|
1032 def dbShuffle( inData, outData, verbose=0 ):
|
|
1033 if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
|
|
1034 prg = "esl-shuffle"
|
|
1035 else : prg = "shuffle"
|
|
1036 genericCmd = prg + " -d INPUT > OUTPUT"
|
|
1037 if os.path.isfile( inData ):
|
|
1038 if verbose > 0:
|
|
1039 print "shuffle input file '%s'" % inData
|
|
1040 cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData)
|
|
1041 print cmd
|
|
1042 returnStatus = os.system( cmd )
|
|
1043 if returnStatus != 0:
|
|
1044 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
|
|
1045 sys.exit(1)
|
|
1046
|
|
1047 elif os.path.isdir( inData ):
|
|
1048 if verbose > 0:
|
|
1049 print "shuffle files in input directory '%s'" % inData
|
|
1050 if os.path.exists( outData ):
|
|
1051 shutil.rmtree( outData )
|
|
1052 os.mkdir( outData )
|
|
1053 lInputFiles = glob.glob( "%s/*.fa" %( inData ) )
|
|
1054 nbFastaFiles = 0
|
|
1055 for inputFile in lInputFiles:
|
|
1056 nbFastaFiles += 1
|
|
1057 if verbose > 1:
|
|
1058 print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) )
|
|
1059 fastaBaseName = os.path.basename( inputFile )
|
|
1060 prefix, extension = os.path.splitext( fastaBaseName )
|
|
1061 cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix))
|
|
1062 returnStatus = os.system( cmd )
|
|
1063 if returnStatus != 0:
|
|
1064 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
|
|
1065 sys.exit(1)
|
|
1066
|
|
1067
|
|
1068 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
|
|
1069 #
|
|
1070 # @param inClusterFileName string input cluster file name
|
|
1071 # @param inFastaFileName string input fasta file name
|
|
1072 # @param outFileName string output file name
|
|
1073 # @param verbosity integer verbosity
|
|
1074 #
|
|
1075 @staticmethod
|
|
1076 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
|
|
1077 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
|
|
1078 iFastaParser = FastaParser(inFastaFileName)
|
|
1079 with open(outFileName, "w") as f:
|
|
1080 for iSequence in iFastaParser.getIterator():
|
|
1081
|
|
1082 header = iSequence.getName()
|
|
1083 if dHeader2ClusterClusterMember.get(header):
|
|
1084 cluster = dHeader2ClusterClusterMember[header][0]
|
|
1085 member = dHeader2ClusterClusterMember[header][1]
|
|
1086 else:
|
|
1087 clusterIdForSingletonCluster += 1
|
|
1088 cluster = clusterIdForSingletonCluster
|
|
1089 member = 1
|
|
1090
|
|
1091 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
|
|
1092 iSequence.setName(newHeader)
|
|
1093 f.write(iSequence.printFasta())
|
|
1094
|
18
|
1095 @staticmethod
|
6
|
1096 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
|
|
1097 dHeader2ClusterClusterMember = {}
|
|
1098 clusterId = 0
|
|
1099 with open(inClusterFileName) as f:
|
|
1100 line = f.readline()
|
|
1101 while line:
|
|
1102 lineWithoutLastChar = line.rstrip()
|
|
1103 lHeaders = lineWithoutLastChar.split("\t")
|
|
1104 clusterId += 1
|
|
1105 if verbosity > 0:
|
|
1106 print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
|
|
1107 memberId = 0
|
|
1108 for header in lHeaders:
|
|
1109 memberId += 1
|
|
1110 dHeader2ClusterClusterMember[header] = (clusterId, memberId)
|
|
1111 line = f.readline()
|
|
1112 if verbosity > 0:
|
|
1113 print "%i clusters" % clusterId
|
|
1114 return dHeader2ClusterClusterMember, clusterId
|
|
1115
|
18
|
1116 @staticmethod
|
6
|
1117 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
|
|
1118 """
|
|
1119 Write a map file from fasta output of clustering tool.
|
|
1120 Warning: only works if input fasta headers are formated like LTRharvest fasta output.
|
|
1121 """
|
|
1122 if not outMapFileName:
|
|
1123 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
|
|
1124
|
|
1125 fileDb = open(fastaFileNameFromClustering , "r")
|
|
1126 fileMap = open(outMapFileName, "w")
|
|
1127 seq = Bioseq()
|
|
1128 numseq = 0
|
|
1129 while 1:
|
|
1130 seq.read(fileDb)
|
|
1131 if seq.sequence == None:
|
|
1132 break
|
|
1133 numseq = numseq + 1
|
|
1134 ID = seq.header.split(' ')[0].split('_')[0]
|
|
1135 chunk = seq.header.split(' ')[0].split('_')[1]
|
|
1136 start = seq.header.split(' ')[-1].split(',')[0][1:]
|
|
1137 end = seq.header.split(' ')[-1].split(',')[1][:-1]
|
|
1138 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
|
|
1139 fileMap.write(line + "\n")
|
|
1140
|
|
1141 fileDb.close()
|
|
1142 fileMap.close()
|
|
1143 print "saved in %s" % outMapFileName
|
18
|
1144
|
|
1145 @staticmethod
|
|
1146 def writeNstreches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
|
|
1147 outFormat = outFormat.lower()
|
|
1148 if outFormat in ["gff", "gff3"]:
|
|
1149 outFormat = "gff3"
|
|
1150 else:
|
|
1151 outFormat = "map"
|
|
1152
|
|
1153 lTNstretches = []
|
|
1154 if nbN != 0:
|
|
1155 iBSDB = BioseqDB(fastaFileName)
|
|
1156 for iBS in iBSDB.db:
|
|
1157 nbNFound = 0
|
|
1158 start = 1
|
|
1159 pos = 1
|
|
1160 lastPos = 0
|
|
1161
|
|
1162 while pos <= iBS.getLength():
|
|
1163 if nbNFound == 0:
|
|
1164 start = pos
|
|
1165
|
|
1166 while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
|
|
1167 nbNFound += 1
|
|
1168 pos += 1
|
|
1169 lastPos = pos
|
|
1170
|
|
1171 if pos - lastPos >= nbN:
|
|
1172 if nbNFound >= nbN:
|
|
1173 lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
|
|
1174 nbNFound = 0
|
|
1175 pos += 1
|
|
1176
|
|
1177 if nbNFound >= nbN:
|
|
1178 lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
|
|
1179
|
|
1180 lTNstretches.sort(key = itemgetter(0, 1, 2))
|
|
1181
|
|
1182 if outFileName == "":
|
|
1183 outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)
|
|
1184
|
|
1185 with open(outFileName, "w") as fH:
|
|
1186 if outFormat == "gff3":
|
|
1187 fH.write("##gff-version 3\n")
|
|
1188 for item in lTNstretches:
|
|
1189 seq = item[0]
|
|
1190 start = item[1]
|
|
1191 end = item[2]
|
|
1192 if outFormat == "gff3":
|
|
1193 fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
|
|
1194 else:
|
|
1195 fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))
|
|
1196
|
|
1197 |