6
|
1 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
2 # http://www.inra.fr
|
|
3 # http://urgi.versailles.inra.fr
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30
|
|
31
|
|
32 import os
|
|
33 import sys
|
|
34 import string
|
|
35 import math
|
|
36 import shutil
|
|
37 import re
|
|
38 import glob
|
|
39 from commons.core.seq.BioseqDB import BioseqDB
|
|
40 from commons.core.seq.Bioseq import Bioseq
|
|
41 from commons.core.coord.MapUtils import MapUtils
|
|
42 from commons.core.coord.Range import Range
|
|
43 from commons.core.checker.CheckerUtils import CheckerUtils
|
|
44 from commons.core.launcher.LauncherUtils import LauncherUtils
|
|
45 from commons.core.coord.ConvCoord import ConvCoord
|
|
46 from commons.core.parsing.FastaParser import FastaParser
|
|
47
|
|
48
|
|
49 ## Static methods for fasta file manipulation
|
|
50 #
|
|
51 class FastaUtils( object ):
|
|
52
|
|
53 ## Count the number of sequences in the input fasta file
|
|
54 #
|
|
55 # @param inFile name of the input fasta file
|
|
56 #
|
|
57 # @return integer number of sequences in the input fasta file
|
|
58 #
|
|
59 @staticmethod
|
|
60 def dbSize( inFile ):
|
|
61 nbSeq = 0
|
|
62 inFileHandler = open( inFile, "r" )
|
|
63 line = inFileHandler.readline()
|
|
64 while line:
|
|
65 if line[0] == ">":
|
|
66 nbSeq = nbSeq + 1
|
|
67 line = inFileHandler.readline()
|
|
68 inFileHandler.close()
|
|
69
|
|
70 return nbSeq
|
|
71
|
|
72
|
|
73 ## Compute the cumulative sequence length in the input fasta file
|
|
74 #
|
|
75 # @param inFile handler of the input fasta file
|
|
76 #
|
|
77 @staticmethod
|
|
78 def dbCumLength( inFile ):
|
|
79 cumLength = 0
|
|
80 line = inFile.readline()
|
|
81 while line:
|
|
82 if line[0] != ">":
|
|
83 cumLength += len(string.rstrip(line))
|
|
84 line = inFile.readline()
|
|
85
|
|
86 return cumLength
|
|
87
|
|
88
|
|
89 ## Return a list with the length of each sequence in the input fasta file
|
|
90 #
|
|
91 # @param inFile string name of the input fasta file
|
|
92 #
|
|
93 @staticmethod
|
|
94 def dbLengths( inFile ):
|
|
95 lLengths = []
|
|
96 inFileHandler = open( inFile, "r" )
|
|
97 currentLength = 0
|
|
98 line = inFileHandler.readline()
|
|
99 while line:
|
|
100 if line[0] == ">":
|
|
101 if currentLength != 0:
|
|
102 lLengths.append( currentLength )
|
|
103 currentLength = 0
|
|
104 else:
|
|
105 currentLength += len(line[:-1])
|
|
106 line = inFileHandler.readline()
|
|
107 lLengths.append( currentLength )
|
|
108 inFileHandler.close()
|
|
109 return lLengths
|
|
110
|
|
111
|
|
112 ## Retrieve the sequence headers present in the input fasta file
|
|
113 #
|
|
114 # @param inFile string name of the input fasta file
|
|
115 # @param verbose integer level of verbosity
|
|
116 #
|
|
117 # @return list of sequence headers
|
|
118 #
|
|
119 @staticmethod
|
|
120 def dbHeaders( inFile, verbose=0 ):
|
|
121 lHeaders = []
|
|
122
|
|
123 inFileHandler = open( inFile, "r" )
|
|
124 line = inFileHandler.readline()
|
|
125 while line:
|
|
126 if line[0] == ">":
|
|
127 lHeaders.append( string.rstrip(line[1:]) )
|
|
128 if verbose > 0:
|
|
129 print string.rstrip(line[1:])
|
|
130 line = inFileHandler.readline()
|
|
131 inFileHandler.close()
|
|
132
|
|
133 return lHeaders
|
|
134
|
|
135
|
|
136 ## Cut a data bank into chunks according to the input parameters
|
|
137 # If a sequence is shorter than the threshold, it is only renamed (not cut)
|
|
138 #
|
|
139 # @param inFileName string name of the input fasta file
|
|
140 # @param chkLgth string chunk length (in bp, default=200000)
|
|
141 # @param chkOver string chunk overlap (in bp, default=10000)
|
|
142 # @param wordN string N stretch word length (default=11, 0 for no detection)
|
|
143 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
|
|
144 # @param clean boolean remove 'cut' and 'Nstretch' files
|
|
145 # @param verbose integer (default = 0)
|
|
146 #
|
|
147 @staticmethod
|
|
148 def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ):
|
|
149 nbSeq = FastaUtils.dbSize( inFileName )
|
|
150 if verbose > 0:
|
|
151 print "cut the %i input sequences with cutterDB..." % ( nbSeq )
|
|
152 sys.stdout.flush()
|
|
153
|
|
154 prg = "cutterDB"
|
|
155 cmd = prg
|
|
156 cmd += " -l %s" % ( chkLgth )
|
|
157 cmd += " -o %s" %( chkOver )
|
|
158 cmd += " -w %s" % ( wordN )
|
|
159 cmd += " %s" % ( inFileName )
|
|
160 returnStatus = os.system( cmd )
|
|
161 if returnStatus != 0:
|
|
162 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
|
|
163 sys.stderr.write( "%s\n" % ( msg ) )
|
|
164 sys.exit(1)
|
|
165
|
|
166 nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) )
|
|
167 if verbose > 0:
|
|
168 print "done (%i chunks)" % ( nbChunks )
|
|
169 sys.stdout.flush()
|
|
170
|
|
171 if verbose > 0:
|
|
172 print "rename the headers..."
|
|
173 sys.stdout.flush()
|
|
174
|
|
175 if outFilePrefix == "":
|
|
176 outFastaName = inFileName + "_chunks.fa"
|
|
177 outMapName = inFileName + "_chunks.map"
|
|
178 else:
|
|
179 outFastaName = outFilePrefix + ".fa"
|
|
180 outMapName = outFilePrefix + ".map"
|
|
181
|
|
182 inFile = open( "%s_cut" % ( inFileName ), "r" )
|
|
183 line = inFile.readline()
|
|
184
|
|
185 outFasta = open( outFastaName, "w" )
|
|
186 outMap = open( outMapName, "w" )
|
|
187
|
|
188 # read line after line (no need for big RAM) and change the sequence headers
|
|
189 while line:
|
|
190
|
|
191 if line[0] == ">":
|
|
192 if verbose > 1:
|
|
193 print "rename '%s'" % ( line[:-1] ); sys.stdout.flush()
|
|
194 data = line[:-1].split(" ")
|
|
195 seqID = data[0].split(">")[1]
|
|
196 newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) )
|
|
197 oldHeader = data[2]
|
|
198 seqStart = data[4].split("..")[0]
|
|
199 seqEnd = data[4].split("..")[1]
|
|
200 outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) )
|
|
201 outFasta.write( ">%s\n" % ( newHeader ) )
|
|
202
|
|
203 else:
|
|
204 outFasta.write( line.upper() )
|
|
205
|
|
206 line = inFile.readline()
|
|
207
|
|
208 inFile.close()
|
|
209 outFasta.close()
|
|
210 outMap.close()
|
|
211
|
|
212 if clean == True:
|
|
213 os.remove(inFileName + "_cut")
|
|
214 os.remove(inFileName + ".Nstretch.map")
|
|
215
|
|
216
|
|
217 ## Split the input fasta file in several output files
|
|
218 #
|
|
219 # @param inFile string name of the input fasta file
|
|
220 # @param nbSeqPerBatch integer number of sequences per output file
|
|
221 # @param newDir boolean put the sequences in a new directory called 'batches'
|
|
222 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
|
|
223 # @param prefix prefix in output file name
|
|
224 # @param verbose integer verbosity level (default = 0)
|
|
225 #
|
|
226 @staticmethod
|
|
227 def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ):
|
|
228 if not os.path.exists( inFile ):
|
|
229 msg = "ERROR: file '%s' doesn't exist" % ( inFile )
|
|
230 sys.stderr.write( "%s\n" % ( msg ) )
|
|
231 sys.exit(1)
|
|
232
|
|
233 nbSeq = FastaUtils.dbSize( inFile )
|
|
234
|
|
235 nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) )
|
|
236 if verbose > 0:
|
|
237 print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches )
|
|
238 sys.stdout.flush()
|
|
239
|
|
240 if nbSeqPerBatch > 1 and useSeqHeader:
|
|
241 useSeqHeader = False
|
|
242
|
|
243 if newDir == True:
|
|
244 if os.path.exists( "batches" ):
|
|
245 shutil.rmtree( "batches" )
|
|
246 os.mkdir( "batches" )
|
|
247 os.chdir( "batches" )
|
|
248 os.system( "ln -s ../%s ." % ( inFile ) )
|
|
249
|
|
250 inFileHandler = open( inFile, "r" )
|
|
251 inFileHandler.seek( 0, 0 )
|
|
252 countBatch = 0
|
|
253 countSeq = 0
|
|
254 line = inFileHandler.readline()
|
|
255 while line:
|
|
256 if line == "":
|
|
257 break
|
|
258 if line[0] == ">":
|
|
259 countSeq += 1
|
|
260 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
|
|
261 if "outFile" in locals():
|
|
262 outFile.close()
|
|
263 countBatch += 1
|
|
264 if nbSeqPerBatch == 1 and useSeqHeader:
|
|
265 outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") )
|
|
266 else:
|
|
267 outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) )
|
|
268 outFile = open( outFileName, "w" )
|
|
269 if verbose > 1:
|
|
270 print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName )
|
|
271 sys.stdout.flush()
|
|
272 outFile.write( line )
|
|
273 line = inFileHandler.readline()
|
|
274 inFileHandler.close()
|
|
275
|
|
276 if newDir == True:
|
|
277 os.remove( os.path.basename( inFile ) )
|
|
278 os.chdir( ".." )
|
|
279
|
|
280
|
|
281 ## Split the input fasta file in several output files
|
|
282 #
|
|
283 # @param inFileName string name of the input fasta file
|
|
284 # @param maxSize integer max cumulative length for each output file
|
|
285 #
|
|
286 @staticmethod
|
|
287 def splitFastaFileInBatches(inFileName, maxSize = 200000):
|
|
288 iBioseqDB = BioseqDB(inFileName)
|
|
289 lHeadersSizeTuples = []
|
|
290 for iBioseq in iBioseqDB.db:
|
|
291 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
|
|
292
|
|
293 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
|
|
294 os.mkdir("batches")
|
|
295 os.chdir("batches")
|
|
296
|
|
297 iterator = 0
|
|
298 for lHeader in lHeadersList :
|
|
299 iterator += 1
|
|
300 with open("batch_%s.fa" % iterator, 'w') as f :
|
|
301 for header in lHeader :
|
|
302 iBioseq = iBioseqDB.fetch(header)
|
|
303 iBioseq.write(f)
|
|
304 os.chdir("..")
|
|
305
|
|
306
|
|
307 ## Split the input fasta file in several output files according to their cluster identifier
|
|
308 #
|
|
309 # @param inFileName string name of the input fasta file
|
|
310 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
|
|
311 # @param simplifyHeader boolean simplify the headers
|
|
312 # @param createDir boolean put the sequences in different directories
|
|
313 # @param outPrefix string prefix of the output files (default='seqCluster')
|
|
314 # @param verbose integer (default = 0)
|
|
315 #
|
|
316 @staticmethod
|
|
317 def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ):
|
|
318 if not os.path.exists( inFileName ):
|
|
319 print "ERROR: %s doesn't exist" % ( inFileName )
|
|
320 sys.exit(1)
|
|
321
|
|
322 inFile = open( inFileName, "r" )
|
|
323
|
|
324 line = inFile.readline()
|
|
325 if line:
|
|
326 name = line.split(" ")[0]
|
|
327 if "Cluster" in name:
|
|
328 clusterID = name.split("Cluster")[1].split("Mb")[0]
|
|
329 seqID = name.split("Mb")[1]
|
|
330 else:
|
|
331 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
|
|
332 if "Q" in name.split("Gr")[0]:
|
|
333 seqID = name.split("Gr")[0].split("MbQ")[1]
|
|
334 elif "S" in name:
|
|
335 seqID = name.split("Gr")[0].split("MbS")[1]
|
|
336 sClusterIDs = set( [ clusterID ] )
|
|
337 if simplifyHeader == True:
|
|
338 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
|
|
339 else:
|
|
340 header = line[1:-1]
|
|
341 if createDir == True:
|
|
342 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
|
|
343 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
344 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
345 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
|
|
346 outFile = open( outFileName, "w" )
|
|
347 outFile.write( ">%s\n" % ( header ) )
|
|
348 prevClusterID = clusterID
|
|
349
|
|
350 line = inFile.readline()
|
|
351 while line:
|
|
352 if line[0] == ">":
|
|
353 name = line.split(" ")[0]
|
|
354 if "Cluster" in name:
|
|
355 clusterID = name.split("Cluster")[1].split("Mb")[0]
|
|
356 seqID = name.split("Mb")[1]
|
|
357 else:
|
|
358 clusterID = name.split("Cl")[0].split("Gr")[1]
|
|
359 if "Q" in name.split("Gr")[0]:
|
|
360 seqID = name.split("Gr")[0].split("MbQ")[1]
|
|
361 elif "S" in name:
|
|
362 seqID = name.split("Gr")[0].split("MbS")[1]
|
|
363
|
|
364 if clusterID != prevClusterID:
|
|
365 outFile.close()
|
|
366
|
|
367 if simplifyHeader == True:
|
|
368 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
|
|
369 else:
|
|
370 header = line[1:-1]
|
|
371
|
|
372 if createDir == True:
|
|
373 os.chdir( ".." )
|
|
374 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
|
|
375 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
376 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
377
|
|
378 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
|
|
379 if not os.path.exists( outFileName ):
|
|
380 outFile = open( outFileName, "w" )
|
|
381 else:
|
|
382 if clusterID != prevClusterID:
|
|
383 outFile.close()
|
|
384 outFile = open( outFileName, "a" )
|
|
385 outFile.write( ">%s\n" % ( header ) )
|
|
386 prevClusterID = clusterID
|
|
387 sClusterIDs.add( clusterID )
|
|
388
|
|
389 else:
|
|
390 outFile.write( line )
|
|
391
|
|
392 line = inFile.readline()
|
|
393
|
|
394 outFile.close()
|
|
395 if verbose > 0:
|
|
396 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
|
|
397
|
|
398 if createDir == True:
|
|
399 os.chdir("..")
|
|
400 else:
|
|
401 print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
|
|
402
|
|
403
|
|
404 ## Filter a fasta file in two fasta files using the length of each sequence as a criteron
|
|
405 #
|
|
406 # @param len_min integer length sequence criterion to filter
|
|
407 # @param inFileName string name of the input fasta file
|
|
408 # @param verbose integer (default = 0)
|
|
409 #
|
|
410 @staticmethod
|
|
411 def dbLengthFilter( len_min, inFileName, verbose=0 ):
|
|
412 file_db = open( inFileName, "r" )
|
|
413 file_dbInf = open( inFileName+".Inf"+str(len_min), "w" )
|
|
414 file_dbSup = open( inFileName+".Sup"+str(len_min), "w" )
|
|
415 seq = Bioseq()
|
|
416 numseq = 0
|
|
417 nbsave = 0
|
|
418
|
|
419 seq.read( file_db )
|
|
420 while seq.sequence:
|
|
421 l = seq.getLength()
|
|
422 numseq = numseq + 1
|
|
423 if l >= len_min:
|
|
424 seq.write( file_dbSup )
|
|
425 if verbose > 0:
|
|
426 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!'
|
|
427 nbsave=nbsave+1
|
|
428 else:
|
|
429 seq.write( file_dbInf )
|
|
430 if verbose > 0:
|
|
431 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!'
|
|
432 nbsave=nbsave+1
|
|
433 seq.read( file_db )
|
|
434
|
|
435 file_db.close()
|
|
436 file_dbInf.close()
|
|
437 file_dbSup.close()
|
|
438 if verbose > 0:
|
|
439 print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
|
|
440
|
|
441
|
|
442 ## Extract the longest sequences from a fasta file
|
|
443 #
|
|
444 # @param num integer maximum number of sequences in the output file
|
|
445 # @param inFileName string name of the input fasta file
|
|
446 # @param outFileName string name of the output fasta file
|
|
447 # @param minThresh integer minimum length threshold (default=0)
|
|
448 # @param verbose integer (default = 0)
|
|
449 #
|
|
450 @staticmethod
|
|
451 def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ):
|
|
452 bsDB = BioseqDB( inFileName )
|
|
453 if verbose > 0:
|
|
454 print "nb of input sequences: %i" % ( bsDB.getSize() )
|
|
455
|
|
456 if outFileName == "":
|
|
457 outFileName = inFileName + ".best" + str(num)
|
|
458 outFile = open( outFileName, "w" )
|
|
459
|
|
460 if bsDB.getSize()==0:
|
|
461 return 0
|
|
462
|
|
463 num = int(num)
|
|
464 if verbose > 0:
|
|
465 print "keep the %i longest sequences" % ( num )
|
|
466 if minThresh > 0:
|
|
467 print "with length > %i bp" % ( minThresh )
|
|
468 sys.stdout.flush()
|
|
469
|
|
470 # retrieve the length of each input sequence
|
|
471 tmpLSeqLgth = []
|
|
472 seqNum = 0
|
|
473 for bs in bsDB.db:
|
|
474 seqNum += 1
|
|
475 tmpLSeqLgth.append( bs.getLength() )
|
|
476 if verbose > 1:
|
|
477 print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() )
|
|
478 sys.stdout.flush()
|
|
479
|
|
480 # sort the lengths
|
|
481 tmpLSeqLgth.sort()
|
|
482 tmpLSeqLgth.reverse()
|
|
483
|
|
484 # select the longest
|
|
485 lSeqLgth = []
|
|
486 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
|
|
487 if tmpLSeqLgth[i] >= minThresh:
|
|
488 lSeqLgth.append( tmpLSeqLgth[i] )
|
|
489 if verbose > 0:
|
|
490 print "selected max length: %i" % ( max(lSeqLgth) )
|
|
491 print "selected min length: %i" % ( min(lSeqLgth) )
|
|
492 sys.stdout.flush()
|
|
493
|
|
494 # save the longest
|
|
495 inFile = open( inFileName )
|
|
496 seqNum = 0
|
|
497 nbSave = 0
|
|
498 for bs in bsDB.db:
|
|
499 seqNum += 1
|
|
500 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
|
|
501 bs.write( outFile )
|
|
502 if verbose > 1:
|
|
503 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
|
|
504 sys.stdout.flush()
|
|
505 nbSave += 1
|
|
506 if nbSave == num:
|
|
507 break
|
|
508 inFile.close()
|
|
509 outFile.close()
|
|
510 if verbose > 0:
|
|
511 print nbSave, "saved sequences in ", outFileName
|
|
512 sys.stdout.flush()
|
|
513
|
|
514 return 0
|
|
515
|
|
516
|
|
517 ## Extract all the sequence headers from a fasta file and write them in a new fasta file
|
|
518 #
|
|
519 # @param inFileName string name of the input fasta file
|
|
520 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
|
|
521 #
|
|
522 @staticmethod
|
|
523 def dbExtractSeqHeaders( inFileName, outFileName="" ):
|
|
524 lHeaders = FastaUtils.dbHeaders( inFileName )
|
|
525
|
|
526 if outFileName == "":
|
|
527 outFileName = inFileName + ".headers"
|
|
528
|
|
529 outFile = open( outFileName, "w" )
|
|
530 for i in lHeaders:
|
|
531 outFile.write( i + "\n" )
|
|
532 outFile.close()
|
|
533
|
|
534 return 0
|
|
535
|
|
536
|
|
537 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
|
|
538 #
|
|
539 # @param pattern regular expression to search in headers
|
|
540 # @param inFileName string name of the input fasta file
|
|
541 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
542 # @param verbose integer verbosity level (default = 0)
|
|
543 #
|
|
544 @staticmethod
|
|
545 def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ):
|
|
546 if pattern == "":
|
|
547 return
|
|
548
|
|
549 if outFileName == "":
|
|
550 outFileName = inFileName + '.extracted'
|
|
551 outFile = open( outFileName, 'w' )
|
|
552
|
|
553 patternTosearch = re.compile( pattern )
|
|
554 bioseq = Bioseq()
|
|
555 bioseqNb = 0
|
|
556 savedBioseqNb = 0
|
|
557 inFile = open( inFileName, "r" )
|
|
558 bioseq.read( inFile )
|
|
559 while bioseq.sequence:
|
|
560 bioseqNb = bioseqNb + 1
|
|
561 m = patternTosearch.search( bioseq.header )
|
|
562 if m:
|
|
563 bioseq.write( outFile )
|
|
564 if verbose > 1:
|
|
565 print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!'
|
|
566 savedBioseqNb = savedBioseqNb + 1
|
|
567 bioseq.read( inFile )
|
|
568 inFile.close()
|
|
569
|
|
570 outFile.close()
|
|
571
|
|
572 if verbose > 0:
|
|
573 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
|
|
574
|
|
575
|
|
576 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
|
|
577 #
|
|
578 # @param patternFileName string file containing regular expression to search in headers
|
|
579 # @param inFileName string name of the input fasta file
|
|
580 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
581 # @param verbose integer verbosity level (default = 0)
|
|
582 #
|
|
583 @staticmethod
|
|
584 def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
|
|
585
|
|
586 if patternFileName == "":
|
|
587 print "ERROR: no file of pattern"
|
|
588 sys.exit(1)
|
|
589
|
|
590 bioseq = Bioseq()
|
|
591 bioseqNb = 0
|
|
592 savedBioseqNb = 0
|
|
593 lHeaders = []
|
|
594
|
|
595 inFile = open( inFileName, "r" )
|
|
596 bioseq.read( inFile )
|
|
597 while bioseq.sequence != None:
|
|
598 lHeaders.append( bioseq.header )
|
|
599 bioseq.read( inFile )
|
|
600 inFile.close()
|
|
601
|
|
602 lHeadersToKeep = []
|
|
603 patternFile = open( patternFileName, "r" )
|
|
604 for pattern in patternFile:
|
|
605 if verbose > 0:
|
|
606 print "pattern: ",pattern[:-1]; sys.stdout.flush()
|
|
607
|
|
608 patternToSearch = re.compile(pattern[:-1])
|
|
609 for h in lHeaders:
|
|
610 if patternToSearch.search(h):
|
|
611 lHeadersToKeep.append(h)
|
|
612 patternFile.close()
|
|
613
|
|
614 if outFileName == "":
|
|
615 outFileName = inFileName + ".extracted"
|
|
616 outFile=open( outFileName, "w" )
|
|
617
|
|
618 inFile = open( inFileName, "r" )
|
|
619 bioseq.read(inFile)
|
|
620 while bioseq.sequence:
|
|
621 bioseqNb += 1
|
|
622 if bioseq.header in lHeadersToKeep:
|
|
623 bioseq.write(outFile)
|
|
624 if verbose > 1:
|
|
625 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
|
|
626 savedBioseqNb += 1
|
|
627 bioseq.read(inFile)
|
|
628 inFile.close()
|
|
629
|
|
630 outFile.close()
|
|
631
|
|
632 if verbose > 0:
|
|
633 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
|
|
634
|
|
635
|
|
636 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
|
|
637 #
|
|
638 # @param pattern regular expression to search in headers
|
|
639 # @param inFileName string name of the input fasta file
|
|
640 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
641 # @param verbose integer verbosity level (default = 0)
|
|
642 #
|
|
643 @staticmethod
|
|
644 def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ):
|
|
645 if pattern == "":
|
|
646 return
|
|
647
|
|
648 patternToSearch = re.compile(pattern)
|
|
649
|
|
650 if outFileName == "":
|
|
651 outFileName = inFileName + '.cleaned'
|
|
652 outFile = open(outFileName,'w')
|
|
653
|
|
654 bioseq = Bioseq()
|
|
655 bioseqNb = 0
|
|
656 savedBioseqNb = 0
|
|
657 inFile = open(inFileName)
|
|
658 bioseq.read(inFile)
|
|
659 while bioseq.sequence != None:
|
|
660 bioseqNb += 1
|
|
661 if not patternToSearch.search(bioseq.header):
|
|
662 bioseq.write(outFile)
|
|
663 if verbose > 1:
|
|
664 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'
|
|
665 savedBioseqNb += 1
|
|
666 bioseq.read(inFile)
|
|
667 inFile.close()
|
|
668
|
|
669 outFile.close()
|
|
670
|
|
671 if verbose > 0:
|
|
672 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
|
|
673
|
|
674
|
|
675 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
|
|
676 #
|
|
677 # @param patternFileName string file containing regular expression to search in headers
|
|
678 # @param inFileName string name of the input fasta file
|
|
679 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
680 # @param verbose integer verbosity level (default = 0)
|
|
681 #
|
|
682 @staticmethod
|
|
683 def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
|
|
684 if patternFileName == "":
|
|
685 print "ERROR: no file of pattern"
|
|
686 sys.exit(1)
|
|
687
|
|
688 bioseq = Bioseq()
|
|
689 bioseqNb = 0
|
|
690 savedBioseqNb = 0
|
|
691 lHeaders = []
|
|
692 inFile = open( inFileName, "r" )
|
|
693 bioseq.read( inFile )
|
|
694 while bioseq.sequence != None:
|
|
695 bioseqNb += 1
|
|
696 lHeaders.append( bioseq.header )
|
|
697 bioseq.read( inFile )
|
|
698 inFile.close()
|
|
699
|
|
700 patternFile = open( patternFileName, "r")
|
|
701 lHeadersToRemove = []
|
|
702 for pattern in patternFile:
|
|
703 if verbose > 0:
|
|
704 print "pattern: ",pattern[:-1]; sys.stdout.flush()
|
|
705
|
|
706 patternToSearch = re.compile( pattern[:-1] )
|
|
707 for h in lHeaders:
|
|
708 if patternToSearch.search(h):
|
|
709 lHeadersToRemove.append(h)
|
|
710 patternFile.close()
|
|
711
|
|
712 if outFileName == "":
|
|
713 outFileName = inFileName + '.cleaned'
|
|
714 outFile = open( outFileName, 'w' )
|
|
715
|
|
716 bioseqNum = 0
|
|
717 inFile=open( inFileName )
|
|
718 bioseq.read( inFile )
|
|
719 while bioseq.sequence != None:
|
|
720 bioseqNum += 1
|
|
721 if bioseq.header not in lHeadersToRemove:
|
|
722 bioseq.write( outFile )
|
|
723 if verbose > 1:
|
|
724 print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
|
|
725 savedBioseqNb += 1
|
|
726 bioseq.read( inFile )
|
|
727 inFile.close()
|
|
728
|
|
729 outFile.close()
|
|
730
|
|
731 if verbose > 0:
|
|
732 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
|
|
733
|
|
734
|
|
735 ## Find sequence's ORFs from a fasta file and write them in a tab file
|
|
736 #
|
|
737 # @param inFileName string name of the input fasta file
|
|
738 # @param orfMaxNb integer Select orfMaxNb best ORFs
|
|
739 # @param orfMinLength integer Keep ORFs with length > orfMinLength
|
|
740 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
|
|
741 # @param verbose integer verbosity level (default = 0)
|
|
742 #
|
|
743 @staticmethod
|
|
744 def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ):
|
|
745 if outFileName == "":
|
|
746 outFileName = inFileName + ".ORF.map"
|
|
747 outFile = open( outFileName, "w" )
|
|
748
|
|
749 bioseq = Bioseq()
|
|
750 bioseqNb = 0
|
|
751
|
|
752 inFile = open( inFileName )
|
|
753 bioseq.read( inFile )
|
|
754 while bioseq.sequence != None:
|
|
755 bioseq.upCase()
|
|
756 bioseqNb += 1
|
|
757 if verbose > 0:
|
|
758 print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]'
|
|
759
|
|
760 orf = bioseq.findORF()
|
|
761 bestOrf = []
|
|
762 for i in orf.keys():
|
|
763 orfLen = len(orf[i])
|
|
764 for j in xrange(1, orfLen):
|
|
765 start = orf[i][j-1] + 4
|
|
766 end = orf[i][j] + 3
|
|
767 if end - start >= orfMinLength:
|
|
768 bestOrf.append( ( end-start, i+1, start, end ) )
|
|
769
|
|
770 bioseq.reverseComplement()
|
|
771
|
|
772 orf = bioseq.findORF()
|
|
773 seqLen = bioseq.getLength()
|
|
774 for i in orf.keys():
|
|
775 orfLen = len(orf[i])
|
|
776 for j in xrange(1, orfLen):
|
|
777 start = seqLen - orf[i][j-1] - 3
|
|
778 end = seqLen - orf[i][j] - 2
|
|
779 if start - end >= orfMinLength:
|
|
780 bestOrf.append( ( start-end, (i+1)*-1, start, end ) )
|
|
781
|
|
782 bestOrf.sort()
|
|
783 bestOrf.reverse()
|
|
784 bestOrfNb = len(bestOrf)
|
|
785 if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
|
|
786 bestOrfNb = orfMaxNb
|
|
787 for i in xrange(0, bestOrfNb):
|
|
788 if verbose > 0:
|
|
789 print bestOrf[i]
|
|
790 outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\
|
|
791 "|"+str(bestOrf[i][0]),bioseq.header,
|
|
792 bestOrf[i][2],bestOrf[i][3]))
|
|
793 bioseq.read( inFile )
|
|
794
|
|
795 inFile.close()
|
|
796 outFile.close()
|
|
797
|
|
798 return 0
|
|
799
|
|
800
|
|
801 ## Sort sequences by increasing length (write a new file)
|
|
802 #
|
|
803 # @param inFileName string name of the input fasta file
|
|
804 # @param outFileName string name of the output fasta file
|
|
805 # @param verbose integer verbosity level
|
|
806 #
|
|
807 @staticmethod
|
|
808 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
|
|
809 if verbose > 0:
|
|
810 print "sort sequences by increasing length"
|
|
811 sys.stdout.flush()
|
|
812 if not os.path.exists( inFileName ):
|
|
813 print "ERROR: file '%s' doesn't exist" % ( inFileName )
|
|
814 sys.exit(1)
|
|
815
|
|
816 # read each seq one by one
|
|
817 # save them in distinct temporary files
|
|
818 # with their length in the name
|
|
819 inFileHandler = open( inFileName, "r" )
|
|
820 countSeq = 0
|
|
821 bs = Bioseq()
|
|
822 bs.read( inFileHandler )
|
|
823 while bs.header:
|
|
824 countSeq += 1
|
|
825 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
|
|
826 bs.appendBioseqInFile( tmpFile )
|
|
827 if verbose > 1:
|
|
828 print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
|
|
829 bs.header = ""
|
|
830 bs.sequence = ""
|
|
831 bs.read( inFileHandler )
|
|
832 inFileHandler.close()
|
|
833
|
|
834 # sort temporary file names
|
|
835 # concatenate them into the output file
|
|
836 if os.path.exists( outFileName ):
|
|
837 os.remove( outFileName )
|
|
838 lFiles = glob.glob( "*bp_*nb" )
|
|
839 lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
|
|
840 for fileName in lFiles:
|
|
841 cmd = "cat %s >> %s" % ( fileName, outFileName )
|
|
842 returnValue = os.system( cmd )
|
|
843 if returnValue != 0:
|
|
844 print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
|
|
845 sys.exit(1)
|
|
846 os.remove( fileName )
|
|
847
|
|
848 return 0
|
|
849
|
|
850
|
|
851 ## Sort sequences by header
|
|
852 #
|
|
853 # @param inFileName string name of the input fasta file
|
|
854 # @param outFileName string name of the output fasta file
|
|
855 # @param verbose integer verbosity level
|
|
856 #
|
|
857 @staticmethod
|
|
858 def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0):
|
|
859 if outFileName == "":
|
|
860 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
|
|
861 iBioseqDB = BioseqDB(inFileName)
|
|
862 f = open(outFileName, "w")
|
|
863 lHeaders = sorted(iBioseqDB.getHeaderList())
|
|
864 for header in lHeaders:
|
|
865 iBioseq = iBioseqDB.fetch(header)
|
|
866 iBioseq.write(f)
|
|
867 f.close()
|
|
868
|
|
869
|
|
870 ## Return a dictionary which keys are the headers and values the length of the sequences
|
|
871 #
|
|
872 # @param inFile string name of the input fasta file
|
|
873 # @param verbose integer verbosity level
|
|
874 #
|
|
875 @staticmethod
|
|
876 def getLengthPerHeader( inFile, verbose=0 ):
|
|
877 dHeader2Length = {}
|
|
878
|
|
879 inFileHandler = open( inFile, "r" )
|
|
880 currentSeqHeader = ""
|
|
881 currentSeqLength = 0
|
|
882 line = inFileHandler.readline()
|
|
883 while line:
|
|
884 if line[0] == ">":
|
|
885 if currentSeqHeader != "":
|
|
886 dHeader2Length[ currentSeqHeader ] = currentSeqLength
|
|
887 currentSeqLength = 0
|
|
888 currentSeqHeader = line[1:-1]
|
|
889 if verbose > 0:
|
|
890 print "current header: %s" % ( currentSeqHeader )
|
|
891 sys.stdout.flush()
|
|
892 else:
|
|
893 currentSeqLength += len( line.replace("\n","") )
|
|
894 line = inFileHandler.readline()
|
|
895 dHeader2Length[ currentSeqHeader ] = currentSeqLength
|
|
896 inFileHandler.close()
|
|
897
|
|
898 return dHeader2Length
|
|
899
|
|
900
|
|
901 ## Convert headers from a fasta file having chunk coordinates
|
|
902 #
|
|
903 # @param inFile string name of the input fasta file
|
|
904 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
|
|
905 # @param outFile string name of the output file
|
|
906 #
|
|
907 @staticmethod
|
|
908 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
|
|
909 inFileHandler = open(inFile, "r")
|
|
910 outFileHandler = open(outFile, "w")
|
|
911 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
|
|
912 iConvCoord = ConvCoord()
|
|
913 line = inFileHandler.readline()
|
|
914 while line:
|
|
915 if line[0] == ">":
|
|
916 if "{Fragment}" in line:
|
|
917 chkName = line.split(" ")[1]
|
|
918 chrName = dChunk2Map[chkName].seqname
|
|
919 lCoordPairs = line.split(" ")[3].split(",")
|
|
920 lRangesOnChk = []
|
|
921 for i in lCoordPairs:
|
|
922 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
|
|
923 lRangesOnChk.append(iRange)
|
|
924 lRangesOnChr = []
|
|
925 for iRange in lRangesOnChk:
|
|
926 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
|
|
927 newHeader = line[1:-1].split(" ")[0]
|
|
928 newHeader += " %s" % chrName
|
|
929 newHeader += " {Fragment}"
|
|
930 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
|
|
931 for iRange in lRangesOnChr[1:]:
|
|
932 newHeader += ",%i..%i" % (iRange.start, iRange.end)
|
|
933 outFileHandler.write(">%s\n" % newHeader)
|
|
934 else:
|
|
935 chkName = line.split("_")[1].split(" ")[0]
|
|
936 chrName = dChunk2Map[chkName].seqname
|
|
937 coords = line[line.find("[")+1 : line.find("]")]
|
|
938 start = int(coords.split(",")[0])
|
|
939 end = int(coords.split(",")[1])
|
|
940 iRangeOnChk = Range(chkName, start, end)
|
|
941 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
|
|
942 newHeader = line[1:-1].split("_")[0]
|
|
943 newHeader += " %s" % chrName
|
|
944 newHeader += " %s" % line[line.find("(") : line.find(")")+1]
|
|
945 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
|
|
946 outFileHandler.write(">%s\n" % newHeader)
|
|
947 else:
|
|
948 outFileHandler.write(line)
|
|
949 line = inFileHandler.readline()
|
|
950 inFileHandler.close()
|
|
951 outFileHandler.close()
|
|
952
|
|
953
|
|
954 ## Convert a fasta file to a length file
|
|
955 #
|
|
956 # @param inFile string name of the input fasta file
|
|
957 # @param outFile string name of the output file
|
|
958 #
|
|
959 @staticmethod
|
|
960 def convertFastaToLength(inFile, outFile = ""):
|
|
961 if outFile == "":
|
|
962 outFile = "%s.length" % inFile
|
|
963
|
|
964 if inFile != "":
|
|
965 with open(inFile, "r") as inFH:
|
|
966 with open(outFile, "w") as outFH:
|
|
967 bioseq = Bioseq()
|
|
968 bioseq.read(inFH)
|
|
969 while bioseq.sequence != None:
|
|
970 seqLen = bioseq.getLength()
|
|
971 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
|
|
972 bioseq.read(inFH)
|
|
973
|
|
974
|
|
975 ## Convert a fasta file to a seq file
|
|
976 #
|
|
977 # @param inFile string name of the input fasta file
|
|
978 # @param outFile string name of the output file
|
|
979 #
|
|
980 @staticmethod
|
|
981 def convertFastaToSeq(inFile, outFile = ""):
|
|
982 if outFile == "":
|
|
983 outFile = "%s.seq" % inFile
|
|
984
|
|
985 if inFile != "":
|
|
986 with open(inFile, "r") as inFH:
|
|
987 with open(outFile, "w") as outFH:
|
|
988 bioseq = Bioseq()
|
|
989 bioseq.read(inFH)
|
|
990 while bioseq.sequence != None:
|
|
991 seqLen = bioseq.getLength()
|
|
992 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
|
|
993 bioseq.sequence, bioseq.header, seqLen))
|
|
994 bioseq.read(inFH)
|
|
995
|
|
996
|
|
997 ## Splice an input fasta file using coordinates in a Map file
|
|
998 #
|
|
999 # @note the coordinates should be merged beforehand!
|
|
1000 #
|
|
1001 @staticmethod
|
|
1002 def spliceFromCoords( genomeFile, coordFile, obsFile ):
|
|
1003 genomeFileHandler = open( genomeFile, "r" )
|
|
1004 obsFileHandler = open( obsFile, "w" )
|
|
1005 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
|
|
1006
|
|
1007 bs = Bioseq()
|
|
1008 bs.read( genomeFileHandler )
|
|
1009 while bs.sequence:
|
|
1010 if dChr2Maps.has_key( bs.header ):
|
|
1011 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
|
|
1012 splicedSeq = ""
|
|
1013 currentSite = 0
|
|
1014 for iMap in lCoords:
|
|
1015 minSplice = iMap.getMin() - 1
|
|
1016 if minSplice > currentSite:
|
|
1017 splicedSeq += bs.sequence[ currentSite : minSplice ]
|
|
1018 currentSite = iMap.getMax()
|
|
1019 splicedSeq += bs.sequence[ currentSite : ]
|
|
1020 bs.sequence = splicedSeq
|
|
1021 bs.write( obsFileHandler )
|
|
1022 bs.read( genomeFileHandler )
|
|
1023
|
|
1024 genomeFileHandler.close()
|
|
1025 obsFileHandler.close()
|
|
1026
|
|
1027
|
|
1028 ## Shuffle input sequences (single file or files in a directory)
|
|
1029 #
|
|
1030 @staticmethod
|
|
1031 def dbShuffle( inData, outData, verbose=0 ):
|
|
1032 if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
|
|
1033 prg = "esl-shuffle"
|
|
1034 else : prg = "shuffle"
|
|
1035 genericCmd = prg + " -d INPUT > OUTPUT"
|
|
1036 if os.path.isfile( inData ):
|
|
1037 if verbose > 0:
|
|
1038 print "shuffle input file '%s'" % inData
|
|
1039 cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData)
|
|
1040 print cmd
|
|
1041 returnStatus = os.system( cmd )
|
|
1042 if returnStatus != 0:
|
|
1043 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
|
|
1044 sys.exit(1)
|
|
1045
|
|
1046 elif os.path.isdir( inData ):
|
|
1047 if verbose > 0:
|
|
1048 print "shuffle files in input directory '%s'" % inData
|
|
1049 if os.path.exists( outData ):
|
|
1050 shutil.rmtree( outData )
|
|
1051 os.mkdir( outData )
|
|
1052 lInputFiles = glob.glob( "%s/*.fa" %( inData ) )
|
|
1053 nbFastaFiles = 0
|
|
1054 for inputFile in lInputFiles:
|
|
1055 nbFastaFiles += 1
|
|
1056 if verbose > 1:
|
|
1057 print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) )
|
|
1058 fastaBaseName = os.path.basename( inputFile )
|
|
1059 prefix, extension = os.path.splitext( fastaBaseName )
|
|
1060 cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix))
|
|
1061 returnStatus = os.system( cmd )
|
|
1062 if returnStatus != 0:
|
|
1063 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
|
|
1064 sys.exit(1)
|
|
1065
|
|
1066
|
|
1067 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
|
|
1068 #
|
|
1069 # @param inClusterFileName string input cluster file name
|
|
1070 # @param inFastaFileName string input fasta file name
|
|
1071 # @param outFileName string output file name
|
|
1072 # @param verbosity integer verbosity
|
|
1073 #
|
|
1074 @staticmethod
|
|
1075 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
|
|
1076 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
|
|
1077 iFastaParser = FastaParser(inFastaFileName)
|
|
1078 with open(outFileName, "w") as f:
|
|
1079 for iSequence in iFastaParser.getIterator():
|
|
1080
|
|
1081 header = iSequence.getName()
|
|
1082 if dHeader2ClusterClusterMember.get(header):
|
|
1083 cluster = dHeader2ClusterClusterMember[header][0]
|
|
1084 member = dHeader2ClusterClusterMember[header][1]
|
|
1085 else:
|
|
1086 clusterIdForSingletonCluster += 1
|
|
1087 cluster = clusterIdForSingletonCluster
|
|
1088 member = 1
|
|
1089
|
|
1090 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
|
|
1091 iSequence.setName(newHeader)
|
|
1092 f.write(iSequence.printFasta())
|
|
1093
|
|
1094 @staticmethod
|
|
1095 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
|
|
1096 dHeader2ClusterClusterMember = {}
|
|
1097 clusterId = 0
|
|
1098 with open(inClusterFileName) as f:
|
|
1099 line = f.readline()
|
|
1100 while line:
|
|
1101 lineWithoutLastChar = line.rstrip()
|
|
1102 lHeaders = lineWithoutLastChar.split("\t")
|
|
1103 clusterId += 1
|
|
1104 if verbosity > 0:
|
|
1105 print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
|
|
1106 memberId = 0
|
|
1107 for header in lHeaders:
|
|
1108 memberId += 1
|
|
1109 dHeader2ClusterClusterMember[header] = (clusterId, memberId)
|
|
1110 line = f.readline()
|
|
1111 if verbosity > 0:
|
|
1112 print "%i clusters" % clusterId
|
|
1113 return dHeader2ClusterClusterMember, clusterId
|
|
1114
|
|
1115 @staticmethod
|
|
1116 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
|
|
1117 """
|
|
1118 Write a map file from fasta output of clustering tool.
|
|
1119 Warning: only works if input fasta headers are formated like LTRharvest fasta output.
|
|
1120 """
|
|
1121 if not outMapFileName:
|
|
1122 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
|
|
1123
|
|
1124 fileDb = open(fastaFileNameFromClustering , "r")
|
|
1125 fileMap = open(outMapFileName, "w")
|
|
1126 seq = Bioseq()
|
|
1127 numseq = 0
|
|
1128 while 1:
|
|
1129 seq.read(fileDb)
|
|
1130 if seq.sequence == None:
|
|
1131 break
|
|
1132 numseq = numseq + 1
|
|
1133 ID = seq.header.split(' ')[0].split('_')[0]
|
|
1134 chunk = seq.header.split(' ')[0].split('_')[1]
|
|
1135 start = seq.header.split(' ')[-1].split(',')[0][1:]
|
|
1136 end = seq.header.split(' ')[-1].split(',')[1][:-1]
|
|
1137 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
|
|
1138 fileMap.write(line + "\n")
|
|
1139
|
|
1140 fileDb.close()
|
|
1141 fileMap.close()
|
|
1142 print "saved in %s" % outMapFileName
|
|
1143 |