18
|
1 from commons.core.utils.FileUtils import FileUtils
|
|
2 from commons.core.coord.Align import Align
|
|
3 import shutil
|
|
4 import os
|
|
5 import sys
|
|
6
|
|
7 class MergeMatchsFiles(object):
|
|
8
|
|
9 def __init__(self, fileType, outFileBaseName, allByAll = False, clean = True, verbose = 0):
|
|
10 self._fileType = fileType
|
|
11 self._outFileBaseName = outFileBaseName
|
|
12 self._allByAll = allByAll
|
|
13 self._verbose = verbose
|
|
14 self._clean = clean
|
|
15
|
|
16 def _filterRedundantMatches( self, inFile, outFile ):
|
|
17 """
|
|
18 When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks),
|
|
19 one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...'
|
|
20 and we discards 'chunk7-11-110-chunk3-1-100-...'.
|
|
21 Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards
|
|
22 'chunk5-11-110-chunk5-1-100-...'.
|
|
23 For this of course the results need to be sorted by query, on plus strand,
|
|
24 and in ascending coordinates (always the case with Blaster).
|
|
25 """
|
|
26 inFileHandler = open( inFile, "r" )
|
|
27 outFileHandler = open( outFile, "w" )
|
|
28 iAlign = Align()
|
|
29 countMatches = 0
|
|
30 tick = 100000
|
|
31 while True:
|
|
32 line = inFileHandler.readline()
|
|
33 if line == "":
|
|
34 break
|
|
35 countMatches += 1
|
|
36 iAlign.setFromString( line )
|
|
37 if "chunk" not in iAlign.range_query.seqname \
|
|
38 or "chunk" not in iAlign.range_subject.seqname:
|
|
39 print "ERROR: 'chunk' not in seqname"
|
|
40 sys.exit(1)
|
|
41 if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]):
|
|
42 iAlign.write( outFileHandler )
|
|
43 elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]):
|
|
44 if iAlign.range_query.getMin() < iAlign.range_subject.getMin():
|
|
45 iAlign.write( outFileHandler )
|
|
46 if countMatches % tick == 0: # need to free buffer frequently as file can be big
|
|
47 outFileHandler.flush()
|
|
48 os.fsync( outFileHandler.fileno() )
|
|
49 inFileHandler.close()
|
|
50 outFileHandler.close()
|
|
51
|
|
52 def run(self):
|
|
53 if self._verbose > 1:
|
|
54 print "concatenate the results of each job"
|
|
55 sys.stdout.flush()
|
|
56
|
|
57 tmpFileName = "%s.%s_tmp" % (self._outFileBaseName, self._fileType)
|
|
58 outFileName = "%s.%s" % (self._outFileBaseName, self._fileType)
|
|
59 pattern = "*.%s" % self._fileType
|
|
60
|
|
61 if os.path.exists(tmpFileName):
|
|
62 os.remove(tmpFileName)
|
|
63
|
|
64 FileUtils.catFilesByPattern(pattern, tmpFileName)
|
|
65 if self._clean:
|
|
66 FileUtils.removeFilesByPattern(pattern)
|
|
67
|
|
68 if self._fileType == "align":
|
|
69 if self._allByAll:
|
|
70 self._filterRedundantMatches(tmpFileName, outFileName)
|
|
71 else:
|
|
72 shutil.move(tmpFileName, outFileName)
|
|
73 else:
|
|
74 prg = "%s/bin/%snum2id" % (os.environ["REPET_PATH"], self._fileType)
|
|
75 cmd = prg
|
|
76 cmd += " -i %s" % tmpFileName
|
|
77 cmd += " -o %s" % outFileName
|
|
78 cmd += " -v %i" % (self._verbose - 1)
|
|
79 log = os.system(cmd)
|
|
80 if log != 0:
|
|
81 print "*** Error: %s returned %i" % (prg, log)
|
|
82 sys.exit(1)
|
|
83 if self._clean and FileUtils.isRessourceExists(tmpFileName):
|
|
84 os.remove(tmpFileName) |