comparison commons/tools/MergeMatchsFiles.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 from commons.core.utils.FileUtils import FileUtils
2 from commons.core.coord.Align import Align
3 import shutil
4 import os
5 import sys
6
7 class MergeMatchsFiles(object):
8
9 def __init__(self, fileType, outFileBaseName, allByAll = False, clean = True, verbose = 0):
10 self._fileType = fileType
11 self._outFileBaseName = outFileBaseName
12 self._allByAll = allByAll
13 self._verbose = verbose
14 self._clean = clean
15
16 def _filterRedundantMatches( self, inFile, outFile ):
17 """
18 When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks),
19 one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...'
20 and we discards 'chunk7-11-110-chunk3-1-100-...'.
21 Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards
22 'chunk5-11-110-chunk5-1-100-...'.
23 For this of course the results need to be sorted by query, on plus strand,
24 and in ascending coordinates (always the case with Blaster).
25 """
26 inFileHandler = open( inFile, "r" )
27 outFileHandler = open( outFile, "w" )
28 iAlign = Align()
29 countMatches = 0
30 tick = 100000
31 while True:
32 line = inFileHandler.readline()
33 if line == "":
34 break
35 countMatches += 1
36 iAlign.setFromString( line )
37 if "chunk" not in iAlign.range_query.seqname \
38 or "chunk" not in iAlign.range_subject.seqname:
39 print "ERROR: 'chunk' not in seqname"
40 sys.exit(1)
41 if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]):
42 iAlign.write( outFileHandler )
43 elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]):
44 if iAlign.range_query.getMin() < iAlign.range_subject.getMin():
45 iAlign.write( outFileHandler )
46 if countMatches % tick == 0: # need to free buffer frequently as file can be big
47 outFileHandler.flush()
48 os.fsync( outFileHandler.fileno() )
49 inFileHandler.close()
50 outFileHandler.close()
51
52 def run(self):
53 if self._verbose > 1:
54 print "concatenate the results of each job"
55 sys.stdout.flush()
56
57 tmpFileName = "%s.%s_tmp" % (self._outFileBaseName, self._fileType)
58 outFileName = "%s.%s" % (self._outFileBaseName, self._fileType)
59 pattern = "*.%s" % self._fileType
60
61 if os.path.exists(tmpFileName):
62 os.remove(tmpFileName)
63
64 FileUtils.catFilesByPattern(pattern, tmpFileName)
65 if self._clean:
66 FileUtils.removeFilesByPattern(pattern)
67
68 if self._fileType == "align":
69 if self._allByAll:
70 self._filterRedundantMatches(tmpFileName, outFileName)
71 else:
72 shutil.move(tmpFileName, outFileName)
73 else:
74 prg = "%s/bin/%snum2id" % (os.environ["REPET_PATH"], self._fileType)
75 cmd = prg
76 cmd += " -i %s" % tmpFileName
77 cmd += " -o %s" % outFileName
78 cmd += " -v %i" % (self._verbose - 1)
79 log = os.system(cmd)
80 if log != 0:
81 print "*** Error: %s returned %i" % (prg, log)
82 sys.exit(1)
83 if self._clean and FileUtils.isRessourceExists(tmpFileName):
84 os.remove(tmpFileName)