Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/MergeMatchsFiles.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:b0e8584489e6 | 18:94ab73e8a190 |
---|---|
1 from commons.core.utils.FileUtils import FileUtils | |
2 from commons.core.coord.Align import Align | |
3 import shutil | |
4 import os | |
5 import sys | |
6 | |
7 class MergeMatchsFiles(object): | |
8 | |
9 def __init__(self, fileType, outFileBaseName, allByAll = False, clean = True, verbose = 0): | |
10 self._fileType = fileType | |
11 self._outFileBaseName = outFileBaseName | |
12 self._allByAll = allByAll | |
13 self._verbose = verbose | |
14 self._clean = clean | |
15 | |
16 def _filterRedundantMatches( self, inFile, outFile ): | |
17 """ | |
18 When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks), | |
19 one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...' | |
20 and we discards 'chunk7-11-110-chunk3-1-100-...'. | |
21 Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards | |
22 'chunk5-11-110-chunk5-1-100-...'. | |
23 For this of course the results need to be sorted by query, on plus strand, | |
24 and in ascending coordinates (always the case with Blaster). | |
25 """ | |
26 inFileHandler = open( inFile, "r" ) | |
27 outFileHandler = open( outFile, "w" ) | |
28 iAlign = Align() | |
29 countMatches = 0 | |
30 tick = 100000 | |
31 while True: | |
32 line = inFileHandler.readline() | |
33 if line == "": | |
34 break | |
35 countMatches += 1 | |
36 iAlign.setFromString( line ) | |
37 if "chunk" not in iAlign.range_query.seqname \ | |
38 or "chunk" not in iAlign.range_subject.seqname: | |
39 print "ERROR: 'chunk' not in seqname" | |
40 sys.exit(1) | |
41 if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]): | |
42 iAlign.write( outFileHandler ) | |
43 elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]): | |
44 if iAlign.range_query.getMin() < iAlign.range_subject.getMin(): | |
45 iAlign.write( outFileHandler ) | |
46 if countMatches % tick == 0: # need to free buffer frequently as file can be big | |
47 outFileHandler.flush() | |
48 os.fsync( outFileHandler.fileno() ) | |
49 inFileHandler.close() | |
50 outFileHandler.close() | |
51 | |
52 def run(self): | |
53 if self._verbose > 1: | |
54 print "concatenate the results of each job" | |
55 sys.stdout.flush() | |
56 | |
57 tmpFileName = "%s.%s_tmp" % (self._outFileBaseName, self._fileType) | |
58 outFileName = "%s.%s" % (self._outFileBaseName, self._fileType) | |
59 pattern = "*.%s" % self._fileType | |
60 | |
61 if os.path.exists(tmpFileName): | |
62 os.remove(tmpFileName) | |
63 | |
64 FileUtils.catFilesByPattern(pattern, tmpFileName) | |
65 if self._clean: | |
66 FileUtils.removeFilesByPattern(pattern) | |
67 | |
68 if self._fileType == "align": | |
69 if self._allByAll: | |
70 self._filterRedundantMatches(tmpFileName, outFileName) | |
71 else: | |
72 shutil.move(tmpFileName, outFileName) | |
73 else: | |
74 prg = "%s/bin/%snum2id" % (os.environ["REPET_PATH"], self._fileType) | |
75 cmd = prg | |
76 cmd += " -i %s" % tmpFileName | |
77 cmd += " -o %s" % outFileName | |
78 cmd += " -v %i" % (self._verbose - 1) | |
79 log = os.system(cmd) | |
80 if log != 0: | |
81 print "*** Error: %s returned %i" % (prg, log) | |
82 sys.exit(1) | |
83 if self._clean and FileUtils.isRessourceExists(tmpFileName): | |
84 os.remove(tmpFileName) |