annotate commons/tools/MergeMatchsFiles.py @ 19:9bcfa7936eec

Deleted selected files
author m-zytnicki
date Mon, 29 Apr 2013 03:23:29 -0400
parents 94ab73e8a190
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 from commons.core.utils.FileUtils import FileUtils
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2 from commons.core.coord.Align import Align
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 import shutil
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4 import os
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 import sys
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7 class MergeMatchsFiles(object):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9 def __init__(self, fileType, outFileBaseName, allByAll = False, clean = True, verbose = 0):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 self._fileType = fileType
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11 self._outFileBaseName = outFileBaseName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12 self._allByAll = allByAll
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13 self._verbose = verbose
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14 self._clean = clean
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16 def _filterRedundantMatches( self, inFile, outFile ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks),
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19 one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...'
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20 and we discards 'chunk7-11-110-chunk3-1-100-...'.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22 'chunk5-11-110-chunk5-1-100-...'.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23 For this of course the results need to be sorted by query, on plus strand,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24 and in ascending coordinates (always the case with Blaster).
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26 inFileHandler = open( inFile, "r" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27 outFileHandler = open( outFile, "w" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28 iAlign = Align()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 countMatches = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 tick = 100000
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31 while True:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32 line = inFileHandler.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33 if line == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34 break
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35 countMatches += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36 iAlign.setFromString( line )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37 if "chunk" not in iAlign.range_query.seqname \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 or "chunk" not in iAlign.range_subject.seqname:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39 print "ERROR: 'chunk' not in seqname"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41 if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42 iAlign.write( outFileHandler )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43 elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44 if iAlign.range_query.getMin() < iAlign.range_subject.getMin():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45 iAlign.write( outFileHandler )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46 if countMatches % tick == 0: # need to free buffer frequently as file can be big
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47 outFileHandler.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 os.fsync( outFileHandler.fileno() )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49 inFileHandler.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50 outFileHandler.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 def run(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53 if self._verbose > 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54 print "concatenate the results of each job"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 tmpFileName = "%s.%s_tmp" % (self._outFileBaseName, self._fileType)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58 outFileName = "%s.%s" % (self._outFileBaseName, self._fileType)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59 pattern = "*.%s" % self._fileType
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 if os.path.exists(tmpFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 os.remove(tmpFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64 FileUtils.catFilesByPattern(pattern, tmpFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65 if self._clean:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66 FileUtils.removeFilesByPattern(pattern)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68 if self._fileType == "align":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69 if self._allByAll:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70 self._filterRedundantMatches(tmpFileName, outFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72 shutil.move(tmpFileName, outFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74 prg = "%s/bin/%snum2id" % (os.environ["REPET_PATH"], self._fileType)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75 cmd = prg
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76 cmd += " -i %s" % tmpFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77 cmd += " -o %s" % outFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78 cmd += " -v %i" % (self._verbose - 1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 log = os.system(cmd)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80 if log != 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81 print "*** Error: %s returned %i" % (prg, log)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83 if self._clean and FileUtils.isRessourceExists(tmpFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84 os.remove(tmpFileName)