Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/MergeMatchsFiles.py @ 18:94ab73e8a190
Uploaded
| author | m-zytnicki |
|---|---|
| date | Mon, 29 Apr 2013 03:20:15 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:b0e8584489e6 | 18:94ab73e8a190 |
|---|---|
| 1 from commons.core.utils.FileUtils import FileUtils | |
| 2 from commons.core.coord.Align import Align | |
| 3 import shutil | |
| 4 import os | |
| 5 import sys | |
| 6 | |
| 7 class MergeMatchsFiles(object): | |
| 8 | |
| 9 def __init__(self, fileType, outFileBaseName, allByAll = False, clean = True, verbose = 0): | |
| 10 self._fileType = fileType | |
| 11 self._outFileBaseName = outFileBaseName | |
| 12 self._allByAll = allByAll | |
| 13 self._verbose = verbose | |
| 14 self._clean = clean | |
| 15 | |
| 16 def _filterRedundantMatches( self, inFile, outFile ): | |
| 17 """ | |
| 18 When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks), | |
| 19 one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...' | |
| 20 and we discards 'chunk7-11-110-chunk3-1-100-...'. | |
| 21 Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards | |
| 22 'chunk5-11-110-chunk5-1-100-...'. | |
| 23 For this of course the results need to be sorted by query, on plus strand, | |
| 24 and in ascending coordinates (always the case with Blaster). | |
| 25 """ | |
| 26 inFileHandler = open( inFile, "r" ) | |
| 27 outFileHandler = open( outFile, "w" ) | |
| 28 iAlign = Align() | |
| 29 countMatches = 0 | |
| 30 tick = 100000 | |
| 31 while True: | |
| 32 line = inFileHandler.readline() | |
| 33 if line == "": | |
| 34 break | |
| 35 countMatches += 1 | |
| 36 iAlign.setFromString( line ) | |
| 37 if "chunk" not in iAlign.range_query.seqname \ | |
| 38 or "chunk" not in iAlign.range_subject.seqname: | |
| 39 print "ERROR: 'chunk' not in seqname" | |
| 40 sys.exit(1) | |
| 41 if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]): | |
| 42 iAlign.write( outFileHandler ) | |
| 43 elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]): | |
| 44 if iAlign.range_query.getMin() < iAlign.range_subject.getMin(): | |
| 45 iAlign.write( outFileHandler ) | |
| 46 if countMatches % tick == 0: # need to free buffer frequently as file can be big | |
| 47 outFileHandler.flush() | |
| 48 os.fsync( outFileHandler.fileno() ) | |
| 49 inFileHandler.close() | |
| 50 outFileHandler.close() | |
| 51 | |
| 52 def run(self): | |
| 53 if self._verbose > 1: | |
| 54 print "concatenate the results of each job" | |
| 55 sys.stdout.flush() | |
| 56 | |
| 57 tmpFileName = "%s.%s_tmp" % (self._outFileBaseName, self._fileType) | |
| 58 outFileName = "%s.%s" % (self._outFileBaseName, self._fileType) | |
| 59 pattern = "*.%s" % self._fileType | |
| 60 | |
| 61 if os.path.exists(tmpFileName): | |
| 62 os.remove(tmpFileName) | |
| 63 | |
| 64 FileUtils.catFilesByPattern(pattern, tmpFileName) | |
| 65 if self._clean: | |
| 66 FileUtils.removeFilesByPattern(pattern) | |
| 67 | |
| 68 if self._fileType == "align": | |
| 69 if self._allByAll: | |
| 70 self._filterRedundantMatches(tmpFileName, outFileName) | |
| 71 else: | |
| 72 shutil.move(tmpFileName, outFileName) | |
| 73 else: | |
| 74 prg = "%s/bin/%snum2id" % (os.environ["REPET_PATH"], self._fileType) | |
| 75 cmd = prg | |
| 76 cmd += " -i %s" % tmpFileName | |
| 77 cmd += " -o %s" % outFileName | |
| 78 cmd += " -v %i" % (self._verbose - 1) | |
| 79 log = os.system(cmd) | |
| 80 if log != 0: | |
| 81 print "*** Error: %s returned %i" % (prg, log) | |
| 82 sys.exit(1) | |
| 83 if self._clean and FileUtils.isRessourceExists(tmpFileName): | |
| 84 os.remove(tmpFileName) |
