Mercurial > repos > yufei-luo > s_mart
diff commons/tools/MergeMatchsFiles.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/MergeMatchsFiles.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,84 @@ +from commons.core.utils.FileUtils import FileUtils +from commons.core.coord.Align import Align +import shutil +import os +import sys + +class MergeMatchsFiles(object): + + def __init__(self, fileType, outFileBaseName, allByAll = False, clean = True, verbose = 0): + self._fileType = fileType + self._outFileBaseName = outFileBaseName + self._allByAll = allByAll + self._verbose = verbose + self._clean = clean + + def _filterRedundantMatches( self, inFile, outFile ): + """ + When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks), + one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...' + and we discards 'chunk7-11-110-chunk3-1-100-...'. + Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards + 'chunk5-11-110-chunk5-1-100-...'. + For this of course the results need to be sorted by query, on plus strand, + and in ascending coordinates (always the case with Blaster). + """ + inFileHandler = open( inFile, "r" ) + outFileHandler = open( outFile, "w" ) + iAlign = Align() + countMatches = 0 + tick = 100000 + while True: + line = inFileHandler.readline() + if line == "": + break + countMatches += 1 + iAlign.setFromString( line ) + if "chunk" not in iAlign.range_query.seqname \ + or "chunk" not in iAlign.range_subject.seqname: + print "ERROR: 'chunk' not in seqname" + sys.exit(1) + if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]): + iAlign.write( outFileHandler ) + elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]): + if iAlign.range_query.getMin() < iAlign.range_subject.getMin(): + iAlign.write( outFileHandler ) + if countMatches % tick == 0: # need to free buffer frequently as file can be big + outFileHandler.flush() + os.fsync( outFileHandler.fileno() ) + inFileHandler.close() + outFileHandler.close() + + def run(self): + if self._verbose > 1: + print "concatenate the results of each job" + sys.stdout.flush() + + tmpFileName = "%s.%s_tmp" % (self._outFileBaseName, self._fileType) + outFileName = "%s.%s" % (self._outFileBaseName, self._fileType) + pattern = "*.%s" % self._fileType + + if os.path.exists(tmpFileName): + os.remove(tmpFileName) + + FileUtils.catFilesByPattern(pattern, tmpFileName) + if self._clean: + FileUtils.removeFilesByPattern(pattern) + + if self._fileType == "align": + if self._allByAll: + self._filterRedundantMatches(tmpFileName, outFileName) + else: + shutil.move(tmpFileName, outFileName) + else: + prg = "%s/bin/%snum2id" % (os.environ["REPET_PATH"], self._fileType) + cmd = prg + cmd += " -i %s" % tmpFileName + cmd += " -o %s" % outFileName + cmd += " -v %i" % (self._verbose - 1) + log = os.system(cmd) + if log != 0: + print "*** Error: %s returned %i" % (prg, log) + sys.exit(1) + if self._clean and FileUtils.isRessourceExists(tmpFileName): + os.remove(tmpFileName) \ No newline at end of file