Mercurial > repos > yufei-luo > s_mart
view commons/tools/MergeMatchsFiles.py @ 19:9bcfa7936eec
Deleted selected files
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:23:29 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line source
from commons.core.utils.FileUtils import FileUtils from commons.core.coord.Align import Align import shutil import os import sys class MergeMatchsFiles(object): def __init__(self, fileType, outFileBaseName, allByAll = False, clean = True, verbose = 0): self._fileType = fileType self._outFileBaseName = outFileBaseName self._allByAll = allByAll self._verbose = verbose self._clean = clean def _filterRedundantMatches( self, inFile, outFile ): """ When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks), one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...' and we discards 'chunk7-11-110-chunk3-1-100-...'. Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards 'chunk5-11-110-chunk5-1-100-...'. For this of course the results need to be sorted by query, on plus strand, and in ascending coordinates (always the case with Blaster). """ inFileHandler = open( inFile, "r" ) outFileHandler = open( outFile, "w" ) iAlign = Align() countMatches = 0 tick = 100000 while True: line = inFileHandler.readline() if line == "": break countMatches += 1 iAlign.setFromString( line ) if "chunk" not in iAlign.range_query.seqname \ or "chunk" not in iAlign.range_subject.seqname: print "ERROR: 'chunk' not in seqname" sys.exit(1) if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]): iAlign.write( outFileHandler ) elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]): if iAlign.range_query.getMin() < iAlign.range_subject.getMin(): iAlign.write( outFileHandler ) if countMatches % tick == 0: # need to free buffer frequently as file can be big outFileHandler.flush() os.fsync( outFileHandler.fileno() ) inFileHandler.close() outFileHandler.close() def run(self): if self._verbose > 1: print "concatenate the results of each job" sys.stdout.flush() tmpFileName = "%s.%s_tmp" % (self._outFileBaseName, self._fileType) outFileName = "%s.%s" % (self._outFileBaseName, self._fileType) pattern = "*.%s" % self._fileType if os.path.exists(tmpFileName): os.remove(tmpFileName) FileUtils.catFilesByPattern(pattern, tmpFileName) if self._clean: FileUtils.removeFilesByPattern(pattern) if self._fileType == "align": if self._allByAll: self._filterRedundantMatches(tmpFileName, outFileName) else: shutil.move(tmpFileName, outFileName) else: prg = "%s/bin/%snum2id" % (os.environ["REPET_PATH"], self._fileType) cmd = prg cmd += " -i %s" % tmpFileName cmd += " -o %s" % outFileName cmd += " -v %i" % (self._verbose - 1) log = os.system(cmd) if log != 0: print "*** Error: %s returned %i" % (prg, log) sys.exit(1) if self._clean and FileUtils.isRessourceExists(tmpFileName): os.remove(tmpFileName)