s_mart: commons/tools/MergeMatchsFiles.py comparison

comparison commons/tools/MergeMatchsFiles.py @ 18:94ab73e8a190

Uploaded

author	m-zytnicki
date	Mon, 29 Apr 2013 03:20:15 -0400
parents
children

comparison

equal deleted inserted replaced

-:b0e8584489e6
+:94ab73e8a190
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.coord.Align import Align
+import shutil
+import os
+import sys
+class MergeMatchsFiles(object):
+def __init__(self, fileType, outFileBaseName, allByAll = False, clean = True, verbose = 0):
+self._fileType = fileType
+self._outFileBaseName = outFileBaseName
+self._allByAll = allByAll
+self._verbose = verbose
+self._clean = clean
+def _filterRedundantMatches( self, inFile, outFile ):
+"""
+When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks),
+one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...'
+and we discards 'chunk7-11-110-chunk3-1-100-...'.
+Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards
+'chunk5-11-110-chunk5-1-100-...'.
+For this of course the results need to be sorted by query, on plus strand,
+and in ascending coordinates (always the case with Blaster).
+"""
+inFileHandler = open( inFile, "r" )
+outFileHandler = open( outFile, "w" )
+iAlign = Align()
+countMatches = 0
+tick = 100000
+while True:
+line = inFileHandler.readline()
+if line == "":
+break
+countMatches += 1
+iAlign.setFromString( line )
+if "chunk" not in iAlign.range_query.seqname \
+or "chunk" not in iAlign.range_subject.seqname:
+print "ERROR: 'chunk' not in seqname"
+sys.exit(1)
+if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]):
+iAlign.write( outFileHandler )
+elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]):
+if iAlign.range_query.getMin() < iAlign.range_subject.getMin():
+iAlign.write( outFileHandler )
+if countMatches % tick == 0:   # need to free buffer frequently as file can be big
+outFileHandler.flush()
+os.fsync( outFileHandler.fileno() )
+inFileHandler.close()
+outFileHandler.close()
+def run(self):
+if self._verbose > 1:
+print "concatenate the results of each job"
+sys.stdout.flush()
+tmpFileName = "%s.%s_tmp" % (self._outFileBaseName, self._fileType)
+outFileName = "%s.%s" % (self._outFileBaseName, self._fileType)
+pattern = "*.%s" % self._fileType
+if os.path.exists(tmpFileName):
+os.remove(tmpFileName)
+FileUtils.catFilesByPattern(pattern, tmpFileName)
+if self._clean:
+FileUtils.removeFilesByPattern(pattern)
+if self._fileType == "align":
+if self._allByAll:
+self._filterRedundantMatches(tmpFileName, outFileName)
+else:
+shutil.move(tmpFileName, outFileName)
+else:
+prg = "%s/bin/%snum2id" % (os.environ["REPET_PATH"], self._fileType)
+cmd = prg
+cmd += " -i %s" % tmpFileName
+cmd += " -o %s" % outFileName
+cmd += " -v %i" % (self._verbose - 1)
+log = os.system(cmd)
+if log != 0:
+print "*** Error: %s returned %i" % (prg, log)
+sys.exit(1)
+if self._clean and FileUtils.isRessourceExists(tmpFileName):
+os.remove(tmpFileName)

Mercurial > repos > yufei-luo > s_mart

comparison commons/tools/MergeMatchsFiles.py @ 18:94ab73e8a190