diff commons/tools/MergeMatchsFiles.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/tools/MergeMatchsFiles.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,84 @@
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.coord.Align import Align
+import shutil
+import os
+import sys
+
+class MergeMatchsFiles(object):
+
+    def __init__(self, fileType, outFileBaseName, allByAll = False, clean = True, verbose = 0):
+        self._fileType = fileType
+        self._outFileBaseName = outFileBaseName
+        self._allByAll = allByAll
+        self._verbose = verbose
+        self._clean = clean
+
+    def _filterRedundantMatches( self, inFile, outFile ):
+        """
+        When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks),
+        one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...'
+        and we discards 'chunk7-11-110-chunk3-1-100-...'.
+        Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards
+        'chunk5-11-110-chunk5-1-100-...'.
+        For this of course the results need to be sorted by query, on plus strand,
+        and in ascending coordinates (always the case with Blaster).
+        """
+        inFileHandler = open( inFile, "r" )
+        outFileHandler = open( outFile, "w" )
+        iAlign = Align()
+        countMatches = 0
+        tick = 100000
+        while True:
+            line = inFileHandler.readline()
+            if line == "":
+                break
+            countMatches += 1
+            iAlign.setFromString( line )
+            if "chunk" not in iAlign.range_query.seqname \
+                   or "chunk" not in iAlign.range_subject.seqname:
+                print "ERROR: 'chunk' not in seqname"
+                sys.exit(1)
+            if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]):
+                iAlign.write( outFileHandler )
+            elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]):
+                if iAlign.range_query.getMin() < iAlign.range_subject.getMin():
+                    iAlign.write( outFileHandler )
+            if countMatches % tick == 0:   # need to free buffer frequently as file can be big
+                outFileHandler.flush()
+                os.fsync( outFileHandler.fileno() )
+        inFileHandler.close()
+        outFileHandler.close()
+
+    def run(self):
+        if self._verbose > 1:
+            print "concatenate the results of each job"
+            sys.stdout.flush()
+            
+        tmpFileName = "%s.%s_tmp" % (self._outFileBaseName, self._fileType)
+        outFileName = "%s.%s" % (self._outFileBaseName, self._fileType)
+        pattern = "*.%s" % self._fileType
+    
+        if os.path.exists(tmpFileName):
+            os.remove(tmpFileName)
+    
+        FileUtils.catFilesByPattern(pattern, tmpFileName)
+        if self._clean:
+            FileUtils.removeFilesByPattern(pattern)
+    
+        if self._fileType == "align":
+            if self._allByAll:
+                self._filterRedundantMatches(tmpFileName, outFileName)
+            else:
+                shutil.move(tmpFileName, outFileName)
+        else:
+            prg = "%s/bin/%snum2id" % (os.environ["REPET_PATH"], self._fileType)
+            cmd = prg
+            cmd += " -i %s" % tmpFileName
+            cmd += " -o %s" % outFileName
+            cmd += " -v %i" % (self._verbose - 1)
+            log = os.system(cmd)
+            if log != 0:
+                print "*** Error: %s returned %i" % (prg, log)
+                sys.exit(1)
+        if self._clean and FileUtils.isRessourceExists(tmpFileName):
+            os.remove(tmpFileName)
\ No newline at end of file