diff commons/launcher/LaunchMCL.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/launcher/LaunchMCL.py	Tue Apr 30 14:33:21 2013 -0400
@@ -0,0 +1,239 @@
+#!/usr/bin/env python
+
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use, 
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info". 
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability. 
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or 
+# data to be ensured and,  more generally, to use and operate it in the 
+# same conditions as regards security. 
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+
+from commons.core.LoggerFactory import LoggerFactory
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+from commons.core.seq.FastaUtils import FastaUtils
+from commons.core.coord.MatchUtils import MatchUtils
+import subprocess
+import os
+import time
+import shutil
+from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders
+
+LOG_DEPTH = "repet.base"
+
+##Launch MCL
+#
+class LaunchMCL(object):
+    
+    def __init__(self, fastaFileName = "", outFilePrefix = "", inflate = 1.5, covThres = 0.0, isJoined = False, isCluster2Map = False, isClusterConsensusHeaders = False, doClean = False, verbosity = 0):
+        self._fastaFileName = fastaFileName
+        self.setOutFilePrefix(outFilePrefix)
+        self._inflate = inflate
+        self._coverageThreshold = covThres
+        self._isJoined = isJoined
+        self._isCluster2Map = isCluster2Map
+        self._isClusterConsensusHeaders = isClusterConsensusHeaders
+        self._doClean = doClean
+        self._verbosity = verbosity
+        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
+        
+    def setAttributesFromCmdLine(self):
+        description = "Launch MCL clustering program."
+        epilog = "\nExample: launch without verbosity and keep temporary files.\n"
+        epilog += "\t$ python LaunchMCL.py -i file.fa -v 0\n"
+        parser = RepetOptionParser(description = description, epilog = epilog)
+        parser.add_option("-i", "--fasta",      dest = "fastaFileName", action = "store",       type = "string", help = "input fasta file name [compulsory] [format: fasta]", default = "")
+        parser.add_option("-o", "--out",        dest = "outFilePrefix", action = "store",       type = "string", help = "prefix of the output files [default=input fasta file name]",    default = "")
+        parser.add_option("-I", "--inflate",    dest = "inflate",       action = "store",       type = "float",  help = "inflate parameter of MCL [optional] [default: 1.5]", default = 1.5)
+        parser.add_option("-T", "--coverage",   dest = "coverageThreshold", action = "store",   type = "float",  help = "length coverage threshold (default=0.0, 0.0 <= value <= 1.0)",              default = 0.0)
+        parser.add_option("-j", "--join",       dest = "isJoined",          action = "store_true",               help = "join hits after alignement [optional] [default: False]" , default = False)
+        parser.add_option("-m", "--map",        dest = "isCluster2Map", action = "store_true",                   help = "generate an additional output file in map format (Warning: only works if MCL's fasta input headers are formated like LTRharvest fasta output)",   default = False)
+        parser.add_option("", "--isClusterConsensusHeaders", dest = "isClusterConsensusHeaders", action="store_true", help = "format headers for Cluster Consensus tool", default = False)
+        parser.add_option("-c", "--clean",      dest = "doClean",       action = "store_true",                   help = "clean temporary files [optional] [default: False]", default = False)
+        parser.add_option("-v", "--verbosity",  dest = "verbosity",     action = "store",       type = "int",    help = "verbosity [optional] [default: 1]", default = 1)
+        options = parser.parse_args()[0]
+        self._setAttributesFromOptions(options)
+        
+    def _setAttributesFromOptions(self, options):
+        self.setFastaFileName(options.fastaFileName)
+        self.setOutFilePrefix(options.outFilePrefix)
+        self.setInflate(options.inflate)
+        self.setCoverageThreshold(options.coverageThreshold)
+        self.setIsJoined(options.isJoined)
+        self.setIsCluster2Map(options.isCluster2Map)
+        self.setIsClusterConsensusHeaders(options.isClusterConsensusHeaders)
+        self.setDoClean(options.doClean)
+        self.setVerbosity(options.verbosity)
+
+    def setFastaFileName(self, fastaFileName):
+        self._fastaFileName = fastaFileName
+        
+    def setOutFilePrefix(self, outFilePrefix):
+        if outFilePrefix == "":
+            self._outFilePrefix = os.path.splitext(self._fastaFileName)[0]
+        else:
+            self._outFilePrefix = outFilePrefix
+            
+    def setInflate(self, inflate):
+        self._inflate = inflate
+        
+    def setCoverageThreshold(self, covThres):
+        self._coverageThreshold = float(covThres)
+    
+    def setIsJoined(self, isJoined):
+        self._isJoined = isJoined       
+        
+    def setDoClean(self, doClean):
+        self._doClean = doClean
+        
+    def setIsCluster2Map(self, isCluster2Map):
+        self._isCluster2Map = isCluster2Map
+        
+    def setIsClusterConsensusHeaders(self, isClusterConsensusHeaders):
+        self._isClusterConsensusHeaders = isClusterConsensusHeaders
+        
+    def setVerbosity(self, verbosity):
+        self._verbosity = verbosity
+        
+    def _checkOptions(self):
+        if self._fastaFileName == "":
+            self._logAndRaise("ERROR: Missing input fasta file name")
+        if self._isCluster2Map and self._isClusterConsensusHeaders:
+            self._logAndRaise("ERROR: You can't use both '--isClusterConsensusHeaders' and '-m' options")
+        if self._coverageThreshold > 1 or self._coverageThreshold < 0:
+            self._logAndRaise("ERROR: Coverage Threshold must be in [0.0 , 1.0]")
+            
+    def _logAndRaise(self, errorMsg):
+        self._log.error(errorMsg)
+        raise Exception(errorMsg)
+                    
+    def run(self):
+        LoggerFactory.setLevel(self._log, self._verbosity)
+        self._checkOptions()
+        self._log.info("START Launch MCL")
+        self._log.debug("With parameters: -i %s -o %s -I %.2f -T %.2f -j %r -m %r -clusterHeaders %r " % (self._fastaFileName, self._outFilePrefix , self._inflate , self._coverageThreshold, self._isJoined, self._isCluster2Map, self._isClusterConsensusHeaders))
+        #self._log.debug("With parameters: -i %s -o %s -I %.2f -T %.2f" % (self._fastaFileName, self._outFilePrefix , self._inflate , self._coverageThreshold))
+        self._log.debug("Fasta file name: %s" % self._fastaFileName)
+        workingDir = "MCLtmpDirectory"
+        if os.path.exists(workingDir):
+            self._logAndRaise("ERROR: %s already exists." % workingDir)
+        os.mkdir(workingDir)
+        os.chdir(workingDir)
+        linkToFastaFile = "%s2.fa" % os.path.splitext(self._fastaFileName)[0]
+        os.symlink("../%s" % self._fastaFileName, self._fastaFileName)
+        fastaFileNameShorten = "%s.shortH" % self._fastaFileName
+        iChangeSequenceHeaders = ChangeSequenceHeaders(inFile=self._fastaFileName, format="fasta", step=1, outFile=fastaFileNameShorten, verbosity=self._verbosity - 1)
+        iChangeSequenceHeaders.run()
+        os.symlink(fastaFileNameShorten, linkToFastaFile)     
+        
+        self._log.info("START Blaster-Matcher (%s)" % time.strftime("%Y-%m-%d %H:%M:%S"))
+        cmd = "LaunchBlaster.py"
+        cmd += " -q %s" % fastaFileNameShorten
+        cmd += " -s %s" % linkToFastaFile
+        cmd += " -a"
+        cmd += " 1>&2 >> blasterMatcher.log"
+        process = subprocess.Popen(cmd, shell = True)
+        self._log.debug("Running : %s" % cmd)
+        process.communicate()
+        if process.returncode != 0:
+            self._logAndRaise("ERROR when launching '%s'" % cmd)
+        outBlasterFileName = "%s.align" % fastaFileNameShorten
+        
+        cmd = "matcher"
+        cmd += " -m %s" % outBlasterFileName
+        cmd += " -q %s" % fastaFileNameShorten
+        cmd += " -s %s" % linkToFastaFile
+        cmd += " -a"
+        if self._isJoined:
+            cmd += " -j"
+        cmd += " 1>&2 >> blasterMatcher.log"
+        process = subprocess.Popen(cmd, shell=True)
+        self._log.debug("Running : %s" % cmd)
+        process.communicate()
+        if process.returncode != 0:
+            self._logAndRaise("ERROR when launching '%s'" % cmd)
+        self._log.info("END Blaster-Matcher (%s)" % time.strftime("%Y-%m-%d %H:%M:%S"))
+        
+        outMatcherFileName = "%s.match.tab" % outBlasterFileName
+        inputABCFileName = "%s.shortH.abc" % os.path.splitext(fastaFileNameShorten)[0]
+        MatchUtils.convertMatchFileIntoABCFileOnQueryCoverage(outMatcherFileName, inputABCFileName, coverage = self._coverageThreshold)
+        outMCLPreprocessFileName = "MCLPreprocess.out"
+        
+        self._log.info("START MCL (%s)" % time.strftime("%Y-%m-%d %H:%M:%S"))
+        cmd = "mcxload"
+        cmd += " -abc %s" % inputABCFileName
+        cmd += " --stream-mirror"
+        cmd += " --stream-neg-log10"
+        cmd += " -stream-tf 'ceil(200)'"
+        cmd += " -o %s" % outMCLPreprocessFileName
+        cmd += " -write-tab %s.tab" % outMCLPreprocessFileName
+        cmd += " 1>&2 > MCLpreprocess.log"
+        process = subprocess.Popen(cmd, shell = True)
+        self._log.debug("Running : %s" % cmd)
+        process.communicate()
+        if process.returncode != 0:
+            self._logAndRaise("ERROR when launching '%s'" % cmd)
+        
+        outMCLFileName = "out.shortH.mcl"
+        cmd = "mcl"
+        cmd += " %s" % outMCLPreprocessFileName
+        cmd += " -I %s" % self._inflate
+        cmd += " -use-tab %s.tab" % outMCLPreprocessFileName
+        cmd += " -o %s" % outMCLFileName
+        cmd += " 1>&2 > MCL.log"
+        process = subprocess.Popen(cmd, shell = True)
+        self._log.debug("Running : %s" % cmd)
+        process.communicate()
+        if process.returncode != 0:
+            self._logAndRaise("ERROR when launching '%s'" % cmd)
+        self._log.info("END MCL (%s)" % time.strftime("%Y-%m-%d %H:%M:%S"))
+
+        outFastaFileNameShorten = "%s.fa" % os.path.splitext(outMCLFileName)[0]
+
+        FastaUtils.convertClusterFileToFastaFile(outMCLFileName, fastaFileNameShorten, outFastaFileNameShorten, "MCL", verbosity = self._verbosity - 1)
+        
+        outFastaFileName = "%s_MCL.fa" % self._outFilePrefix
+        linkFileName = "%s.newHlink" % self._fastaFileName
+        headerStyle = "A"
+        if self._isClusterConsensusHeaders:
+            headerStyle = "B"
+        iChangeSequenceHeaders = ChangeSequenceHeaders(inFile=outFastaFileNameShorten, format="fasta", step=2, outFile=outFastaFileName, linkFile=linkFileName, whichCluster = headerStyle, verbosity=self._verbosity - 1)
+        iChangeSequenceHeaders.run()
+        
+        if self._isCluster2Map:
+            outMapFileName = "%s_MCL.map" % self._outFilePrefix
+            FastaUtils.convertClusteredFastaFileToMapFile(outFastaFileName, outMapFileName)
+            shutil.move(outMapFileName, "..")
+
+        shutil.move(outFastaFileName, "..")
+        os.chdir("..")
+        if self._doClean:
+            self._log.warning("Working directory will be cleaned")
+            shutil.rmtree(workingDir)
+        self._log.info("END Launch MCL")
+
+if __name__ == "__main__":
+    iLaunch = LaunchMCL()
+    iLaunch.setAttributesFromCmdLine()
+    iLaunch.run()