diff commons/pyRepetUnit/blastnForClassifierStep1/RepbaseBLRnForClassifierStep1.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/pyRepetUnit/blastnForClassifierStep1/RepbaseBLRnForClassifierStep1.py	Tue Apr 30 14:33:21 2013 -0400
@@ -0,0 +1,186 @@
+"""
+Launch Blaster and then Matcher to compare the input sequences with known TEs via blastn and record the results into a MySQL table.
+"""
+
+import os
+import ConfigParser
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.LoggerFactory import LoggerFactory
+
+LOG_DEPTH = "repet.tools"
+
+class RepbaseBLRnForClassifierStep1( object ):
+    
+    """
+    Launch Blaster and then Matcher to compare the input sequences with known TEs via blastn and record the results into a MySQL table.
+    
+    @param inFileName: name of the input fasta file
+    @type inFileName: string
+    
+    @param launch_1: generic command at the beginning of a specific command
+    @type launch_1: string
+    
+    @param launch_2: generic command at the end of a specific command
+    @type launch_2: string
+
+    @return: all the commands to run the job
+    @rtype: string
+    
+    @param cDir: current directory (where to retrieve the result files)
+    @ype cDir: string
+
+    @param tmpDir: temporary directory (where the job will run)
+    @type tmpDir: string
+    
+    @param configFileName: configuration file name
+    @type configFileName: string
+    
+    @param logger: a logger Instance
+    @type logger: logger
+    
+    @param verbose: verbose(0/1/2)
+    @type verbose: int
+    
+    @param pL: program launcher
+    @type pL: programLauncher Instance
+    
+    @param project: project name
+    @type project: string
+    
+    """
+
+    def __init__(self, inFileName, launch_1, launch_2, cDir, tmpDir, configFileName, verbose, pL, project):
+        """
+        Constructor
+        """
+        self._inFileName = inFileName
+        self._launch_1 = launch_1
+        self._launch_2 = launch_2
+        self._cDir = cDir
+        self._tmpDir = tmpDir
+        self._verbose = verbose
+        self._pL = pL
+        self._project = project
+        self._fileUtils = FileUtils()
+        self._config = ConfigParser.ConfigParser()
+        self._configFileName = configFileName
+        self._config.readfp( open(self._configFileName) )
+        self._bank = self._config.get("detect_features","TE_nucl_bank")
+        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbose)
+
+    def formatRepbase_ntIfNecessary( self ):
+        """
+        Format Repbase (make 'cut' files).
+        """
+        if not os.path.exists( "%s_cut" % ( self._bank ) ):
+            self._log.debug("prepare bank '%s'..." % ( self._bank ))
+            prg = os.environ["REPET_PATH"] + "/bin/blaster"
+            cmd = prg
+            cmd += " -s %s" % ( self._bank )
+            cmd += " -n blastn"
+            if self._config.get("detect_features","wublast") == "yes":
+                cmd += " -W"
+            cmd += " -r"
+            cmd += " -P"
+            self._pL.launch( prg, cmd )
+            os.system( "rm -f %s-blastn-*.param" % ( self._bank ) )
+        
+    def createCmdToLaunch( self ):
+        cmd = self._launch_1 + os.environ["REPET_PATH"] + "/bin/blaster"
+        cmd += " -q %s" % ( self._inFileName )
+        cmd += " -s %s/%s" % ( self._cDir, self._bank )
+        cmd += " -B %s_BLRn_%s" % ( self._inFileName, self._bank )
+        cmd += " -n blastn"
+        if self._config.get("detect_features","wublast") == "yes":
+            cmd += " -W"
+        cmd += " -r"
+        cmd += " -v 1"
+        cmd += self._launch_2
+    
+        cmd += "if not os.path.exists( \"%s/%s_BLRn_%s.param\" ):\n" % ( self._cDir, self._inFileName, self._bank )
+        cmd += "\tos.system( \"mv %s_BLRn_%s.param %s\" )\n" % ( self._inFileName, self._bank, self._cDir )
+        cmd += "if os.path.exists( \"%s_cut\" ):\n" % ( self._inFileName )
+        cmd += "\tos.system( \"rm -f %s_cut*\" )\n" % ( self._inFileName )
+        cmd += "if os.path.exists( \"%s.Nstretch.map\" ):\n" % ( self._inFileName )
+        cmd += "\tos.remove( \"%s.Nstretch.map\" )\n" % ( self._inFileName )
+        cmd += "if os.path.exists( \"%s_BLRn_%s.raw\" ):\n" % ( self._inFileName, self._bank )
+        cmd += "\tos.remove( \"%s_BLRn_%s.raw\" )\n" % ( self._inFileName, self._bank )
+        cmd += "if os.path.exists( \"%s_BLRn_%s.seq_treated\" ):\n" % ( self._inFileName, self._bank )
+        cmd += "\tos.remove( \"%s_BLRn_%s.seq_treated\" )\n" % ( self._inFileName, self._bank )
+    
+        cmd += self._launch_1
+        cmd += os.environ["REPET_PATH"] + "/bin/matcher"
+        cmd += " -m %s_BLRn_%s.align" % ( self._inFileName, self._bank )
+        cmd += " -q %s" % ( self._inFileName )
+        cmd += " -s %s/%s" % ( self._cDir, self._bank )
+        cmd += " -j"
+        cmd += " -v 1"
+        cmd += self._launch_2
+    
+        cmd += "if not os.path.exists( \"%s/%s_BLRn_%s.align.clean_match.path\" ):\n" % ( self._cDir, self._inFileName, self._bank )
+        cmd += "\tos.system( \"mv %s_BLRn_%s.align.clean_match.path %s\" )\n" % ( self._inFileName, self._bank, self._cDir )
+        cmd += "if not os.path.exists( \"%s/%s_BLRn_%s.align.clean_match.param\" ):\n" % ( self._cDir, self._inFileName, self._bank )
+        cmd += "\tos.system( \"mv %s_BLRn_%s.align.clean_match.param %s\" )\n" % ( self._inFileName, self._bank, self._cDir )
+        cmd += "if os.path.exists( \"%s_BLRn_%s.align\" ):\n" % ( self._inFileName, self._bank )
+        cmd += "\tos.remove( \"%s_BLRn_%s.align\" )\n" % ( self._inFileName, self._bank )
+        cmd += "if os.path.exists( \"%s_BLRn_%s.align.clean_match.fa\" ):\n" % ( self._inFileName, self._bank )
+        cmd += "\tos.remove( \"%s_BLRn_%s.align.clean_match.fa\" )\n" % ( self._inFileName, self._bank )
+        cmd += "if os.path.exists( \"%s_BLRn_%s.align.clean_match.map\" ):\n" % ( self._inFileName, self._bank )
+        cmd += "\tos.remove( \"%s_BLRn_%s.align.clean_match.map\" )\n" % ( self._inFileName, self._bank )
+        cmd += "if os.path.exists( \"%s_BLRn_%s.align.clean_match.tab\" ):\n" % ( self._inFileName, self._bank )
+        cmd += "\tos.remove( \"%s_BLRn_%s.align.clean_match.tab\" )\n" % ( self._inFileName, self._bank )
+    
+        if self._tmpDir != self._cDir:
+            cmd += "if os.path.exists( \"%s\" ):\n" % ( self._bank )
+            cmd += "\tos.remove( \"%s\" )\n" % ( self._bank )
+            
+        return cmd
+    
+    def collectRepbaseBLRn( self ):
+        """
+        Concatenate the outputs of blastn, adapt the ID and load the results into a table.
+        """
+        bankFull = self._bank
+        bankPath, bank = os.path.split( bankFull )
+        self._concatPathFile(bank)
+        self._adaptIDInPathFile(bank)
+        self._loadPathFileInTable(bank)    
+        self._findAndRemoveUselessFiles(bank)
+        
+    def _concatPathFile(self, bank):
+        FileUtils.catFilesByPattern("../batch_*.fa_BLRn_%s.align.clean_match.path" % bank,
+                                        "%s_BLRn_%s.align.clean_match.path.tmp" % (self._project, bank))
+
+    def _adaptIDInPathFile(self, bank):
+        if os.path.exists(os.environ["REPET_PATH"] + "/bin/pathnum2id"):
+            prg = os.environ["REPET_PATH"] + "/bin/pathnum2id"
+            cmd = prg
+            cmd += " -i %s_BLRn_%s.align.clean_match.path.tmp" % (self._project, bank)
+            cmd += " -o %s_BLRn_%s.align.clean_match.path" % (self._project, bank)
+            cmd += " -v %i" % (self._verbose - 1)
+            self._pL.launch(prg, cmd)
+        else:
+            prg = os.environ["REPET_PATH"] + "/bin/pathnum2id.py"
+            cmd = prg
+            cmd += " -i %s_BLRn_%s.align.clean_match.path.tmp" % (self._project, bank)
+            cmd += " -o %s_BLRn_%s.align.clean_match.path" % (self._project, bank)
+            self._pL.launch(prg, cmd)
+
+    def _loadPathFileInTable(self, bank):
+        prg = os.environ["REPET_PATH"] + "/bin/srptCreateTable.py"
+        cmd = prg
+        cmd += " -f %s_BLRn_%s.align.clean_match.path" % (self._project, bank)
+        cmd += " -n %s_TE_BLRn_path" % (self._project)
+        cmd += " -t path"
+        cmd += " -c ../%s" % (self._configFileName)
+        self._pL.launch(prg, cmd)
+
+    def _findAndRemoveUselessFiles(self, bank):
+        prg = "find"
+        cmd = prg
+        cmd += " .. -name \"batch_*.fa_BLRn_%s.*\" -exec rm {} \;" % (bank)
+        self._pL.launch(prg, cmd)
+        prg = "rm"
+        cmd = prg
+        cmd += " %s_BLRn_%s.align.clean_match.path.tmp" % (self._project, bank)
+        self._pL.launch(prg, cmd)