diff commons/launcher/LaunchBlastclust.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/launcher/LaunchBlastclust.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,372 @@
+#!/usr/bin/env python
+
+"""
+Launch Blastclust on nucleotide sequences and return a fasta file.
+"""
+
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use, 
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info". 
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability. 
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or 
+# data to be ensured and,  more generally, to use and operate it in the 
+# same conditions as regards security. 
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+
+import os
+import sys
+import subprocess
+from commons.core.seq.BioseqDB import BioseqDB
+from commons.core.seq.Bioseq import Bioseq
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders
+
+class LaunchBlastclust(object):
+    """
+    Launch Blastclust on nucleotide sequences and return a fasta file.
+    """
+    
+    def __init__(self, input = "", outFilePrefix = "", clean = False, verbose = 0):
+        """
+        Constructor.
+        """
+        self._inFileName = input
+        self._identityThreshold = 95
+        self._coverageThreshold = 0.9
+        self._bothSeq = "T"
+        self._filterUnclusteredSeq = False
+        self._outFilePrefix = outFilePrefix
+        self._isBlastToMap = False
+        self._isHeaderForTEdenovo = False
+        self._nbCPUs = 1
+        self._clean = clean
+        self._verbose = verbose
+        self._tmpFileName = ""
+        
+    def setAttributesFromCmdLine(self):
+        """
+        Set the attributes from the command-line.
+        """
+        
+        description = "Launch Blastclust on nucleotide sequences and return a fasta file."
+        usage = "LaunchBlastclust.py -i inputFileName [options]"
+        
+        examples = "\nExample 1: launch Blastclust with default options, highest verbose and clean temporary files.\n"
+        examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -v 2 -c"
+        examples += "\n\t"
+        examples += "\t\nExample 2: launch Blastclust with an identity threshold of 90%, rename output files and generate a map file corresponding to the fasta output.\n"
+        examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -S 90 -o SpecialOutputName -m"
+        examples += "\n\tWARNING: Please refer to -m option limitations in the description above.\n"
+        
+        #TODO: check if the optionParser can handle '\' into strings for a better code readability in -m option
+        
+        parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples)
+        parser.add_option("-i", "--input",          dest = "inFileName",            type = "string",    help = "name of the input fasta file (nucleotides)",                    default = "")
+        parser.add_option("-L", "--length",         dest = "coverageThreshold",     type = "float",     help = "length coverage threshold (default=0.9)",                       default = 0.9)
+        parser.add_option("-S", "--ident",          dest = "identityThreshold",     type = "int",       help = "identity threshold (default=95)",                               default = 95)
+        parser.add_option("-b", "--both",           dest = "bothSeq",               type = "string",    help = "require coverage on both neighbours (default=T/F)",             default = "T")
+        parser.add_option("-f", "--filter",         dest = "filterUnclusteredSeq",                      help = "filter unclustered sequences",                                  default = False,    action="store_true")
+        parser.add_option("-o", "--out",            dest = "outFilePrefix",         type = "string",    help = "prefix of the output files (default=input fasta file name)",    default = "")
+        parser.add_option("-m", "--map",            dest = "isBlast2Map",                               help = "generate an additional output file in map format (Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output)",   default = False,    action="store_true")
+        parser.add_option("", "--TEdenovoHeader",   dest = "isHeaderForTEdenovo",                       help = "format headers for TEdenovo pipeline",                          default = False,    action="store_true")
+        parser.add_option("-n", "--num",            dest = "nbCPUs",                type = "int",       help = "number of CPU's to use (default=1)",                            default = 1)
+        parser.add_option("-c", "--clean",          dest = "clean",                                     help = "clean temporary files",                                         default = False,    action="store_true")
+        parser.add_option("-v", "--verbose",        dest = "verbose",               type = "int",       help = "verbosity level (default=0/1/2)",                               default = 0)
+
+        options = parser.parse_args()[0]
+        self._setAttributesFromOptions(options)
+        
+    def _setAttributesFromOptions(self, options):
+        self.setInputFileName(options.inFileName)
+        self.setCoverageThreshold(options.coverageThreshold)
+        self.setIdentityThreshold(options.identityThreshold)
+        self.setBothSequences(options.bothSeq)
+        self.setNbCPUs(options.nbCPUs)
+        self.setIsHeaderForTEdenovo(options.isHeaderForTEdenovo)
+        if options.filterUnclusteredSeq:
+            self.setFilterUnclusteredSequences()
+        if options.outFilePrefix != "":
+            self.setOutputFilePrefix(options.outFilePrefix)
+        else:
+            self._outFilePrefix = self._inFileName
+        if options.isBlast2Map:
+            self.setIsBlastToMap()
+        if options.clean:
+            self.setClean()
+        self.setVerbosityLevel(options.verbose)
+                
+    def setInputFileName(self , inFileName):
+        self._inFileName = inFileName
+        
+    def setCoverageThreshold(self, lengthThresh):
+        self._coverageThreshold = float(lengthThresh)
+                
+    def setIdentityThreshold(self, identityThresh):
+        self._identityThreshold = int(identityThresh)
+        
+    def setBothSequences(self, bothSeq):
+        self._bothSeq = bothSeq
+        
+    def setNbCPUs(self, nbCPUs):
+        self._nbCPUs = int(nbCPUs)
+        
+    def setFilterUnclusteredSequences(self):
+        self._filterUnclusteredSeq = True
+        
+    def setOutputFilePrefix(self, outFilePrefix):
+        self._outFilePrefix = outFilePrefix
+        
+    def setIsBlastToMap(self):
+        self._isBlastToMap = True
+        
+    def setIsHeaderForTEdenovo(self, isHeaderForTEdenovo):
+        self._isHeaderForTEdenovo = isHeaderForTEdenovo
+        
+    def setClean(self):
+        self._clean = True
+        
+    def setVerbosityLevel(self, verbose):
+        self._verbose = int(verbose)
+    
+    def setTmpFileName(self, tmpFileName):    
+        self._tmpFileName = tmpFileName
+        
+        
+    def checkAttributes(self):
+        """
+        Check the attributes are valid before running the algorithm.
+        """
+        if self._inFileName == "":
+            print "ERROR: missing input file name (-i)"
+            sys.exit(1)
+        if self._outFilePrefix == "":
+            self._outFilePrefix = self._inFileName
+        self._tmpFileName = "%s_blastclust.txt" % (self._outFilePrefix)
+        
+        
+    def launchBlastclust(self, inFile):
+        """
+        Launch Blastclust in command-line.
+        """
+        if os.path.exists(os.path.basename(inFile)):
+            inFile = os.path.basename(inFile)
+        prg = "blastclust"
+        cmd = prg
+        cmd += " -i %s" % (inFile)
+        cmd += " -o %s" % (self._tmpFileName)
+        cmd += " -S %i" % (self._identityThreshold)
+        cmd += " -L %f" % (self._coverageThreshold)
+        cmd += " -b %s" % (self._bothSeq)
+        cmd += " -p F"
+        cmd += " -a %i" % (self._nbCPUs)
+        if self._verbose == 0:
+            cmd += " -v blastclust.log"
+        if self._verbose > 0:
+            print cmd
+            sys.stdout.flush()
+        process = subprocess.Popen(cmd, shell = True)
+        process.communicate()
+        if process.returncode != 0:
+            raise Exception("ERROR when launching '%s'" % cmd)
+        if self._clean and os.path.exists("error.log"):
+            os.remove("error.log")
+        if self._clean and os.path.exists("blastclust.log"):
+            os.remove("blastclust.log")
+            
+            
+    def getClustersFromTxtFile(self):
+        """
+        Return a dictionary with cluster IDs as keys and sequence headers as values.
+        """
+        dClusterId2SeqHeaders = {}
+        inF = open(self._tmpFileName, "r")
+        line = inF.readline()
+        clusterId = 1
+        while True:
+            if line == "":
+                break
+            tokens = line[:-1].split(" ")
+            dClusterId2SeqHeaders[clusterId] = []
+            for seqHeader in tokens:
+                if seqHeader != "":
+                    dClusterId2SeqHeaders[clusterId].append(seqHeader)
+            line = inF.readline()
+            clusterId += 1
+        inF.close()
+        if self._verbose > 0:
+            print "nb of clusters: %i" % (len(dClusterId2SeqHeaders.keys()))
+            sys.stdout.flush()
+        return dClusterId2SeqHeaders
+    
+    
+    def filterUnclusteredSequences(self, dClusterId2SeqHeaders):
+        """
+        Filter clusters having only one sequence.
+        """
+        for clusterId in dClusterId2SeqHeaders.keys():
+            if len(dClusterId2SeqHeaders[clusterId]) == 1:
+                del dClusterId2SeqHeaders[clusterId]
+        if self._verbose > 0:
+            print "nb of clusters (>1seq): %i" % (len(dClusterId2SeqHeaders.keys()))
+            sys.stdout.flush()
+        return dClusterId2SeqHeaders
+    
+    
+    def getClusteringResultsInFasta(self, inFile):
+        """
+        Write a fasta file whose sequence headers contain the clustering IDs.
+        """
+        dClusterId2SeqHeaders = self.getClustersFromTxtFile()
+        if self._filterUnclusteredSeq:
+            dClusterId2SeqHeaders = self.filterUnclusteredSequences(dClusterId2SeqHeaders)
+        inDB = BioseqDB(inFile)
+        outFileName = "%s_Blastclust.fa" % (inFile)
+        outF = open(outFileName, "w")
+        for clusterId in dClusterId2SeqHeaders.keys():
+            memberId = 1
+            for seqHeader in dClusterId2SeqHeaders[clusterId]:
+                bs = inDB.fetch(seqHeader)
+                bs.header = "BlastclustCluster%iMb%i_%s" % (clusterId, memberId, seqHeader)
+                bs.write(outF)
+                memberId += 1
+        outF.close()
+        
+        
+    def getLinkInitNewHeaders(self):
+        dNew2Init = {}
+        linkFileName = "%s.shortHlink" % (self._inFileName)
+        linkFile = open(linkFileName,"r")
+        line = linkFile.readline()
+        while True:
+            if line == "":
+                break
+            data = line.split("\t")
+            dNew2Init[data[0]] = data[1]
+            line = linkFile.readline()
+        linkFile.close()
+        return dNew2Init
+    
+    
+    def retrieveInitHeaders(self, dNewH2InitH):
+        tmpFaFile = "%s.shortH_Blastclust.fa" % (self._inFileName)
+        tmpFaFileHandler = open(tmpFaFile, "r")
+        outFaFile = "%s_Blastclust.fa" % (self._outFilePrefix)
+        outFaFileHandler = open(outFaFile, "w")
+        while True:
+            line = tmpFaFileHandler.readline()
+            if line == "":
+                break
+            if line[0] == ">":
+                tokens = line[1:-1].split("_")
+                initHeader = dNewH2InitH[tokens[1]]
+                if self._isHeaderForTEdenovo:
+                    classif = initHeader.split("_")[0]
+                    consensusName = "_".join(initHeader.split("_")[1:])
+                    clusterId = tokens[0].split("Cluster")[1].split("Mb")[0]
+                    newHeader = "%s_Blc%s_%s" % (classif, clusterId, consensusName)
+                else:
+                    newHeader = "%s_%s" % (tokens[0], initHeader)
+                outFaFileHandler.write(">%s\n" % (newHeader))
+            else:
+                outFaFileHandler.write(line)
+        tmpFaFileHandler.close()
+        outFaFileHandler.close()
+        if self._clean:
+            os.remove(tmpFaFile)
+
+
+    def blastclustToMap(self, blastclustFastaOut):
+        """
+        Write a map file from blastclust fasta output.
+        Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output.
+        """
+        fileDb = open(blastclustFastaOut , "r")
+        mapFilename = "%s.map" % (os.path.splitext(blastclustFastaOut)[0])
+        fileMap = open(mapFilename, "w")
+        seq = Bioseq()
+        numseq = 0
+        while 1:
+            seq.read(fileDb)
+            if seq.sequence == None:
+                break
+            numseq = numseq + 1
+            ID = seq.header.split(' ')[0].split('_')[0]
+            chunk = seq.header.split(' ')[0].split('_')[1]
+            start = seq.header.split(' ')[-1].split(',')[0][1:]
+            end = seq.header.split(' ')[-1].split(',')[1][:-1]
+            line= '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
+            fileMap.write(line + "\n")
+    
+        fileDb.close()
+        fileMap.close()
+        print "saved in %s" % mapFilename 
+        
+        
+    def start(self):
+        """
+        Useful commands before running the program.
+        """
+        self.checkAttributes()
+        if self._verbose > 0:
+            print "START %s" % (type(self).__name__)
+            
+            
+    def end(self):
+        """
+        Useful commands before ending the program.
+        """
+        if self._verbose > 0:
+            print "END %s" % (type(self).__name__)
+            
+            
+    def run(self):
+        """
+        Run the program.
+        """
+        self.start()
+        
+        iCSH = ChangeSequenceHeaders(inFile = self._inFileName, format = "fasta", step = 1, outFile = "%s.shortH" % self._inFileName, linkFile = "%s.shortHlink" % self._inFileName)
+        iCSH.run()
+        
+        self.launchBlastclust("%s.shortH" % (self._inFileName))
+        
+        self.getClusteringResultsInFasta("%s.shortH" % (self._inFileName))
+        
+        dNewH2InitH = self.getLinkInitNewHeaders()
+        self.retrieveInitHeaders(dNewH2InitH)
+        
+        if self._isBlastToMap:
+            blastclustFileName = "%s_Blastclust.fa" % (self._outFilePrefix)
+            self.blastclustToMap(blastclustFileName)
+        
+        if self._clean:
+            os.remove("%s.shortH" % (self._inFileName))
+            os.remove("%s.shortHlink" % (self._inFileName))
+        
+        self.end()
+        
+if __name__ == "__main__":
+    i = LaunchBlastclust()
+    i.setAttributesFromCmdLine()
+    i.run()