Mercurial > repos > yufei-luo > s_mart
diff commons/launcher/LaunchBlastclust.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/launcher/LaunchBlastclust.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,372 @@ +#!/usr/bin/env python + +""" +Launch Blastclust on nucleotide sequences and return a fasta file. +""" + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import os +import sys +import subprocess +from commons.core.seq.BioseqDB import BioseqDB +from commons.core.seq.Bioseq import Bioseq +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders + +class LaunchBlastclust(object): + """ + Launch Blastclust on nucleotide sequences and return a fasta file. + """ + + def __init__(self, input = "", outFilePrefix = "", clean = False, verbose = 0): + """ + Constructor. + """ + self._inFileName = input + self._identityThreshold = 95 + self._coverageThreshold = 0.9 + self._bothSeq = "T" + self._filterUnclusteredSeq = False + self._outFilePrefix = outFilePrefix + self._isBlastToMap = False + self._isHeaderForTEdenovo = False + self._nbCPUs = 1 + self._clean = clean + self._verbose = verbose + self._tmpFileName = "" + + def setAttributesFromCmdLine(self): + """ + Set the attributes from the command-line. + """ + + description = "Launch Blastclust on nucleotide sequences and return a fasta file." + usage = "LaunchBlastclust.py -i inputFileName [options]" + + examples = "\nExample 1: launch Blastclust with default options, highest verbose and clean temporary files.\n" + examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -v 2 -c" + examples += "\n\t" + examples += "\t\nExample 2: launch Blastclust with an identity threshold of 90%, rename output files and generate a map file corresponding to the fasta output.\n" + examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -S 90 -o SpecialOutputName -m" + examples += "\n\tWARNING: Please refer to -m option limitations in the description above.\n" + + #TODO: check if the optionParser can handle '\' into strings for a better code readability in -m option + + parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples) + parser.add_option("-i", "--input", dest = "inFileName", type = "string", help = "name of the input fasta file (nucleotides)", default = "") + parser.add_option("-L", "--length", dest = "coverageThreshold", type = "float", help = "length coverage threshold (default=0.9)", default = 0.9) + parser.add_option("-S", "--ident", dest = "identityThreshold", type = "int", help = "identity threshold (default=95)", default = 95) + parser.add_option("-b", "--both", dest = "bothSeq", type = "string", help = "require coverage on both neighbours (default=T/F)", default = "T") + parser.add_option("-f", "--filter", dest = "filterUnclusteredSeq", help = "filter unclustered sequences", default = False, action="store_true") + parser.add_option("-o", "--out", dest = "outFilePrefix", type = "string", help = "prefix of the output files (default=input fasta file name)", default = "") + parser.add_option("-m", "--map", dest = "isBlast2Map", help = "generate an additional output file in map format (Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output)", default = False, action="store_true") + parser.add_option("", "--TEdenovoHeader", dest = "isHeaderForTEdenovo", help = "format headers for TEdenovo pipeline", default = False, action="store_true") + parser.add_option("-n", "--num", dest = "nbCPUs", type = "int", help = "number of CPU's to use (default=1)", default = 1) + parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true") + parser.add_option("-v", "--verbose", dest = "verbose", type = "int", help = "verbosity level (default=0/1/2)", default = 0) + + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setInputFileName(options.inFileName) + self.setCoverageThreshold(options.coverageThreshold) + self.setIdentityThreshold(options.identityThreshold) + self.setBothSequences(options.bothSeq) + self.setNbCPUs(options.nbCPUs) + self.setIsHeaderForTEdenovo(options.isHeaderForTEdenovo) + if options.filterUnclusteredSeq: + self.setFilterUnclusteredSequences() + if options.outFilePrefix != "": + self.setOutputFilePrefix(options.outFilePrefix) + else: + self._outFilePrefix = self._inFileName + if options.isBlast2Map: + self.setIsBlastToMap() + if options.clean: + self.setClean() + self.setVerbosityLevel(options.verbose) + + def setInputFileName(self , inFileName): + self._inFileName = inFileName + + def setCoverageThreshold(self, lengthThresh): + self._coverageThreshold = float(lengthThresh) + + def setIdentityThreshold(self, identityThresh): + self._identityThreshold = int(identityThresh) + + def setBothSequences(self, bothSeq): + self._bothSeq = bothSeq + + def setNbCPUs(self, nbCPUs): + self._nbCPUs = int(nbCPUs) + + def setFilterUnclusteredSequences(self): + self._filterUnclusteredSeq = True + + def setOutputFilePrefix(self, outFilePrefix): + self._outFilePrefix = outFilePrefix + + def setIsBlastToMap(self): + self._isBlastToMap = True + + def setIsHeaderForTEdenovo(self, isHeaderForTEdenovo): + self._isHeaderForTEdenovo = isHeaderForTEdenovo + + def setClean(self): + self._clean = True + + def setVerbosityLevel(self, verbose): + self._verbose = int(verbose) + + def setTmpFileName(self, tmpFileName): + self._tmpFileName = tmpFileName + + + def checkAttributes(self): + """ + Check the attributes are valid before running the algorithm. + """ + if self._inFileName == "": + print "ERROR: missing input file name (-i)" + sys.exit(1) + if self._outFilePrefix == "": + self._outFilePrefix = self._inFileName + self._tmpFileName = "%s_blastclust.txt" % (self._outFilePrefix) + + + def launchBlastclust(self, inFile): + """ + Launch Blastclust in command-line. + """ + if os.path.exists(os.path.basename(inFile)): + inFile = os.path.basename(inFile) + prg = "blastclust" + cmd = prg + cmd += " -i %s" % (inFile) + cmd += " -o %s" % (self._tmpFileName) + cmd += " -S %i" % (self._identityThreshold) + cmd += " -L %f" % (self._coverageThreshold) + cmd += " -b %s" % (self._bothSeq) + cmd += " -p F" + cmd += " -a %i" % (self._nbCPUs) + if self._verbose == 0: + cmd += " -v blastclust.log" + if self._verbose > 0: + print cmd + sys.stdout.flush() + process = subprocess.Popen(cmd, shell = True) + process.communicate() + if process.returncode != 0: + raise Exception("ERROR when launching '%s'" % cmd) + if self._clean and os.path.exists("error.log"): + os.remove("error.log") + if self._clean and os.path.exists("blastclust.log"): + os.remove("blastclust.log") + + + def getClustersFromTxtFile(self): + """ + Return a dictionary with cluster IDs as keys and sequence headers as values. + """ + dClusterId2SeqHeaders = {} + inF = open(self._tmpFileName, "r") + line = inF.readline() + clusterId = 1 + while True: + if line == "": + break + tokens = line[:-1].split(" ") + dClusterId2SeqHeaders[clusterId] = [] + for seqHeader in tokens: + if seqHeader != "": + dClusterId2SeqHeaders[clusterId].append(seqHeader) + line = inF.readline() + clusterId += 1 + inF.close() + if self._verbose > 0: + print "nb of clusters: %i" % (len(dClusterId2SeqHeaders.keys())) + sys.stdout.flush() + return dClusterId2SeqHeaders + + + def filterUnclusteredSequences(self, dClusterId2SeqHeaders): + """ + Filter clusters having only one sequence. + """ + for clusterId in dClusterId2SeqHeaders.keys(): + if len(dClusterId2SeqHeaders[clusterId]) == 1: + del dClusterId2SeqHeaders[clusterId] + if self._verbose > 0: + print "nb of clusters (>1seq): %i" % (len(dClusterId2SeqHeaders.keys())) + sys.stdout.flush() + return dClusterId2SeqHeaders + + + def getClusteringResultsInFasta(self, inFile): + """ + Write a fasta file whose sequence headers contain the clustering IDs. + """ + dClusterId2SeqHeaders = self.getClustersFromTxtFile() + if self._filterUnclusteredSeq: + dClusterId2SeqHeaders = self.filterUnclusteredSequences(dClusterId2SeqHeaders) + inDB = BioseqDB(inFile) + outFileName = "%s_Blastclust.fa" % (inFile) + outF = open(outFileName, "w") + for clusterId in dClusterId2SeqHeaders.keys(): + memberId = 1 + for seqHeader in dClusterId2SeqHeaders[clusterId]: + bs = inDB.fetch(seqHeader) + bs.header = "BlastclustCluster%iMb%i_%s" % (clusterId, memberId, seqHeader) + bs.write(outF) + memberId += 1 + outF.close() + + + def getLinkInitNewHeaders(self): + dNew2Init = {} + linkFileName = "%s.shortHlink" % (self._inFileName) + linkFile = open(linkFileName,"r") + line = linkFile.readline() + while True: + if line == "": + break + data = line.split("\t") + dNew2Init[data[0]] = data[1] + line = linkFile.readline() + linkFile.close() + return dNew2Init + + + def retrieveInitHeaders(self, dNewH2InitH): + tmpFaFile = "%s.shortH_Blastclust.fa" % (self._inFileName) + tmpFaFileHandler = open(tmpFaFile, "r") + outFaFile = "%s_Blastclust.fa" % (self._outFilePrefix) + outFaFileHandler = open(outFaFile, "w") + while True: + line = tmpFaFileHandler.readline() + if line == "": + break + if line[0] == ">": + tokens = line[1:-1].split("_") + initHeader = dNewH2InitH[tokens[1]] + if self._isHeaderForTEdenovo: + classif = initHeader.split("_")[0] + consensusName = "_".join(initHeader.split("_")[1:]) + clusterId = tokens[0].split("Cluster")[1].split("Mb")[0] + newHeader = "%s_Blc%s_%s" % (classif, clusterId, consensusName) + else: + newHeader = "%s_%s" % (tokens[0], initHeader) + outFaFileHandler.write(">%s\n" % (newHeader)) + else: + outFaFileHandler.write(line) + tmpFaFileHandler.close() + outFaFileHandler.close() + if self._clean: + os.remove(tmpFaFile) + + + def blastclustToMap(self, blastclustFastaOut): + """ + Write a map file from blastclust fasta output. + Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output. + """ + fileDb = open(blastclustFastaOut , "r") + mapFilename = "%s.map" % (os.path.splitext(blastclustFastaOut)[0]) + fileMap = open(mapFilename, "w") + seq = Bioseq() + numseq = 0 + while 1: + seq.read(fileDb) + if seq.sequence == None: + break + numseq = numseq + 1 + ID = seq.header.split(' ')[0].split('_')[0] + chunk = seq.header.split(' ')[0].split('_')[1] + start = seq.header.split(' ')[-1].split(',')[0][1:] + end = seq.header.split(' ')[-1].split(',')[1][:-1] + line= '%s\t%s\t%s\t%s' % (ID, chunk, start, end) + fileMap.write(line + "\n") + + fileDb.close() + fileMap.close() + print "saved in %s" % mapFilename + + + def start(self): + """ + Useful commands before running the program. + """ + self.checkAttributes() + if self._verbose > 0: + print "START %s" % (type(self).__name__) + + + def end(self): + """ + Useful commands before ending the program. + """ + if self._verbose > 0: + print "END %s" % (type(self).__name__) + + + def run(self): + """ + Run the program. + """ + self.start() + + iCSH = ChangeSequenceHeaders(inFile = self._inFileName, format = "fasta", step = 1, outFile = "%s.shortH" % self._inFileName, linkFile = "%s.shortHlink" % self._inFileName) + iCSH.run() + + self.launchBlastclust("%s.shortH" % (self._inFileName)) + + self.getClusteringResultsInFasta("%s.shortH" % (self._inFileName)) + + dNewH2InitH = self.getLinkInitNewHeaders() + self.retrieveInitHeaders(dNewH2InitH) + + if self._isBlastToMap: + blastclustFileName = "%s_Blastclust.fa" % (self._outFilePrefix) + self.blastclustToMap(blastclustFileName) + + if self._clean: + os.remove("%s.shortH" % (self._inFileName)) + os.remove("%s.shortHlink" % (self._inFileName)) + + self.end() + +if __name__ == "__main__": + i = LaunchBlastclust() + i.setAttributesFromCmdLine() + i.run()