Mercurial > repos > yufei-luo > s_mart
view commons/launcher/LaunchBlastclust.py @ 19:9bcfa7936eec
Deleted selected files
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:23:29 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line source
#!/usr/bin/env python """ Launch Blastclust on nucleotide sequences and return a fasta file. """ # Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. import os import sys import subprocess from commons.core.seq.BioseqDB import BioseqDB from commons.core.seq.Bioseq import Bioseq from commons.core.utils.RepetOptionParser import RepetOptionParser from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders class LaunchBlastclust(object): """ Launch Blastclust on nucleotide sequences and return a fasta file. """ def __init__(self, input = "", outFilePrefix = "", clean = False, verbose = 0): """ Constructor. """ self._inFileName = input self._identityThreshold = 95 self._coverageThreshold = 0.9 self._bothSeq = "T" self._filterUnclusteredSeq = False self._outFilePrefix = outFilePrefix self._isBlastToMap = False self._isHeaderForTEdenovo = False self._nbCPUs = 1 self._clean = clean self._verbose = verbose self._tmpFileName = "" def setAttributesFromCmdLine(self): """ Set the attributes from the command-line. """ description = "Launch Blastclust on nucleotide sequences and return a fasta file." usage = "LaunchBlastclust.py -i inputFileName [options]" examples = "\nExample 1: launch Blastclust with default options, highest verbose and clean temporary files.\n" examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -v 2 -c" examples += "\n\t" examples += "\t\nExample 2: launch Blastclust with an identity threshold of 90%, rename output files and generate a map file corresponding to the fasta output.\n" examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -S 90 -o SpecialOutputName -m" examples += "\n\tWARNING: Please refer to -m option limitations in the description above.\n" #TODO: check if the optionParser can handle '\' into strings for a better code readability in -m option parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples) parser.add_option("-i", "--input", dest = "inFileName", type = "string", help = "name of the input fasta file (nucleotides)", default = "") parser.add_option("-L", "--length", dest = "coverageThreshold", type = "float", help = "length coverage threshold (default=0.9)", default = 0.9) parser.add_option("-S", "--ident", dest = "identityThreshold", type = "int", help = "identity threshold (default=95)", default = 95) parser.add_option("-b", "--both", dest = "bothSeq", type = "string", help = "require coverage on both neighbours (default=T/F)", default = "T") parser.add_option("-f", "--filter", dest = "filterUnclusteredSeq", help = "filter unclustered sequences", default = False, action="store_true") parser.add_option("-o", "--out", dest = "outFilePrefix", type = "string", help = "prefix of the output files (default=input fasta file name)", default = "") parser.add_option("-m", "--map", dest = "isBlast2Map", help = "generate an additional output file in map format (Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output)", default = False, action="store_true") parser.add_option("", "--TEdenovoHeader", dest = "isHeaderForTEdenovo", help = "format headers for TEdenovo pipeline", default = False, action="store_true") parser.add_option("-n", "--num", dest = "nbCPUs", type = "int", help = "number of CPU's to use (default=1)", default = 1) parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true") parser.add_option("-v", "--verbose", dest = "verbose", type = "int", help = "verbosity level (default=0/1/2)", default = 0) options = parser.parse_args()[0] self._setAttributesFromOptions(options) def _setAttributesFromOptions(self, options): self.setInputFileName(options.inFileName) self.setCoverageThreshold(options.coverageThreshold) self.setIdentityThreshold(options.identityThreshold) self.setBothSequences(options.bothSeq) self.setNbCPUs(options.nbCPUs) self.setIsHeaderForTEdenovo(options.isHeaderForTEdenovo) if options.filterUnclusteredSeq: self.setFilterUnclusteredSequences() if options.outFilePrefix != "": self.setOutputFilePrefix(options.outFilePrefix) else: self._outFilePrefix = self._inFileName if options.isBlast2Map: self.setIsBlastToMap() if options.clean: self.setClean() self.setVerbosityLevel(options.verbose) def setInputFileName(self , inFileName): self._inFileName = inFileName def setCoverageThreshold(self, lengthThresh): self._coverageThreshold = float(lengthThresh) def setIdentityThreshold(self, identityThresh): self._identityThreshold = int(identityThresh) def setBothSequences(self, bothSeq): self._bothSeq = bothSeq def setNbCPUs(self, nbCPUs): self._nbCPUs = int(nbCPUs) def setFilterUnclusteredSequences(self): self._filterUnclusteredSeq = True def setOutputFilePrefix(self, outFilePrefix): self._outFilePrefix = outFilePrefix def setIsBlastToMap(self): self._isBlastToMap = True def setIsHeaderForTEdenovo(self, isHeaderForTEdenovo): self._isHeaderForTEdenovo = isHeaderForTEdenovo def setClean(self): self._clean = True def setVerbosityLevel(self, verbose): self._verbose = int(verbose) def setTmpFileName(self, tmpFileName): self._tmpFileName = tmpFileName def checkAttributes(self): """ Check the attributes are valid before running the algorithm. """ if self._inFileName == "": print "ERROR: missing input file name (-i)" sys.exit(1) if self._outFilePrefix == "": self._outFilePrefix = self._inFileName self._tmpFileName = "%s_blastclust.txt" % (self._outFilePrefix) def launchBlastclust(self, inFile): """ Launch Blastclust in command-line. """ if os.path.exists(os.path.basename(inFile)): inFile = os.path.basename(inFile) prg = "blastclust" cmd = prg cmd += " -i %s" % (inFile) cmd += " -o %s" % (self._tmpFileName) cmd += " -S %i" % (self._identityThreshold) cmd += " -L %f" % (self._coverageThreshold) cmd += " -b %s" % (self._bothSeq) cmd += " -p F" cmd += " -a %i" % (self._nbCPUs) if self._verbose == 0: cmd += " -v blastclust.log" if self._verbose > 0: print cmd sys.stdout.flush() process = subprocess.Popen(cmd, shell = True) process.communicate() if process.returncode != 0: raise Exception("ERROR when launching '%s'" % cmd) if self._clean and os.path.exists("error.log"): os.remove("error.log") if self._clean and os.path.exists("blastclust.log"): os.remove("blastclust.log") def getClustersFromTxtFile(self): """ Return a dictionary with cluster IDs as keys and sequence headers as values. """ dClusterId2SeqHeaders = {} inF = open(self._tmpFileName, "r") line = inF.readline() clusterId = 1 while True: if line == "": break tokens = line[:-1].split(" ") dClusterId2SeqHeaders[clusterId] = [] for seqHeader in tokens: if seqHeader != "": dClusterId2SeqHeaders[clusterId].append(seqHeader) line = inF.readline() clusterId += 1 inF.close() if self._verbose > 0: print "nb of clusters: %i" % (len(dClusterId2SeqHeaders.keys())) sys.stdout.flush() return dClusterId2SeqHeaders def filterUnclusteredSequences(self, dClusterId2SeqHeaders): """ Filter clusters having only one sequence. """ for clusterId in dClusterId2SeqHeaders.keys(): if len(dClusterId2SeqHeaders[clusterId]) == 1: del dClusterId2SeqHeaders[clusterId] if self._verbose > 0: print "nb of clusters (>1seq): %i" % (len(dClusterId2SeqHeaders.keys())) sys.stdout.flush() return dClusterId2SeqHeaders def getClusteringResultsInFasta(self, inFile): """ Write a fasta file whose sequence headers contain the clustering IDs. """ dClusterId2SeqHeaders = self.getClustersFromTxtFile() if self._filterUnclusteredSeq: dClusterId2SeqHeaders = self.filterUnclusteredSequences(dClusterId2SeqHeaders) inDB = BioseqDB(inFile) outFileName = "%s_Blastclust.fa" % (inFile) outF = open(outFileName, "w") for clusterId in dClusterId2SeqHeaders.keys(): memberId = 1 for seqHeader in dClusterId2SeqHeaders[clusterId]: bs = inDB.fetch(seqHeader) bs.header = "BlastclustCluster%iMb%i_%s" % (clusterId, memberId, seqHeader) bs.write(outF) memberId += 1 outF.close() def getLinkInitNewHeaders(self): dNew2Init = {} linkFileName = "%s.shortHlink" % (self._inFileName) linkFile = open(linkFileName,"r") line = linkFile.readline() while True: if line == "": break data = line.split("\t") dNew2Init[data[0]] = data[1] line = linkFile.readline() linkFile.close() return dNew2Init def retrieveInitHeaders(self, dNewH2InitH): tmpFaFile = "%s.shortH_Blastclust.fa" % (self._inFileName) tmpFaFileHandler = open(tmpFaFile, "r") outFaFile = "%s_Blastclust.fa" % (self._outFilePrefix) outFaFileHandler = open(outFaFile, "w") while True: line = tmpFaFileHandler.readline() if line == "": break if line[0] == ">": tokens = line[1:-1].split("_") initHeader = dNewH2InitH[tokens[1]] if self._isHeaderForTEdenovo: classif = initHeader.split("_")[0] consensusName = "_".join(initHeader.split("_")[1:]) clusterId = tokens[0].split("Cluster")[1].split("Mb")[0] newHeader = "%s_Blc%s_%s" % (classif, clusterId, consensusName) else: newHeader = "%s_%s" % (tokens[0], initHeader) outFaFileHandler.write(">%s\n" % (newHeader)) else: outFaFileHandler.write(line) tmpFaFileHandler.close() outFaFileHandler.close() if self._clean: os.remove(tmpFaFile) def blastclustToMap(self, blastclustFastaOut): """ Write a map file from blastclust fasta output. Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output. """ fileDb = open(blastclustFastaOut , "r") mapFilename = "%s.map" % (os.path.splitext(blastclustFastaOut)[0]) fileMap = open(mapFilename, "w") seq = Bioseq() numseq = 0 while 1: seq.read(fileDb) if seq.sequence == None: break numseq = numseq + 1 ID = seq.header.split(' ')[0].split('_')[0] chunk = seq.header.split(' ')[0].split('_')[1] start = seq.header.split(' ')[-1].split(',')[0][1:] end = seq.header.split(' ')[-1].split(',')[1][:-1] line= '%s\t%s\t%s\t%s' % (ID, chunk, start, end) fileMap.write(line + "\n") fileDb.close() fileMap.close() print "saved in %s" % mapFilename def start(self): """ Useful commands before running the program. """ self.checkAttributes() if self._verbose > 0: print "START %s" % (type(self).__name__) def end(self): """ Useful commands before ending the program. """ if self._verbose > 0: print "END %s" % (type(self).__name__) def run(self): """ Run the program. """ self.start() iCSH = ChangeSequenceHeaders(inFile = self._inFileName, format = "fasta", step = 1, outFile = "%s.shortH" % self._inFileName, linkFile = "%s.shortHlink" % self._inFileName) iCSH.run() self.launchBlastclust("%s.shortH" % (self._inFileName)) self.getClusteringResultsInFasta("%s.shortH" % (self._inFileName)) dNewH2InitH = self.getLinkInitNewHeaders() self.retrieveInitHeaders(dNewH2InitH) if self._isBlastToMap: blastclustFileName = "%s_Blastclust.fa" % (self._outFilePrefix) self.blastclustToMap(blastclustFileName) if self._clean: os.remove("%s.shortH" % (self._inFileName)) os.remove("%s.shortHlink" % (self._inFileName)) self.end() if __name__ == "__main__": i = LaunchBlastclust() i.setAttributesFromCmdLine() i.run()