Mercurial > repos > yufei-luo > s_mart
diff commons/launcher/LaunchTallymer.py @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/launcher/LaunchTallymer.py Tue Apr 30 14:33:21 2013 -0400 @@ -0,0 +1,328 @@ +#!/usr/bin/env python + +""" +Launch Tallymer's sub programs, generate map file, and convert output to gff and wig, as well as visual (RPlot) data +""" + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import os +import shutil +import subprocess +import time +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.LoggerFactory import LoggerFactory +from SMART.Java.Python.convertTranscriptFile import ConvertTranscriptFile +from commons.core.seq.BioseqUtils import BioseqUtils +from commons.core.seq.BioseqDB import BioseqDB +from Tallymer_pipe.PlotBenchMarkGFFFiles import PlotBenchMarkGFFFiles + +LOG_DEPTH = "repet.tools" + + +class LaunchTallymer(object): + """ + Launch Tallymer's sub programs, generate map file, and convert output to + gff and wig, as well as visual (RPlot) data + """ + + _lValidFormats = ["gff", "gff3", "wig", "bed", "map"] + + def __init__(self, inputFasta="", indexationFasta=None, merSize=20, minOccs=4, outputFormats="gff", nLargestScaffoldsToPlot=0, clean=False, verbosity=0): + self.inputFasta = inputFasta + self.indexationFasta = indexationFasta if indexationFasta != None else inputFasta + self.merSize = merSize + self.minOccs = minOccs + self.outputFormats = outputFormats + self.nLargestScaffoldsToPlot = nLargestScaffoldsToPlot + self.doClean = clean + self.verbosity = verbosity + + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity) + self._workdir = os.path.join(os.getcwd(), "launchTallymer_%s" % time.strftime("%Y%m%d%H%M%S")) + self._tmpSearchFileName = None + self._tmpMapFileName = None + self._tmpStatFileName = None + self._tmpPngFileName = None + self._plot_data = {} + self._plot_data2 = {} + + def setAttributesFromCmdLine(self): + description = "Generates stats from the results of the tallymer search ." + parser = RepetOptionParser(description=description) + parser.add_option("-i", "--inputFasta", dest="inputFasta", default = "", action="store", type="string", help="input fasta file [compulsory] [format: fasta]") + parser.add_option("-u", "--indexationFasta", dest="indexationFasta", default = "", action="store", type="string", help="input indexation fasta file used to generate kmer index (defaults to input fasta)[optional] [format: fasta]") + parser.add_option("-s", "--merSize", dest="merSize", default = 20, action="store", type="int", help="input merSize [optional][default:20]") + parser.add_option("-m", "--minOccs", dest="minOccs", default = 4, action="store", type="int", help="input minimal kmer occurence count [default:4]") + parser.add_option("-f", "--outputFormats", dest="outputFormats", default = "gff", action="store", type="string", help="comma separated list of output file formats (can be %s) [optional] [default:gff]" % ", ".join(self._lValidFormats)) + parser.add_option("-n", "--nLargestScaffoldsToPlot",default = 0, type="int", action="store", dest = "nLargestScaffoldsToPlot", help = "generate graph of Kmer repartition along the input sequence for the n biggest scaffolds") + parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true") + parser.add_option("-v", "--verbosity", dest="verbosity", default = 1, action="store", type="int", help="verbosity [optional] ") + options, args = parser.parse_args() + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.inputFasta = options.inputFasta + self.indexationFasta = options.indexationFasta if options.indexationFasta != "" else options.inputFasta + self.merSize = options.merSize + self.minOccs = options.minOccs + self.outputFormats = options.outputFormats + self.nLargestScaffoldsToPlot = options.nLargestScaffoldsToPlot + self.doClean = options.clean + self.verbosity = options.verbosity + + def _checkOptions(self): + if self.inputFasta == "": + self._logAndRaise("Error: missing input fasta file") + if self.merSize < 1: + self._logAndRaise("Error: invalid kmer size '%i'; must be a positive integer" % self.merSize) + + self.outputFormats = self.outputFormats.lower().split(',') + sOutFormats = set(self.outputFormats) + sValidFormats = set(self._lValidFormats) + lInvalidFormats = list(sOutFormats - sValidFormats) + self.outputFormats = list(sValidFormats.intersection(sOutFormats)) + if lInvalidFormats: + self._log.warning("Warning: ignoring invalid formats: <%s>" % " ".join(lInvalidFormats)) + if not self.outputFormats: + self._logAndRaise("Error: no valid output formats specified") + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise Exception(errorMsg) + + def clean(self): + try: + shutil.rmtree(self._workdir) + except Exception as inst: + self._log.error(inst) + + def run(self): + LoggerFactory.setLevel(self._log, self.verbosity) + self._checkOptions() + self._log.debug("Input fasta file: %s; K-mer size: %s; Output formats: %s; Cleaning: %s" % (self.inputFasta, self.merSize, str(self.outputFormats), self.doClean)) + try: + os.makedirs(self._workdir) + except:pass + + srcPath = os.path.abspath(self.inputFasta) + dstPath = os.path.join(self._workdir,os.path.basename(self.inputFasta)) + os.symlink(srcPath, dstPath) + + if (self.indexationFasta != self.inputFasta): + srcPath = os.path.abspath(self.indexationFasta) + dstPath = os.path.join(self._workdir,os.path.basename(self.indexationFasta)) + try: + os.symlink(srcPath, dstPath) + except OSError as inst: + pass + + os.chdir(self._workdir) + + if (self.indexationFasta != self.inputFasta): + self.indexationFasta = os.path.basename(self.indexationFasta) + else: + self.indexationFasta = os.path.basename(self.inputFasta) + self.inputFasta = os.path.basename(self.inputFasta) + + self._tmpSearchFileName = "%s.tallymer" % os.path.splitext(os.path.basename(self.inputFasta))[0] + self._tmpMapFileName = "%s_tmp.map" % self._tmpSearchFileName + self._tmpStatFileName = "%s.stat" % self._tmpSearchFileName + self._tmpPngFileName = "%s.png" % self._tmpSearchFileName + + + + + + self._runTallymerSearch() + self._convertTallymerToMap() + self._writeOutputFiles() + + if self.nLargestScaffoldsToPlot > 0: + self._doPlot() + shutil.copy2(self._tmpPngFileName, "../.") + + shutil.copy2(self._tmpStatFileName, "../.") + os.chdir("..") + + if self.doClean: + self.clean() + + def _runTallymerSearch(self): + self._log.info("Starting to run tallymer search of sequence %s " % (self.inputFasta)) + self._indexInputFastaFile() + self._countAndIndexKmersForGivenK() + self._searchKmersListInTallymerIndex() + self._log.info("Finished running tallymer scan of sequence %s " % (self.inputFasta)) + + def _convertTallymerToMap(self): + self._log.info("Starting to run tallymer search to map conversion") + totalNbOcc, dKmer2Occ, self._plot_data, self._plot_data2 = ConvertUtils.convertTallymerFormatIntoMapFormatAndGenerateData(self.inputFasta, self._tmpSearchFileName, self._tmpMapFileName) + self._log.debug("totalNbOcc=%i" % totalNbOcc) + self._writeOccurencesNbAndFrequencyOfKmers(totalNbOcc, dKmer2Occ) + self._log.info("Finished tallymer search to map conversion") + + def _doPlot(self): + iBioseqDB = BioseqDB() + iBioseqDB.load(self.inputFasta) + largest_seqsDb = iBioseqDB.bestLength(self.nLargestScaffoldsToPlot) + + for seq in largest_seqsDb.db: + headerCleaned = seq.header.replace(" ", "_") + shortFastaName = "%s_%s" % (os.path.basename(self.inputFasta),headerCleaned) + data = self._plot_data2[seq.header] + gffPlotter = PlotBenchMarkGFFFiles(yLabel="Number of Kmer Occurences", maxLength=1, title="Kmer repartition along the input sequence: %s; MerSize: %i" % (shortFastaName, self.merSize) ) + gffPlotter.setOutFileName("%s_%s.png" % (self._tmpSearchFileName, headerCleaned)) + gffPlotter._title = "Mers along the input sequence: %s MerSize: %i" % (shortFastaName, self.merSize) + gffPlotter._xLabel = "Coordinates along the input sequence (%s)" % shortFastaName + gffPlotter._rawData = data + gffPlotter.run() + + def _indexInputFastaFile(self): + self._log.debug("index the input fasta file: get an enhanced suffix array.") + cmd = "gt suffixerator -dna -pl -tis -suf -lcp -v -db %s -indexname %s.reads" % (self.indexationFasta, self.indexationFasta) + process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE) + self._log.debug("Running suffixerator with following params %s" % cmd) + output = process.communicate() + self._log.debug("Suffixerator Output:\n%s" % output[0]) + if process.returncode != 0: + self._logAndRaise("Error in generating enhanced suffix array in %s" % self.indexationFasta) + + def _countAndIndexKmersForGivenK(self): + self._log.debug("Counting and indexing k-mers for k = %i " % self.merSize) + cmd = "gt tallymer mkindex -mersize %i -minocc %i -indexname %s.tyr-reads -counts -pl -esa %s.reads" % (self.merSize, self.minOccs, self.indexationFasta, self.indexationFasta) + process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + self._log.debug("Running tallymer mkindex with following params %s" % cmd) + output = process.communicate() + self._log.debug("Tallymer mkindex Output:\n%s" % output[0]) + if process.returncode != 0: + self._logAndRaise("Error in indexing kmers in %s.reads" % self.indexationFasta) + + def _searchKmersListInTallymerIndex(self): + self._log.debug("Searching list of kmers in tallymer-index ") + cmd = "gt tallymer search -output qseqnum qpos counts sequence -tyr %s.tyr-reads -q %s" % (self.indexationFasta, self.inputFasta) + process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE) + self._log.debug("Running tallymer search with following params %s" % cmd) + output = process.communicate() + self._log.debug("Tallymer search Output:\n%s" % output[0]) + if process.returncode != 0: + self._logAndRaise("Error in searching for kmers in %s.tyr-reads" % self.indexationFasta) + tmpOutputFile = open(self._tmpSearchFileName,'w') + tmpOutputFile.write(output[0]) + tmpOutputFile.close() + + def _writeOccurencesNbAndFrequencyOfKmers(self, totalNbOcc, dKmer2Occ): + with open(self._tmpStatFileName, "w") as statFile: + statFile.write("kmer\tocc\tfreq\n") + for kmer in dKmer2Occ.keys(): + statFile.write("%s\t%i\t%.10f\n" % (kmer, dKmer2Occ[kmer], dKmer2Occ[kmer] / float(totalNbOcc))) + + def _writeOutputFiles(self): + for format in self.outputFormats: + self._log.info("Generating %s file" % format) + outputFileName = "%s.tallymer.%s" % (os.path.splitext(self.inputFasta)[0], format) + try: + iConvertTranscriptFile = ConvertTranscriptFile(inputFileName=self._tmpMapFileName, name="Tallymer",\ + inputFormat="map", outputFileName=outputFileName, outputFormat=format,feature= "Match", featurePart="Match-part", verbosity=0) #self.verbosity + iConvertTranscriptFile.run() + except Exception as inst: + self._log.error("Error: %s - Failed to generate %s format ouput, skipping" % (inst, format)) + shutil.copy2(outputFileName, "../.") + + +class ConvertUtils(object): + + def convertTallymerFormatIntoMapFormatAndGenerateData(fastaFileName, searchFileName, mapFileName): + dIndex2NameLengthList = ConvertUtils._createDictOfNameLengthAccordingToSeqOrder(fastaFileName) + plotData = {} + plotData2 = {} + with open(searchFileName, "r") as talFile: + with open(mapFileName, "w") as mapFile: + totalNbOcc = 0 + dKmer2Occ = {} + line = talFile.readline() + while line: + data = line[:-1].split("\t") + name = "%s_%s" % (data[3], data[2]) + nbOccs = int(data[2]) + chrname = dIndex2NameLengthList[int(data[0])][0] + if data[1][0] == "+": + start = int(data[1][1:]) + 1 + end = start + len(data[3]) + elif data[1][0] == "-": + start_revcomp = int(data[1][1:]) + start = dIndex2NameLengthList[int(data[0])][1] - start_revcomp - 1 + end = end - len(data[3]) + 1 + mapLine = "%s\t%s\t%s\t%s\t%i\n" % (name, chrname, start, end, nbOccs) + mapFile.write(mapLine) + + if plotData2.get(chrname,None) is None: + plotData2[chrname] = {} + if plotData2[chrname].get(start, None) is None: + plotData2[chrname][start]=0 + plotData2[chrname][start] += nbOccs + + totalNbOcc += 1 + if dKmer2Occ.has_key(data[3]): + dKmer2Occ[data[3]] += 1 + else: + dKmer2Occ[data[3]] = 1 + plotData[start] = nbOccs + line = talFile.readline() + return totalNbOcc, dKmer2Occ, plotData, plotData2 + + convertTallymerFormatIntoMapFormatAndGenerateData = staticmethod(convertTallymerFormatIntoMapFormatAndGenerateData) + + def _createDictOfNameLengthAccordingToSeqOrder(fastaFileName): + with open(fastaFileName) as fastaFile: + line = fastaFile.readline() + i = 0 + length = 0 + dIndex2Name = {} + while line: + if line[0] == ">": + dIndex2Name[i] = [line[1:-1]] + if i > 0: + dIndex2Name[i - 1].append(length) + length = 0 + i += 1 + else: + length += len(line[:-1]) + line = fastaFile.readline() + dIndex2Name[i - 1].append(length) + return dIndex2Name + + _createDictOfNameLengthAccordingToSeqOrder = staticmethod(_createDictOfNameLengthAccordingToSeqOrder) + +if __name__ == "__main__": + iLaunchTallymer = LaunchTallymer() + iLaunchTallymer.setAttributesFromCmdLine() + iLaunchTallymer.run() \ No newline at end of file