Mercurial > repos > yufei-luo > s_mart
view commons/launcher/LaunchTallymer.py @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line source
#!/usr/bin/env python """ Launch Tallymer's sub programs, generate map file, and convert output to gff and wig, as well as visual (RPlot) data """ # Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. import os import shutil import subprocess import time from commons.core.utils.RepetOptionParser import RepetOptionParser from commons.core.LoggerFactory import LoggerFactory from SMART.Java.Python.convertTranscriptFile import ConvertTranscriptFile from commons.core.seq.BioseqUtils import BioseqUtils from commons.core.seq.BioseqDB import BioseqDB from Tallymer_pipe.PlotBenchMarkGFFFiles import PlotBenchMarkGFFFiles LOG_DEPTH = "repet.tools" class LaunchTallymer(object): """ Launch Tallymer's sub programs, generate map file, and convert output to gff and wig, as well as visual (RPlot) data """ _lValidFormats = ["gff", "gff3", "wig", "bed", "map"] def __init__(self, inputFasta="", indexationFasta=None, merSize=20, minOccs=4, outputFormats="gff", nLargestScaffoldsToPlot=0, clean=False, verbosity=0): self.inputFasta = inputFasta self.indexationFasta = indexationFasta if indexationFasta != None else inputFasta self.merSize = merSize self.minOccs = minOccs self.outputFormats = outputFormats self.nLargestScaffoldsToPlot = nLargestScaffoldsToPlot self.doClean = clean self.verbosity = verbosity self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity) self._workdir = os.path.join(os.getcwd(), "launchTallymer_%s" % time.strftime("%Y%m%d%H%M%S")) self._tmpSearchFileName = None self._tmpMapFileName = None self._tmpStatFileName = None self._tmpPngFileName = None self._plot_data = {} self._plot_data2 = {} def setAttributesFromCmdLine(self): description = "Generates stats from the results of the tallymer search ." parser = RepetOptionParser(description=description) parser.add_option("-i", "--inputFasta", dest="inputFasta", default = "", action="store", type="string", help="input fasta file [compulsory] [format: fasta]") parser.add_option("-u", "--indexationFasta", dest="indexationFasta", default = "", action="store", type="string", help="input indexation fasta file used to generate kmer index (defaults to input fasta)[optional] [format: fasta]") parser.add_option("-s", "--merSize", dest="merSize", default = 20, action="store", type="int", help="input merSize [optional][default:20]") parser.add_option("-m", "--minOccs", dest="minOccs", default = 4, action="store", type="int", help="input minimal kmer occurence count [default:4]") parser.add_option("-f", "--outputFormats", dest="outputFormats", default = "gff", action="store", type="string", help="comma separated list of output file formats (can be %s) [optional] [default:gff]" % ", ".join(self._lValidFormats)) parser.add_option("-n", "--nLargestScaffoldsToPlot",default = 0, type="int", action="store", dest = "nLargestScaffoldsToPlot", help = "generate graph of Kmer repartition along the input sequence for the n biggest scaffolds") parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true") parser.add_option("-v", "--verbosity", dest="verbosity", default = 1, action="store", type="int", help="verbosity [optional] ") options, args = parser.parse_args() self._setAttributesFromOptions(options) def _setAttributesFromOptions(self, options): self.inputFasta = options.inputFasta self.indexationFasta = options.indexationFasta if options.indexationFasta != "" else options.inputFasta self.merSize = options.merSize self.minOccs = options.minOccs self.outputFormats = options.outputFormats self.nLargestScaffoldsToPlot = options.nLargestScaffoldsToPlot self.doClean = options.clean self.verbosity = options.verbosity def _checkOptions(self): if self.inputFasta == "": self._logAndRaise("Error: missing input fasta file") if self.merSize < 1: self._logAndRaise("Error: invalid kmer size '%i'; must be a positive integer" % self.merSize) self.outputFormats = self.outputFormats.lower().split(',') sOutFormats = set(self.outputFormats) sValidFormats = set(self._lValidFormats) lInvalidFormats = list(sOutFormats - sValidFormats) self.outputFormats = list(sValidFormats.intersection(sOutFormats)) if lInvalidFormats: self._log.warning("Warning: ignoring invalid formats: <%s>" % " ".join(lInvalidFormats)) if not self.outputFormats: self._logAndRaise("Error: no valid output formats specified") def _logAndRaise(self, errorMsg): self._log.error(errorMsg) raise Exception(errorMsg) def clean(self): try: shutil.rmtree(self._workdir) except Exception as inst: self._log.error(inst) def run(self): LoggerFactory.setLevel(self._log, self.verbosity) self._checkOptions() self._log.debug("Input fasta file: %s; K-mer size: %s; Output formats: %s; Cleaning: %s" % (self.inputFasta, self.merSize, str(self.outputFormats), self.doClean)) try: os.makedirs(self._workdir) except:pass srcPath = os.path.abspath(self.inputFasta) dstPath = os.path.join(self._workdir,os.path.basename(self.inputFasta)) os.symlink(srcPath, dstPath) if (self.indexationFasta != self.inputFasta): srcPath = os.path.abspath(self.indexationFasta) dstPath = os.path.join(self._workdir,os.path.basename(self.indexationFasta)) try: os.symlink(srcPath, dstPath) except OSError as inst: pass os.chdir(self._workdir) if (self.indexationFasta != self.inputFasta): self.indexationFasta = os.path.basename(self.indexationFasta) else: self.indexationFasta = os.path.basename(self.inputFasta) self.inputFasta = os.path.basename(self.inputFasta) self._tmpSearchFileName = "%s.tallymer" % os.path.splitext(os.path.basename(self.inputFasta))[0] self._tmpMapFileName = "%s_tmp.map" % self._tmpSearchFileName self._tmpStatFileName = "%s.stat" % self._tmpSearchFileName self._tmpPngFileName = "%s.png" % self._tmpSearchFileName self._runTallymerSearch() self._convertTallymerToMap() self._writeOutputFiles() if self.nLargestScaffoldsToPlot > 0: self._doPlot() shutil.copy2(self._tmpPngFileName, "../.") shutil.copy2(self._tmpStatFileName, "../.") os.chdir("..") if self.doClean: self.clean() def _runTallymerSearch(self): self._log.info("Starting to run tallymer search of sequence %s " % (self.inputFasta)) self._indexInputFastaFile() self._countAndIndexKmersForGivenK() self._searchKmersListInTallymerIndex() self._log.info("Finished running tallymer scan of sequence %s " % (self.inputFasta)) def _convertTallymerToMap(self): self._log.info("Starting to run tallymer search to map conversion") totalNbOcc, dKmer2Occ, self._plot_data, self._plot_data2 = ConvertUtils.convertTallymerFormatIntoMapFormatAndGenerateData(self.inputFasta, self._tmpSearchFileName, self._tmpMapFileName) self._log.debug("totalNbOcc=%i" % totalNbOcc) self._writeOccurencesNbAndFrequencyOfKmers(totalNbOcc, dKmer2Occ) self._log.info("Finished tallymer search to map conversion") def _doPlot(self): iBioseqDB = BioseqDB() iBioseqDB.load(self.inputFasta) largest_seqsDb = iBioseqDB.bestLength(self.nLargestScaffoldsToPlot) for seq in largest_seqsDb.db: headerCleaned = seq.header.replace(" ", "_") shortFastaName = "%s_%s" % (os.path.basename(self.inputFasta),headerCleaned) data = self._plot_data2[seq.header] gffPlotter = PlotBenchMarkGFFFiles(yLabel="Number of Kmer Occurences", maxLength=1, title="Kmer repartition along the input sequence: %s; MerSize: %i" % (shortFastaName, self.merSize) ) gffPlotter.setOutFileName("%s_%s.png" % (self._tmpSearchFileName, headerCleaned)) gffPlotter._title = "Mers along the input sequence: %s MerSize: %i" % (shortFastaName, self.merSize) gffPlotter._xLabel = "Coordinates along the input sequence (%s)" % shortFastaName gffPlotter._rawData = data gffPlotter.run() def _indexInputFastaFile(self): self._log.debug("index the input fasta file: get an enhanced suffix array.") cmd = "gt suffixerator -dna -pl -tis -suf -lcp -v -db %s -indexname %s.reads" % (self.indexationFasta, self.indexationFasta) process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE) self._log.debug("Running suffixerator with following params %s" % cmd) output = process.communicate() self._log.debug("Suffixerator Output:\n%s" % output[0]) if process.returncode != 0: self._logAndRaise("Error in generating enhanced suffix array in %s" % self.indexationFasta) def _countAndIndexKmersForGivenK(self): self._log.debug("Counting and indexing k-mers for k = %i " % self.merSize) cmd = "gt tallymer mkindex -mersize %i -minocc %i -indexname %s.tyr-reads -counts -pl -esa %s.reads" % (self.merSize, self.minOccs, self.indexationFasta, self.indexationFasta) process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE) self._log.debug("Running tallymer mkindex with following params %s" % cmd) output = process.communicate() self._log.debug("Tallymer mkindex Output:\n%s" % output[0]) if process.returncode != 0: self._logAndRaise("Error in indexing kmers in %s.reads" % self.indexationFasta) def _searchKmersListInTallymerIndex(self): self._log.debug("Searching list of kmers in tallymer-index ") cmd = "gt tallymer search -output qseqnum qpos counts sequence -tyr %s.tyr-reads -q %s" % (self.indexationFasta, self.inputFasta) process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE) self._log.debug("Running tallymer search with following params %s" % cmd) output = process.communicate() self._log.debug("Tallymer search Output:\n%s" % output[0]) if process.returncode != 0: self._logAndRaise("Error in searching for kmers in %s.tyr-reads" % self.indexationFasta) tmpOutputFile = open(self._tmpSearchFileName,'w') tmpOutputFile.write(output[0]) tmpOutputFile.close() def _writeOccurencesNbAndFrequencyOfKmers(self, totalNbOcc, dKmer2Occ): with open(self._tmpStatFileName, "w") as statFile: statFile.write("kmer\tocc\tfreq\n") for kmer in dKmer2Occ.keys(): statFile.write("%s\t%i\t%.10f\n" % (kmer, dKmer2Occ[kmer], dKmer2Occ[kmer] / float(totalNbOcc))) def _writeOutputFiles(self): for format in self.outputFormats: self._log.info("Generating %s file" % format) outputFileName = "%s.tallymer.%s" % (os.path.splitext(self.inputFasta)[0], format) try: iConvertTranscriptFile = ConvertTranscriptFile(inputFileName=self._tmpMapFileName, name="Tallymer",\ inputFormat="map", outputFileName=outputFileName, outputFormat=format,feature= "Match", featurePart="Match-part", verbosity=0) #self.verbosity iConvertTranscriptFile.run() except Exception as inst: self._log.error("Error: %s - Failed to generate %s format ouput, skipping" % (inst, format)) shutil.copy2(outputFileName, "../.") class ConvertUtils(object): def convertTallymerFormatIntoMapFormatAndGenerateData(fastaFileName, searchFileName, mapFileName): dIndex2NameLengthList = ConvertUtils._createDictOfNameLengthAccordingToSeqOrder(fastaFileName) plotData = {} plotData2 = {} with open(searchFileName, "r") as talFile: with open(mapFileName, "w") as mapFile: totalNbOcc = 0 dKmer2Occ = {} line = talFile.readline() while line: data = line[:-1].split("\t") name = "%s_%s" % (data[3], data[2]) nbOccs = int(data[2]) chrname = dIndex2NameLengthList[int(data[0])][0] if data[1][0] == "+": start = int(data[1][1:]) + 1 end = start + len(data[3]) elif data[1][0] == "-": start_revcomp = int(data[1][1:]) start = dIndex2NameLengthList[int(data[0])][1] - start_revcomp - 1 end = end - len(data[3]) + 1 mapLine = "%s\t%s\t%s\t%s\t%i\n" % (name, chrname, start, end, nbOccs) mapFile.write(mapLine) if plotData2.get(chrname,None) is None: plotData2[chrname] = {} if plotData2[chrname].get(start, None) is None: plotData2[chrname][start]=0 plotData2[chrname][start] += nbOccs totalNbOcc += 1 if dKmer2Occ.has_key(data[3]): dKmer2Occ[data[3]] += 1 else: dKmer2Occ[data[3]] = 1 plotData[start] = nbOccs line = talFile.readline() return totalNbOcc, dKmer2Occ, plotData, plotData2 convertTallymerFormatIntoMapFormatAndGenerateData = staticmethod(convertTallymerFormatIntoMapFormatAndGenerateData) def _createDictOfNameLengthAccordingToSeqOrder(fastaFileName): with open(fastaFileName) as fastaFile: line = fastaFile.readline() i = 0 length = 0 dIndex2Name = {} while line: if line[0] == ">": dIndex2Name[i] = [line[1:-1]] if i > 0: dIndex2Name[i - 1].append(length) length = 0 i += 1 else: length += len(line[:-1]) line = fastaFile.readline() dIndex2Name[i - 1].append(length) return dIndex2Name _createDictOfNameLengthAccordingToSeqOrder = staticmethod(_createDictOfNameLengthAccordingToSeqOrder) if __name__ == "__main__": iLaunchTallymer = LaunchTallymer() iLaunchTallymer.setAttributesFromCmdLine() iLaunchTallymer.run()