Mercurial > repos > yufei-luo > s_mart
view smart_toolShed/SMART/Java/Python/GetReadDistribution.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
line wrap: on
line source
#! /usr/bin/env python # # Copyright INRA-URGI 2009-2010 # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. # import random, os, glob, subprocess from commons.core.parsing.ParserChooser import ParserChooser from commons.core.parsing.GffParser import GffParser from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress from SMART.Java.Python.misc.Progress import Progress from SMART.Java.Python.misc import Utils from commons.core.LoggerFactory import LoggerFactory from commons.core.utils.RepetOptionParser import RepetOptionParser LOG_DEPTH = "smart" DEFAULT_REGION = "_all_" MULTIPLE_STR = {1: "", 1000: " (in kpb)", 1000000: " (in Gbp)"} class GetReadDistribution(object): def __init__(self, verbosity = 0): self.xLab = "" self.yLab = "# reads" self.verbosity = verbosity self.number = random.randint(0, 100000) self.log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity) self.parsers = {} self.distribution = {} self.factors = {} self.regions = None self.tmpDatName = None self.tmpRName = None self.quorum = 1 self.width = 800 self.height = 300 def setNames(self, names): self.names = names def setInputFiles(self, fileNames, format): chooser = ParserChooser(self.verbosity) chooser.findFormat(format) for cpt, fileName in enumerate(fileNames): self.parsers[self.names[cpt]] = chooser.getParser(fileName) def setOutputFileName(self, fileName): self.outputFileName = fileName def setLabs(self, xLab, yLab): self.xLab = xLab self.yLab = yLab def setBinSize(self, binSize): self.binSize = binSize def setColors(self, colors): self.colors = colors def setFactors(self, factors): self.factors = dict(zip(self.names, factors)) def setMultiple(self, boolean): self.multiple = boolean def setImageSize(self, width, height): if width != None: self.width = width if height != None: self.height = height def setQuorum(self, quorum): self.quorum = quorum def setRegionsFile(self, fileName): if fileName != None: self._loadRegions(fileName) def _checkOptions(self): if not self.parsers: self.logAndRaise("ERROR: Missing input file names") def _logAndRaise(self, errorMsg): self.log.error(errorMsg) raise Exception(errorMsg) def _loadRegions(self, fileName): self.regions = {} parser = GffParser(fileName, self.verbosity) for transcript in parser.getIterator(): chromosome = transcript.getChromosome() start = transcript.getStart() end = transcript.getEnd() name = transcript.getName() if chromosome not in self.regions: self.regions[chromosome] = {} if start not in self.regions[chromosome]: self.regions[chromosome][start] = {} if end not in self.regions[chromosome][start]: self.regions[chromosome][start][end] = [] self.regions[chromosome][start][end].append(name) def _getRegions(self, transcript): if self.regions == None: return [DEFAULT_REGION] chromosome = transcript.getChromosome() start = transcript.getStart() end = transcript.getEnd() if chromosome not in self.regions: return [] names = [] for loadedStart in sorted(self.regions[chromosome].keys()): if loadedStart > end: return names for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())): if loadedEnd < start: break names.extend(self.regions[chromosome][loadedStart][loadedEnd]) return names def _parse(self, name): progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity) for transcript in self.parsers[name].getIterator(): if transcript.__class__.__name__ == "Mapping": transcript = transcript.getTranscript() regions = self._getRegions(transcript) for region in regions: if region not in self.distribution: self.distribution[region] = {} if name not in self.distribution[region]: self.distribution[region][name] = {} chromosome = transcript.getChromosome() nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1 nbElements *= self.factors.get(name, 1) if chromosome not in self.distribution[region][name]: self.distribution[region][name][chromosome] = {} previousBin = None for exon in transcript.getExons(): for pos in range(exon.getStart(), exon.getEnd()+1): bin = pos / self.binSize if bin != previousBin: self.distribution[region][name][chromosome][bin] = self.distribution[region][name][chromosome].get(bin, 0) + nbElements previousBin = bin progress.inc() progress.done() def _checkQuorum(self, region): if self.quorum == None: return True return max([max([max(self.distribution[region][name][chromosome].values()) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]]) >= self.quorum def _writeData(self, region): self.tmpDatName = "tmpFile%d.dat" % (self.number) handle = open(self.tmpDatName, "w") handle.write("Chr\tPos\tCount\tSample\n") for name in self.distribution[region]: for chromosome in sorted(self.distribution[region][name].keys()): for pos in sorted(self.distribution[region][name][chromosome].keys()): handle.write("%s\t%d\t%d\t\"%s\"\n" % (chromosome, pos * self.binSize, self.distribution[region][name][chromosome].get(pos, 0), name)) handle.close() def _findMultiple(self, region): if not self.multiple: return 1 maxPosition = max([self.distribution[region][name][chromosome].keys() for name in self.distribution[region] for chromosome in self.distribution[region][name]]) if maxPosition > 2000000: return 1000000 elif maxPosition > 2000: return 1000 return 1 def _writeScript(self, region): self.tmpRName = "tmpFile%d.R" % (self.number) fileName = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region) colors = "scale_fill_brewer(palette=\"Set1\") + scale_color_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s)) + scale_color_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors]), ", ".join(["\"%s\"" % (color) for color in self.colors])) title = "" if region == DEFAULT_REGION else " of %s" % (region) facet = "Sample ~ Chr" if region == DEFAULT_REGION else "Sample ~ ." handle = open(self.tmpRName, "w") multiple = self._findMultiple(region) handle.write("library(ggplot2)\n") handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName)) handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names]))) handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height)) handle.write("ggplot(data, aes(x = Pos/%d, y = Count, fill = Sample, color = Sample)) + opts(title = \"Distribution%s\") + geom_bar(stat = \"identity\") + facet_grid(%s, space=\"free\") + xlab(\"%s%s\") + ylab(\"%s\") + %s + opts(legend.position = \"none\", panel.grid.major = theme_blank(), panel.grid.minor = theme_blank(), panel.background = theme_blank())\n" % (multiple, title, facet, self.xLab, MULTIPLE_STR[multiple], self.yLab, colors)) handle.write("dev.off()\n") def _runR(self): rCommand = "R" if "SMARTRPATH" in os.environ: rCommand = os.environ["SMARTRPATH"] command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName) status = subprocess.call(command, shell=True) if status != 0: raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status)) def _plot(self): progress = Progress(len(self.distribution), "Plotting data", self.verbosity) for region in self.distribution: if not self._checkQuorum(region): self.log.info("Not displaying '%s' for it contains insufficient data." % (region)) else: self._writeData(region) self._writeScript(region) self._runR() progress.inc() progress.done() def _cleanFiles(self): for fileName in (self.tmpDatName, self.tmpRName): if fileName != None and os.path.exists(fileName): os.remove(fileName) for otherFileName in glob.glob("%s*" % (fileName)): os.remove(otherFileName) def run(self): LoggerFactory.setLevel(self.log, self.verbosity) self._checkOptions() self.log.info("START Get Read Distribution") for name in self.names: self._parse(name) self._plot() self._cleanFiles() self.log.info("END Get Read Distribution") if __name__ == "__main__": description = "Usage: GetReadDistribution.py [options]\n\nGet Read Distribution v1.0.1: Get the distribution of a set of reads. [Category: Personal]\n" epilog = "" parser = RepetOptionParser(description = description, epilog = epilog) parser.add_option("-i", "--input", dest="inputFileNames", action="store", default=None, type="string", help="input files, separated by commas [compulsory] [format: string]") parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name of the input data, separated by commas [compulsory] [format: string]") parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") parser.add_option("-s", "--binSize", dest="binSize", action="store", default=10000, type="int", help="bin size [format: int] [default: 10000]") parser.add_option("-l", "--xLabel", dest="xLab", action="store", default="", type="string", help="x-axis label name [format: string]") parser.add_option("-L", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]") parser.add_option("-c", "--colors", dest="colors", action="store", default=None, type="string", help="colors of the bars, separated by commas [format: string]") parser.add_option("-a", "--factors", dest="factors", action="store", default=None, type="string", help="normalization factors, separated by commas [format: string]") parser.add_option("-r", "--regions", dest="regionsFileName", action="store", default=None, type="string", help="regions to plot [format: transcript file in GFF format]") parser.add_option("-m", "--multiple", dest="multiple", action="store_true", default=False, help="print position using multiples (k, G) [format: boolean] [default: False]") parser.add_option("-q", "--quorum", dest="quorum", action="store", default=1, type="int", help="minimum number of intervals to plot a region [format: int] [default: 1]") parser.add_option("-z", "--width", dest="width", action="store", default=800, type="int", help="width of the image [format: int] [default: 800]") parser.add_option("-Z", "--height", dest="height", action="store", default=300, type="int", help="height of the image [format: int] [default: 300]") parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") options = parser.parse_args()[0] iGetReadDistribution = GetReadDistribution(options.verbosity) iGetReadDistribution.setNames(options.names.split(",")) iGetReadDistribution.setInputFiles(options.inputFileNames.split(","), options.format) iGetReadDistribution.setOutputFileName(options.outputFileName) iGetReadDistribution.setLabs(options.xLab, options.yLab) iGetReadDistribution.setBinSize(options.binSize) iGetReadDistribution.setColors(None if options.colors == None else options.colors.split(",")) iGetReadDistribution.setFactors(None if options.factors == None else map(float, options.factors.split(","))) iGetReadDistribution.setRegionsFile(options.regionsFileName) iGetReadDistribution.setMultiple(options.multiple) iGetReadDistribution.setQuorum(options.quorum) iGetReadDistribution.setImageSize(options.width, options.height) iGetReadDistribution.run()