Mercurial > repos > yufei-luo > s_mart
view SMART/Java/Python/GetReadSizes.py @ 44:5f796c5c579f
Uploaded
author | m-zytnicki |
---|---|
date | Wed, 18 Sep 2013 08:32:38 -0400 |
parents | 2c0c0a89fad7 |
children | 169d364ddd91 |
line wrap: on
line source
#! /usr/bin/env python # # Copyright INRA-URGI 2009-2010 # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. # import random, os, glob, subprocess from commons.core.parsing.ParserChooser import ParserChooser from commons.core.parsing.GffParser import GffParser from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress from SMART.Java.Python.misc.Progress import Progress from SMART.Java.Python.misc import Utils from commons.core.LoggerFactory import LoggerFactory from commons.core.utils.RepetOptionParser import RepetOptionParser LOG_DEPTH = "smart" DEFAULT_REGION = "_all_" class GetReadSizes(object): def __init__(self, verbosity = 0): self.xLab = "Size" self.yLab = "# reads" self.verbosity = verbosity self.number = random.randint(0, 100000) self.log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity) self.parsers = {} self.sizes = {} self.factors = {} self.regions = None self.tmpDatName = None self.tmpRName = None self.width = 800 self.height = 300 self.arial = False def setNames(self, names): self.names = names def setInputFiles(self, fileNames, format): chooser = ParserChooser(self.verbosity) chooser.findFormat(format) for cpt, fileName in enumerate(fileNames): self.parsers[self.names[cpt]] = chooser.getParser(fileName) def setOutputFileName(self, fileName): self.outputFileName = fileName def setLabs(self, xLab, yLab): self.xLab = xLab self.yLab = yLab def setSizes(self, minSize, maxSize): self.minSize = minSize self.maxSize = maxSize def setColors(self, colors): self.colors = colors def setFactors(self, factors): self.factors = dict(zip(self.names, factors)) def setRegionsFile(self, fileName): if fileName != None: self._loadRegions(fileName) def setImageSize(self, width, height): if width != None: self.width = width if height != None: self.height = height def setArial(self, arial): self.arial = arial def _checkOptions(self): if not self.parsers: self.logAndRaise("ERROR: Missing input file names") def _logAndRaise(self, errorMsg): self.log.error(errorMsg) raise Exception(errorMsg) def _loadRegions(self, fileName): self.regions = {} parser = GffParser(fileName, self.verbosity) for transcript in parser.getIterator(): chromosome = transcript.getChromosome() start = transcript.getStart() end = transcript.getEnd() name = transcript.getName() if chromosome not in self.regions: self.regions[chromosome] = {} if start not in self.regions[chromosome]: self.regions[chromosome][start] = {} if end not in self.regions[chromosome][start]: self.regions[chromosome][start][end] = [] self.regions[chromosome][start][end].append(name) def _getRegions(self, transcript): if self.regions == None: return [DEFAULT_REGION] chromosome = transcript.getChromosome() start = transcript.getStart() end = transcript.getEnd() if chromosome not in self.regions: return [] names = [] for loadedStart in sorted(self.regions[chromosome].keys()): if loadedStart > end: return names for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())): if loadedEnd < start: break names.extend(self.regions[chromosome][loadedStart][loadedEnd]) return names def _parse(self, name): progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity) for transcript in self.parsers[name].getIterator(): if transcript.__class__.__name__ == "Mapping": transcript = transcript.getTranscript() regions = self._getRegions(transcript) for region in regions: if region not in self.sizes: self.sizes[region] = {} if name not in self.sizes[region]: self.sizes[region][name] = {} size = transcript.getSize() if (self.minSize == None or size >= self.minSize) and (self.maxSize == None or size <= self.maxSize): nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1 nbElements *= self.factors.get(name, 1) self.sizes[region][name][size] = self.sizes[region][name].get(size, 0) + nbElements progress.inc() progress.done() if self.minSize == None: self.minSize = min([min(self.sizes[region][name].keys()) for name in self.names for region in region]) if self.maxSize == None: self.maxSize = max([max(self.sizes[region][name].keys()) for name in self.names for region in region]) def _checkQuorum(self, region): return (max([sum(self.sizes[region][name].values()) for name in self.sizes[region]]) > 0) def _writeData(self, region): self.tmpDatName = "tmpFile%d.dat" % (self.number) handle = open(self.tmpDatName, "w") handle.write("Size\tCount\tSample\n") for name in self.sizes[region]: for size in sorted(self.sizes[region][name].keys()): handle.write("%d\t%d\t\"%s\"\n" % (size, self.sizes[region][name].get(size, 0), name)) handle.close() def _writeScript(self, region): self.tmpRName = "tmpFile%d.R" % (self.number) fileName = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region) colors = "scale_fill_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors])) title = "" if region == DEFAULT_REGION else " + labs(title = \"Sizes of %s\")" % (region) handle = open(self.tmpRName, "w") arial = ", text = element_text(family=\"Arial\", size=20)" if self.arial else "" if self.arial: handle.write("library(extrafont)\nloadfonts()\n") handle.write("library(ggplot2)\n") handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName)) handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names]))) handle.write("data$Size <- factor(data$Size, levels=c(%s))\n" % (", ".join(["%d" % (size) for size in range(self.minSize, self.maxSize+1)]))) handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height)) handle.write("ggplot(data, aes(x = Size, y = Count, fill = Size)) %s + geom_bar(stat = \"identity\") + facet_grid(. ~ Sample, space=\"free_x\") + xlab(\"%s\") + ylab(\"%s\") + %s + theme(legend.position = \"none\", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()%s)\n" % (title, self.xLab, self.yLab, colors, arial)) handle.write("dev.off()\n") def _runR(self): rCommand = os.environ["SMARTRPATH"] if "SMARTRPATH" in os.environ else "R" command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName) status = subprocess.call(command, shell=True) if status != 0: raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status)) def _plot(self): progress = Progress(len(self.sizes), "Plotting data", self.verbosity) for region in self.sizes: if not self._checkQuorum(region): self.log.info("Not displaying '%s' for it contains no data." % (region)) else: self._writeData(region) self._writeScript(region) self._runR() progress.inc() progress.done() def _cleanFiles(self): for fileName in (self.tmpDatName, self.tmpRName): if fileName != None and os.path.exists(fileName): os.remove(fileName) for otherFileName in glob.glob("%s*" % (fileName)): os.remove(otherFileName) def run(self): LoggerFactory.setLevel(self.log, self.verbosity) self._checkOptions() self.log.info("START Get Read Sizes") for name in self.names: self._parse(name) self._plot() self._cleanFiles() self.log.info("END Get Read Sizes") if __name__ == "__main__": description = "Usage: GetReadSizes.py [options]\n\nGet Read Sizes v1.0.1: Get the sizes of a set of reads. [Category: Personal]\n" epilog = "" parser = RepetOptionParser(description = description, epilog = epilog) parser.add_option("-i", "--input", dest="inputFileNames", action="store", default=None, type="string", help="input files, separated by commas [compulsory] [format: string]") parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name of the input data, separated by commas [compulsory] [format: string]") parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") parser.add_option("-s", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size [format: int]") parser.add_option("-S", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size [format: int]") parser.add_option("-l", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x-axis label name [format: string] [default: Size]") parser.add_option("-L", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]") parser.add_option("-c", "--colors", dest="colors", action="store", default=None, type="string", help="colors of the bars, separated by commas [format: string]") parser.add_option("-a", "--factors", dest="factors", action="store", default=None, type="string", help="normalization factors, separated by commas [format: string]") parser.add_option("-r", "--regions", dest="regionsFileName", action="store", default=None, type="string", help="regions to plot [format: transcript file in GFF format]") parser.add_option("-z", "--width", dest="width", action="store", default=800, type="int", help="width of the image [format: int] [default: 800]") parser.add_option("-Z", "--height", dest="height", action="store", default=300, type="int", help="height of the image [format: int] [default: 300]") parser.add_option("-A", "--arial", dest="arial", action="store_true", default=False, help="use Arial font [format: boolean] [default: false]") parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") options = parser.parse_args()[0] iGetReadSizes = GetReadSizes(options.verbosity) iGetReadSizes.setNames(options.names.split(",")) iGetReadSizes.setInputFiles(options.inputFileNames.split(","), options.format) iGetReadSizes.setOutputFileName(options.outputFileName) iGetReadSizes.setLabs(options.xLab, options.yLab) iGetReadSizes.setSizes(options.minSize, options.maxSize) iGetReadSizes.setColors(None if options.colors == None else options.colors.split(",")) iGetReadSizes.setFactors(None if options.factors == None else map(float, options.factors.split(","))) iGetReadSizes.setRegionsFile(options.regionsFileName) iGetReadSizes.setImageSize(options.width, options.height) iGetReadSizes.setArial(options.arial) iGetReadSizes.run()