Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/GetDistribution.py @ 46:169d364ddd91
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 30 Sep 2013 03:19:26 -0400 |
parents | 44d5973c188c |
children |
line wrap: on
line diff
--- a/SMART/Java/Python/GetDistribution.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/GetDistribution.py Mon Sep 30 03:19:26 2013 -0400 @@ -45,28 +45,33 @@ class GetDistribution(object): def __init__(self, verbosity): - self.verbosity = verbosity - self.sizes = None - self.twoStrands = False - self.start = 1 - self.names = ["nbElements"] - self.average = False - self.nbValues = {} - self.height = 300 - self.width = 600 - self.colors = None - self.gffFileName = None - self.csvFileName = None - self.yMin = None - self.yMax = None - self.chromosome = None - self.merge = False - self.nbTranscripts = None + self.verbosity = verbosity + self.sizes = None + self.nbBins = None + self.sliceSize = None + self.twoStrands = False + self.start = 1 + self.names = ["nbElements"] + self.average = False + self.nbValues = {} + self.height = 300 + self.width = 600 + self.dots = False + self.colors = None + self.gffFileName = None + self.csvFileName = None + self.yMin = None + self.yMax = None + self.chromosome = None + self.merge = False + self.nbTranscripts = None + self.factors = None + self.thicknessCurve = 1 + self.sizePoliceLegend = 1.5 - def setInputFile(self, fileName, format): - chooser = ParserChooser(self.verbosity) - chooser.findFormat(format) - self.parser = chooser.getParser(fileName) + def setInputFiles(self, fileNames, format): + self.fileNames = fileNames + self.format = format def setReferenceFile(self, fileName): if fileName == None: @@ -77,7 +82,7 @@ self.maxSize = max(self.sizes.values()) def setRegion(self, chromosome, start, end): - if chromosome == None: + if chromosome == None or start == None or end == None: return self.maxSize = options.end self.sizes = {chromosome: end} @@ -90,13 +95,20 @@ self.outputFileName = fileName def setNbBins(self, nbBins): - self.nbBins = nbBins + if nbBins != None: + self.nbBins = int(nbBins) + + def setBinSize(self, binSize): + if binSize != None: + self.sliceSize = int(binSize) def set2Strands(self, twoStrands): self.twoStrands = twoStrands def setNames(self, names): self.names = names + if len(self.names) == 1 and len(self.fileNames) > 1: + self.names = ["file %d" % (i+1) for i in range(len(self.fileNames))] def setAverage(self, average): self.average = average @@ -104,10 +116,16 @@ def setNormalization(self, normalization): self.normalization = normalization + def setNormalizationFactors(self, factors): + self.factors = dict([name, 1.0] for name in self.names) if factors == None else dict(zip(self.names, factors)) + def setImageSize(self, height, width): self.height = height self.width = width + def setDots(self, dots): + self.dots = dots + def setYLimits(self, yMin, yMax): self.yMin = yMin self.yMax = yMax @@ -124,15 +142,29 @@ def mergePlots(self, merge): self.merge = merge + def setThicknessCurve(self, thickness) : + self.thickness = thickness + + def setSizePoliceLegend(self, sizePoliceLegend): + self.sizePoliceLegend = sizePoliceLegend + def _estimateSizes(self): - progress = UnlimitedProgress(10000, "Reading input for chromosome size estimate", self.verbosity) - self.sizes = {} - for self.nbTranscripts, transcript in enumerate(self.parser.getIterator()): - chromosome = transcript.getChromosome() - start = transcript.getStart() - self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0)) - progress.inc() - progress.done() + self.sizes = {} + self.nbTranscripts = {} + for fileName in self.fileNames: + progress = UnlimitedProgress(10000, "Reading %s for chromosome size estimate" % (fileName), self.verbosity) + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(self.format) + parser = parserChooser.getParser(fileName) + for nbTranscripts, transcript in enumerate(parser.getIterator()): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + start = transcript.getStart() + self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0)) + progress.inc() + progress.done() + self.nbTranscripts[fileName] = nbTranscripts def _computeSliceSize(self): if self.nbBins == 0: @@ -156,37 +188,50 @@ self.bins[chromosome][name][strand] = dict([(i * self.sliceSize + 1, 0.0) for i in range(self.start / self.sliceSize, self.sizes[chromosome] / self.sliceSize + 1)]) def _populateBins(self): - if self.nbTranscripts == None: - progress = UnlimitedProgress(10000, "Counting data", self.verbosity) - else: - progress = Progress(self.nbTranscripts, "Counting data", self.verbosity) - for transcript in self.parser.getIterator(): - if transcript.__class__.__name__ == "Mapping": - transcript = transcript.getTranscript() - progress.inc() - chromosome = transcript.getChromosome() - start = transcript.getStart() - if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end): - continue - strand = transcript.getDirection() if self.twoStrands else 0 - if self.nbBins != 0: - bin = (start / self.sliceSize) * self.sliceSize + 1 + for id, fileName in enumerate(self.fileNames): + if self.nbTranscripts == None: + progress = UnlimitedProgress(10000, "Counting data", self.verbosity) else: - bin = start - for name in self.names: - value = float(transcript.tags.get(name, 1)) - self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value - self.nbValues[name] = self.nbValues.get(name, 0) + value - progress.done() + progress = Progress(self.nbTranscripts[fileName], "Counting data", self.verbosity) + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(self.format) + parser = parserChooser.getParser(fileName) + for transcript in parser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + progress.inc() + chromosome = transcript.getChromosome() + start = transcript.getStart() + if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end): + continue + strand = transcript.getDirection() if self.twoStrands else 0 + if self.nbBins != 0: + bin = (start / self.sliceSize) * self.sliceSize + 1 + else: + bin = start + if len(self.fileNames) > 1: + nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1 + name = self.names[id] + self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + nbElements + self.nbValues[name] = self.nbValues.get(name, 0) + nbElements + else: + for name in self.names: + value = float(transcript.tags.get(name, 1)) + self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value + self.nbValues[name] = self.nbValues.get(name, 0) + value + progress.done() - def _normalize(self): - average = float(sum(self.nbValues)) / len(self.nbValues.keys()) - factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues) + def _normalizeFactors(self): for chromosome in self.bins: for name in self.bins[chromosome]: for strand in self.bins[chromosome][name]: for bin in self.bins[chromosome][name][strand]: - self.bins[chromosome][name][strand][bin] *= factors[name] + self.bins[chromosome][name][strand][bin] *= self.factors[name] + + def _normalize(self): + average = float(sum(self.nbValues.values())) / len(self.nbValues.keys()) + self.factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues) + self._normalizeFactors() def _computeAverage(self): for chromosome in self.bins: @@ -198,6 +243,10 @@ def _getPlotter(self, chromosome): plot = RPlotter("%s_%s.png" % (os.path.splitext(self.outputFileName)[0], chromosome), self.verbosity) plot.setImageSize(self.width, self.height) + plot.setLineWidth(self.thickness) + plot.setSizePoliceLegend(self.sizePoliceLegend) + if self.dots: + plot.setPoints(True) if self.sizes[chromosome] <= 1000: unit = "nt." ratio = 1.0 @@ -212,10 +261,12 @@ if self.yMax != None: plot.setMaximumY(self.yMax) plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) - plot.setLegend(True) + if len(self.names) > 1: + plot.setLegend(True, True) for i, name in enumerate(self.bins[chromosome]): for strand in self.bins[chromosome][name]: - fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand]) + #fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand]) + fullName = name.replace("_", " ")[:6] factor = 1 if strand == 0 else strand correctedLine = dict([(key / ratio, value * factor) for key, value in self.bins[chromosome][name][strand].iteritems()]) plot.addLine(correctedLine, fullName, self.colors[i] if self.colors else None) @@ -299,11 +350,14 @@ def run(self): if self.sizes == None: self._estimateSizes() - self._computeSliceSize() + if self.sliceSize == None: + self._computeSliceSize() self._initBins() self._populateBins() if self.normalization: self._normalize() + if self.factors != None: + self._normalizeFactors() if self.average: self._computeAverage() self._plot() @@ -318,34 +372,40 @@ description = "Get Distribution v1.0.2: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" parser = OptionParser(description = description) - parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") - parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") - parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") - parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [format: file in FASTA format]") - parser.add_option("-b", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") - parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") - parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") - parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") - parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") - parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") - parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") - parser.add_option("-x", "--csv", dest="csv", action="store", default=None, help="write a .csv file [format: output file in CSV format] [default: None]") - parser.add_option("-g", "--gff", dest="gff", action="store", default=None, help="also write GFF3 file [format: output file in GFF format] [default: None]") - parser.add_option("-H", "--height", dest="height", action="store", default=300, type="int", help="height of the graphics [format: int] [default: 300]") - parser.add_option("-W", "--width", dest="width", action="store", default=600, type="int", help="width of the graphics [format: int] [default: 1000]") - parser.add_option("-a", "--average", dest="average", action="store_true", default=False, help="plot average (instead of sum) [default: false] [format: boolean]") - parser.add_option("-n", "--names", dest="names", action="store", default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]") - parser.add_option("-l", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]") - parser.add_option("-z", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]") - parser.add_option("-m", "--merge", dest="mergePlots", action="store_true", default=False, help="merge all plots in one figure [format: bool] [default: false]") - parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + parser.add_option("-i", "--input", dest="inputFileNames", action="store", type="string", help="input files separated by commas [compulsory] [format: string]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [format: file in FASTA format]") + parser.add_option("-b", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") + parser.add_option("-B", "--binSize", dest="binSize", action="store", default=None, type="int", help="bin size [default: None] [format: int]") + parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") + parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") + parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") + parser.add_option("-x", "--csv", dest="csv", action="store", default=None, help="write a .csv file [format: output file in CSV format] [default: None]") + parser.add_option("-g", "--gff", dest="gff", action="store", default=None, help="also write GFF3 file [format: output file in GFF format] [default: None]") + parser.add_option("-H", "--height", dest="height", action="store", default=500, type="int", help="height of the graphics [format: int] [default: 300]") + parser.add_option("-W", "--width", dest="width", action="store", default=800, type="int", help="width of the graphics [format: int] [default: 1000]") + parser.add_option("-t", "--thickness", dest="lineThickness", action="store", default=1, type="int", help="thickness of the lines [format : int] [default : 1]") + parser.add_option("-d", "--policeLegend", dest="sizePoliceLegend", action="store", default=1.5, type="float", help="size of the police of the legend [format : float] [default : 1.5]") + parser.add_option("-D", "--dots", dest="dots", action="store_true", default=False, help="plot dots instead of lines [format : bool] [default : false]") + parser.add_option("-a", "--average", dest="average", action="store_true", default=False, help="plot average (instead of sum) [default: false] [format: boolean]") + parser.add_option("-n", "--names", dest="names", action="store", default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]") + parser.add_option("-l", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]") + parser.add_option("-z", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]") + parser.add_option("-Z", "--normalizeFac", dest="normalizeFactors", action="store", default=None, help="normalize data with given factors (when panels are different) [format: string]") + parser.add_option("-m", "--merge", dest="mergePlots", action="store_true", default=False, help="merge all plots in one figure [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") (options, args) = parser.parse_args() gt = GetDistribution(options.verbosity) - gt.setInputFile(options.inputFileName, options.format) + gt.setInputFiles(options.inputFileNames.split(","), options.format) gt.setOutputFile(options.outputFileName) gt.setReferenceFile(options.referenceFileName) - gt.setNbBins(int(options.nbBins)) + gt.setNbBins(options.nbBins) + gt.setBinSize(options.binSize) gt.set2Strands(options.bothStrands) gt.setRegion(options.chromosome, options.start, options.end) gt.setNormalization(options.normalize) @@ -355,8 +415,12 @@ gt.writeGff(options.gff) gt.setImageSize(options.height, options.width) gt.setNames(options.names.split(",")) + gt.setThicknessCurve(options.lineThickness) + gt.setSizePoliceLegend(options.sizePoliceLegend) gt.setColors(None if options.colors == None else options.colors.split(",")) + gt.setDots(options.dots) gt.setNormalization(options.normalize) + gt.setNormalizationFactors(None if options.normalizeFactors == None else [float(factor) for factor in options.normalizeFactors.split(",")]) gt.mergePlots(options.mergePlots) gt.run()