Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/GetDistribution.py @ 36:44d5973c188c
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 15:02:29 -0400 |
parents | |
children | 169d364ddd91 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetDistribution.py Tue Apr 30 15:02:29 2013 -0400 @@ -0,0 +1,362 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.MultipleRPlotter import MultipleRPlotter +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc.Progress import Progress + +TWOSTRANDS = {True: [1, -1], False: [0]} +STRANDTOSTR = {1: "(+)", -1: "(-)", 0: ""} + +class GetDistribution(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.sizes = None + self.twoStrands = False + self.start = 1 + self.names = ["nbElements"] + self.average = False + self.nbValues = {} + self.height = 300 + self.width = 600 + self.colors = None + self.gffFileName = None + self.csvFileName = None + self.yMin = None + self.yMax = None + self.chromosome = None + self.merge = False + self.nbTranscripts = None + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setReferenceFile(self, fileName): + if fileName == None: + return + fastaParser = FastaParser(fileName, self.verbosity) + self.chromosomes = fastaParser.getRegions() + self.sizes = dict([region, fastaParser.getSizeOfRegion(region)] for region in self.chromosomes) + self.maxSize = max(self.sizes.values()) + + def setRegion(self, chromosome, start, end): + if chromosome == None: + return + self.maxSize = options.end + self.sizes = {chromosome: end} + self.chromosomes = [chromosome] + self.chromosome = chromosome + self.start = start + self.end = end + + def setOutputFile(self, fileName): + self.outputFileName = fileName + + def setNbBins(self, nbBins): + self.nbBins = nbBins + + def set2Strands(self, twoStrands): + self.twoStrands = twoStrands + + def setNames(self, names): + self.names = names + + def setAverage(self, average): + self.average = average + + def setNormalization(self, normalization): + self.normalization = normalization + + def setImageSize(self, height, width): + self.height = height + self.width = width + + def setYLimits(self, yMin, yMax): + self.yMin = yMin + self.yMax = yMax + + def setColors(self, colors): + self.colors = colors + + def writeGff(self, fileName): + self.gffFileName = fileName + + def writeCsv(self, fileName): + self.csvFileName = fileName + + def mergePlots(self, merge): + self.merge = merge + + def _estimateSizes(self): + progress = UnlimitedProgress(10000, "Reading input for chromosome size estimate", self.verbosity) + self.sizes = {} + for self.nbTranscripts, transcript in enumerate(self.parser.getIterator()): + chromosome = transcript.getChromosome() + start = transcript.getStart() + self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0)) + progress.inc() + progress.done() + + def _computeSliceSize(self): + if self.nbBins == 0: + return + tmp1 = int(max(self.sizes.values()) / float(self.nbBins)) + tmp2 = 10 ** (len("%d" % (tmp1))-2) + self.sliceSize = max(1, int((tmp1 / tmp2) * tmp2)) + if self.verbosity > 0: + print "choosing bin size of %d" % (self.sliceSize) + + def _initBins(self): + self.bins = {} + for chromosome in self.sizes: + self.bins[chromosome] = {} + for name in self.names: + self.bins[chromosome][name] = {} + for strand in TWOSTRANDS[self.twoStrands]: + if self.nbBins == 0: + self.bins[chromosome][name][strand] = {} + else: + self.bins[chromosome][name][strand] = dict([(i * self.sliceSize + 1, 0.0) for i in range(self.start / self.sliceSize, self.sizes[chromosome] / self.sliceSize + 1)]) + + def _populateBins(self): + if self.nbTranscripts == None: + progress = UnlimitedProgress(10000, "Counting data", self.verbosity) + else: + progress = Progress(self.nbTranscripts, "Counting data", self.verbosity) + for transcript in self.parser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + progress.inc() + chromosome = transcript.getChromosome() + start = transcript.getStart() + if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end): + continue + strand = transcript.getDirection() if self.twoStrands else 0 + if self.nbBins != 0: + bin = (start / self.sliceSize) * self.sliceSize + 1 + else: + bin = start + for name in self.names: + value = float(transcript.tags.get(name, 1)) + self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value + self.nbValues[name] = self.nbValues.get(name, 0) + value + progress.done() + + def _normalize(self): + average = float(sum(self.nbValues)) / len(self.nbValues.keys()) + factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues) + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + self.bins[chromosome][name][strand][bin] *= factors[name] + + def _computeAverage(self): + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + self.bins[chromosome][name][strand][bin] = float(self.bins[chromosome][name][strand][bin]) / self.sliceSize + + def _getPlotter(self, chromosome): + plot = RPlotter("%s_%s.png" % (os.path.splitext(self.outputFileName)[0], chromosome), self.verbosity) + plot.setImageSize(self.width, self.height) + if self.sizes[chromosome] <= 1000: + unit = "nt." + ratio = 1.0 + elif self.sizes[chromosome] <= 1000000: + unit = "kb" + ratio = 1000.0 + else: + unit = "Mb" + ratio = 1000000.0 + if self.yMin != None: + plot.setMinimumY(self.yMin) + if self.yMax != None: + plot.setMaximumY(self.yMax) + plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) + plot.setLegend(True) + for i, name in enumerate(self.bins[chromosome]): + for strand in self.bins[chromosome][name]: + fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand]) + factor = 1 if strand == 0 else strand + correctedLine = dict([(key / ratio, value * factor) for key, value in self.bins[chromosome][name][strand].iteritems()]) + plot.addLine(correctedLine, fullName, self.colors[i] if self.colors else None) + return plot + + def _plot(self): + if self.merge: + multiplePlot = MultipleRPlotter(self.outputFileName, self.verbosity) + multiplePlot.setImageSize(self.width, self.height * len(self.bins.keys())) + progress = Progress(len(self.bins.keys()), "Plotting", options.verbosity) + for chromosome in sorted(self.bins.keys()): + plot = self._getPlotter(chromosome) + if self.merge: + multiplePlot.addPlot(plot) + else: + plot.plot() + progress.inc() + if self.merge: + multiplePlot.plot() + progress.done() + + def _writeCsv(self): + if self.verbosity > 1: + print "Writing CSV file..." + csvHandle = open(self.csvFileName, "w") + csvHandle.write("chromosome;tag;strand") + if self.nbBins != 0: + xValues = range(self.start / self.sliceSize, max(self.sizes.values()) / self.sliceSize + 1) + for value in xValues: + csvHandle.write(";%d-%d" % (value * self.sliceSize + 1, (value+1) * self.sliceSize)) + csvHandle.write("\n") + else: + xValues = [] + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + xValues.extend(self.bins[chromosome][name][strand].keys()) + xValues = sorted(list(set(xValues))) + for value in xValues: + csvHandle.write(";%d" % (value)) + csvHandle.write("\n") + for chromosome in self.bins: + csvHandle.write("%s" % (chromosome)) + for name in self.bins[chromosome]: + csvHandle.write(";%s" % (name)) + for strand in self.bins[chromosome][name]: + csvHandle.write(";%s" % (STRANDTOSTR[strand])) + for bin in xValues: + csvHandle.write(";%.2f" % (self.bins[chromosome][name][strand].get(bin, 0))) + csvHandle.write("\n") + csvHandle.write(";") + csvHandle.write(";") + csvHandle.close() + if self.verbosity > 1: + print "...done" + + def _writeGff(self): + if self.verbosity > 1: + print "Writing GFF file..." + writer = Gff3Writer(self.gffFileName, self.verbosity) + cpt = 1 + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setStart(bin) + if self.nbBins > 0: + transcript.setEnd(bin + self.sliceSize) + else: + transcript.setEnd(self.start) + transcript.setDirection(1 if strand == 0 else strand) + transcript.setTagValue("ID", "region%d" % (cpt)) + cpt += 1 + writer.write() + if self.verbosity > 1: + print "...done" + + def run(self): + if self.sizes == None: + self._estimateSizes() + self._computeSliceSize() + self._initBins() + self._populateBins() + if self.normalization: + self._normalize() + if self.average: + self._computeAverage() + self._plot() + if self.csvFileName != None: + self._writeCsv() + if self.gffFileName != None: + self._writeGff() + + +if __name__ == "__main__": + + description = "Get Distribution v1.0.2: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [format: file in FASTA format]") + parser.add_option("-b", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") + parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") + parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") + parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") + parser.add_option("-x", "--csv", dest="csv", action="store", default=None, help="write a .csv file [format: output file in CSV format] [default: None]") + parser.add_option("-g", "--gff", dest="gff", action="store", default=None, help="also write GFF3 file [format: output file in GFF format] [default: None]") + parser.add_option("-H", "--height", dest="height", action="store", default=300, type="int", help="height of the graphics [format: int] [default: 300]") + parser.add_option("-W", "--width", dest="width", action="store", default=600, type="int", help="width of the graphics [format: int] [default: 1000]") + parser.add_option("-a", "--average", dest="average", action="store_true", default=False, help="plot average (instead of sum) [default: false] [format: boolean]") + parser.add_option("-n", "--names", dest="names", action="store", default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]") + parser.add_option("-l", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]") + parser.add_option("-z", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]") + parser.add_option("-m", "--merge", dest="mergePlots", action="store_true", default=False, help="merge all plots in one figure [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + (options, args) = parser.parse_args() + + gt = GetDistribution(options.verbosity) + gt.setInputFile(options.inputFileName, options.format) + gt.setOutputFile(options.outputFileName) + gt.setReferenceFile(options.referenceFileName) + gt.setNbBins(int(options.nbBins)) + gt.set2Strands(options.bothStrands) + gt.setRegion(options.chromosome, options.start, options.end) + gt.setNormalization(options.normalize) + gt.setAverage(options.average) + gt.setYLimits(options.yMin, options.yMax) + gt.writeCsv(options.csv) + gt.writeGff(options.gff) + gt.setImageSize(options.height, options.width) + gt.setNames(options.names.split(",")) + gt.setColors(None if options.colors == None else options.colors.split(",")) + gt.setNormalization(options.normalize) + gt.mergePlots(options.mergePlots) + gt.run() +