Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/getSizes.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | 769e306b7933 |
children |
line wrap: on
line diff
--- a/SMART/Java/Python/getSizes.py Mon Apr 22 11:11:10 2013 -0400 +++ b/SMART/Java/Python/getSizes.py Mon Apr 29 03:20:15 2013 -0400 @@ -44,195 +44,175 @@ LOG_DEPTH = "smart" class GetSizes(object): - - def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, csv=False, verbosity = 0): - self.inFileName = inFileName - self.inFormat= inFormat - self.outFileName = outFileName - self.query = query - self.xMax = xMax - self.xMin = xMin - self.xLab = "Size" - self.yLab = "# reads" - self.barplot = False - self.csv = csv - self._verbosity = verbosity - self.parser = None - self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) - - def setAttributesFromCmdLine(self): - description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n" - epilog = "" - parser = RepetOptionParser(description = description, epilog = epilog) - parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") - parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") - parser.add_option("-q", "--query", dest="query", action="store", default=None, type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]") - parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") - parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") - parser.add_option("-X", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int]") - parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") - parser.add_option("-c", "--csv", dest="csv", action="store", type="string", help="write a .csv file [format: bool] [default: false]") - parser.add_option("-a", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x absis label name [format: string] [default: Size]") - parser.add_option("-b", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y absis label name [format: string] [default: Reads]") - parser.add_option("-B", "--barplot", dest="barplot", action="store_true", default=False, help="use barplot representation [format: bool] [default: false]") - options = parser.parse_args()[0] - self._setAttributesFromOptions(options) - - def _setAttributesFromOptions(self, options): - self.setInFileName(options.inputFileName) - self.setInFormat(options.format) - self.setQuery(options.query) - self.setOutFileName(options.outputFileName) - self.setXMax(options.xMax) - self.setXMin(options.xMin) - self.setxLab(options.xLab) - self.setyLab(options.yLab) - self.setBarplot(options.barplot) - self.setVerbosity(options.verbosity) - - def setInFileName(self, inputFileName): - self.inFileName = inputFileName - - def setInFormat(self, inFormat): - self.inFormat = inFormat - - def setQuery(self, query): - self.query = query - - def setOutFileName(self, outFileName): - self.outFileName = outFileName - - def setXMax(self, xMax): - self.xMax = xMax - - def setXMin(self, xMin): - self.xMin = xMin - - def setxLab(self, xLab): - self.xLab = xLab - - def setyLab(self, yLab): - self.yLab = yLab - - def setBarplot(self, barplot): - self.barplot = barplot - - def setCsv(self, csv): - self.csv = csv - - def setVerbosity(self, verbosity): - self._verbosity = verbosity - - def _checkOptions(self): - if self.inFileName == None: - self._logAndRaise("ERROR: Missing input file name") - if self.inFormat == "fasta": - self.parser = FastaParser(self.inFileName, self._verbosity) - elif self.inFormat == "fastq": - self.parser = FastqParser(self.inFileName, self._verbosity) - else: - self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity) - - def _logAndRaise(self, errorMsg): - self._log.error(errorMsg) - raise Exception(errorMsg) - - def run(self): - LoggerFactory.setLevel(self._log, self._verbosity) - self._checkOptions() - self._log.info("START getsizes") - self._log.debug("Input file name: %s" % self.inFileName) + + def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, verbosity = 0): + self.inFileName = inFileName + self.inFormat= inFormat + self.outFileName = outFileName + self.query = query + self.xMax = xMax + self.xMin = xMin + self.xLab = "Size" + self.yLab = "# reads" + self.barplot = False + self._verbosity = verbosity + self.parser = None + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + def setAttributesFromCmdLine(self): + description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n" + epilog = "" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") + parser.add_option("-q", "--query", dest="query", action="store", default=None, type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") + parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") + parser.add_option("-X", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-a", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x absis label name [format: string] [default: Size]") + parser.add_option("-b", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y absis label name [format: string] [default: Reads]") + parser.add_option("-B", "--barplot", dest="barplot", action="store_true", default=False, help="use barplot representation [format: bool] [default: false]") + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setInFileName(options.inputFileName) + self.setInFormat(options.format) + self.setQuery(options.query) + self.setOutFileName(options.outputFileName) + self.setXMax(options.xMax) + self.setXMin(options.xMin) + self.setxLab(options.xLab) + self.setyLab(options.yLab) + self.setBarplot(options.barplot) + self.setVerbosity(options.verbosity) + + def setInFileName(self, inputFileName): + self.inFileName = inputFileName + + def setInFormat(self, inFormat): + self.inFormat = inFormat + + def setQuery(self, query): + self.query = query + + def setOutFileName(self, outFileName): + self.outFileName = outFileName + + def setXMax(self, xMax): + self.xMax = xMax + + def setXMin(self, xMin): + self.xMin = xMin + + def setxLab(self, xLab): + self.xLab = xLab + + def setyLab(self, yLab): + self.yLab = yLab + + def setBarplot(self, barplot): + self.barplot = barplot + + def setVerbosity(self, verbosity): + self._verbosity = verbosity + + def _checkOptions(self): + if self.inFileName == None: + self._logAndRaise("ERROR: Missing input file name") + if self.inFormat == "fasta": + self.parser = FastaParser(self.inFileName, self._verbosity) + elif self.inFormat == "fastq": + self.parser = FastqParser(self.inFileName, self._verbosity) + else: + self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity) + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise Exception(errorMsg) - nbItems = self.parser.getNbItems() - self._log.info( "%i items found" % (nbItems)) - - # treat items - progress = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity) - sizes = {} - names = {} - minimum = 1000000000000 - maximum = 0 - sum = 0 - number = 0 - nbSubItems = 0 - for item in self.parser.getIterator(): - items = [] - if self.query == "exon": - items = item.getExons() - elif self.query == "exon1": - if len(item.getExons()) > 1: - item.sortExons() - items = [item.getExons()[0]] - elif self.query == "intron": - items = item.getIntrons() - else: - items = [item, ] - - for thisItem in items: - try: - nbElements = int(float(thisItem.getTagValue("nbElements"))) - if nbElements == None: - nbElements = 1 - except: - nbElements = 1 - size = thisItem.getSize() - minimum = min(minimum, size) - maximum = max(maximum, size) - name = thisItem.name.split()[0] - - if size not in sizes: - sizes[size] = nbElements - if self.csv: - names[size] = [name, ] - else: - sizes[size] += nbElements - if self.csv: - names[size].append(name) - sum += size - nbSubItems += nbElements - number += 1 - progress.inc() - progress.done() + def run(self): + LoggerFactory.setLevel(self._log, self._verbosity) + self._checkOptions() + self._log.info("START getsizes") + self._log.debug("Input file name: %s" % self.inFileName) - if self.outFileName != None: - plotter = RPlotter(self.outFileName, self._verbosity) - plotter.setFill(0) - plotter.setMinimumX(self.xMin) - plotter.setMaximumX(self.xMax) - plotter.setXLabel(self.xLab) - plotter.setYLabel(self.yLab) - plotter.setBarplot(self.barplot) - plotter.addLine(sizes) - plotter.plot() - - if nbSubItems == 0: - self._logAndRaise("No item found") - - if self.csv: - csvHandle = open(self.csv, "w") - for size in range(min(sizes.keys()), max(sizes.keys())+1): - if size not in sizes: - csvHandle.write("%d,0,\n" % (size)) - else: - csvHandle.write("%d,%d,%s\n" % (size, sizes[size], ";".join(names[size]))) - csvHandle.close() - - self.items = number - self.subItems = nbSubItems - self.nucleotides = sum - self.minAvgMedMax = Utils.getMinAvgMedMax(sizes) - - print "%d items" % (number) - print "%d sub-items" % (nbSubItems) - print "%d nucleotides" % (sum) - print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes) + nbItems = self.parser.getNbItems() + self._log.info( "%i items found" % (nbItems)) + + # treat items + progress = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity) + sizes = {} + minimum = 1000000000000 + maximum = 0 + sum = 0 + number = 0 + nbSubItems = 0 + for item in self.parser.getIterator(): + items = [] + if self.query == "exon": + items = item.getExons() + elif self.query == "exon1": + if len(item.getExons()) > 1: + item.sortExons() + items = [item.getExons()[0]] + elif self.query == "intron": + items = item.getIntrons() + else: + items = [item, ] + + for thisItem in items: + try: + nbElements = int(float(thisItem.getTagValue("nbElements"))) + if nbElements == None: + nbElements = 1 + except: + nbElements = 1 + size = thisItem.getSize() + minimum = min(minimum, size) + maximum = max(maximum, size) + + if size not in sizes: + sizes[size] = nbElements + else: + sizes[size] += nbElements + sum += size + nbSubItems += nbElements + number += 1 + progress.inc() + progress.done() - self._log.info("END getsizes") + if self.outFileName != None: + plotter = RPlotter(self.outFileName, self._verbosity) + plotter.setFill(0) + plotter.setMinimumX(self.xMin) + plotter.setMaximumX(self.xMax) + plotter.setXLabel(self.xLab) + plotter.setYLabel(self.yLab) + plotter.setBarplot(self.barplot) + plotter.addLine(sizes) + plotter.plot() + + if nbSubItems == 0: + self._logAndRaise("No item found") + + self.items = number + self.subItems = nbSubItems + self.nucleotides = sum + self.minAvgMedMax = Utils.getMinAvgMedMax(sizes) + + print "%d items" % (number) + print "%d sub-items" % (nbSubItems) + print "%d nucleotides" % (sum) + print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes) + + self._log.info("END getsizes") if __name__ == "__main__": - iGetSizes = GetSizes() - iGetSizes.setAttributesFromCmdLine() - iGetSizes.run() - + iGetSizes = GetSizes() + iGetSizes.setAttributesFromCmdLine() + iGetSizes.run() + #TODO: add two more options!!!!!!