Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/getSizes.py @ 36:44d5973c188c
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 15:02:29 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getSizes.py Tue Apr 30 15:02:29 2013 -0400 @@ -0,0 +1,218 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, sys +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc import Utils + +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser + +LOG_DEPTH = "smart" + +class GetSizes(object): + + def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, verbosity = 0): + self.inFileName = inFileName + self.inFormat= inFormat + self.outFileName = outFileName + self.query = query + self.xMax = xMax + self.xMin = xMin + self.xLab = "Size" + self.yLab = "# reads" + self.barplot = False + self._verbosity = verbosity + self.parser = None + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + def setAttributesFromCmdLine(self): + description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n" + epilog = "" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") + parser.add_option("-q", "--query", dest="query", action="store", default=None, type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") + parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") + parser.add_option("-X", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-a", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x absis label name [format: string] [default: Size]") + parser.add_option("-b", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y absis label name [format: string] [default: Reads]") + parser.add_option("-B", "--barplot", dest="barplot", action="store_true", default=False, help="use barplot representation [format: bool] [default: false]") + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setInFileName(options.inputFileName) + self.setInFormat(options.format) + self.setQuery(options.query) + self.setOutFileName(options.outputFileName) + self.setXMax(options.xMax) + self.setXMin(options.xMin) + self.setxLab(options.xLab) + self.setyLab(options.yLab) + self.setBarplot(options.barplot) + self.setVerbosity(options.verbosity) + + def setInFileName(self, inputFileName): + self.inFileName = inputFileName + + def setInFormat(self, inFormat): + self.inFormat = inFormat + + def setQuery(self, query): + self.query = query + + def setOutFileName(self, outFileName): + self.outFileName = outFileName + + def setXMax(self, xMax): + self.xMax = xMax + + def setXMin(self, xMin): + self.xMin = xMin + + def setxLab(self, xLab): + self.xLab = xLab + + def setyLab(self, yLab): + self.yLab = yLab + + def setBarplot(self, barplot): + self.barplot = barplot + + def setVerbosity(self, verbosity): + self._verbosity = verbosity + + def _checkOptions(self): + if self.inFileName == None: + self._logAndRaise("ERROR: Missing input file name") + if self.inFormat == "fasta": + self.parser = FastaParser(self.inFileName, self._verbosity) + elif self.inFormat == "fastq": + self.parser = FastqParser(self.inFileName, self._verbosity) + else: + self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity) + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise Exception(errorMsg) + + def run(self): + LoggerFactory.setLevel(self._log, self._verbosity) + self._checkOptions() + self._log.info("START getsizes") + self._log.debug("Input file name: %s" % self.inFileName) + + nbItems = self.parser.getNbItems() + self._log.info( "%i items found" % (nbItems)) + + # treat items + progress = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity) + sizes = {} + minimum = 1000000000000 + maximum = 0 + sum = 0 + number = 0 + nbSubItems = 0 + for item in self.parser.getIterator(): + items = [] + if self.query == "exon": + items = item.getExons() + elif self.query == "exon1": + if len(item.getExons()) > 1: + item.sortExons() + items = [item.getExons()[0]] + elif self.query == "intron": + items = item.getIntrons() + else: + items = [item, ] + + for thisItem in items: + try: + nbElements = int(float(thisItem.getTagValue("nbElements"))) + if nbElements == None: + nbElements = 1 + except: + nbElements = 1 + size = thisItem.getSize() + minimum = min(minimum, size) + maximum = max(maximum, size) + + if size not in sizes: + sizes[size] = nbElements + else: + sizes[size] += nbElements + sum += size + nbSubItems += nbElements + number += 1 + progress.inc() + progress.done() + + if self.outFileName != None: + plotter = RPlotter(self.outFileName, self._verbosity) + plotter.setFill(0) + plotter.setMinimumX(self.xMin) + plotter.setMaximumX(self.xMax) + plotter.setXLabel(self.xLab) + plotter.setYLabel(self.yLab) + plotter.setBarplot(self.barplot) + plotter.addLine(sizes) + plotter.plot() + + if nbSubItems == 0: + self._logAndRaise("No item found") + + self.items = number + self.subItems = nbSubItems + self.nucleotides = sum + self.minAvgMedMax = Utils.getMinAvgMedMax(sizes) + + print "%d items" % (number) + print "%d sub-items" % (nbSubItems) + print "%d nucleotides" % (sum) + print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes) + + self._log.info("END getsizes") + + +if __name__ == "__main__": + iGetSizes = GetSizes() + iGetSizes.setAttributesFromCmdLine() + iGetSizes.run() + +#TODO: add two more options!!!!!!