| 6 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2010 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 import os, sys | 
|  | 32 from optparse import OptionParser | 
|  | 33 from commons.core.parsing.FastaParser import FastaParser | 
|  | 34 from commons.core.parsing.FastqParser import FastqParser | 
|  | 35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer | 
|  | 36 from commons.core.parsing.GffParser import GffParser | 
|  | 37 from SMART.Java.Python.misc.Progress import Progress | 
|  | 38 from SMART.Java.Python.misc.RPlotter import RPlotter | 
|  | 39 from SMART.Java.Python.misc import Utils | 
|  | 40 | 
|  | 41 from commons.core.LoggerFactory import LoggerFactory | 
|  | 42 from commons.core.utils.RepetOptionParser import RepetOptionParser | 
|  | 43 | 
|  | 44 LOG_DEPTH = "smart" | 
|  | 45 | 
|  | 46 class GetSizes(object): | 
|  | 47 | 
|  | 48     def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, csv=False, verbosity = 0): | 
|  | 49         self.inFileName = inFileName | 
|  | 50         self.inFormat= inFormat | 
|  | 51         self.outFileName = outFileName | 
|  | 52         self.query = query | 
|  | 53         self.xMax = xMax | 
|  | 54         self.xMin = xMin | 
|  | 55         self.xLab = "Size" | 
|  | 56         self.yLab = "# reads" | 
|  | 57         self.barplot = False | 
|  | 58         self.csv = csv | 
|  | 59         self._verbosity = verbosity | 
|  | 60         self.parser = None | 
|  | 61         self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) | 
|  | 62 | 
|  | 63     def setAttributesFromCmdLine(self): | 
|  | 64         description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n" | 
|  | 65         epilog = "" | 
|  | 66         parser = RepetOptionParser(description = description, epilog = epilog) | 
|  | 67         parser.add_option("-i", "--input",     dest="inputFileName",  action="store",      default=None,      type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") | 
|  | 68         parser.add_option("-f", "--format",    dest="format",         action="store",      default=None,      type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") | 
|  | 69         parser.add_option("-q", "--query",     dest="query",          action="store",      default=None,      type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]") | 
|  | 70         parser.add_option("-o", "--output",    dest="outputFileName", action="store",      default=None,      type="string", help="output file [format: output file in PNG format]") | 
|  | 71         parser.add_option("-x", "--xMax",      dest="xMax",           action="store",      default=None,      type="int",    help="maximum value on the x-axis to plot [format: int]") | 
|  | 72         parser.add_option("-X", "--xMin",      dest="xMin",           action="store",      default=None,      type="int",    help="minimum value on the x-axis to plot [format: int]") | 
|  | 73         parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,         type="int",    help="trace level [format: int]") | 
|  | 74         parser.add_option("-c", "--csv",       dest="csv",            action="store",                         type="string", help="write a .csv file [format: bool] [default: false]") | 
|  | 75         parser.add_option("-a", "--xLabel",    dest="xLab",           action="store",      default="Size",    type="string", help="x absis label name [format: string] [default: Size]") | 
|  | 76         parser.add_option("-b", "--yLabel",    dest="yLab",           action="store",      default="# reads", type="string", help="y absis label name [format: string] [default: Reads]") | 
|  | 77         parser.add_option("-B", "--barplot",   dest="barplot",        action="store_true", default=False,                    help="use barplot representation [format: bool] [default: false]") | 
|  | 78         options = parser.parse_args()[0] | 
|  | 79         self._setAttributesFromOptions(options) | 
|  | 80 | 
|  | 81     def _setAttributesFromOptions(self, options): | 
|  | 82         self.setInFileName(options.inputFileName) | 
|  | 83         self.setInFormat(options.format) | 
|  | 84         self.setQuery(options.query) | 
|  | 85         self.setOutFileName(options.outputFileName) | 
|  | 86         self.setXMax(options.xMax) | 
|  | 87         self.setXMin(options.xMin) | 
|  | 88         self.setxLab(options.xLab) | 
|  | 89         self.setyLab(options.yLab) | 
|  | 90         self.setBarplot(options.barplot) | 
|  | 91         self.setVerbosity(options.verbosity) | 
|  | 92 | 
|  | 93     def setInFileName(self, inputFileName): | 
|  | 94         self.inFileName = inputFileName | 
|  | 95 | 
|  | 96     def setInFormat(self, inFormat): | 
|  | 97         self.inFormat = inFormat | 
|  | 98 | 
|  | 99     def setQuery(self, query): | 
|  | 100         self.query = query | 
|  | 101 | 
|  | 102     def setOutFileName(self, outFileName): | 
|  | 103         self.outFileName = outFileName | 
|  | 104 | 
|  | 105     def setXMax(self, xMax): | 
|  | 106         self.xMax = xMax | 
|  | 107 | 
|  | 108     def setXMin(self, xMin): | 
|  | 109         self.xMin = xMin | 
|  | 110 | 
|  | 111     def setxLab(self, xLab): | 
|  | 112         self.xLab = xLab | 
|  | 113 | 
|  | 114     def setyLab(self, yLab): | 
|  | 115         self.yLab = yLab | 
|  | 116 | 
|  | 117     def setBarplot(self, barplot): | 
|  | 118         self.barplot = barplot | 
|  | 119 | 
|  | 120     def setCsv(self, csv): | 
|  | 121         self.csv = csv | 
|  | 122 | 
|  | 123     def setVerbosity(self, verbosity): | 
|  | 124         self._verbosity = verbosity | 
|  | 125 | 
|  | 126     def _checkOptions(self): | 
|  | 127         if self.inFileName == None: | 
|  | 128             self._logAndRaise("ERROR: Missing input file name") | 
|  | 129         if self.inFormat == "fasta": | 
|  | 130             self.parser = FastaParser(self.inFileName, self._verbosity) | 
|  | 131         elif self.inFormat == "fastq": | 
|  | 132             self.parser = FastqParser(self.inFileName, self._verbosity) | 
|  | 133         else: | 
|  | 134             self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity) | 
|  | 135 | 
|  | 136     def _logAndRaise(self, errorMsg): | 
|  | 137         self._log.error(errorMsg) | 
|  | 138         raise Exception(errorMsg) | 
|  | 139 | 
|  | 140     def run(self): | 
|  | 141         LoggerFactory.setLevel(self._log, self._verbosity) | 
|  | 142         self._checkOptions() | 
|  | 143         self._log.info("START getsizes") | 
|  | 144         self._log.debug("Input file name: %s" % self.inFileName) | 
|  | 145 | 
|  | 146         nbItems = self.parser.getNbItems() | 
|  | 147         self._log.info( "%i items found" % (nbItems)) | 
|  | 148 | 
|  | 149         # treat items | 
|  | 150         progress   = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity) | 
|  | 151         sizes      = {} | 
|  | 152         names      = {} | 
|  | 153         minimum    = 1000000000000 | 
|  | 154         maximum    = 0 | 
|  | 155         sum        = 0 | 
|  | 156         number     = 0 | 
|  | 157         nbSubItems = 0 | 
|  | 158         for item in self.parser.getIterator(): | 
|  | 159             items = [] | 
|  | 160             if self.query == "exon": | 
|  | 161                 items = item.getExons() | 
|  | 162             elif self.query == "exon1": | 
|  | 163                 if len(item.getExons()) > 1: | 
|  | 164                     item.sortExons() | 
|  | 165                     items = [item.getExons()[0]] | 
|  | 166             elif self.query == "intron": | 
|  | 167                 items = item.getIntrons() | 
|  | 168             else: | 
|  | 169                 items = [item, ] | 
|  | 170 | 
|  | 171             for thisItem in items: | 
|  | 172                 try: | 
|  | 173                     nbElements = int(float(thisItem.getTagValue("nbElements"))) | 
|  | 174                     if nbElements == None: | 
|  | 175                         nbElements = 1 | 
|  | 176                 except: | 
|  | 177                     nbElements = 1 | 
|  | 178                 size    = thisItem.getSize() | 
|  | 179                 minimum = min(minimum, size) | 
|  | 180                 maximum = max(maximum, size) | 
|  | 181                 name    = thisItem.name.split()[0] | 
|  | 182 | 
|  | 183                 if size not in sizes: | 
|  | 184                     sizes[size] = nbElements | 
|  | 185                     if self.csv: | 
|  | 186                         names[size] = [name, ] | 
|  | 187                 else: | 
|  | 188                     sizes[size] += nbElements | 
|  | 189                     if self.csv: | 
|  | 190                         names[size].append(name) | 
|  | 191                 sum        += size | 
|  | 192                 nbSubItems += nbElements | 
|  | 193             number += 1 | 
|  | 194             progress.inc() | 
|  | 195         progress.done() | 
|  | 196 | 
|  | 197         if self.outFileName != None: | 
|  | 198             plotter = RPlotter(self.outFileName, self._verbosity) | 
|  | 199             plotter.setFill(0) | 
|  | 200             plotter.setMinimumX(self.xMin) | 
|  | 201             plotter.setMaximumX(self.xMax) | 
|  | 202             plotter.setXLabel(self.xLab) | 
|  | 203             plotter.setYLabel(self.yLab) | 
|  | 204             plotter.setBarplot(self.barplot) | 
|  | 205             plotter.addLine(sizes) | 
|  | 206             plotter.plot() | 
|  | 207 | 
|  | 208         if nbSubItems == 0: | 
|  | 209             self._logAndRaise("No item found") | 
|  | 210 | 
|  | 211         if self.csv: | 
|  | 212             csvHandle = open(self.csv, "w") | 
|  | 213             for size in range(min(sizes.keys()), max(sizes.keys())+1): | 
|  | 214                 if size not in sizes: | 
|  | 215                     csvHandle.write("%d,0,\n" % (size)) | 
|  | 216                 else: | 
|  | 217                     csvHandle.write("%d,%d,%s\n" % (size, sizes[size], ";".join(names[size]))) | 
|  | 218             csvHandle.close() | 
|  | 219 | 
|  | 220         self.items = number | 
|  | 221         self.subItems = nbSubItems | 
|  | 222         self.nucleotides = sum | 
|  | 223         self.minAvgMedMax = Utils.getMinAvgMedMax(sizes) | 
|  | 224 | 
|  | 225         print "%d items" % (number) | 
|  | 226         print "%d sub-items" % (nbSubItems) | 
|  | 227         print "%d nucleotides" % (sum) | 
|  | 228         print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes) | 
|  | 229 | 
|  | 230         self._log.info("END getsizes") | 
|  | 231 | 
|  | 232 | 
|  | 233 if __name__ == "__main__": | 
|  | 234     iGetSizes = GetSizes() | 
|  | 235     iGetSizes.setAttributesFromCmdLine() | 
|  | 236     iGetSizes.run() | 
|  | 237 | 
|  | 238 #TODO: add two more options!!!!!! |