comparison SMART/Java/Python/getSizes.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents
children
comparison
equal deleted inserted replaced
35:d94018ca4ada 36:44d5973c188c
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import os, sys
32 from optparse import OptionParser
33 from commons.core.parsing.FastaParser import FastaParser
34 from commons.core.parsing.FastqParser import FastqParser
35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
36 from commons.core.parsing.GffParser import GffParser
37 from SMART.Java.Python.misc.Progress import Progress
38 from SMART.Java.Python.misc.RPlotter import RPlotter
39 from SMART.Java.Python.misc import Utils
40
41 from commons.core.LoggerFactory import LoggerFactory
42 from commons.core.utils.RepetOptionParser import RepetOptionParser
43
44 LOG_DEPTH = "smart"
45
46 class GetSizes(object):
47
48 def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, verbosity = 0):
49 self.inFileName = inFileName
50 self.inFormat= inFormat
51 self.outFileName = outFileName
52 self.query = query
53 self.xMax = xMax
54 self.xMin = xMin
55 self.xLab = "Size"
56 self.yLab = "# reads"
57 self.barplot = False
58 self._verbosity = verbosity
59 self.parser = None
60 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
61
62 def setAttributesFromCmdLine(self):
63 description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n"
64 epilog = ""
65 parser = RepetOptionParser(description = description, epilog = epilog)
66 parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]")
67 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
68 parser.add_option("-q", "--query", dest="query", action="store", default=None, type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]")
69 parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]")
70 parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]")
71 parser.add_option("-X", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int]")
72 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
73 parser.add_option("-a", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x absis label name [format: string] [default: Size]")
74 parser.add_option("-b", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y absis label name [format: string] [default: Reads]")
75 parser.add_option("-B", "--barplot", dest="barplot", action="store_true", default=False, help="use barplot representation [format: bool] [default: false]")
76 options = parser.parse_args()[0]
77 self._setAttributesFromOptions(options)
78
79 def _setAttributesFromOptions(self, options):
80 self.setInFileName(options.inputFileName)
81 self.setInFormat(options.format)
82 self.setQuery(options.query)
83 self.setOutFileName(options.outputFileName)
84 self.setXMax(options.xMax)
85 self.setXMin(options.xMin)
86 self.setxLab(options.xLab)
87 self.setyLab(options.yLab)
88 self.setBarplot(options.barplot)
89 self.setVerbosity(options.verbosity)
90
91 def setInFileName(self, inputFileName):
92 self.inFileName = inputFileName
93
94 def setInFormat(self, inFormat):
95 self.inFormat = inFormat
96
97 def setQuery(self, query):
98 self.query = query
99
100 def setOutFileName(self, outFileName):
101 self.outFileName = outFileName
102
103 def setXMax(self, xMax):
104 self.xMax = xMax
105
106 def setXMin(self, xMin):
107 self.xMin = xMin
108
109 def setxLab(self, xLab):
110 self.xLab = xLab
111
112 def setyLab(self, yLab):
113 self.yLab = yLab
114
115 def setBarplot(self, barplot):
116 self.barplot = barplot
117
118 def setVerbosity(self, verbosity):
119 self._verbosity = verbosity
120
121 def _checkOptions(self):
122 if self.inFileName == None:
123 self._logAndRaise("ERROR: Missing input file name")
124 if self.inFormat == "fasta":
125 self.parser = FastaParser(self.inFileName, self._verbosity)
126 elif self.inFormat == "fastq":
127 self.parser = FastqParser(self.inFileName, self._verbosity)
128 else:
129 self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity)
130
131 def _logAndRaise(self, errorMsg):
132 self._log.error(errorMsg)
133 raise Exception(errorMsg)
134
135 def run(self):
136 LoggerFactory.setLevel(self._log, self._verbosity)
137 self._checkOptions()
138 self._log.info("START getsizes")
139 self._log.debug("Input file name: %s" % self.inFileName)
140
141 nbItems = self.parser.getNbItems()
142 self._log.info( "%i items found" % (nbItems))
143
144 # treat items
145 progress = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity)
146 sizes = {}
147 minimum = 1000000000000
148 maximum = 0
149 sum = 0
150 number = 0
151 nbSubItems = 0
152 for item in self.parser.getIterator():
153 items = []
154 if self.query == "exon":
155 items = item.getExons()
156 elif self.query == "exon1":
157 if len(item.getExons()) > 1:
158 item.sortExons()
159 items = [item.getExons()[0]]
160 elif self.query == "intron":
161 items = item.getIntrons()
162 else:
163 items = [item, ]
164
165 for thisItem in items:
166 try:
167 nbElements = int(float(thisItem.getTagValue("nbElements")))
168 if nbElements == None:
169 nbElements = 1
170 except:
171 nbElements = 1
172 size = thisItem.getSize()
173 minimum = min(minimum, size)
174 maximum = max(maximum, size)
175
176 if size not in sizes:
177 sizes[size] = nbElements
178 else:
179 sizes[size] += nbElements
180 sum += size
181 nbSubItems += nbElements
182 number += 1
183 progress.inc()
184 progress.done()
185
186 if self.outFileName != None:
187 plotter = RPlotter(self.outFileName, self._verbosity)
188 plotter.setFill(0)
189 plotter.setMinimumX(self.xMin)
190 plotter.setMaximumX(self.xMax)
191 plotter.setXLabel(self.xLab)
192 plotter.setYLabel(self.yLab)
193 plotter.setBarplot(self.barplot)
194 plotter.addLine(sizes)
195 plotter.plot()
196
197 if nbSubItems == 0:
198 self._logAndRaise("No item found")
199
200 self.items = number
201 self.subItems = nbSubItems
202 self.nucleotides = sum
203 self.minAvgMedMax = Utils.getMinAvgMedMax(sizes)
204
205 print "%d items" % (number)
206 print "%d sub-items" % (nbSubItems)
207 print "%d nucleotides" % (sum)
208 print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes)
209
210 self._log.info("END getsizes")
211
212
213 if __name__ == "__main__":
214 iGetSizes = GetSizes()
215 iGetSizes.setAttributesFromCmdLine()
216 iGetSizes.run()
217
218 #TODO: add two more options!!!!!!