comparison smart_toolShed/SMART/Java/Python/getSizes.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import os, sys
32 from optparse import OptionParser
33 from commons.core.parsing.FastaParser import FastaParser
34 from commons.core.parsing.FastqParser import FastqParser
35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
36 from commons.core.parsing.GffParser import GffParser
37 from SMART.Java.Python.misc.Progress import Progress
38 from SMART.Java.Python.misc.RPlotter import RPlotter
39 from SMART.Java.Python.misc import Utils
40
41 from commons.core.LoggerFactory import LoggerFactory
42 from commons.core.utils.RepetOptionParser import RepetOptionParser
43
44 LOG_DEPTH = "smart"
45
46 class GetSizes(object):
47
48 def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, csv=False, verbosity = 0):
49 self.inFileName = inFileName
50 self.inFormat= inFormat
51 self.outFileName = outFileName
52 self.query = query
53 self.xMax = xMax
54 self.xMin = xMin
55 self.xLab = "Size"
56 self.yLab = "# reads"
57 self.barplot = False
58 self.csv = csv
59 self._verbosity = verbosity
60 self.parser = None
61 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
62
63 def setAttributesFromCmdLine(self):
64 description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n"
65 epilog = ""
66 parser = RepetOptionParser(description = description, epilog = epilog)
67 parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]")
68 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
69 parser.add_option("-q", "--query", dest="query", action="store", default=None, type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]")
70 parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]")
71 parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]")
72 parser.add_option("-X", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int]")
73 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
74 parser.add_option("-c", "--csv", dest="csv", action="store", type="string", help="write a .csv file [format: bool] [default: false]")
75 parser.add_option("-a", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x absis label name [format: string] [default: Size]")
76 parser.add_option("-b", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y absis label name [format: string] [default: Reads]")
77 parser.add_option("-B", "--barplot", dest="barplot", action="store_true", default=False, help="use barplot representation [format: bool] [default: false]")
78 options = parser.parse_args()[0]
79 self._setAttributesFromOptions(options)
80
81 def _setAttributesFromOptions(self, options):
82 self.setInFileName(options.inputFileName)
83 self.setInFormat(options.format)
84 self.setQuery(options.query)
85 self.setOutFileName(options.outputFileName)
86 self.setXMax(options.xMax)
87 self.setXMin(options.xMin)
88 self.setxLab(options.xLab)
89 self.setyLab(options.yLab)
90 self.setBarplot(options.barplot)
91 self.setVerbosity(options.verbosity)
92
93 def setInFileName(self, inputFileName):
94 self.inFileName = inputFileName
95
96 def setInFormat(self, inFormat):
97 self.inFormat = inFormat
98
99 def setQuery(self, query):
100 self.query = query
101
102 def setOutFileName(self, outFileName):
103 self.outFileName = outFileName
104
105 def setXMax(self, xMax):
106 self.xMax = xMax
107
108 def setXMin(self, xMin):
109 self.xMin = xMin
110
111 def setxLab(self, xLab):
112 self.xLab = xLab
113
114 def setyLab(self, yLab):
115 self.yLab = yLab
116
117 def setBarplot(self, barplot):
118 self.barplot = barplot
119
120 def setCsv(self, csv):
121 self.csv = csv
122
123 def setVerbosity(self, verbosity):
124 self._verbosity = verbosity
125
126 def _checkOptions(self):
127 if self.inFileName == None:
128 self._logAndRaise("ERROR: Missing input file name")
129 if self.inFormat == "fasta":
130 self.parser = FastaParser(self.inFileName, self._verbosity)
131 elif self.inFormat == "fastq":
132 self.parser = FastqParser(self.inFileName, self._verbosity)
133 else:
134 self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity)
135
136 def _logAndRaise(self, errorMsg):
137 self._log.error(errorMsg)
138 raise Exception(errorMsg)
139
140 def run(self):
141 LoggerFactory.setLevel(self._log, self._verbosity)
142 self._checkOptions()
143 self._log.info("START getsizes")
144 self._log.debug("Input file name: %s" % self.inFileName)
145
146 nbItems = self.parser.getNbItems()
147 self._log.info( "%i items found" % (nbItems))
148
149 # treat items
150 progress = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity)
151 sizes = {}
152 names = {}
153 minimum = 1000000000000
154 maximum = 0
155 sum = 0
156 number = 0
157 nbSubItems = 0
158 for item in self.parser.getIterator():
159 items = []
160 if self.query == "exon":
161 items = item.getExons()
162 elif self.query == "exon1":
163 if len(item.getExons()) > 1:
164 item.sortExons()
165 items = [item.getExons()[0]]
166 elif self.query == "intron":
167 items = item.getIntrons()
168 else:
169 items = [item, ]
170
171 for thisItem in items:
172 try:
173 nbElements = int(float(thisItem.getTagValue("nbElements")))
174 if nbElements == None:
175 nbElements = 1
176 except:
177 nbElements = 1
178 size = thisItem.getSize()
179 minimum = min(minimum, size)
180 maximum = max(maximum, size)
181 name = thisItem.name.split()[0]
182
183 if size not in sizes:
184 sizes[size] = nbElements
185 if self.csv:
186 names[size] = [name, ]
187 else:
188 sizes[size] += nbElements
189 if self.csv:
190 names[size].append(name)
191 sum += size
192 nbSubItems += nbElements
193 number += 1
194 progress.inc()
195 progress.done()
196
197 if self.outFileName != None:
198 plotter = RPlotter(self.outFileName, self._verbosity)
199 plotter.setFill(0)
200 plotter.setMinimumX(self.xMin)
201 plotter.setMaximumX(self.xMax)
202 plotter.setXLabel(self.xLab)
203 plotter.setYLabel(self.yLab)
204 plotter.setBarplot(self.barplot)
205 plotter.addLine(sizes)
206 plotter.plot()
207
208 if nbSubItems == 0:
209 self._logAndRaise("No item found")
210
211 if self.csv:
212 csvHandle = open(self.csv, "w")
213 for size in range(min(sizes.keys()), max(sizes.keys())+1):
214 if size not in sizes:
215 csvHandle.write("%d,0,\n" % (size))
216 else:
217 csvHandle.write("%d,%d,%s\n" % (size, sizes[size], ";".join(names[size])))
218 csvHandle.close()
219
220 self.items = number
221 self.subItems = nbSubItems
222 self.nucleotides = sum
223 self.minAvgMedMax = Utils.getMinAvgMedMax(sizes)
224
225 print "%d items" % (number)
226 print "%d sub-items" % (nbSubItems)
227 print "%d nucleotides" % (sum)
228 print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes)
229
230 self._log.info("END getsizes")
231
232
233 if __name__ == "__main__":
234 iGetSizes = GetSizes()
235 iGetSizes.setAttributesFromCmdLine()
236 iGetSizes.run()
237
238 #TODO: add two more options!!!!!!