comparison smart_toolShed/SMART/Java/Python/clusterizeBySlidingWindows.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import re
32 from commons.core.writer.WriterChooser import WriterChooser
33 """
34 Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks.
35 """
36
37 import os, os.path
38 from optparse import OptionParser
39 from SMART.Java.Python.structure.Transcript import Transcript
40 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
41 from SMART.Java.Python.misc.RPlotter import RPlotter
42 from SMART.Java.Python.misc.Progress import Progress
43 from commons.core.writer.Gff3Writer import Gff3Writer
44
45 class ClusterizeBySlidingWindows(object):
46
47 def __init__(self, verbosity = 0):
48 self.verbosity = verbosity
49 self.strands = (0, )
50 self.normalize = False
51 self.plot = None
52 self.excel = None
53 self.outputFileName = ''
54 self.defaultValue = None
55
56 def __del__(self):
57 pass
58
59 def setInputFile(self, fileName, format):
60 self.parser = TranscriptContainer(fileName, format, self.verbosity)
61
62 def setOutputFileName(self, fileName, format="gff", title="S-MART", feature="transcript", featurePart="exon"):
63 writerChooser = WriterChooser(self.verbosity)
64 writerChooser.findFormat(format)
65 self.writer = writerChooser.getWriter(fileName)
66 self.writer.setTitle(title)
67 self.writer.setFeature(feature)
68 self.writer.setFeaturePart(featurePart)
69 # self.outputFileName = fileName
70 # self.outputFormat = format
71
72 def setWindowSize(self, size):
73 self.size = size
74
75 def setWindowOverlap(self, overlap):
76 self.overlap = overlap
77
78 def setTag(self, tag):
79 self.tag = tag
80
81 def setOperation(self, operation):
82 self.operation = operation
83
84 def setBothStrands(self, bothStrands):
85 if bothStrands:
86 self.strands = (-1, 1)
87
88 def setNormalize(self, normalize):
89 self.normalize = normalize
90
91 def setPlot(self, plot):
92 self.plot = plot
93
94 def setExcel(self, excel):
95 self.excel = excel
96
97 def setOutputTag(self, tag):
98 self.outputTagName = tag
99
100 def setDefaultValue(self, defaultValue):
101 self.defaultValue = defaultValue
102
103 def checkOptions(self):
104 # if self.operation != None:
105 # raise Exception("Trying to combine the values without specifying tag! Aborting...")
106 if self.operation != None and self.operation not in ("sum", "avg", "med", "min", "max"):
107 raise Exception("Do not understand tag '%s'! Aborting..." % (self.operation))
108
109 def getChromosomeSizes(self):
110 self.sizes = {}
111 progress = Progress(self.parser.getNbTranscripts(), "Getting sizes in genome", self.verbosity)
112 for transcript in self.parser.getIterator():
113 self.sizes[transcript.getChromosome()] = max(transcript.getStart(), self.sizes.get(transcript.getChromosome(), 0))
114 progress.inc()
115 progress.done()
116
117 def getBinsFromPos(self, pos):
118 bin = (pos - 1) / (self.size - self.overlap)
119 if bin >= 1 and pos <= bin * (self.size - self.overlap) + self.overlap:
120 return (bin - 1, bin)
121 return (bin, )
122
123 def getPosFromBin(self, bin):
124 return (bin * (self.size - self.overlap) + 1, bin * (self.size - self.overlap) + self.size)
125
126 def initializeBins(self):
127 self.binsPerStrand = {}
128 self.sumsPerStrand = {}
129 self.valuesPerStrand = {}
130 self.toBePlottedPerStrand = {}
131 for strand in self.strands:
132 self.binsPerStrand[strand] = {}
133 self.sumsPerStrand[strand] = {}
134 self.valuesPerStrand[strand] = {}
135 self.toBePlottedPerStrand[strand] = {}
136 for chromosome in self.sizes:
137 binRange = range(self.getBinsFromPos(self.sizes[chromosome])[-1] + 1)
138 self.binsPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange])
139 self.sumsPerStrand[strand][chromosome] = dict([[i, 0.0] for i in binRange])
140 self.valuesPerStrand[strand][chromosome] = dict([[i, []] for i in binRange])
141 self.toBePlottedPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange])
142
143 def getNbElements(self, transcript):
144 nbOccurrences = 1 if "nbOccurrences" not in transcript.getTagNames() else transcript.getTagValue("nbOccurrences")
145 nbElements = 1 if "nbElements" not in transcript.getTagNames() else transcript.getTagValue("nbElements")
146 nbOccurrences = float(nbOccurrences)
147 nbElements = float(nbElements)
148 nbElements /= float(nbOccurrences)
149 return nbElements
150
151 def setBins(self):
152 progress = Progress(self.parser.getNbTranscripts(), "Setting bins", self.verbosity)
153 for transcript in self.parser.getIterator():
154 nbElements = self.getNbElements(transcript)
155 strand = transcript.getDirection() if len(self.strands) == 2 else 0
156 for bin in self.getBinsFromPos(transcript.getStart()):
157 self.binsPerStrand[strand][transcript.getChromosome()][bin] += nbElements
158 if self.tag != None:
159 if self.tag not in transcript.getTagNames():
160 if self.defaultValue is None:
161 raise Exception("Tag %s undefined in transcript %s" % (self.tag, transcript))
162 value = self.defaultValue
163 else:
164 value = float(transcript.getTagValue(self.tag))
165 self.sumsPerStrand[strand][transcript.getChromosome()][bin] += value
166 self.valuesPerStrand[strand][transcript.getChromosome()][bin].append(value)
167 progress.inc()
168 progress.done()
169
170 def aggregateData(self):
171 if self.operation == "sum":
172 self.computeSumData()
173 elif self.operation == "avg":
174 self.computeAvgData()
175 elif self.operation == "med":
176 self.computeMedData()
177 elif self.operation == "min":
178 self.computeMinData()
179 elif self.operation == "max":
180 self.computeMaxData()
181 elif self.operation == "GCpercent":
182 self.computeGCPercent()
183 else:
184 self.toBePlottedPerStrand = self.binsPerStrand
185
186 def computeSumData(self):
187 self.toBePlottedPerStrand = self.sumsPerStrand
188
189 def computeAvgData(self):
190 for strand in self.strands:
191 for chromosome in self.binsPerStrand[strand]:
192 for bin in self.binsPerStrand[strand][chromosome]:
193 if self.binsPerStrand[strand][chromosome][bin] != 0:
194 self.toBePlottedPerStrand[strand][chromosome][bin] = float(self.sumsPerStrand[strand][chromosome][bin]) / self.binsPerStrand[strand][chromosome][bin]
195
196 def computeMedData(self):
197 for strand in self.strands:
198 for chromosome in self.binsPerStrand[strand]:
199 for bin in self.binsPerStrand[strand][chromosome]:
200 if self.valuesPerStrand[strand][chromosome][bin]:
201 self.valuesPerStrand[strand][chromosome][bin].sort()
202 size = len(self.valuesPerStrand[strand][chromosome][bin])
203 if size % 2 == 1:
204 self.toBePlottedPerStrand[strand][chromosome][bin] = self.valuesPerStrand[strand][chromosome][bin][(size - 1) / 2]
205 else:
206 self.toBePlottedPerStrand[strand][chromosome][bin] = (self.valuesPerStrand[strand][chromosome][bin][size / 2 - 1] + self.valuesPerStrand[strand][chromosome][bin][size / 2]) / 2.0
207
208 def computeMinData(self):
209 for strand in self.strands:
210 for chromosome in self.binsPerStrand[strand]:
211 for bin in self.binsPerStrand[strand][chromosome]:
212 if self.valuesPerStrand[strand][chromosome][bin]:
213 self.toBePlottedPerStrand[strand][chromosome][bin] = min(self.valuesPerStrand[strand][chromosome][bin])
214
215 def computeMaxData(self):
216 for strand in self.strands:
217 for chromosome in self.binsPerStrand[strand]:
218 for bin in self.binsPerStrand[strand][chromosome]:
219 if self.valuesPerStrand[strand][chromosome][bin]:
220 self.toBePlottedPerStrand[strand][chromosome][bin] = max(self.valuesPerStrand[strand][chromosome][bin])
221
222 def computeGCPercent(self):
223 for strand in self.strands:
224 for chromosome in self.binsPerStrand[strand]:
225 for bin in self.binsPerStrand[strand][chromosome]:
226 if self.valuesPerStrand[strand][chromosome][bin]:
227 subSequence = self.valuesPerStrand[strand][chromosome][bin]
228 NPercent = 100 * (subSequence.countNt("N") / float(subSequence.getSize()))
229 if NPercent >= 50:
230 currentGCpercent = "NA"
231 else:
232 currentGCpercent = subSequence.getGCpercentageInSequenceWithoutCountNInLength()
233
234 self.toBePlottedPerStrand[strand][chromosome][bin] = currentGCpercent
235 #TODO: see if a map method could be used for the various "compute" methods
236 #return currentGCpercent, NPercent
237
238 def plotData(self):
239 if self.plot != None:
240 for strand in self.strands:
241 adjunct = ""
242 if strand != 0:
243 adjunct = "Strand%d" % (strand)
244 for chromosome in self.toBePlottedPerStrand[strand]:
245 if len(self.toBePlottedPerStrand[strand][chromosome].keys()) > 0:
246 plotter = RPlotter(self.plot, self.verbosity)
247 plotter.setFill(0)
248 plotter.addLine(self.toBePlottedPerStrand[strand][chromosome], chromosome)
249 plotter.plot()
250
251 def writeExcel(self):
252 if self.excel != None:
253 excelFile = open(self.excel, "w")
254 for strand in self.strands:
255 maxBin = max([max(self.toBePlottedPerStrand[strand][chromosome].keys()) for chromosome in self.binsPerStrand[strand]])
256 for bin in range(0, maxBin + 1):
257 excelFile.write(",%d-%d" % self.getPosFromBin(bin))
258 excelFile.write("\n")
259 for chromosome in self.toBePlottedPerStrand[strand]:
260 excelFile.write("%s" % (chromosome))
261 for bin in self.toBePlottedPerStrand[strand][chromosome]:
262 excelFile.write(",%f" % (self.toBePlottedPerStrand[strand][chromosome][bin]))
263 excelFile.write("\n")
264 excelFile.close()
265
266 def printRegions(self):
267 cpt = 1
268 tagOp = "nb"
269 tagName = "Elements"
270 outputTagName = "nbElements"
271 if self.operation != None:
272 tagOp = self.operation.lower()
273 if self.tag != None:
274 tagName = self.tag.title()
275 if self.outputTagName != None:
276 outputTagName = self.outputTagName
277
278
279 #writer = Gff3Writer(self.outputFileName, self.verbosity)
280
281 for strand in self.strands:
282 for chromosome in self.toBePlottedPerStrand[strand]:
283 for bin in self.toBePlottedPerStrand[strand][chromosome]:
284 transcript = Transcript()
285 transcript.setName("region%d" % cpt)
286 transcript.setChromosome(chromosome)
287 transcript.setStart(self.getPosFromBin(bin)[0])
288 transcript.setEnd(self.getPosFromBin(bin)[1])
289 transcript.setDirection(1 if strand == 0 else strand)
290 transcript.setTagValue(outputTagName, self.binsPerStrand[strand][chromosome][bin])
291 transcript.setTagValue("%s%s" % (tagOp, tagName), str(self.toBePlottedPerStrand[strand][chromosome][bin]))
292 self.writer.addTranscript(transcript)
293 cpt += 1
294 self.writer.close()
295
296 def run(self):
297 self.checkOptions()
298 self.getChromosomeSizes()
299 self.initializeBins()
300 self.setBins()
301 self.aggregateData()
302 if self.excel:
303 self.writeExcel()
304 if self.plot:
305 self.plotData()
306 self.printRegions()
307
308
309 if __name__ == "__main__":
310
311 # parse command line
312 description = "Clusterize by Sliding Windows v1.0.1: Produces a GFF3 file that clusters a list of transcripts using a sliding window. [Category: Sliding Windows]"
313
314 parser = OptionParser(description = description)
315 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
316 parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]")
317 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
318 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="format of the output file [format: transcript file format]")
319 parser.add_option("-s", "--size", dest="size", action="store", type="int", help="size of the regions [compulsory] [format: int]")
320 parser.add_option("-e", "--overlap", dest="overlap", action="store", type="int", help="overlap between two consecutive regions [compulsory] [format: int]")
321 parser.add_option("-m", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
322 parser.add_option("-g", "--tag", dest="tag", action="store", default=None, type="string", help="use a given tag as input (instead of summing number of features) [format: string]")
323 parser.add_option("-r", "--operation", dest="operation", action="store", default=None, type="string", help="combine tag value with given operation [format: choice (sum, avg, med, min, max)]")
324 parser.add_option("-d", "--defaultValue",dest="defaultValue", action="store", type="float", help="default value for input tag [format: float]")
325 parser.add_option("-w", "--write", dest="writeTag", action="store", default=None, type="string", help="print the result in the given tag (default usually is 'nbElements') [format: string]")
326 parser.add_option("-2", "--strands", dest="strands", action="store_true", default=False, help="consider the two strands separately [format: bool] [default: false]")
327 parser.add_option("-p", "--plot", dest="plot", action="store", default=None, type="string", help="plot regions to the given file [format: output file in PNG format]")
328 parser.add_option("-x", "--excel", dest="excel", action="store", default=None, type="string", help="write an Excel file to the given file [format: output file in Excel format]")
329 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]")
330 (options, args) = parser.parse_args()
331
332 cbsw = ClusterizeBySlidingWindows(options.verbosity)
333 cbsw.setInputFile(options.inputFileName, options.inputFormat)
334 cbsw.setOutputFileName(options.outputFileName, options.outputFormat)
335 cbsw.setWindowSize(options.size)
336 cbsw.setWindowOverlap(options.overlap)
337 cbsw.setTag(options.tag)
338 cbsw.setDefaultValue(options.defaultValue)
339 cbsw.setOperation(options.operation)
340 cbsw.setOutputTag(options.writeTag)
341 cbsw.setBothStrands(options.strands)
342 cbsw.setPlot(options.plot)
343 cbsw.setExcel(options.excel)
344 cbsw.run()