comparison SMART/Java/Python/GetReadDistribution.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents
children
comparison
equal deleted inserted replaced
37:d22fadc825e3 38:2c0c0a89fad7
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import random, os, glob, subprocess
32 from commons.core.parsing.ParserChooser import ParserChooser
33 from commons.core.parsing.GffParser import GffParser
34 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
35 from SMART.Java.Python.misc.Progress import Progress
36 from SMART.Java.Python.misc import Utils
37 from commons.core.LoggerFactory import LoggerFactory
38 from commons.core.utils.RepetOptionParser import RepetOptionParser
39
40 LOG_DEPTH = "smart"
41 DEFAULT_REGION = "_all_"
42 MULTIPLE_STR = {1: "", 1000: " (in kbp)", 1000000: " (in Gbp)"}
43
44 class GetReadDistribution(object):
45
46 def __init__(self, verbosity = 0):
47 self.xLab = ""
48 self.yLab = "# reads"
49 self.verbosity = verbosity
50 self.number = random.randint(0, 100000)
51 self.log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
52 self.parsers = {}
53 self.distribution = {}
54 self.factors = {}
55 self.regions = None
56 self.tmpDatName = None
57 self.tmpRName = None
58 self.quorum = 1
59 self.strands = False
60 self.width = 800
61 self.height = 300
62 self.arial = False
63
64 def setNames(self, names):
65 self.names = names
66
67 def setInputFiles(self, fileNames, format):
68 chooser = ParserChooser(self.verbosity)
69 chooser.findFormat(format)
70 for cpt, fileName in enumerate(fileNames):
71 self.parsers[self.names[cpt]] = chooser.getParser(fileName)
72
73 def setOutputFileName(self, fileName):
74 self.outputFileName = fileName
75
76 def setLabs(self, xLab, yLab):
77 self.xLab = xLab
78 self.yLab = yLab
79
80 def setBinSize(self, binSize):
81 self.binSize = binSize
82
83 def setColors(self, colors):
84 self.colors = colors
85
86 def setFactors(self, factors):
87 if factors == None:
88 self.factors = dict([name, 1.0] for name in self.names)
89 else:
90 self.factors = dict(zip(self.names, factors))
91
92 def setMultiple(self, boolean):
93 self.multiple = boolean
94
95 def setImageSize(self, width, height):
96 if width != None:
97 self.width = width
98 if height != None:
99 self.height = height
100
101 def setQuorum(self, quorum):
102 self.quorum = quorum
103
104 def setRegionsFile(self, fileName):
105 if fileName != None:
106 self._loadRegions(fileName)
107
108 def setBothStrands(self, strands):
109 self.strands = strands
110
111 def setArial(self, arial):
112 self.arial = arial
113
114 def _checkOptions(self):
115 if not self.parsers:
116 self.logAndRaise("ERROR: Missing input file names")
117
118 def _logAndRaise(self, errorMsg):
119 self.log.error(errorMsg)
120 raise Exception(errorMsg)
121
122 def _loadRegions(self, fileName):
123 self.regions = {}
124 parser = GffParser(fileName, self.verbosity)
125 for transcript in parser.getIterator():
126 chromosome = transcript.getChromosome()
127 start = transcript.getStart()
128 end = transcript.getEnd()
129 name = transcript.getName()
130 if chromosome not in self.regions:
131 self.regions[chromosome] = {}
132 if start not in self.regions[chromosome]:
133 self.regions[chromosome][start] = {}
134 if end not in self.regions[chromosome][start]:
135 self.regions[chromosome][start][end] = []
136 self.regions[chromosome][start][end].append(name)
137
138 def _getRegions(self, transcript):
139 if self.regions == None:
140 return [DEFAULT_REGION]
141 chromosome = transcript.getChromosome()
142 start = transcript.getStart()
143 end = transcript.getEnd()
144 if chromosome not in self.regions:
145 return []
146 names = []
147 for loadedStart in sorted(self.regions[chromosome].keys()):
148 if loadedStart > end:
149 return names
150 for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())):
151 if loadedEnd < start:
152 break
153 names.extend(self.regions[chromosome][loadedStart][loadedEnd])
154 return names
155
156 def _parse(self, name):
157 progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity)
158 for transcript in self.parsers[name].getIterator():
159 if transcript.__class__.__name__ == "Mapping":
160 transcript = transcript.getTranscript()
161 regions = self._getRegions(transcript)
162 for region in regions:
163 if region not in self.distribution:
164 self.distribution[region] = {}
165 if name not in self.distribution[region]:
166 self.distribution[region][name] = {}
167 chromosome = transcript.getChromosome()
168 nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1
169 nbElements *= self.factors.get(name, 1)
170 strand = transcript.getDirection() if self.strands else 1
171 if chromosome not in self.distribution[region][name]:
172 self.distribution[region][name][chromosome] = {}
173 if strand not in self.distribution[region][name][chromosome]:
174 self.distribution[region][name][chromosome][strand] = {}
175 previousBin = None
176 for exon in transcript.getExons():
177 for pos in range(exon.getStart(), exon.getEnd()+1):
178 bin = pos / self.binSize
179 if bin != previousBin:
180 self.distribution[region][name][chromosome][strand][bin] = self.distribution[region][name][chromosome][strand].get(bin, 0) + nbElements
181 previousBin = bin
182 progress.inc()
183 progress.done()
184
185 def _checkQuorum(self, region):
186 if self.quorum == None:
187 return True
188 return max([max([max([max(self.distribution[region][name][chromosome][strand].values()) for strand in self.distribution[region][name][chromosome]]) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]])
189
190 def _writeData(self, region):
191 self.tmpDatName = "tmpFile%d.dat" % (self.number)
192 handle = open(self.tmpDatName, "w")
193 handle.write("Chr\tPos\tStrand\tCount\tSample\n")
194 for name in self.distribution[region]:
195 for chromosome in sorted(self.distribution[region][name].keys()):
196 for strand in sorted(self.distribution[region][name][chromosome].keys()):
197 for pos in sorted(self.distribution[region][name][chromosome][strand].keys()):
198 handle.write("%s\t%d\t%d\t%d\t\"%s\"\n" % (chromosome, pos * self.binSize, strand, self.distribution[region][name][chromosome][strand].get(pos, 0) * strand, name))
199 handle.close()
200
201 def _findMultiple(self, region):
202 if not self.multiple:
203 return 1
204 maxPosition = max([max([max([max(self.distribution[region][name][chromosome][strand].keys()) for strand in self.distribution[region][name][chromosome]]) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]]) * self.binSize
205 if maxPosition > 2000000:
206 return 1000000
207 elif maxPosition > 2000:
208 return 1000
209 return 1
210
211 def _writeScript(self, region):
212 self.tmpRName = "tmpFile%d.R" % (self.number)
213 fileName = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region)
214 colors = "scale_fill_brewer(palette=\"Set1\") + scale_color_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s)) + scale_color_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors]), ", ".join(["\"%s\"" % (color) for color in self.colors]))
215 title = "" if region == DEFAULT_REGION else " + labs(title = \"Distribution of %s\") " % (region)
216 facet = "Sample ~ Chr" if region == DEFAULT_REGION else "Sample ~ ."
217 handle = open(self.tmpRName, "w")
218 multiple = self._findMultiple(region)
219 arial = ", text = element_text(family=\"Arial\", size=20)" if self.arial else ""
220 if self.arial:
221 handle.write("library(extrafont)\nloadfonts()\n")
222 handle.write("library(ggplot2)\n")
223 handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName))
224 handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names])))
225 handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height))
226 handle.write("ggplot(data, aes(x = Pos/%d, y = Count, fill = Sample, color = Sample)) %s + geom_bar(stat = \"identity\") + facet_grid(%s, space=\"free\") + xlab(\"%s%s\") + ylab(\"%s\") + %s + theme(legend.position = \"none\", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()%s)\n" % (multiple, title, facet, self.xLab, MULTIPLE_STR[multiple], self.yLab, colors, arial))
227 handle.write("dev.off()\n")
228
229 def _runR(self):
230 rCommand = os.environ["SMARTRPATH"] if "SMARTRPATH" in os.environ else "R"
231 command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName)
232 status = subprocess.call(command, shell=True)
233 if status != 0:
234 raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status))
235
236 def _plot(self):
237 progress = Progress(len(self.distribution), "Plotting data", self.verbosity)
238 for region in self.distribution:
239 if not self._checkQuorum(region):
240 self.log.info("Not displaying '%s' for it contains insufficient data." % (region))
241 else:
242 self._writeData(region)
243 self._writeScript(region)
244 self._runR()
245 progress.inc()
246 progress.done()
247
248 def _cleanFiles(self):
249 for fileName in (self.tmpDatName, self.tmpRName):
250 if fileName != None and os.path.exists(fileName):
251 os.remove(fileName)
252 for otherFileName in glob.glob("%s*" % (fileName)):
253 os.remove(otherFileName)
254
255 def run(self):
256 LoggerFactory.setLevel(self.log, self.verbosity)
257 self._checkOptions()
258 self.log.info("START Get Read Distribution")
259 for name in self.names:
260 self._parse(name)
261 self._plot()
262 self._cleanFiles()
263 self.log.info("END Get Read Distribution")
264
265
266 if __name__ == "__main__":
267 description = "Usage: GetReadDistribution.py [options]\n\nGet Read Distribution v1.0.1: Get the distribution of a set of reads. [Category: Personal]\n"
268 epilog = ""
269 parser = RepetOptionParser(description = description, epilog = epilog)
270 parser.add_option("-i", "--input", dest="inputFileNames", action="store", default=None, type="string", help="input files, separated by commas [compulsory] [format: string]")
271 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
272 parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name of the input data, separated by commas [compulsory] [format: string]")
273 parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]")
274 parser.add_option("-s", "--binSize", dest="binSize", action="store", default=10000, type="int", help="bin size [format: int] [default: 10000]")
275 parser.add_option("-l", "--xLabel", dest="xLab", action="store", default="", type="string", help="x-axis label name [format: string]")
276 parser.add_option("-L", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]")
277 parser.add_option("-c", "--colors", dest="colors", action="store", default=None, type="string", help="colors of the bars, separated by commas [format: string]")
278 parser.add_option("-a", "--factors", dest="factors", action="store", default=None, type="string", help="normalization factors, separated by commas [format: string]")
279 parser.add_option("-r", "--regions", dest="regionsFileName", action="store", default=None, type="string", help="regions to plot [format: transcript file in GFF format]")
280 parser.add_option("-2", "--strands", dest="strands", action="store_true", default=False, help="plot negative strands on the negative x-axis [format: boolean] [default: False]")
281 parser.add_option("-m", "--multiple", dest="multiple", action="store_true", default=False, help="use human readable genomic positions (k, G) [format: boolean] [default: False]")
282 parser.add_option("-q", "--quorum", dest="quorum", action="store", default=1, type="int", help="minimum number of intervals to plot a region [format: int] [default: 1]")
283 parser.add_option("-z", "--width", dest="width", action="store", default=800, type="int", help="width of the image [format: int] [default: 800]")
284 parser.add_option("-Z", "--height", dest="height", action="store", default=300, type="int", help="height of the image [format: int] [default: 300]")
285 parser.add_option("-A", "--arial", dest="arial", action="store_true", default=False, help="use Arial font [format: boolean] [default: false]")
286 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
287 options = parser.parse_args()[0]
288 iGetReadDistribution = GetReadDistribution(options.verbosity)
289 iGetReadDistribution.setNames(options.names.split(","))
290 iGetReadDistribution.setInputFiles(options.inputFileNames.split(","), options.format)
291 iGetReadDistribution.setOutputFileName(options.outputFileName)
292 iGetReadDistribution.setLabs(options.xLab, options.yLab)
293 iGetReadDistribution.setBinSize(options.binSize)
294 iGetReadDistribution.setColors(None if options.colors == None else options.colors.split(","))
295 iGetReadDistribution.setFactors(None if options.factors == None else map(float, options.factors.split(",")))
296 iGetReadDistribution.setRegionsFile(options.regionsFileName)
297 iGetReadDistribution.setMultiple(options.multiple)
298 iGetReadDistribution.setQuorum(options.quorum)
299 iGetReadDistribution.setImageSize(options.width, options.height)
300 iGetReadDistribution.setBothStrands(options.strands)
301 iGetReadDistribution.setArial(options.arial)
302 iGetReadDistribution.run()
303