comparison smart_toolShed/SMART/Java/Python/plotCoverage.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import os, subprocess, glob, random
32 from optparse import OptionParser
33 from SMART.Java.Python.structure.Interval import Interval
34 from SMART.Java.Python.structure.Transcript import Transcript
35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
36 from SMART.Java.Python.misc.RPlotter import RPlotter
37 from SMART.Java.Python.misc.Progress import Progress
38 from commons.core.parsing.FastaParser import FastaParser
39
40 strands = [-1, 1]
41 colors = {-1: "blue", 1: "red", 0: "black"}
42 colorLine = "black"
43
44 def parseTargetField(field):
45 strand = "+"
46 splittedFieldSpace = field.split()
47 splittedFieldPlus = field.split("+", 4)
48 if len(splittedFieldSpace) == 3:
49 id, start, end = splittedFieldSpace
50 elif len(splittedFieldSpace) == 4:
51 id, start, end, strand = splittedFieldSpace
52 elif len(splittedFieldPlus) == 3:
53 id, start, end = splittedFieldPlus
54 elif len(splittedFieldPlus) == 4:
55 id, start, end, strand = splittedFieldPlus
56 else:
57 raise Exception("Cannot parse Target field '%s'." % (field))
58 return (id, int(start), int(end), strand)
59
60
61 class SimpleTranscript(object):
62 def __init__(self, transcript1, transcript2, color = None):
63 self.start = max(0, transcript1.getStart() - transcript2.getStart())
64 self.end = min(transcript2.getEnd() - transcript2.getStart(), transcript1.getEnd() - transcript2.getStart())
65 self.strand = transcript1.getDirection() * transcript2.getDirection()
66 self.exons = []
67 for exon in transcript1.getExons():
68 if exon.getEnd() >= transcript2.getStart() and exon.getStart() <= transcript2.getEnd():
69 start = max(0, exon.getStart() - transcript2.getStart())
70 end = min(transcript2.getEnd() - transcript2.getStart(), exon.getEnd() - transcript2.getStart())
71 self.addExon(start, end, self.strand, color)
72
73 def addExon(self, start, end, strand, color):
74 exon = SimpleExon(start, end, strand, color)
75 self.exons.append(exon)
76
77 def getRScript(self, yOffset, height):
78 rString = ""
79 previousEnd = None
80 for exon in sorted(self.exons, key=lambda exon: exon.start):
81 if previousEnd != None:
82 rString += "segments(%.1f, %.1f, %.1f, %.1f, col = \"%s\")\n" % (previousEnd, yOffset + height / 4.0, exon.start, yOffset + height / 4.0, colorLine)
83 rString += exon.getRScript(yOffset, height)
84 previousEnd = exon.end
85 return rString
86
87
88 class SimpleExon(object):
89 def __init__(self, start, end, strand, color = None):
90 self.start = start
91 self.end = end
92 self.strand = strand
93 self.color = color
94
95 def getRScript(self, yOffset, height):
96 color = self.color if self.color != None else colors[self.strand]
97 return "rect(%.1f, %.1f, %.1f, %.1f, col=\"%s\", border = \"%s\")\n" % (self.start, yOffset, self.end, yOffset + height / 2.0, color, colorLine)
98
99
100 class Plotter(object):
101
102 def __init__(self, seed, index, verbosity):
103 self.seed = seed
104 self.index = index
105 self.verbosity = verbosity
106 self.maxCoverage = 0
107 self.maxOverlap = 0
108 self.log = ""
109 self.merge = False
110 self.width = 1500
111 self.heigth = 1000
112 self.xLabel = ""
113 self.yLabel = ""
114 self.title = None
115 self.absPath = os.getcwd()
116 self.coverageDataFileName = "tmpFile_%d_%s.dat" % (seed, index)
117 self.coverageScript = ""
118 self.overlapScript = ""
119 self.outputFileName = None
120
121 def setOutputFileName(self, fileName):
122 self.outputFileName = fileName
123
124 def setTranscript(self, transcript):
125 self.transcript = transcript
126 self.name = transcript.getName()
127 self.size = transcript.getEnd() - transcript.getStart() + 1
128 if self.title == None:
129 self.title = self.name
130 else:
131 self.title += " " + self.name
132
133 def setTitle(self, title):
134 self.title = title + " " + self.name
135
136 def setPlotSize(self, width, height):
137 self.width = width
138 self.height = height
139
140 def setLabels(self, xLabel, yLabel):
141 self.xLabel = xLabel
142 self.yLabel = yLabel
143
144 def setMerge(self, merge):
145 self.merge = merge
146
147 def setCoverageData(self, coverage):
148 outputCoveragePerStrand = dict([strand, 0] for strand in strands)
149 outputCoverage = 0
150 dataFile = open(os.path.abspath(self.coverageDataFileName), "w")
151 for position in range(self.size+1):
152 sumValue = 0
153 found = False
154 dataFile.write("%d\t" % (position))
155 for strand in strands:
156 value = coverage[strand].get(position, 0)
157 sumValue += value
158 dataFile.write("%d\t" % (value))
159 if value > 0:
160 found = True
161 outputCoveragePerStrand[strand] += 1
162 self.maxCoverage = max(self.maxCoverage, sumValue)
163 dataFile.write("%d\n" % (sumValue))
164 if found:
165 outputCoverage += 1
166 dataFile.close()
167 self.log += "%s (%d nt):\n - both strands: %d (%.0f%%)\n - (+) strand: %d (%.0f%%)\n - (-) strand: %d (%.0f%%)\n" % (self.name, self.size, outputCoverage, float(outputCoverage) / self.size * 100, outputCoveragePerStrand[1], float(outputCoveragePerStrand[1]) / self.size * 100, outputCoveragePerStrand[-1], float(outputCoveragePerStrand[-1]) / self.size * 100)
168 self.coverageScript += "data = scan(\"%s\", list(pos = -666, minus = -666, plus = -666, sumValue = -666), sep=\"\t\")\n" % (os.path.abspath(self.coverageDataFileName))
169 self.coverageScript += "lines(x = data$pos, y = data$minus, col = \"%s\")\n" % (colors[-1])
170 self.coverageScript += "lines(x = data$pos, y = data$plus, col = \"%s\")\n" % (colors[1])
171 self.coverageScript += "lines(x = data$pos, y = data$sumValue, col = \"%s\")\n" % (colors[0])
172
173 def setOverlapData(self, overlap):
174 height = 1
175 self.maxOverlap = (len(overlap) + 1) * height
176 thisElement = SimpleTranscript(self.transcript, self.transcript, "black")
177 self.overlapScript += thisElement.getRScript(0, height)
178 for cpt, transcript in enumerate(sorted(overlap, cmp=lambda c1, c2: c1.start - c2.start if c1.start != c2.start else c1.end - c2.end)):
179 self.overlapScript += transcript.getRScript((cpt + 1) * height, height)
180
181 def getFirstLine(self, suffix = None):
182 return "png(file = \"%s_%s%s.png\", width = %d, height = %d, bg = \"white\")\n" % (self.outputFileName, self.name, "" if suffix == None or self.merge else "_%s" % (suffix), self.width, self.height)
183
184 def getLastLine(self):
185 return "dev.off()\n"
186
187 def startR(self, fileName, script):
188 scriptFile = open(fileName, "w")
189 scriptFile.write(script)
190 scriptFile.close()
191 command = "R CMD BATCH %s" % (fileName)
192 status = subprocess.call(command, shell=True)
193 if status != 0:
194 raise Exception("Problem with the execution of script file %s, status is: %s" % (fileName, status))
195
196 def plot(self):
197 print "outputfileName is written in :", self.outputFileName
198 if self.merge:
199 fileName = "%s_%d_%s.R" % (self.outputFileName, self.seed, self.index)
200 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, max(self.maxCoverage, self.maxOverlap), self.title)
201 script = self.getFirstLine() + plotLine + self.overlapScript + self.coverageScript + self.getLastLine()
202 self.startR(fileName, script)
203 else:
204 fileName = "%s_%d_%s_overlap.R" % (self.outputFileName, self.seed, self.index)
205 print "overlap file is written in :", fileName
206 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxOverlap, self.title)
207 script = self.getFirstLine("overlap") + plotLine + self.overlapScript + self.getLastLine()
208 self.startR(fileName, script)
209 fileName = "%s_%d_%s_coverage.R" % (self.outputFileName, self.seed, self.index)
210 plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxCoverage, self.title)
211 script = self.getFirstLine("coverage") + plotLine + self.coverageScript + self.getLastLine()
212 self.startR(fileName, script)
213
214
215 class PlotParser(object):
216
217 def __init__(self, verbosity):
218 self.verbosity = verbosity
219 self.parsers = [None, None]
220 self.sequenceParser = None
221 self.seed = random.randint(0, 10000)
222 self.title = ""
223 self.merge = False
224
225 def __del__(self):
226 for fileName in glob.glob("tmpFile_%d*.dat" % (self.seed)):
227 os.remove(fileName)
228 for fileName in glob.glob("%s*.R" % (os.path.abspath(self.outputFileName))):
229 os.remove(fileName)
230 for fileName in glob.glob("%s*.Rout" % (os.path.abspath(self.outputFileName))):
231 os.remove(fileName)
232
233 def addInput(self, inputNb, fileName, fileFormat):
234 if fileName == None:
235 return
236 self.parsers[inputNb] = TranscriptContainer(fileName, fileFormat, self.verbosity)
237 if inputNb == 0:
238 self.parsers[1] = self.parsers[0]
239
240 def addSequence(self, fileName):
241 if fileName == None:
242 return
243 self.sequenceParser = FastaParser(fileName, self.verbosity)
244
245 def setOutput(self, fileName):
246 self.outputFileName = fileName
247
248 def setPlotSize(self, width, height):
249 self.width = width
250 self.height = height
251
252 def setLabels(self, xLabel, yLabel):
253 self.xLabel = xLabel
254 self.yLabel = yLabel
255
256 def setTitle(self, title):
257 self.title = title
258
259 def setMerge(self, merge):
260 self.merge = merge
261
262 def initializeDataFromSequences(self):
263 self.sizes = {}
264 self.coverage = {}
265 self.overlap = {}
266 for region in self.sequenceParser.getRegions():
267 self.sizes[region] = self.sequenceParser.getSizeOfRegion(region)
268 self.coverage[region] = {}
269 self.overlap[region] = []
270 for strand in strands:
271 self.coverage[region][strand] = {}
272 self.coverage[region][strand][1] = 0
273 self.coverage[region][strand][self.sizes[region]] = 0
274
275
276 def initializeDataFromTranscripts(self):
277 self.coverage = dict([i, None] for i in range(self.parsers[1].getNbTranscripts()))
278 self.overlap = dict([i, None] for i in range(self.parsers[1].getNbTranscripts()))
279 self.sizes = dict([i, 0] for i in range(self.parsers[1].getNbTranscripts()))
280 self.parsers[0].findData()
281 progress = Progress(self.parsers[1].getNbTranscripts(), "Reading regions", self.verbosity)
282 for cpt, transcript in enumerate(self.parsers[1].getIterator()):
283 self.coverage[cpt] = {}
284 self.overlap[cpt] = []
285 for strand in strands:
286 self.coverage[cpt][strand] = {}
287 self.coverage[cpt][strand][0] = 0
288 self.coverage[cpt][strand][transcript.getEnd() - transcript.getStart()] = 0
289 for exon in transcript.getExons():
290 self.sizes[cpt] += exon.getSize()
291 progress.inc()
292 progress.done()
293
294 def initialize(self):
295 if self.sequenceParser == None:
296 self.initializeDataFromTranscripts()
297 else:
298 self.initializeDataFromSequences()
299
300 def computeCoverage(self, transcript1, transcript2, id):
301 strand = transcript1.getDirection() * transcript2.getDirection()
302 for exon1 in transcript1.getExons():
303 for exon2 in transcript2.getExons():
304 if exon1.overlapWith(exon2):
305 for position in range(max(exon1.getStart(), exon2.getStart()), min(exon1.getEnd(), exon2.getEnd()) + 1):
306 relativePosition = position - transcript2.getStart() + 1
307 self.coverage[id][strand][relativePosition] = self.coverage[id][strand].get(relativePosition, 0) + 1
308
309 def computeOverlap(self, transcript1, transcript2, id):
310 simpleTranscript = SimpleTranscript(transcript1, transcript2)
311 self.overlap[id].append(simpleTranscript)
312
313 def compute2TranscriptFiles(self):
314 progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity)
315 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()):
316 for transcript1 in self.parsers[0].getIterator():
317 if transcript1.overlapWithExon(transcript2):
318 self.computeCoverage(transcript1, transcript2, cpt2)
319 self.computeOverlap(transcript1, transcript2, cpt2)
320 progress.inc()
321 progress.done()
322
323 def extractReferenceQuery(self, inputTranscript):
324 if "Target" not in inputTranscript.getTagNames():
325 raise Exception("Cannot extract Target field in line '%s'." % (inputTranscript))
326 id, start, end, strand = parseTargetField(inputTranscript.getTagValue("Target"))
327 if id not in self.sizes:
328 raise Exception("Target id '%s' of transcript '%s' does not correspond to anything in FASTA file." % (id, inputTranscript))
329 referenceTranscript = Transcript()
330 referenceTranscript.setChromosome(id)
331 referenceTranscript.setName(id)
332 referenceTranscript.setDirection("+")
333 referenceTranscript.setEnd(self.sizes[id])
334 referenceTranscript.setStart(1)
335 queryTranscript = Transcript()
336 queryTranscript.setChromosome(id)
337 queryTranscript.setName(id)
338 queryTranscript.setStart(start)
339 queryTranscript.setEnd(end)
340 queryTranscript.setDirection(strand)
341 if inputTranscript.getNbExons() > 1:
342 factor = float(end - start) / (inputTranscript.getEnd() - inputTranscript.getStart())
343 for exon in inputTranscript.getExons():
344 newExon = Interval()
345 newExon.setChromosome(id)
346 newExon.setDirection(strand)
347 if "Target" in inputTranscript.getTagNames():
348 id, start, end, strand = parseTargetField(exon.getTagValue("Target"))
349 newExon.setStart(start)
350 newExon.setEnd(end)
351 else:
352 newExon.setStart(int(round((exon.getStart() - inputTranscript.getStart()) * factor)) + start)
353 newExon.setEnd( int(round((exon.getEnd() - inputTranscript.getStart()) * factor)) + start)
354 queryTranscript.addExon(newExon)
355 return (referenceTranscript, queryTranscript)
356
357 def compute1TranscriptFiles(self):
358 progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity)
359 for transcript in self.parsers[1].getIterator():
360 referenceTranscript, queryTranscript = self.extractReferenceQuery(transcript)
361 self.computeCoverage(queryTranscript, referenceTranscript, referenceTranscript.getName())
362 self.computeOverlap(queryTranscript, referenceTranscript, referenceTranscript.getName())
363 progress.inc()
364 progress.done()
365
366 def compute(self):
367 if self.sequenceParser == None:
368 self.compute2TranscriptFiles()
369 else:
370 self.compute1TranscriptFiles()
371
372 def plotTranscript(self, index, transcript):
373 plotter = Plotter(self.seed, index, self.verbosity)
374 plotter.setOutputFileName(self.outputFileName)
375 plotter.setTranscript(transcript)
376 plotter.setTitle(self.title)
377 plotter.setLabels(self.xLabel, self.yLabel)
378 plotter.setPlotSize(self.width, self.height)
379 plotter.setCoverageData(self.coverage[index])
380 plotter.setOverlapData(self.overlap[index])
381 plotter.setMerge(self.merge)
382 plotter.plot()
383 output = plotter.log
384 return output
385
386 def plot1TranscriptFile(self):
387 self.outputCoverage = {}
388 self.outputCoveragePerStrand = {}
389 output = ""
390 progress = Progress(len(self.sequenceParser.getRegions()), "Plotting regions", self.verbosity)
391 for cpt2, region in enumerate(self.sequenceParser.getRegions()):
392 transcript = Transcript()
393 transcript.setName(region)
394 transcript.setDirection("+")
395 transcript.setEnd(self.sizes[region])
396 transcript.setStart(1)
397 output += self.plotTranscript(region, transcript)
398 progress.inc()
399 progress.done()
400 if self.verbosity > 0:
401 print output
402
403 def plot2TranscriptFiles(self):
404 self.outputCoverage = [0] * self.parsers[1].getNbTranscripts()
405 self.outputCoveragePerStrand = [None] * self.parsers[1].getNbTranscripts()
406 for cpt in range(self.parsers[1].getNbTranscripts()):
407 self.outputCoveragePerStrand[cpt] = dict([strand, 0] for strand in strands)
408 progress = Progress(self.parsers[1].getNbTranscripts(), "Plotting regions", self.verbosity)
409 output = ""
410 for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()):
411 output += self.plotTranscript(cpt2, transcript2)
412 progress.inc()
413 progress.done()
414 if self.verbosity > 0:
415 print output
416
417 def plot(self):
418 if self.sequenceParser == None:
419 self.plot2TranscriptFiles()
420 else:
421 self.plot1TranscriptFile()
422
423 def start(self):
424 self.initialize()
425 self.compute()
426 self.plot()
427
428
429 if __name__ == "__main__":
430
431 # parse command line
432 description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]"
433
434 parser = OptionParser(description = description)
435 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]")
436 parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript file format]")
437 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
438 parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]")
439 parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]")
440 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]")
441 parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]")
442 parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]")
443 parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]")
444 parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]")
445 parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]")
446 parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]")
447 parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]")
448 parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]")
449 parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]")
450 parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]")
451 parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]")
452 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
453 (options, args) = parser.parse_args()
454
455 colors[1] = options.plusColor
456 colors[-1] = options.minusColor
457 colors[0] = options.sumColor
458 colorLine = options.lineColor
459
460 pp = PlotParser(options.verbosity)
461 pp.addInput(0, options.inputFileName1, options.inputFormat1)
462 pp.addInput(1, options.inputFileName2, options.inputFormat2)
463 pp.addSequence(options.inputSequence)
464 if options.working_Dir[-1] != '/':
465 path = options.working_Dir + '/'
466 pp.setOutput(path + options.outputFileName)
467 pp.setPlotSize(options.width, options.height)
468 pp.setLabels(options.xLabel, options.yLabel)
469 pp.setTitle(options.title)
470 pp.setMerge(options.merge)
471 pp.start()
472
473