diff smart_toolShed/SMART/Java/Python/getWigProfile.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/smart_toolShed/SMART/Java/Python/getWigProfile.py	Thu Jan 17 10:52:14 2013 -0500
@@ -0,0 +1,160 @@
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+"""
+Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks.
+"""
+
+import math
+from optparse import OptionParser
+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
+from commons.core.parsing.WigParser import WigParser
+from SMART.Java.Python.misc.Progress import Progress
+from SMART.Java.Python.misc.RPlotter import RPlotter
+
+class GetWigProfile(object):
+
+	def __init__(self, verbosity):
+		self.verbosity	= verbosity
+		self.values		 = {}
+		self.defaultValue = 0.0
+
+	def _iToJ(self, i, size):
+		return min(self.nbPoints+1, int(math.floor(float(i - self.distance) / (size) * (self.nbPoints))))
+
+	def readTranscripts(self):
+		self.strandNames = (1, -1) if self.strands else (1, )
+		self.values		= dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames])
+		transcriptParser = TranscriptContainer(self.inputFileName, self.inputFormat, self.verbosity)
+		wigParser		= WigParser(self.wig)
+		nbValues		 = dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames])
+		wigParser.setStrands(self.strands)
+		wigParser.setDefaultValue(self.defaultValue)
+
+		progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (self.inputFileName), self.verbosity)
+		for transcript in transcriptParser.getIterator():
+			transcriptSize = transcript.getSize()
+			expectedSize   = transcriptSize + 2 * self.distance
+			transcript.extendStart(self.distance)
+			transcript.extendEnd(self.distance)
+			theseValues = transcript.extractWigData(wigParser)
+
+			if len(self.strandNames) == 1:
+				theseValues = {1: theseValues}
+			for strand in self.strandNames:
+				if len(theseValues[strand]) < expectedSize:
+					theseValues[strand] = [self.defaultValue] * (expectedSize - len(theseValues[strand])) + theseValues[strand]
+				if len(theseValues[strand]) != expectedSize:
+					raise Exception("Got something wrong with the size of the WIG data concerning %s [%s]: %d found instead of %d" % (transcript, ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in transcript.getExons()]), len(theseValues[strand]), expectedSize))
+				fivePValues = theseValues[strand][: self.distance]
+				nbValues         = [0.0] * (self.nbPoints)
+				transcriptValues = [0.0] * (self.nbPoints)
+				for i in range(self.distance, len(theseValues[strand]) - self.distance):
+					startJ = self._iToJ(i, transcriptSize)
+					endJ   = max(startJ+1, self._iToJ(i+1, transcriptSize))
+					for j in range(startJ, endJ):
+						transcriptValues[j]	+= theseValues[strand][i]
+						nbValues[j] += 1
+				threePValues = theseValues[strand][-self.distance: ]
+				values = fivePValues + [self.defaultValue if nbValue == 0 else transcriptValue / nbValue for transcriptValue, nbValue in zip(transcriptValues, nbValues)] + threePValues
+				for i, value in enumerate(values):
+					self.values[strand][i] += value
+			progress.inc()
+		progress.done()
+
+		for strand in self.strandNames:
+			if strand == 0:
+				strand = 1
+			for i in range(self.nbPoints + 2 * self.distance):
+				self.values[strand][i] /= transcriptParser.getNbTranscripts() * strand
+
+
+	def smoothen(self):
+		if self.smoothenForce == None:
+			return
+		for strand in self.strandNames:
+			averageValues = {}
+			for center in range(self.distance, self.distance + self.nbPoints):
+				sum		= 0.0
+				nbValues = 0.0
+				for i in range(center - self.smoothenForce + 1, center + self.smoothenForce):
+					if i > self.distance and i < self.distance + self.nbPoints:
+						nbValues += 1
+						sum		+= self.values[strand][i]
+				averageValues[center] = sum / nbValues
+			for position in range(self.distance, self.distance + self.nbPoints):
+				self.values[strand][position] = averageValues[position]
+		
+
+	def plot(self):
+		plotter = RPlotter(self.outputFileName, self.verbosity)
+		for strand in self.strandNames:
+			plotter.addLine(self.values[strand])
+		if self.log:
+			plotter.setLog("y")
+		plotter.setAxisLabel("x", {0: -self.distance, self.distance: "start", self.distance+self.nbPoints-1: "end", 2*self.distance+self.nbPoints-1: self.distance})
+		plotter.plot()
+
+
+
+if __name__ == "__main__":
+	
+	# parse command line
+	description = "Get WIG Profile v1.0.1: Compute the average profile of some genomic coordinates using WIG files (thus covering a large proportion of the genome). [Category: WIG Tools]"
+
+	parser = OptionParser(description = description)
+	parser.add_option("-i", "--input",			 dest="inputFileName",	action="store",											type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
+	parser.add_option("-f", "--inputFormat", dest="inputFormat",		action="store",											type="string", help="format of the input file [compulsory] [format: transcript file format]")
+	parser.add_option("-w", "--wig",				 dest="wig",						action="store",											type="string", help="wig file name [compulsory] [format: file in WIG format]")	
+	parser.add_option("-p", "--nbPoints",		 dest="nbPoints",				action="store",			 default=1000,	type="int",		 help="number of points on the x-axis [compulsory] [format: int] [default: 1000]")	
+	parser.add_option("-d", "--distance",		 dest="distance",				action="store",			 default=0,			type="int",		 help="distance around genomic coordinates [compulsory] [format: int] [default: 0]")	
+	parser.add_option("-s", "--strands",		 dest="strands",				action="store_true", default=False,								 help="consider both strands separately [format: boolean] [default: False]")	
+	parser.add_option("-m", "--smoothen",		 dest="smoothen",				action="store",			 default=None,	type="int",		 help="smoothen the curve [format: int] [default: None]")	
+	parser.add_option("-a", "--default",		 dest="defaultValue",	 action="store",			 default=0.0,	 type="float",	help="default value (when value is NA) [default: 0.0] [format: float]")
+	parser.add_option("-o", "--output",			 dest="outputFileName", action="store",											type="string", help="output file [compulsory] [format: output file in PNG format]")
+	parser.add_option("-l", "--log",				 dest="log",						action="store_true", default=False,								 help="use log scale for y-axis	[format: boolean] [default: False]")
+	parser.add_option("-v", "--verbosity",	 dest="verbosity",			action="store",			 default=1,			type="int",		 help="trace level [format: int]")
+	(options, args) = parser.parse_args()
+
+	wigProfile								= GetWigProfile(options.verbosity)
+	wigProfile.strands			 	= options.strands
+	wigProfile.inputFileName	= options.inputFileName
+	wigProfile.inputFormat		= options.inputFormat
+	wigProfile.wig						= options.wig
+	wigProfile.nbPoints				= options.nbPoints
+	wigProfile.distance				= options.distance
+	wigProfile.smoothenForce	= options.smoothen
+	wigProfile.defaultValue	  = options.defaultValue
+	wigProfile.outputFileName = options.outputFileName
+	wigProfile.log						= options.log
+
+	wigProfile.readTranscripts()
+	wigProfile.smoothen()
+	wigProfile.plot()