view SMART/Java/Python/GetReadDistribution.py @ 11:2da30502c2f1

Updated CompareOverlappingSmallQuery.xml
author m-zytnicki
date Thu, 14 Mar 2013 05:37:08 -0400
parents 769e306b7933
children 94ab73e8a190
line wrap: on
line source

#! /usr/bin/env python
#
# Copyright INRA-URGI 2009-2010
# 
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
# 
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
# 
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
# 
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
import random, os, glob, subprocess
from commons.core.parsing.ParserChooser import ParserChooser
from commons.core.parsing.GffParser import GffParser
from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
from SMART.Java.Python.misc.Progress import Progress
from SMART.Java.Python.misc import Utils
from commons.core.LoggerFactory import LoggerFactory
from commons.core.utils.RepetOptionParser import RepetOptionParser

LOG_DEPTH      = "smart"
DEFAULT_REGION = "_all_"
MULTIPLE_STR   = {1: "", 1000: " (in kpb)", 1000000: " (in Gbp)"}

class GetReadDistribution(object):

	def __init__(self, verbosity = 0):
		self.xLab         = ""
		self.yLab         = "# reads"
		self.verbosity    = verbosity
		self.number       = random.randint(0, 100000)
		self.log          = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
		self.parsers      = {}
		self.distribution = {}
		self.factors      = {}
		self.regions      = None
		self.tmpDatName   = None
		self.tmpRName     = None
		self.quorum       = 1
		self.width        = 800
		self.height       = 300

	def setNames(self, names):
		self.names = names

	def setInputFiles(self, fileNames, format):
		chooser = ParserChooser(self.verbosity)
		chooser.findFormat(format)
		for cpt, fileName in enumerate(fileNames):
			self.parsers[self.names[cpt]] = chooser.getParser(fileName)

	def setOutputFileName(self, fileName):
		self.outputFileName = fileName

	def setLabs(self, xLab, yLab):
		self.xLab = xLab
		self.yLab = yLab

	def setBinSize(self, binSize):
		self.binSize = binSize

	def setColors(self, colors):
		self.colors = colors

	def setFactors(self, factors):
		self.factors = dict(zip(self.names, factors))

	def setMultiple(self, boolean):
		self.multiple = boolean
	
	def setImageSize(self, width, height):
		if width != None:
			self.width = width
		if height != None:
			self.height = height

	def setQuorum(self, quorum):
		self.quorum = quorum

	def setRegionsFile(self, fileName):
		if fileName != None:
			self._loadRegions(fileName)

	def _checkOptions(self):
		if not self.parsers:
			self.logAndRaise("ERROR: Missing input file names")

	def _logAndRaise(self, errorMsg):
		self.log.error(errorMsg)
		raise Exception(errorMsg)

	def _loadRegions(self, fileName):
		self.regions = {}
		parser       = GffParser(fileName, self.verbosity)
		for transcript in parser.getIterator():
			chromosome = transcript.getChromosome()
			start      = transcript.getStart()
			end        = transcript.getEnd()
			name       = transcript.getName()
			if chromosome not in self.regions:
				self.regions[chromosome] = {}
			if start not in self.regions[chromosome]:
				self.regions[chromosome][start] = {}
			if end not in self.regions[chromosome][start]:
				self.regions[chromosome][start][end] = []
			self.regions[chromosome][start][end].append(name)

	def _getRegions(self, transcript):
		if self.regions == None:
			return [DEFAULT_REGION]
		chromosome = transcript.getChromosome()
		start      = transcript.getStart()
		end        = transcript.getEnd()
		if chromosome not in self.regions:
			return []
		names = []
		for loadedStart in sorted(self.regions[chromosome].keys()):
			if loadedStart > end:
				return names
			for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())):
				if loadedEnd < start:
					break
				names.extend(self.regions[chromosome][loadedStart][loadedEnd])
		return names

	def _parse(self, name):
		progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity)
		for transcript in self.parsers[name].getIterator():
			if transcript.__class__.__name__ == "Mapping":
				transcript = transcript.getTranscript()
			regions = self._getRegions(transcript)
			for region in regions:
				if region not in self.distribution:
					self.distribution[region] = {}
				if name not in self.distribution[region]:
					self.distribution[region][name] = {}
				chromosome  = transcript.getChromosome()
				nbElements  = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1
				nbElements *= self.factors.get(name, 1)
				if chromosome not in self.distribution[region][name]:
					self.distribution[region][name][chromosome] = {}
				previousBin = None
				for exon in transcript.getExons():
					for pos in range(exon.getStart(), exon.getEnd()+1):
						bin = pos / self.binSize
						if bin != previousBin:
							self.distribution[region][name][chromosome][bin] = self.distribution[region][name][chromosome].get(bin, 0) + nbElements
							previousBin = bin
			progress.inc()
		progress.done()

	def _checkQuorum(self, region):
		if self.quorum == None:
			return True
		return max([max([max(self.distribution[region][name][chromosome].values()) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]]) >= self.quorum

	def _writeData(self, region):
		self.tmpDatName = "tmpFile%d.dat" % (self.number)
		handle          = open(self.tmpDatName, "w")
		handle.write("Chr\tPos\tCount\tSample\n")
		for name in self.distribution[region]:
			for chromosome in sorted(self.distribution[region][name].keys()):
				for pos in sorted(self.distribution[region][name][chromosome].keys()):
					handle.write("%s\t%d\t%d\t\"%s\"\n" % (chromosome, pos * self.binSize, self.distribution[region][name][chromosome].get(pos, 0), name))
		handle.close()

	def _findMultiple(self, region):
		if not self.multiple:
			return 1
		maxPosition = max([self.distribution[region][name][chromosome].keys() for name in self.distribution[region] for chromosome in self.distribution[region][name]])
		if maxPosition > 2000000:
			return 1000000
		elif maxPosition > 2000:
			return 1000
		return 1

	def _writeScript(self, region):
		self.tmpRName = "tmpFile%d.R" % (self.number)
		fileName      = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region)
		colors        = "scale_fill_brewer(palette=\"Set1\") + scale_color_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s)) + scale_color_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors]), ", ".join(["\"%s\"" % (color) for color in self.colors]))
		title         = "" if region == DEFAULT_REGION else " of %s" % (region)
		facet         = "Sample ~ Chr" if region == DEFAULT_REGION else "Sample ~ ."
		handle        = open(self.tmpRName, "w")
		multiple      = self._findMultiple(region)
		handle.write("library(ggplot2)\n")
		handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName))
		handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names])))
		handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height))
		handle.write("ggplot(data, aes(x = Pos/%d, y = Count, fill = Sample, color = Sample)) + opts(title = \"Distribution%s\") + geom_bar(stat = \"identity\") + facet_grid(%s, space=\"free\") + xlab(\"%s%s\") + ylab(\"%s\") + %s + opts(legend.position = \"none\", panel.grid.major = theme_blank(), panel.grid.minor = theme_blank(), panel.background = theme_blank())\n" % (multiple, title, facet, self.xLab, MULTIPLE_STR[multiple], self.yLab, colors))
		handle.write("dev.off()\n")

	def _runR(self):
		rCommand = "R"
		if "SMARTRPATH" in os.environ:
			rCommand = os.environ["SMARTRPATH"]
		command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName)
		status = subprocess.call(command, shell=True)
		if status != 0:
			raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status))

	def _plot(self):
		progress = Progress(len(self.distribution), "Plotting data", self.verbosity)
		for region in self.distribution:
			if not self._checkQuorum(region):
				self.log.info("Not displaying '%s' for it contains insufficient data." % (region))
			else:
				self._writeData(region)
				self._writeScript(region)
				self._runR()
			progress.inc()
		progress.done()

	def _cleanFiles(self):
		for fileName in (self.tmpDatName, self.tmpRName):
			if fileName != None and os.path.exists(fileName):
				os.remove(fileName)
				for otherFileName in glob.glob("%s*" % (fileName)):
					os.remove(otherFileName)

	def run(self):
		LoggerFactory.setLevel(self.log, self.verbosity)
		self._checkOptions()
		self.log.info("START Get Read Distribution")
		for name in self.names:
			self._parse(name)
		self._plot()
		self._cleanFiles()
		self.log.info("END Get Read Distribution")


if __name__ == "__main__":
	description = "Usage: GetReadDistribution.py [options]\n\nGet Read Distribution v1.0.1: Get the distribution of a set of reads. [Category: Personal]\n"
	epilog = ""
	parser = RepetOptionParser(description = description, epilog = epilog)
	parser.add_option("-i", "--input",     dest="inputFileNames",  action="store",      default=None,      type="string", help="input files, separated by commas [compulsory] [format: string]")
	parser.add_option("-f", "--format",    dest="format",          action="store",      default=None,      type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")
	parser.add_option("-n", "--names",     dest="names",           action="store",      default=None,      type="string", help="name of the input data, separated by commas [compulsory] [format: string]")
	parser.add_option("-o", "--output",    dest="outputFileName",  action="store",      default=None,      type="string", help="output file [format: output file in PNG format]")
	parser.add_option("-s", "--binSize",   dest="binSize",         action="store",      default=10000,     type="int",    help="bin size [format: int] [default: 10000]")
	parser.add_option("-l", "--xLabel",    dest="xLab",            action="store",      default="",        type="string", help="x-axis label name [format: string]")
	parser.add_option("-L", "--yLabel",    dest="yLab",            action="store",      default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]")
	parser.add_option("-c", "--colors",    dest="colors",          action="store",      default=None,      type="string", help="colors of the bars, separated by commas  [format: string]")
	parser.add_option("-a", "--factors",   dest="factors",         action="store",      default=None,      type="string", help="normalization factors, separated by commas  [format: string]")
	parser.add_option("-r", "--regions",   dest="regionsFileName", action="store",      default=None,      type="string", help="regions to plot [format: transcript file in GFF format]")
	parser.add_option("-m", "--multiple",  dest="multiple",        action="store_true", default=False,                    help="print position using multiples (k, G) [format: boolean] [default: False]")
	parser.add_option("-q", "--quorum",    dest="quorum",          action="store",      default=1,         type="int",    help="minimum number of intervals to plot a region [format: int] [default: 1]")
	parser.add_option("-z", "--width",     dest="width",           action="store",      default=800,       type="int",    help="width of the image [format: int] [default: 800]")
	parser.add_option("-Z", "--height",    dest="height",          action="store",      default=300,       type="int",    help="height of the image [format: int] [default: 300]")
	parser.add_option("-v", "--verbosity", dest="verbosity",       action="store",      default=1,         type="int",    help="trace level [format: int]")
	options = parser.parse_args()[0]
	iGetReadDistribution = GetReadDistribution(options.verbosity)
	iGetReadDistribution.setNames(options.names.split(","))
	iGetReadDistribution.setInputFiles(options.inputFileNames.split(","), options.format)
	iGetReadDistribution.setOutputFileName(options.outputFileName)
	iGetReadDistribution.setLabs(options.xLab, options.yLab)
	iGetReadDistribution.setBinSize(options.binSize)
	iGetReadDistribution.setColors(None if options.colors == None else options.colors.split(","))
	iGetReadDistribution.setFactors(None if options.factors == None else map(float, options.factors.split(",")))
	iGetReadDistribution.setRegionsFile(options.regionsFileName)
	iGetReadDistribution.setMultiple(options.multiple)
	iGetReadDistribution.setQuorum(options.quorum)
	iGetReadDistribution.setImageSize(options.width, options.height)
	iGetReadDistribution.run()