| 36 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2010 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 import os, sys | 
|  | 32 from optparse import OptionParser | 
|  | 33 from commons.core.parsing.FastaParser import FastaParser | 
|  | 34 from commons.core.parsing.FastqParser import FastqParser | 
|  | 35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer | 
|  | 36 from commons.core.parsing.GffParser import GffParser | 
|  | 37 from SMART.Java.Python.misc.Progress import Progress | 
|  | 38 from SMART.Java.Python.misc.RPlotter import RPlotter | 
|  | 39 from SMART.Java.Python.misc import Utils | 
|  | 40 | 
|  | 41 from commons.core.LoggerFactory import LoggerFactory | 
|  | 42 from commons.core.utils.RepetOptionParser import RepetOptionParser | 
|  | 43 | 
|  | 44 LOG_DEPTH = "smart" | 
|  | 45 | 
|  | 46 class GetSizes(object): | 
|  | 47 | 
|  | 48 	def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, verbosity = 0): | 
|  | 49 		self.inFileName = inFileName | 
|  | 50 		self.inFormat= inFormat | 
|  | 51 		self.outFileName = outFileName | 
|  | 52 		self.query = query | 
|  | 53 		self.xMax = xMax | 
|  | 54 		self.xMin = xMin | 
|  | 55 		self.xLab = "Size" | 
|  | 56 		self.yLab = "# reads" | 
|  | 57 		self.barplot = False | 
|  | 58 		self._verbosity = verbosity | 
|  | 59 		self.parser = None | 
|  | 60 		self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) | 
|  | 61 | 
|  | 62 	def setAttributesFromCmdLine(self): | 
|  | 63 		description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n" | 
|  | 64 		epilog = "" | 
|  | 65 		parser = RepetOptionParser(description = description, epilog = epilog) | 
|  | 66 		parser.add_option("-i", "--input",	 dest="inputFileName", action="store",	    default=None,	   type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") | 
|  | 67 		parser.add_option("-f", "--format",	dest="format",		   action="store",	    default=None,	   type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") | 
|  | 68 		parser.add_option("-q", "--query",	 dest="query",		   action="store",	    default=None,	   type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]") | 
|  | 69 		parser.add_option("-o", "--output",	dest="outputFileName", action="store",	    default=None,	   type="string", help="output file [format: output file in PNG format]") | 
|  | 70 		parser.add_option("-x", "--xMax",	  dest="xMax",		   action="store",	    default=None,	   type="int",	  help="maximum value on the x-axis to plot [format: int]") | 
|  | 71 		parser.add_option("-X", "--xMin",	  dest="xMin",		   action="store",	    default=None,	   type="int",	  help="minimum value on the x-axis to plot [format: int]") | 
|  | 72 		parser.add_option("-v", "--verbosity", dest="verbosity",   action="store",	    default=1,		   type="int",	  help="trace level [format: int]") | 
|  | 73 		parser.add_option("-a", "--xLabel",	dest="xLab",           action="store",	    default="Size",	   type="string", help="x absis label name [format: string] [default: Size]") | 
|  | 74 		parser.add_option("-b", "--yLabel",	dest="yLab",           action="store",	    default="# reads", type="string", help="y absis label name [format: string] [default: Reads]") | 
|  | 75 		parser.add_option("-B", "--barplot",   dest="barplot",     action="store_true", default=False,					  help="use barplot representation [format: bool] [default: false]") | 
|  | 76 		options = parser.parse_args()[0] | 
|  | 77 		self._setAttributesFromOptions(options) | 
|  | 78 | 
|  | 79 	def _setAttributesFromOptions(self, options): | 
|  | 80 		self.setInFileName(options.inputFileName) | 
|  | 81 		self.setInFormat(options.format) | 
|  | 82 		self.setQuery(options.query) | 
|  | 83 		self.setOutFileName(options.outputFileName) | 
|  | 84 		self.setXMax(options.xMax) | 
|  | 85 		self.setXMin(options.xMin) | 
|  | 86 		self.setxLab(options.xLab) | 
|  | 87 		self.setyLab(options.yLab) | 
|  | 88 		self.setBarplot(options.barplot) | 
|  | 89 		self.setVerbosity(options.verbosity) | 
|  | 90 | 
|  | 91 	def setInFileName(self, inputFileName): | 
|  | 92 		self.inFileName = inputFileName | 
|  | 93 | 
|  | 94 	def setInFormat(self, inFormat): | 
|  | 95 		self.inFormat = inFormat | 
|  | 96 | 
|  | 97 	def setQuery(self, query): | 
|  | 98 		self.query = query | 
|  | 99 | 
|  | 100 	def setOutFileName(self, outFileName): | 
|  | 101 		self.outFileName = outFileName | 
|  | 102 | 
|  | 103 	def setXMax(self, xMax): | 
|  | 104 		self.xMax = xMax | 
|  | 105 | 
|  | 106 	def setXMin(self, xMin): | 
|  | 107 		self.xMin = xMin | 
|  | 108 | 
|  | 109 	def setxLab(self, xLab): | 
|  | 110 		self.xLab = xLab | 
|  | 111 | 
|  | 112 	def setyLab(self, yLab): | 
|  | 113 		self.yLab = yLab | 
|  | 114 | 
|  | 115 	def setBarplot(self, barplot): | 
|  | 116 		self.barplot = barplot | 
|  | 117 | 
|  | 118 	def setVerbosity(self, verbosity): | 
|  | 119 		self._verbosity = verbosity | 
|  | 120 | 
|  | 121 	def _checkOptions(self): | 
|  | 122 		if self.inFileName == None: | 
|  | 123 			self._logAndRaise("ERROR: Missing input file name") | 
|  | 124 		if self.inFormat == "fasta": | 
|  | 125 			self.parser = FastaParser(self.inFileName, self._verbosity) | 
|  | 126 		elif self.inFormat == "fastq": | 
|  | 127 			self.parser = FastqParser(self.inFileName, self._verbosity) | 
|  | 128 		else: | 
|  | 129 			self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity) | 
|  | 130 | 
|  | 131 	def _logAndRaise(self, errorMsg): | 
|  | 132 		self._log.error(errorMsg) | 
|  | 133 		raise Exception(errorMsg) | 
|  | 134 | 
|  | 135 	def run(self): | 
|  | 136 		LoggerFactory.setLevel(self._log, self._verbosity) | 
|  | 137 		self._checkOptions() | 
|  | 138 		self._log.info("START getsizes") | 
|  | 139 		self._log.debug("Input file name: %s" % self.inFileName) | 
|  | 140 | 
|  | 141 		nbItems = self.parser.getNbItems() | 
|  | 142 		self._log.info( "%i items found" % (nbItems)) | 
|  | 143 | 
|  | 144 		# treat items | 
|  | 145 		progress   = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity) | 
|  | 146 		sizes      = {} | 
|  | 147 		minimum	   = 1000000000000 | 
|  | 148 		maximum	   = 0 | 
|  | 149 		sum		   = 0 | 
|  | 150 		number     = 0 | 
|  | 151 		nbSubItems = 0 | 
|  | 152 		for item in self.parser.getIterator(): | 
|  | 153 			items = [] | 
|  | 154 			if self.query == "exon": | 
|  | 155 				items = item.getExons() | 
|  | 156 			elif self.query == "exon1": | 
|  | 157 				if len(item.getExons()) > 1: | 
|  | 158 					item.sortExons() | 
|  | 159 					items = [item.getExons()[0]] | 
|  | 160 			elif self.query == "intron": | 
|  | 161 				items = item.getIntrons() | 
|  | 162 			else: | 
|  | 163 				items = [item, ] | 
|  | 164 | 
|  | 165 			for thisItem in items: | 
|  | 166 				try: | 
|  | 167 					nbElements = int(float(thisItem.getTagValue("nbElements"))) | 
|  | 168 					if nbElements == None: | 
|  | 169 						nbElements = 1 | 
|  | 170 				except: | 
|  | 171 					nbElements = 1 | 
|  | 172 				size	= thisItem.getSize() | 
|  | 173 				minimum = min(minimum, size) | 
|  | 174 				maximum = max(maximum, size) | 
|  | 175 | 
|  | 176 				if size not in sizes: | 
|  | 177 					sizes[size] = nbElements | 
|  | 178 				else: | 
|  | 179 					sizes[size] += nbElements | 
|  | 180 				sum		+= size | 
|  | 181 				nbSubItems += nbElements | 
|  | 182 			number += 1 | 
|  | 183 			progress.inc() | 
|  | 184 		progress.done() | 
|  | 185 | 
|  | 186 		if self.outFileName != None: | 
|  | 187 			plotter = RPlotter(self.outFileName, self._verbosity) | 
|  | 188 			plotter.setFill(0) | 
|  | 189 			plotter.setMinimumX(self.xMin) | 
|  | 190 			plotter.setMaximumX(self.xMax) | 
|  | 191 			plotter.setXLabel(self.xLab) | 
|  | 192 			plotter.setYLabel(self.yLab) | 
|  | 193 			plotter.setBarplot(self.barplot) | 
|  | 194 			plotter.addLine(sizes) | 
|  | 195 			plotter.plot() | 
|  | 196 | 
|  | 197 		if nbSubItems == 0: | 
|  | 198 			self._logAndRaise("No item found") | 
|  | 199 | 
|  | 200 		self.items = number | 
|  | 201 		self.subItems = nbSubItems | 
|  | 202 		self.nucleotides = sum | 
|  | 203 		self.minAvgMedMax = Utils.getMinAvgMedMax(sizes) | 
|  | 204 | 
|  | 205 		print "%d items" % (number) | 
|  | 206 		print "%d sub-items" % (nbSubItems) | 
|  | 207 		print "%d nucleotides" % (sum) | 
|  | 208 		print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes) | 
|  | 209 | 
|  | 210 		self._log.info("END getsizes") | 
|  | 211 | 
|  | 212 | 
|  | 213 if __name__ == "__main__": | 
|  | 214 	iGetSizes = GetSizes() | 
|  | 215 	iGetSizes.setAttributesFromCmdLine() | 
|  | 216 	iGetSizes.run() | 
|  | 217 | 
|  | 218 #TODO: add two more options!!!!!! |