Mercurial > repos > yufei-luo > s_mart
diff smart_toolShed/SMART/Java/Python/getWigProfile.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getWigProfile.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,160 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks. +""" + +import math +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.WigParser import WigParser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter + +class GetWigProfile(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.values = {} + self.defaultValue = 0.0 + + def _iToJ(self, i, size): + return min(self.nbPoints+1, int(math.floor(float(i - self.distance) / (size) * (self.nbPoints)))) + + def readTranscripts(self): + self.strandNames = (1, -1) if self.strands else (1, ) + self.values = dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames]) + transcriptParser = TranscriptContainer(self.inputFileName, self.inputFormat, self.verbosity) + wigParser = WigParser(self.wig) + nbValues = dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames]) + wigParser.setStrands(self.strands) + wigParser.setDefaultValue(self.defaultValue) + + progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (self.inputFileName), self.verbosity) + for transcript in transcriptParser.getIterator(): + transcriptSize = transcript.getSize() + expectedSize = transcriptSize + 2 * self.distance + transcript.extendStart(self.distance) + transcript.extendEnd(self.distance) + theseValues = transcript.extractWigData(wigParser) + + if len(self.strandNames) == 1: + theseValues = {1: theseValues} + for strand in self.strandNames: + if len(theseValues[strand]) < expectedSize: + theseValues[strand] = [self.defaultValue] * (expectedSize - len(theseValues[strand])) + theseValues[strand] + if len(theseValues[strand]) != expectedSize: + raise Exception("Got something wrong with the size of the WIG data concerning %s [%s]: %d found instead of %d" % (transcript, ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in transcript.getExons()]), len(theseValues[strand]), expectedSize)) + fivePValues = theseValues[strand][: self.distance] + nbValues = [0.0] * (self.nbPoints) + transcriptValues = [0.0] * (self.nbPoints) + for i in range(self.distance, len(theseValues[strand]) - self.distance): + startJ = self._iToJ(i, transcriptSize) + endJ = max(startJ+1, self._iToJ(i+1, transcriptSize)) + for j in range(startJ, endJ): + transcriptValues[j] += theseValues[strand][i] + nbValues[j] += 1 + threePValues = theseValues[strand][-self.distance: ] + values = fivePValues + [self.defaultValue if nbValue == 0 else transcriptValue / nbValue for transcriptValue, nbValue in zip(transcriptValues, nbValues)] + threePValues + for i, value in enumerate(values): + self.values[strand][i] += value + progress.inc() + progress.done() + + for strand in self.strandNames: + if strand == 0: + strand = 1 + for i in range(self.nbPoints + 2 * self.distance): + self.values[strand][i] /= transcriptParser.getNbTranscripts() * strand + + + def smoothen(self): + if self.smoothenForce == None: + return + for strand in self.strandNames: + averageValues = {} + for center in range(self.distance, self.distance + self.nbPoints): + sum = 0.0 + nbValues = 0.0 + for i in range(center - self.smoothenForce + 1, center + self.smoothenForce): + if i > self.distance and i < self.distance + self.nbPoints: + nbValues += 1 + sum += self.values[strand][i] + averageValues[center] = sum / nbValues + for position in range(self.distance, self.distance + self.nbPoints): + self.values[strand][position] = averageValues[position] + + + def plot(self): + plotter = RPlotter(self.outputFileName, self.verbosity) + for strand in self.strandNames: + plotter.addLine(self.values[strand]) + if self.log: + plotter.setLog("y") + plotter.setAxisLabel("x", {0: -self.distance, self.distance: "start", self.distance+self.nbPoints-1: "end", 2*self.distance+self.nbPoints-1: self.distance}) + plotter.plot() + + + +if __name__ == "__main__": + + # parse command line + description = "Get WIG Profile v1.0.1: Compute the average profile of some genomic coordinates using WIG files (thus covering a large proportion of the genome). [Category: WIG Tools]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-w", "--wig", dest="wig", action="store", type="string", help="wig file name [compulsory] [format: file in WIG format]") + parser.add_option("-p", "--nbPoints", dest="nbPoints", action="store", default=1000, type="int", help="number of points on the x-axis [compulsory] [format: int] [default: 1000]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="distance around genomic coordinates [compulsory] [format: int] [default: 0]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider both strands separately [format: boolean] [default: False]") + parser.add_option("-m", "--smoothen", dest="smoothen", action="store", default=None, type="int", help="smoothen the curve [format: int] [default: None]") + parser.add_option("-a", "--default", dest="defaultValue", action="store", default=0.0, type="float", help="default value (when value is NA) [default: 0.0] [format: float]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="use log scale for y-axis [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + wigProfile = GetWigProfile(options.verbosity) + wigProfile.strands = options.strands + wigProfile.inputFileName = options.inputFileName + wigProfile.inputFormat = options.inputFormat + wigProfile.wig = options.wig + wigProfile.nbPoints = options.nbPoints + wigProfile.distance = options.distance + wigProfile.smoothenForce = options.smoothen + wigProfile.defaultValue = options.defaultValue + wigProfile.outputFileName = options.outputFileName + wigProfile.log = options.log + + wigProfile.readTranscripts() + wigProfile.smoothen() + wigProfile.plot()