6
+ − 1 #! /usr/bin/env python
+ − 2 #
+ − 3 # Copyright INRA-URGI 2009-2010
+ − 4 #
+ − 5 # This software is governed by the CeCILL license under French law and
+ − 6 # abiding by the rules of distribution of free software. You can use,
+ − 7 # modify and/ or redistribute the software under the terms of the CeCILL
+ − 8 # license as circulated by CEA, CNRS and INRIA at the following URL
+ − 9 # "http://www.cecill.info".
+ − 10 #
+ − 11 # As a counterpart to the access to the source code and rights to copy,
+ − 12 # modify and redistribute granted by the license, users are provided only
+ − 13 # with a limited warranty and the software's author, the holder of the
+ − 14 # economic rights, and the successive licensors have only limited
+ − 15 # liability.
+ − 16 #
+ − 17 # In this respect, the user's attention is drawn to the risks associated
+ − 18 # with loading, using, modifying and/or developing or reproducing the
+ − 19 # software by the user in light of its specific status of free software,
+ − 20 # that may mean that it is complicated to manipulate, and that also
+ − 21 # therefore means that it is reserved for developers and experienced
+ − 22 # professionals having in-depth computer knowledge. Users are therefore
+ − 23 # encouraged to load and test the software's suitability as regards their
+ − 24 # requirements in conditions enabling the security of their systems and/or
+ − 25 # data to be ensured and, more generally, to use and operate it in the
+ − 26 # same conditions as regards security.
+ − 27 #
+ − 28 # The fact that you are presently reading this means that you have had
+ − 29 # knowledge of the CeCILL license and that you accept its terms.
+ − 30 #
+ − 31 import re
+ − 32 from commons.core.writer.WriterChooser import WriterChooser
+ − 33 """
+ − 34 Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks.
+ − 35 """
+ − 36
+ − 37 import os, os.path
+ − 38 from optparse import OptionParser
+ − 39 from SMART.Java.Python.structure.Transcript import Transcript
+ − 40 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
+ − 41 from SMART.Java.Python.misc.RPlotter import RPlotter
+ − 42 from SMART.Java.Python.misc.Progress import Progress
+ − 43 from commons.core.writer.Gff3Writer import Gff3Writer
+ − 44
+ − 45 class ClusterizeBySlidingWindows(object):
+ − 46
+ − 47 def __init__(self, verbosity = 0):
+ − 48 self.verbosity = verbosity
+ − 49 self.strands = (0, )
+ − 50 self.normalize = False
+ − 51 self.plot = None
+ − 52 self.excel = None
+ − 53 self.outputFileName = ''
+ − 54 self.defaultValue = None
+ − 55
+ − 56 def __del__(self):
+ − 57 pass
+ − 58
+ − 59 def setInputFile(self, fileName, format):
+ − 60 self.parser = TranscriptContainer(fileName, format, self.verbosity)
+ − 61
+ − 62 def setOutputFileName(self, fileName, format="gff", title="S-MART", feature="transcript", featurePart="exon"):
+ − 63 writerChooser = WriterChooser(self.verbosity)
+ − 64 writerChooser.findFormat(format)
+ − 65 self.writer = writerChooser.getWriter(fileName)
+ − 66 self.writer.setTitle(title)
+ − 67 self.writer.setFeature(feature)
+ − 68 self.writer.setFeaturePart(featurePart)
+ − 69 # self.outputFileName = fileName
+ − 70 # self.outputFormat = format
+ − 71
+ − 72 def setWindowSize(self, size):
+ − 73 self.size = size
+ − 74
+ − 75 def setWindowOverlap(self, overlap):
+ − 76 self.overlap = overlap
+ − 77
+ − 78 def setTag(self, tag):
+ − 79 self.tag = tag
+ − 80
+ − 81 def setOperation(self, operation):
+ − 82 self.operation = operation
+ − 83
+ − 84 def setBothStrands(self, bothStrands):
+ − 85 if bothStrands:
+ − 86 self.strands = (-1, 1)
+ − 87
+ − 88 def setNormalize(self, normalize):
+ − 89 self.normalize = normalize
+ − 90
+ − 91 def setPlot(self, plot):
+ − 92 self.plot = plot
+ − 93
+ − 94 def setExcel(self, excel):
+ − 95 self.excel = excel
+ − 96
+ − 97 def setOutputTag(self, tag):
+ − 98 self.outputTagName = tag
+ − 99
+ − 100 def setDefaultValue(self, defaultValue):
+ − 101 self.defaultValue = defaultValue
+ − 102
+ − 103 def checkOptions(self):
+ − 104 # if self.operation != None:
+ − 105 # raise Exception("Trying to combine the values without specifying tag! Aborting...")
+ − 106 if self.operation != None and self.operation not in ("sum", "avg", "med", "min", "max"):
+ − 107 raise Exception("Do not understand tag '%s'! Aborting..." % (self.operation))
+ − 108
+ − 109 def getChromosomeSizes(self):
+ − 110 self.sizes = {}
+ − 111 progress = Progress(self.parser.getNbTranscripts(), "Getting sizes in genome", self.verbosity)
+ − 112 for transcript in self.parser.getIterator():
+ − 113 self.sizes[transcript.getChromosome()] = max(transcript.getStart(), self.sizes.get(transcript.getChromosome(), 0))
+ − 114 progress.inc()
+ − 115 progress.done()
+ − 116
+ − 117 def getBinsFromPos(self, pos):
+ − 118 bin = (pos - 1) / (self.size - self.overlap)
+ − 119 if bin >= 1 and pos <= bin * (self.size - self.overlap) + self.overlap:
+ − 120 return (bin - 1, bin)
+ − 121 return (bin, )
+ − 122
+ − 123 def getPosFromBin(self, bin):
+ − 124 return (bin * (self.size - self.overlap) + 1, bin * (self.size - self.overlap) + self.size)
+ − 125
+ − 126 def initializeBins(self):
+ − 127 self.binsPerStrand = {}
+ − 128 self.sumsPerStrand = {}
+ − 129 self.valuesPerStrand = {}
+ − 130 self.toBePlottedPerStrand = {}
+ − 131 for strand in self.strands:
+ − 132 self.binsPerStrand[strand] = {}
+ − 133 self.sumsPerStrand[strand] = {}
+ − 134 self.valuesPerStrand[strand] = {}
+ − 135 self.toBePlottedPerStrand[strand] = {}
+ − 136 for chromosome in self.sizes:
+ − 137 binRange = range(self.getBinsFromPos(self.sizes[chromosome])[-1] + 1)
+ − 138 self.binsPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange])
+ − 139 self.sumsPerStrand[strand][chromosome] = dict([[i, 0.0] for i in binRange])
+ − 140 self.valuesPerStrand[strand][chromosome] = dict([[i, []] for i in binRange])
+ − 141 self.toBePlottedPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange])
+ − 142
+ − 143 def getNbElements(self, transcript):
+ − 144 nbOccurrences = 1 if "nbOccurrences" not in transcript.getTagNames() else transcript.getTagValue("nbOccurrences")
+ − 145 nbElements = 1 if "nbElements" not in transcript.getTagNames() else transcript.getTagValue("nbElements")
+ − 146 nbOccurrences = float(nbOccurrences)
+ − 147 nbElements = float(nbElements)
+ − 148 nbElements /= float(nbOccurrences)
+ − 149 return nbElements
+ − 150
+ − 151 def setBins(self):
+ − 152 progress = Progress(self.parser.getNbTranscripts(), "Setting bins", self.verbosity)
+ − 153 for transcript in self.parser.getIterator():
+ − 154 nbElements = self.getNbElements(transcript)
+ − 155 strand = transcript.getDirection() if len(self.strands) == 2 else 0
+ − 156 for bin in self.getBinsFromPos(transcript.getStart()):
+ − 157 self.binsPerStrand[strand][transcript.getChromosome()][bin] += nbElements
+ − 158 if self.tag != None:
+ − 159 if self.tag not in transcript.getTagNames():
+ − 160 if self.defaultValue is None:
+ − 161 raise Exception("Tag %s undefined in transcript %s" % (self.tag, transcript))
+ − 162 value = self.defaultValue
+ − 163 else:
+ − 164 value = float(transcript.getTagValue(self.tag))
+ − 165 self.sumsPerStrand[strand][transcript.getChromosome()][bin] += value
+ − 166 self.valuesPerStrand[strand][transcript.getChromosome()][bin].append(value)
+ − 167 progress.inc()
+ − 168 progress.done()
+ − 169
+ − 170 def aggregateData(self):
+ − 171 if self.operation == "sum":
+ − 172 self.computeSumData()
+ − 173 elif self.operation == "avg":
+ − 174 self.computeAvgData()
+ − 175 elif self.operation == "med":
+ − 176 self.computeMedData()
+ − 177 elif self.operation == "min":
+ − 178 self.computeMinData()
+ − 179 elif self.operation == "max":
+ − 180 self.computeMaxData()
+ − 181 elif self.operation == "GCpercent":
+ − 182 self.computeGCPercent()
+ − 183 else:
+ − 184 self.toBePlottedPerStrand = self.binsPerStrand
+ − 185
+ − 186 def computeSumData(self):
+ − 187 self.toBePlottedPerStrand = self.sumsPerStrand
+ − 188
+ − 189 def computeAvgData(self):
+ − 190 for strand in self.strands:
+ − 191 for chromosome in self.binsPerStrand[strand]:
+ − 192 for bin in self.binsPerStrand[strand][chromosome]:
+ − 193 if self.binsPerStrand[strand][chromosome][bin] != 0:
+ − 194 self.toBePlottedPerStrand[strand][chromosome][bin] = float(self.sumsPerStrand[strand][chromosome][bin]) / self.binsPerStrand[strand][chromosome][bin]
+ − 195
+ − 196 def computeMedData(self):
+ − 197 for strand in self.strands:
+ − 198 for chromosome in self.binsPerStrand[strand]:
+ − 199 for bin in self.binsPerStrand[strand][chromosome]:
+ − 200 if self.valuesPerStrand[strand][chromosome][bin]:
+ − 201 self.valuesPerStrand[strand][chromosome][bin].sort()
+ − 202 size = len(self.valuesPerStrand[strand][chromosome][bin])
+ − 203 if size % 2 == 1:
+ − 204 self.toBePlottedPerStrand[strand][chromosome][bin] = self.valuesPerStrand[strand][chromosome][bin][(size - 1) / 2]
+ − 205 else:
+ − 206 self.toBePlottedPerStrand[strand][chromosome][bin] = (self.valuesPerStrand[strand][chromosome][bin][size / 2 - 1] + self.valuesPerStrand[strand][chromosome][bin][size / 2]) / 2.0
+ − 207
+ − 208 def computeMinData(self):
+ − 209 for strand in self.strands:
+ − 210 for chromosome in self.binsPerStrand[strand]:
+ − 211 for bin in self.binsPerStrand[strand][chromosome]:
+ − 212 if self.valuesPerStrand[strand][chromosome][bin]:
+ − 213 self.toBePlottedPerStrand[strand][chromosome][bin] = min(self.valuesPerStrand[strand][chromosome][bin])
+ − 214
+ − 215 def computeMaxData(self):
+ − 216 for strand in self.strands:
+ − 217 for chromosome in self.binsPerStrand[strand]:
+ − 218 for bin in self.binsPerStrand[strand][chromosome]:
+ − 219 if self.valuesPerStrand[strand][chromosome][bin]:
+ − 220 self.toBePlottedPerStrand[strand][chromosome][bin] = max(self.valuesPerStrand[strand][chromosome][bin])
+ − 221
+ − 222 def computeGCPercent(self):
+ − 223 for strand in self.strands:
+ − 224 for chromosome in self.binsPerStrand[strand]:
+ − 225 for bin in self.binsPerStrand[strand][chromosome]:
+ − 226 if self.valuesPerStrand[strand][chromosome][bin]:
+ − 227 subSequence = self.valuesPerStrand[strand][chromosome][bin]
+ − 228 NPercent = 100 * (subSequence.countNt("N") / float(subSequence.getSize()))
+ − 229 if NPercent >= 50:
+ − 230 currentGCpercent = "NA"
+ − 231 else:
+ − 232 currentGCpercent = subSequence.getGCpercentageInSequenceWithoutCountNInLength()
+ − 233
+ − 234 self.toBePlottedPerStrand[strand][chromosome][bin] = currentGCpercent
+ − 235 #TODO: see if a map method could be used for the various "compute" methods
+ − 236 #return currentGCpercent, NPercent
+ − 237
+ − 238 def plotData(self):
+ − 239 if self.plot != None:
+ − 240 for strand in self.strands:
+ − 241 adjunct = ""
+ − 242 if strand != 0:
+ − 243 adjunct = "Strand%d" % (strand)
+ − 244 for chromosome in self.toBePlottedPerStrand[strand]:
+ − 245 if len(self.toBePlottedPerStrand[strand][chromosome].keys()) > 0:
+ − 246 plotter = RPlotter(self.plot, self.verbosity)
+ − 247 plotter.setFill(0)
+ − 248 plotter.addLine(self.toBePlottedPerStrand[strand][chromosome], chromosome)
+ − 249 plotter.plot()
+ − 250
+ − 251 def writeExcel(self):
+ − 252 if self.excel != None:
+ − 253 excelFile = open(self.excel, "w")
+ − 254 for strand in self.strands:
+ − 255 maxBin = max([max(self.toBePlottedPerStrand[strand][chromosome].keys()) for chromosome in self.binsPerStrand[strand]])
+ − 256 for bin in range(0, maxBin + 1):
+ − 257 excelFile.write(",%d-%d" % self.getPosFromBin(bin))
+ − 258 excelFile.write("\n")
+ − 259 for chromosome in self.toBePlottedPerStrand[strand]:
+ − 260 excelFile.write("%s" % (chromosome))
+ − 261 for bin in self.toBePlottedPerStrand[strand][chromosome]:
+ − 262 excelFile.write(",%f" % (self.toBePlottedPerStrand[strand][chromosome][bin]))
+ − 263 excelFile.write("\n")
+ − 264 excelFile.close()
+ − 265
+ − 266 def printRegions(self):
+ − 267 cpt = 1
+ − 268 tagOp = "nb"
+ − 269 tagName = "Elements"
+ − 270 outputTagName = "nbElements"
+ − 271 if self.operation != None:
+ − 272 tagOp = self.operation.lower()
+ − 273 if self.tag != None:
+ − 274 tagName = self.tag.title()
+ − 275 if self.outputTagName != None:
+ − 276 outputTagName = self.outputTagName
+ − 277
+ − 278
+ − 279 #writer = Gff3Writer(self.outputFileName, self.verbosity)
+ − 280
+ − 281 for strand in self.strands:
+ − 282 for chromosome in self.toBePlottedPerStrand[strand]:
+ − 283 for bin in self.toBePlottedPerStrand[strand][chromosome]:
+ − 284 transcript = Transcript()
+ − 285 transcript.setName("region%d" % cpt)
+ − 286 transcript.setChromosome(chromosome)
+ − 287 transcript.setStart(self.getPosFromBin(bin)[0])
+ − 288 transcript.setEnd(self.getPosFromBin(bin)[1])
+ − 289 transcript.setDirection(1 if strand == 0 else strand)
+ − 290 transcript.setTagValue(outputTagName, self.binsPerStrand[strand][chromosome][bin])
+ − 291 transcript.setTagValue("%s%s" % (tagOp, tagName), str(self.toBePlottedPerStrand[strand][chromosome][bin]))
+ − 292 self.writer.addTranscript(transcript)
+ − 293 cpt += 1
+ − 294 self.writer.close()
+ − 295
+ − 296 def run(self):
+ − 297 self.checkOptions()
+ − 298 self.getChromosomeSizes()
+ − 299 self.initializeBins()
+ − 300 self.setBins()
+ − 301 self.aggregateData()
+ − 302 if self.excel:
+ − 303 self.writeExcel()
+ − 304 if self.plot:
+ − 305 self.plotData()
+ − 306 self.printRegions()
+ − 307
+ − 308
+ − 309 if __name__ == "__main__":
+ − 310
+ − 311 # parse command line
+ − 312 description = "Clusterize by Sliding Windows v1.0.1: Produces a GFF3 file that clusters a list of transcripts using a sliding window. [Category: Sliding Windows]"
+ − 313
+ − 314 parser = OptionParser(description = description)
+ − 315 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
+ − 316 parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]")
+ − 317 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
+ − 318 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="format of the output file [format: transcript file format]")
+ − 319 parser.add_option("-s", "--size", dest="size", action="store", type="int", help="size of the regions [compulsory] [format: int]")
+ − 320 parser.add_option("-e", "--overlap", dest="overlap", action="store", type="int", help="overlap between two consecutive regions [compulsory] [format: int]")
+ − 321 parser.add_option("-m", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
+ − 322 parser.add_option("-g", "--tag", dest="tag", action="store", default=None, type="string", help="use a given tag as input (instead of summing number of features) [format: string]")
+ − 323 parser.add_option("-r", "--operation", dest="operation", action="store", default=None, type="string", help="combine tag value with given operation [format: choice (sum, avg, med, min, max)]")
+ − 324 parser.add_option("-d", "--defaultValue",dest="defaultValue", action="store", type="float", help="default value for input tag [format: float]")
+ − 325 parser.add_option("-w", "--write", dest="writeTag", action="store", default=None, type="string", help="print the result in the given tag (default usually is 'nbElements') [format: string]")
+ − 326 parser.add_option("-2", "--strands", dest="strands", action="store_true", default=False, help="consider the two strands separately [format: bool] [default: false]")
+ − 327 parser.add_option("-p", "--plot", dest="plot", action="store", default=None, type="string", help="plot regions to the given file [format: output file in PNG format]")
+ − 328 parser.add_option("-x", "--excel", dest="excel", action="store", default=None, type="string", help="write an Excel file to the given file [format: output file in Excel format]")
+ − 329 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]")
+ − 330 (options, args) = parser.parse_args()
+ − 331
+ − 332 cbsw = ClusterizeBySlidingWindows(options.verbosity)
+ − 333 cbsw.setInputFile(options.inputFileName, options.inputFormat)
+ − 334 cbsw.setOutputFileName(options.outputFileName, options.outputFormat)
+ − 335 cbsw.setWindowSize(options.size)
+ − 336 cbsw.setWindowOverlap(options.overlap)
+ − 337 cbsw.setTag(options.tag)
+ − 338 cbsw.setDefaultValue(options.defaultValue)
+ − 339 cbsw.setOperation(options.operation)
+ − 340 cbsw.setOutputTag(options.writeTag)
+ − 341 cbsw.setBothStrands(options.strands)
+ − 342 cbsw.setPlot(options.plot)
+ − 343 cbsw.setExcel(options.excel)
+ − 344 cbsw.run()